<a href="https://colab.research.google.com/github/INA-95/H-M-Personalized-Fashion-Recommendations/blob/main/Movie_Recommender_System_230310.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import pandas as pd
import numpy as np
import requests
import os
import pickle
import json

from glob import glob
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
from typing import Dict, List, Any
from sortedcontainers import SortedList

In [38]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/Project/Side_Project/H&M_recommendation_system/data

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Project/Side_Project/H&M_recommendation_system/data


In [39]:
files = glob('./*.csv')

In [40]:
dfs = {file:pd.read_csv(file) for file in files}

In [41]:
df = dfs['./df_small.csv']

In [42]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'movieId', 'timestamp'], axis = 1)

Unnamed: 0,userId,rating,movie_id_idx
0,5,4.5,0
1,5,2.5,9
2,5,3.5,18
3,5,5.0,29
4,5,4.5,33
...,...,...,...
5392020,2704,5.0,1730
5392021,2704,3.0,1769
5392022,2704,4.0,1772
5392023,2704,4.0,1780


In [43]:
df = df[['userId', 'movie_id_idx', 'rating']]

In [44]:
# Information of Dataset

num_of_users = df['userId'].nunique()
num_of_movies = df['movie_id_idx'].nunique()
print('num_of_users:', num_of_users)
print('num_of_movies:', num_of_movies)

num_of_users: 10000
num_of_movies: 2000


In [45]:
# Split into train and test

train, test = train_test_split(df, test_size = 0.2)

In [46]:
# user_movie = {user_id : [movie1, movie2, ...]}
# movie_user = {movie_idx : [user1, user2, ...]}
# user_movie_rating = {(user, movie) : rating, (user2, movie2) : rating, ...}

def create_dict_1(target_1:List) -> Dict:
    res_dict_1 = defaultdict(list)
    for v1, v2 in target_1:
        res_dict_1[v1].append(v2)
    return res_dict_1

def create_dict_2(target_2:zip()) -> Dict:
    res_dict_2 = defaultdict(int)
    for v1, v2, v3 in target_2:
        res_dict_2[(v1, v2)] = v3
    return res_dict_2

In [47]:
user_movie = list(zip(df['userId'], df['movie_id_idx']))
movie_user = list(zip(df['movie_id_idx'], df['userId']))
user_movie_rating = zip(df['userId'], df['movie_id_idx'], df['rating'])

user_movie_dict = create_dict_1(user_movie)
movie_user_dict = create_dict_1(movie_user)
user_movie_rating_dict = create_dict_2(user_movie_rating)

In [48]:
train['user_movie'] = train['userId'].apply(lambda x : user_movie_dict[x])
train['movie_user'] = train['movie_id_idx'].apply(lambda x : movie_user_dict[x])

In [49]:
test['user_movie'] = list(zip(test.userId, test.movie_id_idx))

In [50]:
test['user_movie_rating'] = test['user_movie'].apply(lambda x: user_movie_rating_dict[x])

In [51]:
target_2 = zip(test['userId'], test['movie_id_idx'], test['rating'])
test_uesr_movie_rating_dict = create_dict_2(target_2)

In [52]:
# temp json file

with open('user_movie_dict.json', 'wb') as f:
    pickle.dump(user_movie_dict, f)

with open('movie_user_dict.json', 'wb') as f:
    pickle.dump(movie_user_dict, f)

with open('user_movie_rating_dict.json', 'wb') as f:
    pickle.dump(user_movie_rating_dict, f)

with open('test_uesr_movie_rating_dict.json', 'wb') as f:
    pickle.dump(test_uesr_movie_rating_dict, f)

In [53]:
# load data from pickle json

json_files = glob('./*.json')

with open('user_movie_dict.json', 'rb') as f:
    user_movie = pickle.load(f)

with open('movie_user_dict.json', 'rb') as f:
    movie_user = pickle.load(f)

with open('user_movie_rating_dict.json', 'rb') as f:
    user_movie_rating = pickle.load(f)

with open('test_uesr_movie_rating_dict.json', 'rb') as f:
    test_user_movie_rating = pickle.load(f)

In [54]:
N = np.max(list(user_movie.keys())) + 1

m1 = np.max(list(movie_user.keys()))
m2 = np.max([m for (u, m), r in test_user_movie_rating.items()])

M = max(m1, m2) + 1
print('N:', N, 'M:', M)

N: 10000 M: 2000


In [55]:
from sortedcontainers import SortedList


# number of user we'd like to consider
K = 25

# number of common movies to consider
limit = 5

# store neighbors
neighbors = []

# each user's average rating for later use
averages = []

# each user's deviation for later use
deviations = []

for i in range(N):
    # user : [m1, m2, ...]
    movies_i = user_movie[i]
    movies_i_set = set(movies_i) # unique movieid for each user
    
    ratings_i = {movie:user_movie_rating[(i, movie)] for movie in movies_i} # (user, movie) : rating dict -> movie : rating
    avg_i = np.mean(list(ratings_i.values())) # mean of ratings of all the movies
    dev_i = {movie:(rating - avg_i) for movie, rating in ratings_i.items()} # movie : rating(calculate between rating and mean rating = deviation)
    dev_i_values = np.array(list(dev_i.values())) # array of deviation values
    sigma_i = np.sqrt(dev_i_values.dot(dev_i_values)) # square root of the sum of squares of deviations : denominator in the pearson correlation

    averages.append(avg_i)
    deviations.append(dev_i)

    sl = SortedList()

    for j in range(N):
        if j != i:
            movies_j = user_movie[j]
            movies_j_set = set(movies_j)
            common_movies = (movies_j_set & movies_i_set)

            if len(common_movies) > limit:
                # user_movie_rating -> user_movie_rating[(user, movie)] rating
                ratings_j = {movie : user_movie_rating[(j, movie)] for movie in movies_j}

                # average of ratings
                avg_j = np.mean(list(ratings_j.values()))
                
                # deviation of each of rating and average of ratings
                dev_j = {movie : (rating - avg_j) for movie, rating in ratings_j.items()}
                
                dev_j_values = np.array(list(dev_j.values()))
                sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))

                # calculate correlation coefficient
                numerator = sum(dev_i[m] * dev_j[m] for m in common_movies)
                w_ij = numerator / (sigma_i * sigma_j)

                sl.add((-w_ij, j)) 
                if len(sl) > K:
                    del sl[-1]

    neighbors.append(sl) # neighbors = [(correlation, user)]

KeyboardInterrupt: ignored

In [25]:
def predict(i, m):
    numerator = 0
    denominator = 0

    for neg_w, j in neighbors[i]:
        try:
            numerator += -neg_w * deviations[j][m]
            denominator += abs(neg_w)
        except KeyError:
            pass
    
    if denominator == 0:
        prediction = averages[i]
    else:
        prediction = numerator / denominator + averages[i]
    prediction = min(5, prediction)
    prediction = max(0.5, prediction)
    return prediction        

In [34]:
for (i, m), target in user_movie_rating.items():
    print((i, m), target)
    break

(5, 0) 4.5
