<a href="https://colab.research.google.com/github/INA-95/H-M-Personalized-Fashion-Recommendations/blob/main/Movie_Recommender_System_230308.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import requests
import os
import pickle
import json

from glob import glob
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
from typing import Dict, List, Any

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/Project/Side_Project/H&M_recommendation_system/data

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Project/Side_Project/H&M_recommendation_system/data


In [3]:
files = glob('./*.csv')

In [4]:
dfs = {file:pd.read_csv(file) for file in files}
dfs

{'./df_small.csv':          Unnamed: 0  Unnamed: 0.1  userId  movieId  rating  \
 0               960           960       5        1     4.5   
 1               961           961       5       10     2.5   
 2               962           962       5       19     3.5   
 3               963           963       5       32     5.0   
 4               964           964       5       39     4.5   
 ...             ...           ...     ...      ...     ...   
 5392020    19998291      19998291    2704     4993     5.0   
 5392021    19998292      19998292    2704     5349     3.0   
 5392022    19998293      19998293    2704     5378     4.0   
 5392023    19998295      19998295    2704     5449     4.0   
 5392024    19998296      19998296    2704     5459     4.0   
 
                    timestamp  movie_id_idx  
 0        2009-01-02 01:13:41             0  
 1        2009-01-02 01:15:59             9  
 2        2009-01-01 04:21:44            18  
 3        2009-01-01 04:11:35           

In [5]:
df = dfs['./df_small.csv']

In [6]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'movieId', 'timestamp'], axis = 1)

Unnamed: 0,userId,rating,movie_id_idx
0,5,4.5,0
1,5,2.5,9
2,5,3.5,18
3,5,5.0,29
4,5,4.5,33
...,...,...,...
5392020,2704,5.0,1730
5392021,2704,3.0,1769
5392022,2704,4.0,1772
5392023,2704,4.0,1780


In [7]:
df = df[['userId', 'movie_id_idx', 'rating']]

In [8]:
# Information of Dataset

num_of_users = df['userId'].nunique()
num_of_movies = df['movie_id_idx'].nunique()
print('num_of_users:', num_of_users)
print('num_of_movies:', num_of_movies)

num_of_users: 10000
num_of_movies: 2000


In [9]:
# Split into train and test

train, test = train_test_split(df, test_size = 0.2)

In [10]:
# user_movie = {user_id : [movie1, movie2, ...]}
# movie_user = {movie_idx : [user1, user2, ...]}
# user_movie_rating = {(user, movie) : rating, (user2, movie2) : rating, ...}

def create_dict_1(target_1:List) -> Dict:
    res_dict_1 = defaultdict(list)
    for v1, v2 in target_1:
        res_dict_1[v1].append(v2)
    return res_dict_1

def create_dict_2(target_2:zip()) -> Dict:
    res_dict_2 = defaultdict(int)
    for v1, v2, v3 in target_2:
        res_dict_2[(v1, v2)] = v3
    return res_dict_2

In [11]:
user_movie = list(zip(df['userId'], df['movie_id_idx']))
movie_user = list(zip(df['movie_id_idx'], df['userId']))
user_movie_rating = zip(df['userId'], df['movie_id_idx'], df['rating'])

user_movie_dict = create_dict_1(user_movie)
movie_user_dict = create_dict_1(movie_user)
user_movie_rating_dict = create_dict_2(user_movie_rating)

In [12]:
train['user_movie'] = train['userId'].apply(lambda x : user_movie_dict[x])
train['movie_user'] = train['movie_id_idx'].apply(lambda x : movie_user_dict[x])

In [13]:
train

Unnamed: 0,userId,movie_id_idx,rating,user_movie,movie_user
3988789,1327,1774,4.0,"[0, 2, 12, 5, 6, 9, 10, 13, 15, 16, 20, 24, 25...","[5, 30, 39, 86, 119, 122, 147, 176, 192, 209, ..."
2074682,6434,162,2.5,"[5, 15, 16, 20, 29, 30, 34, 43, 85, 103, 109, ...","[5, 10, 15, 27, 30, 33, 39, 47, 78, 86, 88, 92..."
469781,3902,1756,2.5,"[11, 18, 27, 29, 37, 40, 43, 56, 85, 126, 150,...","[5, 39, 86, 95, 131, 145, 147, 157, 199, 214, ..."
1835592,4634,1591,1.0,"[0, 5, 6, 9, 10, 13, 15, 16, 19, 20, 24, 25, 2...","[10, 17, 27, 30, 44, 56, 86, 92, 95, 100, 102,..."
1985974,5808,1526,3.5,"[0, 1, 4, 6, 10, 16, 18, 21, 28, 31, 33, 46, 4...","[44, 56, 86, 102, 131, 161, 207, 215, 259, 260..."
...,...,...,...,...,...
1658810,3279,1981,3.5,"[0, 1, 5, 15, 16, 18, 29, 33, 40, 41, 43, 56, ...","[5, 30, 39, 92, 125, 144, 176, 192, 199, 215, ..."
4987632,9085,557,4.0,"[0, 2, 5, 6, 10, 15, 16, 20, 21, 24, 28, 29, 3...","[15, 17, 27, 33, 47, 78, 100, 102, 115, 122, 1..."
2644928,928,175,5.0,"[0, 5, 6, 9, 15, 23, 29, 31, 40, 43, 47, 69, 7...","[10, 15, 17, 33, 39, 47, 56, 78, 86, 97, 100, ..."
776376,6056,1574,4.5,"[0, 1, 2, 4, 5, 8, 9, 10, 11, 15, 18, 19, 20, ...","[5, 10, 17, 27, 39, 44, 56, 86, 100, 107, 115,..."


In [14]:
test.head()

Unnamed: 0,userId,movie_id_idx,rating
5138804,227,102,2.5
3801983,9700,1388,4.0
4373141,4491,1483,3.0
2215131,7526,541,5.0
1742706,3856,545,5.0


In [15]:
test['user_movie'] = list(zip(test.userId, test.movie_id_idx))

In [16]:
test['user_movie_rating'] = test['user_movie'].apply(lambda x: user_movie_rating_dict[x])

In [17]:
test

Unnamed: 0,userId,movie_id_idx,rating,user_movie,user_movie_rating
5138804,227,102,2.5,"(227, 102)",2.5
3801983,9700,1388,4.0,"(9700, 1388)",4.0
4373141,4491,1483,3.0,"(4491, 1483)",3.0
2215131,7526,541,5.0,"(7526, 541)",5.0
1742706,3856,545,5.0,"(3856, 545)",5.0
...,...,...,...,...,...
2961825,3630,1582,4.5,"(3630, 1582)",4.5
4166000,2883,563,5.0,"(2883, 563)",5.0
3838160,9951,605,5.0,"(9951, 605)",5.0
689761,5466,116,2.0,"(5466, 116)",2.0


In [18]:
target_2 = zip(test['userId'], test['movie_id_idx'], test['rating'])
test_uesr_movie_rating_dict = create_dict_2(target_2)
test_uesr_movie_rating_dict

defaultdict(int,
            {(227, 102): 2.5,
             (9700, 1388): 4.0,
             (4491, 1483): 3.0,
             (7526, 541): 5.0,
             (3856, 545): 5.0,
             (3417, 43): 4.0,
             (2101, 289): 4.0,
             (7731, 1543): 3.0,
             (1904, 714): 1.5,
             (9721, 1885): 4.5,
             (4371, 658): 4.0,
             (3617, 1036): 5.0,
             (9747, 1595): 4.0,
             (818, 918): 3.0,
             (3794, 1041): 1.0,
             (3196, 1835): 2.0,
             (5900, 509): 3.5,
             (9559, 162): 4.0,
             (2472, 729): 3.0,
             (5802, 1466): 4.0,
             (33, 507): 2.0,
             (4606, 518): 4.0,
             (1118, 118): 3.5,
             (9156, 1020): 3.5,
             (2738, 213): 4.0,
             (8264, 103): 4.0,
             (4899, 1055): 1.0,
             (5981, 627): 3.5,
             (275, 1474): 4.0,
             (4333, 205): 5.0,
             (8630, 162): 4.0,
             (36

In [19]:
# temp json file

with open('user_movie_dict.json', 'wb') as f:
    pickle.dump(user_movie_dict, f)

with open('movie_user_dict.json', 'wb') as f:
    pickle.dump(movie_user_dict, f)

with open('user_movie_rating_dict.json', 'wb') as f:
    pickle.dump(user_movie_rating_dict, f)

with open('test_uesr_movie_rating_dict.json', 'wb') as f:
    pickle.dump(test_uesr_movie_rating_dict, f)

In [20]:
# load data from pickle json

json_files = glob('./*.json')

with open('user_movie_dict.json', 'rb') as f:
    user_movie = pickle.load(f)

with open('movie_user_dict.json', 'rb') as f:
    movie_user = pickle.load(f)

with open('user_movie_rating_dict.json', 'rb') as f:
    user_movie_rating = pickle.load(f)

with open('test_uesr_movie_rating_dict.json', 'rb') as f:
    test_user_movie_rating = pickle.load(f)

In [21]:
N = np.max(list(user_movie.keys())) + 1

m1 = np.max(list(movie_user.keys()))
m2 = np.max([m for (u, m), r in test_user_movie_rating.items()])

M = max(m1, m2) + 1
print('N:', N, 'M:', M)

N: 10000 M: 2000


In [22]:
from sortedcontainers import SortedList


# number of user we'd like to consider
K = 25

# number of common movies to consider
limit = 5

# store neighbors
neighbors = []

# each user's average rating for later use
averages = []

# each user's deviation for later use
deviations = []

for i in range(N):
    # user : [m1, m2, ...]
    movies_i = user_movie[i]
    movies_i_set = set(movies_i) # unique movieid for each user
    
    ratings_i = {movie:user_movie_rating[(i, movie)] for movie in movies_i} # (user, movie) : rating dict -> movie : rating
    avg_i = np.mean(list(ratings_i.values())) # mean of ratings of all the movies
    dev_i = {movie:(rating - avg_i) for movie, rating in ratings_i.items()} # movie : rating(calculate between rating and mean rating = deviation)
    dev_i_values = np.array(list(dev_i.values())) # array of deviation values
    sigma_i = np.sqrt(dev_i_values.dot(dev_i_values)) # square root of the sum of squares of deviations : denominator in the pearson correlation

    averages.append(avg_i)
    deviations.append(dev_i)

    sl = SortedList()

In [23]:
for j in range(N):
    if j != i:
        movies_j = user_movie[j]
        movies_j_set = set(movies_j)
        common_movies = (movies_j_set & movies_i_set)

        if len(common_movies) > limit:
            # user_movie_rating -> user_movie_rating[(user, movie)] rating
            ratings_j = {movie : user_movie_rating[(j, movie)] for movie in movies_j}

            # average of ratings
            avg_j = np.mean(list(ratings_j.values()))
            
            # deviation of each of rating and average of ratings
            dev_j = {movie : (rating - avg_j) for movie, rating in ratings_j.items()}
            
            dev_j_values = np.array(list(dev_j.values()))
            sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))

            # calculate correlation coefficient
            numerator = sum(dev_i[m] * dev_j[m] for m in common_movies)
            w_ij = numerator / (sigma_i * sigma_j)

            sl.add((-w_ij, j)) 
            if len(sl) > K:
                del sl[-1]

neighbors.append(sl) # neighbors = [(correlation, user)]

In [25]:
neighbors

[SortedList([(-0.27948226096971623, 274), (-0.27199814560741403, 860), (-0.26862819650143055, 2932), (-0.26810725746682207, 8780), (-0.2670925051826279, 5826), (-0.26412702813774, 8380), (-0.2592848111317966, 5383), (-0.258550458161959, 6516), (-0.25717076082912327, 4144), (-0.2559747481933539, 9143), (-0.25379934138308735, 8592), (-0.2536841990270254, 5705), (-0.2535673814295177, 547), (-0.25351795300284446, 4480), (-0.25036802858016804, 9089), (-0.25015660676189355, 5450), (-0.2500947314334512, 6058), (-0.24980473696007877, 8731), (-0.24880124702864095, 9154), (-0.24827503890996522, 4895), (-0.24788943627329948, 3152), (-0.24631391825916135, 8581), (-0.24476734545800044, 4090), (-0.24319264511826436, 6717), (-0.2415945319017646, 5438)])]

In [36]:
def predict(i, m):
    numerator = 0
    denominator = 0

    for neg_w, j in neighbors[i]:
        try:
            numerator += -neg_w * deviations[j][m]
            denominator += abs(neg_w)
        except KeyError:
            pass
    
    if denominator == 0:
        prediction = averages[i]
    else:
        prediction = numerator / denominator + averages[i]
    prediction = min(5, prediction)
    prediction = max(0.5, prediction)
    return prediction        

In [37]:
for (i, m), target in user_movie_rating.items():
    print(predict(i, m))
    break

IndexError: ignored

In [34]:
neighbors[5]

IndexError: ignored