<a href="https://colab.research.google.com/github/INA-95/H-M-Personalized-Fashion-Recommendations/blob/main/Movie_Recommender_System_230308.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import requests
import os
import pickle

from glob import glob
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
from typing import Dict, List, Any

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/Project/Side_Project/H&M_recommendation_system/data

Mounted at /content/gdrive
/content/gdrive/MyDrive/Project/Side_Project/H&M_recommendation_system/data


In [3]:
files = glob('./*.csv')

In [4]:
dfs = {file:pd.read_csv(file) for file in files}
dfs

{'./df_small.csv':          Unnamed: 0  Unnamed: 0.1  userId  movieId  rating  \
 0               960           960       5        1     4.5   
 1               961           961       5       10     2.5   
 2               962           962       5       19     3.5   
 3               963           963       5       32     5.0   
 4               964           964       5       39     4.5   
 ...             ...           ...     ...      ...     ...   
 5392020    19998291      19998291    2704     4993     5.0   
 5392021    19998292      19998292    2704     5349     3.0   
 5392022    19998293      19998293    2704     5378     4.0   
 5392023    19998295      19998295    2704     5449     4.0   
 5392024    19998296      19998296    2704     5459     4.0   
 
                    timestamp  movie_id_idx  
 0        2009-01-02 01:13:41             0  
 1        2009-01-02 01:15:59             9  
 2        2009-01-01 04:21:44            18  
 3        2009-01-01 04:11:35           

In [5]:
df = dfs['./df_small.csv']

In [6]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'movieId', 'timestamp'], axis = 1)

Unnamed: 0,userId,rating,movie_id_idx
0,5,4.5,0
1,5,2.5,9
2,5,3.5,18
3,5,5.0,29
4,5,4.5,33
...,...,...,...
5392020,2704,5.0,1730
5392021,2704,3.0,1769
5392022,2704,4.0,1772
5392023,2704,4.0,1780


In [7]:
df = df[['userId', 'movie_id_idx', 'rating']]

In [8]:
# Information of Dataset

num_of_users = df['userId'].nunique()
num_of_movies = df['movie_id_idx'].nunique()
print('num_of_users:', num_of_users)
print('num_of_movies:', num_of_movies)

num_of_users: 10000
num_of_movies: 2000


In [9]:
# Split into train and test

train, test = train_test_split(df, test_size = 0.2)

In [10]:
# user_movie = {user_id : [movie1, movie2, ...]}
# movie_user = {movie_idx : [user1, user2, ...]}
# user_movie_rating = {(user, movie) : rating, (user2, movie2) : rating, ...}

def create_dict_1(target_1:List) -> Dict:
    res_dict_1 = defaultdict(list)
    for v1, v2 in target_1:
        res_dict_1[v1].append(v2)
    return res_dict_1

def create_dict_2(target_2:zip()) -> Dict:
    res_dict_2 = defaultdict(int)
    for v1, v2, v3 in target_2:
        res_dict_2[(v1, v2)] = v3
    return res_dict_2

In [11]:
user_movie = list(zip(df['userId'], df['movie_id_idx']))
movie_user = list(zip(df['movie_id_idx'], df['userId']))
user_movie_rating = zip(df['userId'], df['movie_id_idx'], df['rating'])

user_movie_dict = create_dict_1(user_movie)
movie_user_dict = create_dict_1(movie_user)
user_movie_rating_dict = create_dict_2(user_movie_rating)

In [12]:
train['user_movie'] = train['userId'].apply(lambda x : user_movie_dict[x])
train['movie_user'] = train['movie_id_idx'].apply(lambda x : movie_user_dict[x])

In [13]:
train

Unnamed: 0,userId,movie_id_idx,rating,user_movie,movie_user
5068834,9692,250,5.0,"[0, 1, 2, 4, 6, 8, 9, 10, 11, 13, 14, 15, 16, ...","[17, 27, 39, 44, 56, 86, 88, 92, 100, 102, 119..."
3517253,7637,1305,3.0,"[0, 1, 5, 15, 18, 27, 29, 30, 33, 40, 41, 43, ...","[5, 30, 78, 92, 95, 107, 116, 119, 133, 139, 1..."
5082046,9804,490,5.0,"[0, 9, 13, 15, 20, 21, 24, 25, 27, 29, 30, 31,...","[17, 27, 33, 78, 88, 95, 100, 107, 115, 116, 1..."
5063392,9641,1394,5.0,"[0, 1, 2, 5, 6, 10, 15, 16, 17, 20, 23, 24, 25...","[17, 27, 78, 102, 115, 122, 133, 139, 209, 215..."
4743663,7251,751,2.0,"[0, 12, 5, 8, 9, 10, 11, 13, 15, 16, 18, 19, 2...","[10, 15, 39, 56, 78, 86, 88, 102, 139, 147, 16..."
...,...,...,...,...,...
2343555,8516,1461,3.0,"[0, 2, 5, 10, 15, 20, 24, 29, 30, 40, 43, 50, ...","[15, 56, 78, 86, 133, 215, 226, 248, 259, 268,..."
1075691,8308,354,2.5,"[0, 1, 2, 9, 15, 17, 18, 20, 21, 23, 28, 29, 3...","[15, 27, 33, 47, 56, 78, 88, 116, 119, 147, 19..."
2012505,6021,1545,4.0,"[0, 1, 16, 20, 21, 27, 29, 30, 31, 42, 37, 40,...","[17, 27, 30, 39, 56, 78, 86, 102, 116, 133, 13..."
5211075,1023,774,4.0,"[0, 1, 2, 5, 6, 9, 10, 13, 15, 16, 18, 19, 20,...","[5, 10, 15, 27, 39, 47, 56, 86, 100, 102, 115,..."


In [14]:
test.head()

Unnamed: 0,userId,movie_id_idx,rating
289700,2574,221,2.5
4565777,5860,1047,2.0
4268096,3650,422,5.0
4272347,3680,37,2.0
3113089,4659,1607,4.5


In [15]:
test['user_movie'] = list(zip(test.userId, test.movie_id_idx))

In [16]:
test['user_movie_rating'] = test['user_movie'].apply(lambda x: user_movie_rating_dict[x])

In [17]:
test

Unnamed: 0,userId,movie_id_idx,rating,user_movie,user_movie_rating
289700,2574,221,2.5,"(2574, 221)",2.5
4565777,5860,1047,2.0,"(5860, 1047)",2.0
4268096,3650,422,5.0,"(3650, 422)",5.0
4272347,3680,37,2.0,"(3680, 37)",2.0
3113089,4659,1607,4.5,"(4659, 1607)",4.5
...,...,...,...,...,...
3827604,9862,1458,2.5,"(9862, 1458)",2.5
3596303,8243,1666,3.0,"(8243, 1666)",3.0
4408296,4757,1734,2.5,"(4757, 1734)",2.5
3442637,7106,1586,5.0,"(7106, 1586)",5.0


In [None]:
target_2 = zip(test['userId'], test['movie_id_idx'], test['rating'])
test_uesr_movie_rating_dict = create_dict_2(target_2)
test_uesr_movie_rating_dict

In [20]:
# temp json file

with open('user_movie_dict.json', 'wb') as f:
    pickle.dump(user_movie_dict, f)

with open('movie_user_dict.json', 'wb') as f:
    pickle.dump(movie_user_dict, f)

with open('user_movie_rating_dict.json', 'wb') as f:
    pickle.dump(user_movie_rating_dict, f)

with open('test_uesr_movie_rating_dict.json', 'wb') as f:
    pickle.dump(test_uesr_movie_rating_dict, f)

In [30]:
# load data from pickle json

import json
json_files = glob('./*.json')

with open('user_movie_dict.json', 'rb') as f:
    user_movie = pickle.load(f)

with open('movie_user_dict.json', 'rb') as f:
    movie_user = pickle.load(f)

with open('user_movie_rating_dict.json', 'rb') as f:
    user_movie_rating = pickle.load(f)

with open('test_uesr_movie_rating_dict.json', 'rb') as f:
    test_user_movie_rating = pickle.load(f)

In [40]:
N = np.max(list(user_movie.keys())) + 1

m1 = np.max(list(movie_user.keys()))
m2 = np.max([m for (u, m), r in test_user_movie_rating.items()])

M = max(m1, m2) + 1
print('N:', N, 'M:', M)

N: 10000 M: 2000


In [45]:
# number of user we'd like to consider
K = 25

# number of common movies to consider
limit = 5

# store neighbors
neighbors = []

# each user's average rating for later use
averages = []

# each user's deviation for later use
deviations = []

for i in range(N):
    # user : [m1, m2, ...]
    movies_i = user_movie[i]
    movies_i_set = set(movies_i)
    
    ratings_i = {movie:user_movie_rating[(i, movie)] for movie in movies_i}
    avg_i = np.mean(list(ratings_i.values()))
    dev_i = {movie:(rating - avg_i) for movie, rating in ratings_i.items()}
    dev_i_values = np.array(list(dev_i.values()))
    sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))
    print(sigma_i)


17.444196742756603
