# Movie Recommendations with collaborative recommendations

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from collections import defaultdict
from typing import List


### Read data

In [2]:
train_data = pd.read_csv('./data/ml-25m/ratings.csv', nrows=10000)
print(train_data.head(10))


   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510
5       1     1088     4.0  1147868495
6       1     1175     3.5  1147868826
7       1     1217     3.5  1147878326
8       1     1237     5.0  1147868839
9       1     1250     4.0  1147868414


### Analyze data

In [3]:
popularity = train_data.groupby(train_data['userId']).size()
print(popularity)

userId
1      70
2     184
3     656
4     242
5     101
     ... 
71     36
72    813
73     39
74     22
75     61
Length: 75, dtype: int64


### Train test split

In [16]:
# user to movies dictionary
def get_user_movie_dict(data: pd.DataFrame):
    return {key: values for (key, values) in data.groupby('userId')}

users = list(set(train_data['userId']))
user_movie_dict = get_user_movie_dict(train_data) 
print(len(users))

KeyboardInterrupt: 

In [18]:
def calculate_user_similarity(user_movie_dict, users) -> dict: 
    'dict: {u: {v:score}} '
    similarity_scores_dict = defaultdict(dict)
    
    for i in range(len(users)):
        for j in range(i + 1 ,len(users)):
            u = users[i]
            v = users[j]
            u_movies = set(user_movie_dict[u]['movieId'].to_list())
            v_movies = set(user_movie_dict[v]['movieId'].to_list())
            movies_common = u_movies.intersection(v_movies)
            
            score = len(movies_common) / np.sqrt(len(u_movies) * len(v_movies))            
            similarity_scores_dict[u][v]= score            
            similarity_scores_dict[v][u]=score
    return similarity_scores_dict

user_similarity_score = calculate_user_similarity(user_movie_dict, users)

In [19]:

def get_top_k_similar_users(user_id: int|str, user_similarity_score : dict, k: int = 10) -> List: 
    'return user, movies, similarity score'
    similar_users = user_similarity_score[user_id] # {v1:0.2, v3: 0.3}
    res = sorted(similar_users.items(), key=lambda x: x[1], reverse=True)
    return [x for x in res[:k]]
    
similar_users = get_top_k_similar_users(1, user_similarity_score, 5)
print(similar_users)

[(55, 0.13522468075656266), (68, 0.1119980089419836), (50, 0.10624254305194611), (38, 0.09089344255870231), (37, 0.08964214570007951)]


In [26]:

def recommended_movies(user_id, user_movie_dict, user_similarity_score,  k):
    candidate_movies = defaultdict(float) # movie_id, recommendation score
    # find top N similar users    
    user_watched_movies = set(user_movie_dict[user_id]['movieId'].to_list())
    similar_users = get_top_k_similar_users(user_id, user_similarity_score,  k)
    
    for user_v, similarity_score in similar_users:
        user_v_movies = user_movie_dict[user_v]
        for index, movie in user_v_movies.iterrows():
            if movie['movieId'] not in user_watched_movies:
                candidate_movies[movie['movieId']] += similarity_score * movie['rating']

    recommendation = sorted(candidate_movies.items(), key=lambda mv: mv[1], reverse=True)
    return [int(x[0]) for x in recommendation[:k]]

suggestions = recommended_movies(1, user_movie_dict, user_similarity_score, 20)
print(suggestions)

m = 1

[593, 2858, 2959, 4226, 2571, 6874, 4993, 356, 260, 318, 2329, 50, 46578, 60069, 527, 7153, 7438, 110, 3578, 58559]


### Transform data

In [8]:
# find user similarities
# given user U, find top K similar users
# foreach similar user v, get movies that U hasn't watched. And calculate movie recommendation score for each movie
# sort candidate movies by recommendation scores and pick the top K


### Train model

### Evaluate model

### Predit on test dataset


### Output

In [9]:

# df = pd.DataFrame()
# df.to_csv('submission.csv',index = False, header=True)