In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
#importing movie metadata
meta= pd.read_csv("./data/movies_metadata.csv")
meta= meta[["id","imdb_id","original_language","original_title","popularity","vote_average"]]
meta= meta.rename(columns={'id':'movieId'})
meta = meta[meta['original_language']== 'en'] #just want movies in English
meta.head(10)
print("Dataset has {} samples with {} features each.".format(*meta.shape))

Dataset has 32269 samples with 6 features each.


In [3]:
#importing movie ratings
ratings= pd.read_csv("./data/ratings_processed.csv")
ratings= ratings[['userId', 'movieId', 'rating']]
ratings=ratings.head(1000000)

In [4]:
#convert data types before merging
meta.movieId =pd.to_numeric(meta.movieId, errors='coerce')
ratings.movieId = pd.to_numeric(ratings.movieId, errors= 'coerce')


In [5]:
#create a single dataset merging the previous 2 files
data= pd.merge(ratings, meta, on='movieId', how='inner')
data.head()
print("Dataset has {} samples with {} features each.".format(*data.shape))

Dataset has 322350 samples with 8 features each.


In [6]:
#movie matrix so that I can use the recommender function later
matrix= data.pivot_table(index='userId', columns='original_title', values='rating')
print("Matrix has {} samples with {} features each.".format(*matrix.shape))
matrix.head(10)

Matrix has 9875 samples with 3615 features each.


original_title,!Women Art Revolution,$5 a Day,'Gator Bait,'R Xmas,'Twas the Night Before Christmas,...And the Pursuit of Happiness,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,...,xXx,¡Three Amigos!,Æon Flux,Бабник,Грозовые ворота,Дневник его жены,Мой сводный брат Франкенштейн,"Цирк сгорел, и клоуны разбежались",مارمولک,黑太陽731
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [7]:
import math

matrix_dic = matrix.to_dict('index')
matrix_dic_edit = {}

from itertools import chain

for user_id in matrix_dic.keys(): #get rid of NaN reviews and sort by user
    matrix_dic_edit[user_id] = [[k, v] for k, v in matrix_dic[user_id].items() if not math.isnan(v)]

from scipy import spatial

def compute_user_similarity(user1_reviews, user2_reviews):
    similarity_score = 0
    user1_rate = []
    user2_rate = []
    overlaps = 0
    for i in user1_reviews:
        for j in user2_reviews:
            if i[0] == j[0]:
                user1_rate.append(i[1])
                #print(i[1])
                user2_rate.append(j[1])
                #print(j[1])
                overlaps += 1
    return 0 if overlaps <= 2 else 1 - spatial.distance.cosine(user1_rate, user2_rate) #cosine similarity

def get_most_similar_users(dictionary, userid):
    scores = []
    reviews = dictionary[userid]
    for i in dictionary.keys():
        score = compute_user_similarity(reviews, dictionary[i])
        scores.append([i, score]) #user_id, similarity score
    scores.sort(key=lambda x: x[1], reverse=True) #sort to get the best scores
    return scores[:20]


print(get_most_similar_users(matrix_dic_edit, 30))

[[30, 1.0], [87, 1.0], [103, 1.0], [302, 1.0], [343, 1.0], [354, 1.0], [419, 1.0], [829, 1.0], [851, 1.0], [1194, 1.0], [1228, 1.0], [1395, 1.0], [1493, 1.0], [1715, 1.0], [1809, 1.0], [1835, 1.0], [1942, 1.0], [2083, 1.0], [2134, 1.0], [2201, 1.0]]


In [8]:
most_similar = get_most_similar_users(matrix_dic_edit, 30)

def insertIntoDict(movie, score, Dict):
    if not movie in Dict:
        Dict[movie] = score
    else:
        Dict[movie] += score

from collections import Counter

def compute_users_favorite_movies(users, target_user):
    fave_movies = {}
    target_user_seen = []
    for i in range(len(matrix_dic_edit[target_user])):
        target_user_seen.append(matrix_dic_edit[target_user][i][0])
    for user in users:
        for review in matrix_dic_edit[user[0]]:
            if review[0] not in target_user_seen: #make sure we don't give them movies they've already seen
                insertIntoDict(review[0], review[1]**2, fave_movies) #square scores to weight positive reviews heavier
    recs = Counter(fave_movies)
    
    for k, v in recs.most_common(10):
        print('%s' % k)

In [9]:
# A function to make top 10 recommendations based on what similar users liked
def recommend(user_id):
    most_similar = get_most_similar_users(matrix_dic_edit, user_id)
    compute_users_favorite_movies(most_similar, user_id)

In [10]:
recommend(83) #top 10 recommendations for user number 83 that they haven't seen

License to Wed
Terminator 3: Rise of the Machines
The Million Dollar Hotel
Once Were Warriors
Boogie Nights
And Then There Were None
The 39 Steps
The Garden of Eden
Say Anything...
K-19: The Widowmaker


Thanks for checking this out!