In [1]:
# !pip install scikit-surprise

In [2]:
import pandas as pd

import surprise
from surprise import *

In [3]:
df = pd.read_csv('Combined.csv')
df.head()

Unnamed: 0,User First Name,User Last Name,The Social Network,A Prophet,Amour,The King's Speech,La La Land,Boyhood,Inception,A Separation,...,Call Me by Your Name,Winter's Bone,The Grand Budapest Hotel,Dunkirk,Inside Llewyn Davis,A Serious Man,Toy Story 3,Beasts of the Southern Wild,The Imitation Game,The Fighter
0,Arvind,Jeyakumar,3.0,,,,,,5.0,,...,,,,5.0,,,,,4.0,
1,Naga Nidhi,Loath,3.0,,,5.0,,4.0,4.0,,...,3.0,,,3.0,,,,,4.0,
2,Kexin,Fu,,,,3.0,3.0,,3.0,,...,4.0,,4.0,,,,,,,
3,Bokun,Chen,5.0,5.0,,,5.0,4.0,,5.0,...,5.0,,5.0,,,,5.0,3.0,5.0,5.0
4,Kavish,Hukmani,,,,,,,4.0,,...,,,,,,,,,5.0,


In [4]:
df = df.melt(id_vars=['User First Name', 'User Last Name'])
df.rename(columns={'variable':'Movie Name', 'value':'Rating'}, inplace=True)

df['Movie Name'] = df['Movie Name'].str.strip()
df.dropna(inplace=True)
df = df.reset_index(drop=True)

In [5]:
movies_drop = ["The Social Network", "Mad Max: Fury Road", "Inception", "Winter’s Bone", "A Serious Man", "Son of Saul"]
users_drop = ["Kavish", "Jeet", "Neon", "Keshore", "Asha"]

hidden_ratings = {}

for user in users_drop:
    for movie in movies_drop:
        filter = df.loc[(df['User First Name'] == user) & (df['Movie Name'] == movie), "Rating"]
        if len(filter.values) > 0:
            hidden_ratings[(user, movie)] = filter.values[0]
            print(f"{user}, {movie}: {filter.values}")
            df.drop(filter.index, inplace=True)

Kavish, Inception: [4.]
Jeet, The Social Network: [5.]
Jeet, Mad Max: Fury Road: [4.]
Jeet, Inception: [5.]
Neon, Inception: [5.]
Keshore, The Social Network: [4.]
Keshore, Mad Max: Fury Road: [3.]
Keshore, Inception: [5.]
Asha, The Social Network: [4.]
Asha, Mad Max: Fury Road: [5.]
Asha, Inception: [5.]


In [6]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['User First Name', 'Movie Name', 'Rating']], reader)
trainset = data.build_full_trainset()

In [7]:
# # Using SVD

# # algo = SVD()

# # model_selection.cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# param_grid = {'n_epochs': [5, 10, 20, 30, 50], 
#               'n_factors': [5, 10, 20, 40, 75, 100, 125],
#               'lr_all': [0.002, 0.005],
#               'reg_all': [0.02, 0.04, 0.1, 0.2, 0.4, 0.6]}
# gs = model_selection.GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=8)

# gs.fit(data)

# # best RMSE score
# print(gs.best_score['rmse'])

# # combination of parameters that gave the best RMSE score
# print(gs.best_params['rmse'])

0.8688161351176816  
{'n_epochs': 50, 'n_factors': 10, 'lr_all': 0.005, 'reg_all': 0.2}


In [8]:
# Using SVD

algo = SVD(n_epochs= 50, n_factors= 10, lr_all= 0.005, reg_all= 0.2)
algo.fit(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)

surprise.accuracy.rmse(predictions, verbose=True);
surprise.accuracy.mae(predictions, verbose=True);
op_list = []

for user in users_drop:
    for movie in movies_drop:
        op_list.append(list(algo.predict(user, movie, hidden_ratings.get((user, movie), None))))
output_df = pd.DataFrame(op_list, columns=["User First Name", "Movie Name", "Original Rating", "Predicted Rating", "_"])
output_df.drop(columns=['_'], inplace=True)

output_df

RMSE: 0.8058
MAE:  0.6533


Unnamed: 0,User First Name,Movie Name,Original Rating,Predicted Rating
0,Kavish,The Social Network,,3.710761
1,Kavish,Mad Max: Fury Road,,3.448112
2,Kavish,Inception,4.0,4.211207
3,Kavish,Winter’s Bone,,3.77708
4,Kavish,A Serious Man,,3.508827
5,Kavish,Son of Saul,,3.646906
6,Jeet,The Social Network,5.0,4.356538
7,Jeet,Mad Max: Fury Road,4.0,4.08248
8,Jeet,Inception,5.0,4.849899
9,Jeet,Winter’s Bone,,4.42055


In [9]:
# User bias

user_bias_dict = {}

for inner_uid, user_bias in enumerate(algo.bu):
    user_bias_dict[trainset.to_raw_uid(inner_uid)]= user_bias

for user in users_drop:
    print(f"{user}: {user_bias_dict[user]}")

Kavish: -0.20650571144468266
Jeet: 0.4369634144218904
Neon: 0.03567892291208846
Keshore: 0.0591495891994447
Asha: 0.3205312842502365


In [10]:
# Item bias

item_bias_dict = {}

for inner_iid, item_bias in enumerate(algo.bi):
    item_bias_dict[trainset.to_raw_iid(inner_iid)]= item_bias

item_bias_series = pd.Series(item_bias_dict)
item_bias_series.sort_values(ascending=False).head(5)

Inception                   0.437079
The Secret in Their Eyes    0.225529
Spotlight                   0.211809
Inglourious Basterds        0.201582
The Imitation Game          0.201401
dtype: float64

In [11]:
# Popular items

df['Movie Name'].value_counts().head(5)

Avatar                     246
Inception                  225
The Wolf of Wall Street    210
Life of Pi                 201
La La Land                 194
Name: Movie Name, dtype: int64

In [12]:
# Item score (Popularity * Item Bias)

popularity_df = df['Movie Name'].value_counts()

# Min Max Normalization
popularity_df = (popularity_df-popularity_df.min())/(popularity_df.max()-popularity_df.min())
popularity_df = popularity_df.reset_index()

popularity_df.columns =['Movie Name', 'Popularity Score']

item_bias_df = item_bias_series.reset_index()
item_bias_df.columns =['Movie Name', 'Item Bias']

combined_df = popularity_df.merge(item_bias_df, on='Movie Name', how='inner')

combined_df['Combined Score'] = combined_df['Popularity Score']*combined_df['Item Bias']
combined_df.sort_values('Combined Score', ascending=False).head(5)

Unnamed: 0,Movie Name,Popularity Score,Item Bias,Combined Score
1,Inception,0.91358,0.437079,0.399307
5,The Imitation Game,0.674897,0.201401,0.135925
13,The Grand Budapest Hotel,0.514403,0.161296,0.082971
15,Inglourious Basterds,0.399177,0.201582,0.080467
10,The King's Speech,0.55144,0.130851,0.072156


In [13]:
for user in users_drop:
    rating_unbiased = []
    for movie in item_bias_dict.keys():
        rating_unbiased.append(algo.predict(user, movie, hidden_ratings.get((user, movie), None))[3] - item_bias_dict[movie])
    print(f"{user}: {round(sum(rating_unbiased)/len(rating_unbiased), 4)}")

Kavish: 3.779
Jeet: 4.4187
Neon: 4.0196
Keshore: 4.0384
Asha: 4.3093


In [14]:
for movie in ["Avatar", "The Wolf of Wall Street", "Inception"]:
    rating_unbiased = []
    for user in user_bias_dict.keys():
        rating_unbiased.append(algo.predict(user, movie, hidden_ratings.get((user, movie), None))[3] - user_bias_dict[user])
    print(f"{movie}: {round(sum(rating_unbiased)/len(rating_unbiased), 4)}")

Avatar: 3.9987
The Wolf of Wall Street: 4.0327
Inception: 4.4181


In [15]:
# Adding info about Camille, Shachi and Amy

df.loc[-1] = ["Camille", "Mack", "Precious", 2]
df.loc[-1] = ["Shachi", "Govil", "Precious", 2]
df.loc[-1] = ["Amy", "Russell", "Precious", 4]

df.loc[-1] = ["Camille", "Mack", "12 Years a Slave", 2]
df.loc[-1] = ["Shachi", "Govil", "12 Years a Slave", 5]
df.loc[-1] = ["Amy", "Russell", "12 Years a Slave", 3]

df.loc[-1] = ["Camille", "Mack", "Mad Max Fury Road", 4]
df.loc[-1] = ["Shachi", "Govil", "Mad Max Fury Road", 5]
df.loc[-1] = ["Amy", "Russell", "Mad Max Fury Road", 4]

df.loc[-1] = ["Camille", "Mack", "Black Swan", 3]
df.loc[-1] = ["Shachi", "Govil", "Black Swan", 4]
df.loc[-1] = ["Amy", "Russell", "Black Swan", 1]

df.loc[-1] = ["Camille", "Mack", "Toy Story 3", 3]
df.loc[-1] = ["Shachi", "Govil", "Toy Story 3", 3]
df.loc[-1] = ["Amy", "Russell", "Toy Story 3", 4]

In [16]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['User First Name', 'Movie Name', 'Rating']], reader)
trainset = data.build_full_trainset()

# Using SVD

algo = SVD(n_epochs= 50, n_factors= 10, lr_all= 0.005, reg_all= 0.2)
algo.fit(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)

surprise.accuracy.rmse(predictions, verbose=True);
surprise.accuracy.mae(predictions, verbose=True);
op_list = []

for user in ["Camille", "Shachi", "Amy"]:
    for movie in ["Avatar", "The Wolf of Wall Street", "Inception"]:
        op_list.append(list(algo.predict(user, movie, hidden_ratings.get((user, movie), None))))
output_df = pd.DataFrame(op_list, columns=["User First Name", "Movie Name", "Original Rating", "Predicted Rating", "_"])
output_df.drop(columns=['_'], inplace=True)

output_df

RMSE: 0.8057
MAE:  0.6533


Unnamed: 0,User First Name,Movie Name,Original Rating,Predicted Rating
0,Camille,Avatar,,3.998001
1,Camille,The Wolf of Wall Street,,4.032847
2,Camille,Inception,,4.420452
3,Shachi,Avatar,,3.998001
4,Shachi,The Wolf of Wall Street,,4.032847
5,Shachi,Inception,,4.420452
6,Amy,Avatar,,3.996018
7,Amy,The Wolf of Wall Street,,4.034041
8,Amy,Inception,,4.414806
