### 10 Recomender System for movies

In [1]:
import pandas as pd
import numpy as np
import pickle
import random
from sklearn.decomposition import NMF
from sklearn import decomposition, datasets, model_selection, preprocessing, metrics

##### 10.1: import and prep Data

In [2]:
# import data
ratings = pd.read_csv('data/ratings.csv')        # Ratings Data
movies = pd.read_csv('data/movies.csv')     # Movie table

In [3]:
# create column number of Ratings per Movie
ratings['n_ratings'] = ratings.groupby('movieId').transform('count')['rating']

In [4]:
# take into df only movies with more than 20 reviews
df = ratings[ratings['n_ratings']>20]

In [5]:
# create movie name dictionary from Movies table
movie_dict = dict(zip(movies.movieId , movies.title))

In [6]:
# create list of Movie titles used in each table
list_df = list(df['movieId'].unique())
list_dict = list(movie_dict.keys())

In [7]:
len(list_df), len(list_dict)

(1235, 9742)

In [8]:
# find Ids that are only in dict_list (movies that were rated by LESS than 20 users)
dict_only = set(list_dict).difference(list_df)

In [9]:
len(dict_only)

8507

In [10]:
# remove those titles from Movie_dictionarry
movie_dict_final = movie_dict
for i, id in enumerate(dict_only):
    del movie_dict_final[id]

In [11]:
len(list_df), len(movie_dict_final)

(1235, 1235)

In [12]:
# save Dictionary
with open(r"movie_dict.pickle", "wb") as output_file:
    pickle.dump(movie_dict_final, output_file)

##### 10.2: create a Base-Line Recommendor

In [13]:
base_line = df

In [14]:
# Average Rating per Movie
base_line['average_rating'] = base_line.groupby('movieId').transform(np.average)['rating']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_line['average_rating'] = base_line.groupby('movieId').transform(np.average)['rating']


In [15]:
# final df - MovieID, Movie name, Average Rating per Movie
base_line = base_line.drop_duplicates(subset='movieId', keep='first')

In [16]:
base_line = base_line[['movieId', 'average_rating']]

In [17]:
base_line = base_line.reset_index()
base_line.drop('index', axis=1, inplace=True)
base_line

Unnamed: 0,movieId,average_rating
0,1,3.920930
1,3,3.259615
2,6,3.946078
3,47,3.975369
4,50,4.237745
...,...,...
1230,1958,3.795455
1231,3979,2.500000
1232,4247,2.380952
1233,2986,2.640000


In [18]:
# add Movie name
base_line['name'] = 'a'
for i in range(1235):
    base_line['name'][i] = movie_dict_final[base_line['movieId'][i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_line['name'][i] = movie_dict_final[base_line['movieId'][i]]


In [21]:
# create Movie-Name:Avergae-Rating Dictionary
average_dict = dict(zip(base_line.name , base_line.average_rating))

In [None]:
# save Dictionary
with open(r"average_dict.pickle", "wb") as output_file:
    pickle.dump(average_dict, output_file)

In [22]:
# extract top-10 rated movies
top_10 = base_line.sort_values(by='average_rating', ascending=False).iloc[:10,:]

In [23]:
# create Top-10 Dictionary
top10_dict = dict(zip(top_10.name , top_10.average_rating))
top10_dict

{'Shawshank Redemption, The (1994)': 4.429022082018927,
 'Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)': 4.333333333333333,
 'Philadelphia Story, The (1940)': 4.310344827586207,
 'Lawrence of Arabia (1962)': 4.3,
 'In the Name of the Father (1993)': 4.3,
 'Hoop Dreams (1994)': 4.293103448275862,
 'Godfather, The (1972)': 4.2890625,
 'Harold and Maude (1971)': 4.288461538461538,
 'Logan (2017)': 4.28,
 'Fight Club (1999)': 4.272935779816514}

In [None]:
# save Dictionary
with open(r"top10_dict.pickle", "wb") as output_file:
    pickle.dump(top10_dict, output_file)

##### 10.3: Create a Predictive Recomendor (NMF)

Create R Matrix

In [24]:
# create R Matrix
r_true = df.pivot(index='userId', columns='movieId', values='rating')

In [25]:
r_true

movieId,1,2,3,5,6,7,10,11,16,17,...,122920,122922,134130,134853,139385,148626,152081,164179,166528,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,4.0,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,2.5,,2.5,,4.0,...,,,,,,,,,,
607,4.0,,,,,,,3.0,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,4.0,,4.5,,...,,,,,,,,,,
609,3.0,,,,,,4.0,,,,...,,,,,,,,,,


Fill NaNs

In [26]:
# fill missing Values (with movie Average)
r_true = r_true.fillna(r_true.mean())

In [27]:
r_true

movieId,1,2,3,5,6,7,10,11,16,17,...,122920,122922,134130,134853,139385,148626,152081,164179,166528,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.00000,3.431818,4.000000,3.071429,4.000000,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
2,3.92093,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
3,3.92093,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
4,3.92093,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
5,4.00000,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.50000,3.431818,3.259615,3.071429,3.946078,2.500000,3.496212,2.500000,3.926829,4.000000,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
607,4.00000,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.000000,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
608,2.50000,2.000000,2.000000,3.071429,3.946078,3.185185,4.000000,3.671429,4.500000,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
609,3.00000,3.431818,3.259615,3.071429,3.946078,3.185185,4.000000,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28


Create and train model

In [28]:
# create model
model = NMF(n_components=10, max_iter= 500)

In [29]:
model.fit(r_true)



NMF(max_iter=500, n_components=10)

In [30]:
# get Q and P
Q = model.components_
P = model.transform(r_true)

In [31]:
# check error
model.reconstruction_err_

214.61856776547143

In [32]:
# Explained Variance [0..1]
scorer=metrics.explained_variance_score

In [33]:
prediction = model.inverse_transform(model.transform(r_true))
scorer(r_true, prediction)

0.15295576859360813

-----> Explained variance is low. try different n_components in order to increase it

In [34]:
# a function that calculates explained_variance
def get_score(model, data, scorer=metrics.explained_variance_score):
    """ Estimate performance of the model on the data """
    prediction = model.inverse_transform(model.transform(data))
    return scorer(data, prediction)

In [35]:
# test different n_components and calculate explained variance
k = [40, 70, 100, 130]
perfs_train = []
for k in k:
    m2 = NMF(n_components=k).fit(r_true)
    perfs_train.append(get_score(m2, r_true))
print(perfs_train)



[0.3731945908656665, 0.5068384174025595, 0.6016656956878967, 0.6732131913837525]




In [37]:
# try higher values
k = [150, 170, 190, 210, 250]
perfs_train = []
for k in k:
    m2 = NMF(n_components=k).fit(r_true)
    perfs_train.append(get_score(m2, r_true))
print(perfs_train)



[0.712467623529412, 0.7465954777113228, 0.7763858526403236, 0.8031271825525008, 0.8461030906775071]




In [38]:
print(m2)

NMF(n_components=250)


In [None]:
# Save last model with chosen k_value (250)
with open(r"nmf_movierecomendor250.pickle", "wb") as output_file:
    pickle.dump(m2, output_file)

In [39]:
# get P and Q for m2 model
# get Q and P
Q2 = m2.components_
P2 = m2.transform(r_true)



##### 10.4: Create re-constructed R (predicted)

In [40]:
R = pd.DataFrame(np.dot(P2, Q2).round(), index=r_true.index, columns=movie_dict_final.values())

In [41]:
R

Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.0,4.0,3.0,4.0,3.0,3.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
2,4.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
3,4.0,3.0,3.0,3.0,4.0,3.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
4,4.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
5,4.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,4.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.0,3.0,3.0,3.0,4.0,2.0,4.0,2.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
607,4.0,3.0,3.0,3.0,4.0,3.0,4.0,3.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
608,2.0,2.0,2.0,3.0,4.0,3.0,4.0,4.0,5.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
609,3.0,3.0,3.0,3.0,4.0,3.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0


##### 10.5: Create a functions that predicts for a new user

In [67]:
# get recomendation from user for 5 movies (here randomly chosen, in webappf user chooses)
def get_reviews():
    user_reviews_dict = {}
    for i in range(5):
        movie_t = movie_dict[random.choice(list(movie_dict))]
        score = input(f'please rate this movie: {movie_t}')
        user_reviews_dict[movie_t] = score
    return user_reviews_dict

In [68]:
# create user data frame with NaNs and add user reviews for 5 movies
def create_table(user_reviews, movie_dict_final):
    user_df = pd.DataFrame(np.nan, index=[0], columns=movie_dict.values())
    for i, title in enumerate(user_reviews.keys()):
        user_df[title][0] = user_reviews[title]
    return user_df

In [69]:
def calculate_recom(user_df, movie_dict, average_dict, Q2):
    user_array = user_df.copy(deep=True).values
    for title in movie_dict.values():
        user_df.loc[user_df[title].isnull(),title] = average_dict[title]
    user_imp_array = user_df.values
    user_P = m2.transform(user_imp_array)
    user_R = np.dot(user_P, Q2)
    user_recom = pd.DataFrame({'user_input':user_array[0], 'predicted_ratings':user_R[0]}, index = user_df.columns)
    return user_recom

In [70]:
def recommend_10():
    user_reviews = get_reviews()
    user_df = create_table(user_reviews, movie_dict)
    user_recom = calculate_recom(user_df, movie_dict, average_dict, Q2)
    recom_10 = user_recom[user_recom['user_input'].isna()].sort_values(by = 'predicted_ratings', ascending= False).iloc[:10,:]
    return recom_10

In [71]:
recom_10 = recommend_10()
recom_10



Unnamed: 0,user_input,predicted_ratings
"Shawshank Redemption, The (1994)",,4.435571
Fight Club (1999),,4.325953
"Godfather: Part II, The (1974)",,4.308605
Schindler's List (1993),,4.306149
In the Name of the Father (1993),,4.301071
Hoop Dreams (1994),,4.297369
"Third Man, The (1949)",,4.29716
Casablanca (1942),,4.296808
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),,4.29428
"Matrix, The (1999)",,4.289288


##### 10.6: Create a flask web interface + connect the model (see web_app folder)