This notebook seeks to explore various techniques for implementing recommender systems, namely

* Popularity-based - recommend items with high rating 
    * weighted mean item ratings
    * trending, last-watched
    
    
* Content-based - recommend similar items
    * Cosine similarity of item metadata 
    
    
* Collaborative Filtering - recommend items that similar users also like
    * Matrix Factorization
    * Nearest Neighbours
    * Deep learning approaches
    

Other Challenges
* Cold-Start Problem
* Efficiency vs Accuracy

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# To create plots
import matplotlib.pyplot as plt

# To create interactive plots
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

# To shift lists
from collections import deque

# To compute similarities between vectors
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# To use recommender systems
import surprise as sp
from surprise.model_selection import cross_validate

# To create deep learning models
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# To create sparse matrices
from scipy.sparse import coo_matrix

# To light fm
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

# To stack sparse matrices
from scipy.sparse import vstack
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-movies-dataset/links.csv
/kaggle/input/the-movies-dataset/ratings_small.csv
/kaggle/input/the-movies-dataset/links_small.csv
/kaggle/input/the-movies-dataset/credits.csv
/kaggle/input/the-movies-dataset/ratings.csv
/kaggle/input/the-movies-dataset/movies_metadata.csv
/kaggle/input/the-movies-dataset/keywords.csv
/kaggle/input/netflix-prize-data/qualifying.txt
/kaggle/input/netflix-prize-data/probe.txt
/kaggle/input/netflix-prize-data/movie_titles.csv
/kaggle/input/netflix-prize-data/combined_data_3.txt
/kaggle/input/netflix-prize-data/combined_data_2.txt
/kaggle/input/netflix-prize-data/README
/kaggle/input/netflix-prize-data/combined_data_1.txt
/kaggle/input/netflix-prize-data/combined_data_4.txt


### Dataset Preprocessing

Lets start by using the Netflix prize datasets
* 17K+ Movies
* Descriptions of each movie
* 24M movie ratings from users

In [2]:
# Load data for all movies
movie_titles = pd.read_csv('../input/netflix-prize-data/movie_titles.csv', 
                           encoding = 'ISO-8859-1', 
                           header = None, 
                           names = ['Id', 'Year', 'Name']).set_index('Id')

print('Shape Movie-Titles:\t{}'.format(movie_titles.shape))
movie_titles.head()

Shape Movie-Titles:	(17770, 2)


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


In [3]:
# Load a movie metadata dataset
movie_metadata = pd.read_csv('../input/the-movies-dataset/movies_metadata.csv', low_memory=False)[['original_title', 'overview', 'vote_count']].set_index('original_title').dropna()
# Remove the long tail of rarly rated moves
movie_metadata = movie_metadata[movie_metadata['vote_count']>10].drop('vote_count', axis=1)

print('Shape Movie-Metadata:\t{}'.format(movie_metadata.shape))
movie_metadata.head()

Shape Movie-Metadata:	(21604, 1)


Unnamed: 0_level_0,overview
original_title,Unnamed: 1_level_1
Toy Story,"Led by Woody, Andy's toys live happily in his ..."
Jumanji,When siblings Judy and Peter discover an encha...
Grumpier Old Men,A family wedding reignites the ancient feud be...
Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
Father of the Bride Part II,Just when George Banks has recovered from his ...


In [4]:
# Load single data-file
df_raw = pd.read_csv('../input/netflix-prize-data/combined_data_1.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])

# Find empty rows to slice dataframe for each movie
tmp_movies = df_raw[df_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)
user_data = []

# Iterate over all movies 
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    if df_id_1<df_id_2: # Check if it is the last movie in the file
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()
        
    # Create movie_id column and append df
    tmp_df['Movie'] = movie_id
    user_data.append(tmp_df)

# Combine all dataframes
df = pd.concat(user_data)
del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(5)

Shape User-Ratings:	(24053764, 4)


Unnamed: 0,User,Rating,Date,Movie
15320052,1437491,5.0,2005-06-19,2942
13748072,1585984,4.0,2003-06-08,2617
11701589,1582756,5.0,2005-06-19,2212
15339400,1497697,4.0,2005-09-01,2944
5783496,1289706,3.0,2005-06-10,1145


For effecient performance reasons, we want to only get the top 500 movies for this demonstration

In [5]:
# Filter sparse movies
min_movie_ratings = 10000
filter_movies = (df['Movie'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

# Filter sparse users
min_user_ratings = 200
filter_users = (df['User'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Filter all users and movies with low rating count (not useful to us) ~ 4M ratings
df_filterd = df[(df['Movie'].isin(filter_movies)) & (df['User'].isin(filter_users))]
del filter_movies, filter_users, min_movie_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filterd.shape))

Shape User-Ratings unfiltered:	(24053764, 4)
Shape User-Ratings filtered:	(4178032, 4)


In [6]:
# Shuffle df and Split into training and testing data 
df_filterd = df_filterd.drop('Date', axis=1).sample(frac=1).reset_index(drop=True)
test_size = 100000
df_train = df_filterd[:-test_size]
df_test = df_filterd[-test_size:]
df_train.head()

Unnamed: 0,User,Rating,Movie
0,509987,3.0,811
1,581517,3.0,1180
2,1320147,5.0,3860
3,1170130,5.0,1144
4,337793,4.0,2866


In [7]:
df_test.head()

Unnamed: 0,User,Rating,Movie
4078032,1191964,4.0,4157
4078033,1889384,3.0,3725
4078034,1056238,3.0,3320
4078035,253434,5.0,1174
4078036,322078,4.0,2782


Next we want to create a large, sparse matrix to facilitate the recommendation algorithms that we will be building, which consists of 20M users by 490+ movies

In [8]:
# Create a user-movie matrix with empty values
df_p = df_train.pivot_table(index='User', columns='Movie', values='Rating')
print('Shape User-Movie-Matrix:\t{}'.format(df_p.shape))
df_p.head()

Shape User-Movie-Matrix:	(20828, 491)


Movie,8,18,28,30,58,77,83,97,108,111,...,4392,4393,4402,4418,4420,4432,4472,4479,4488,4490
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000079,,,2.0,4.0,,,3.0,,,,...,,,,,3.0,3.0,3.0,4.0,2.0,
1000192,,,2.0,4.0,3.0,2.0,3.0,,,,...,,4.0,,,,4.0,,5.0,,
1000301,,4.0,3.0,,,,,,,,...,4.0,,3.0,,,3.0,4.0,,4.0,
1000387,,,4.0,,2.0,4.0,,,2.0,,...,4.0,2.0,2.0,4.0,,3.0,2.0,,2.0,
1000410,,4.0,,4.0,,,,,,3.0,...,,3.0,3.0,,,4.0,4.0,4.0,3.0,3.0


### Popularity-Based Recommendation

Computing the mean rating for all movies creates a ranking. The recommendation will be the same for all users and can be used if there is no information on the user. Variations of this approach can be separate rankings for each country/year/gender/... and to use them individually to recommend movies/items to the user. However,using the rating of a movie alone is biased and favours movies with fewer ratings, since large numbers of ratings tend to be less extreme in its mean ratings. To tackle the problem of the unstable mean with few ratings e.g. IDMb uses a weighted rating. Many good ratings outweigh few in this algorithm.

Questions
* why dont we use the original count and rating? because we only want to include active users
* how do we deal with NaN values? we dont, we just count those without NaN

In [9]:
# Number of minimum votes to be considered
m = 1000
n = 10
C = df_p.stack().mean() # Mean rating for all movies
R = df_p.mean(axis=0).values # Mean rating for all movies separately
v = df_p.count().values # Rating count for all movies separately
weighted_score = (v/ (v+m) *R) + (m/ (v+m) *C)
weighted_ranking = np.argsort(weighted_score)[::-1]
weighted_score = np.sort(weighted_score)[::-1]
weighted_movie_ids = df_p.columns[weighted_ranking]
ratings_count = df_p.count(axis=0).rename('Rating-Count').to_frame()

# Join labels and predictions
df_prediction = df_test.set_index('Movie').join(pd.DataFrame(weighted_score, index=weighted_movie_ids, columns=['Prediction']))[['Rating', 'Prediction']]
y_true = df_prediction['Rating']
y_pred = df_prediction['Prediction']
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))

# Create DataFrame for plotting
df_plot = pd.DataFrame(weighted_score[:n], columns=['Rating'])
df_plot.index = weighted_movie_ids[:10]
ranking_weighted_rating = df_plot.join(ratings_count).join(movie_titles)


# Create trace
trace = go.Bar(x = ranking_weighted_rating['Rating'],
               text = ranking_weighted_rating['Name'].astype(str) +': '+ ranking_weighted_rating['Rating-Count'].astype(str) + ' Ratings',
               textposition = 'outside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n+1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Weighted-Movie-Ratings: {:.4f} RMSE'.format(n, rmse),
              xaxis = dict(title = 'Weighted Rating',
                          range = (4.15, 4.6)),
              yaxis = dict(title = 'Movie'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [10]:
ranking_weighted_rating.head()

Unnamed: 0_level_0,Rating,Rating-Count,Year,Name
Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2452,4.379357,18439,2001.0,Lord of the Rings: The Fellowship of the Ring
3962,4.323747,17395,2003.0,Finding Nemo (Widescreen)
4306,4.29287,19150,1999.0,The Sixth Sense
2862,4.284901,18808,1991.0,The Silence of the Lambs
3290,4.2683,13116,1974.0,The Godfather


In [11]:
#movie_metadata = pd.read_csv('../input/the-movies-dataset/movies_metadata.csv', low_memory=False)
#movie_metadata.head()

In [12]:
del df_plot, weighted_ranking, weighted_score, weighted_movie_ids, ratings_count

### Content-Based: User Similarity

"Other users are also watching"

This recommendation strategy recommends movies that other similar users are also interested in. 

Interpreting each row of the matrix as a vector, a similarity between all user-vectors can be computed. This enables us to find all similar users and to work on user-specific recommendations. Recommending high rated movies of similar users to a specific user seems reasonable.

Since there are still empty values left in the matrix, we have to use a reliable way to impute a decent value. A simple first approach is to fill in the mean of each user into the empty values.

Afterwards the ratings of all similar users will be weighted with their similarity score and the mean will be computed. Filtering for the unrated movies of a user reveals the best recommendations.

You can easily adapt this process to find similar items by computing the item-item similarity the same way. Since the matrix is mostly sparse and there are more users than items, this could be better for the RMSE score.

In [13]:
user_index = 0
n_recommendation = 100
n_plot = 10
df_p_imputed = df_p.T.fillna(df_p.mean(axis=1)).T # Fill in missing values with mean user ratings
df_p_imputed.head()

Movie,8,18,28,30,58,77,83,97,108,111,...,4392,4393,4402,4418,4420,4432,4472,4479,4488,4490
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000079,2.852071,2.852071,2.0,4.0,2.852071,2.852071,3.0,2.852071,2.852071,2.852071,...,2.852071,2.852071,2.852071,2.852071,3.0,3.0,3.0,4.0,2.0,2.852071
1000192,3.135246,3.135246,2.0,4.0,3.0,2.0,3.0,3.135246,3.135246,3.135246,...,3.135246,4.0,3.135246,3.135246,3.135246,4.0,3.135246,5.0,3.135246,3.135246
1000301,3.361111,4.0,3.0,3.361111,3.361111,3.361111,3.361111,3.361111,3.361111,3.361111,...,4.0,3.361111,3.0,3.361111,3.361111,3.0,4.0,3.361111,4.0,3.361111
1000387,2.878049,2.878049,4.0,2.878049,2.0,4.0,2.878049,2.878049,2.0,2.878049,...,4.0,2.0,2.0,4.0,2.878049,3.0,2.0,2.878049,2.0,2.878049
1000410,3.323671,4.0,3.323671,4.0,3.323671,3.323671,3.323671,3.323671,3.323671,3.0,...,3.323671,3.0,3.0,3.323671,3.323671,4.0,4.0,4.0,3.0,3.0


In [14]:
# Compute similarity between all users and remove self-similarity
similarity = cosine_similarity(df_p_imputed.values)
similarity -= np.eye(similarity.shape[0])
print(np.shape(similarity))
# similarity # An NxN matrix of similarity score for each user

(20828, 20828)


In [15]:
def get_movies_of_top_n_similar_users(similarity, user_index=0, n_recommendation=100):
    # Sort similar users by index and score
    similar_user_index = np.argsort(similarity[user_index])[::-1]
    similar_user_score = np.sort(similarity[user_index])[::-1]

    # Get movies that user has not rated / watched
    unrated_movies = df_p.iloc[user_index][df_p.iloc[user_index].isna()].index

    # Weight ratings of the top n most similar users with their rating and compute the mean for each movie
    mean_movie_recommendations = (df_p_imputed.iloc[similar_user_index[:n_recommendation]].T * similar_user_score[:n_recommendation]).T.mean(axis=0)

    # Filter for unrated movies and sort results
    best_movie_recommendations = mean_movie_recommendations[unrated_movies].sort_values(ascending=False).to_frame().join(movie_titles)

    return best_movie_recommendations

get_movies_of_top_n_similar_users(similarity)

Unnamed: 0_level_0,0,Year,Name
Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3938,4.498563,2004.0,Shrek 2
4306,4.496051,1999.0,The Sixth Sense
3962,4.443040,2003.0,Finding Nemo (Widescreen)
1144,4.406760,1991.0,Fried Green Tomatoes
191,4.405994,2003.0,X2: X-Men United
...,...,...,...
1289,4.094712,1990.0,Look Who's Talking Too
3385,4.085870,2002.0,28 Days Later
1267,4.082044,2001.0,Dr. Dolittle 2
3254,3.993826,2003.0,Daredevil


In [16]:
# clear up ram
del similarity

### Content-Based: TFIDF Movie Metadata Similarity

"Because you watched this" 

If there is no historical data for a user or there is reliable metadata for each movie, it can be useful to compare the metadata of the movies to find similar ones.
In this approch I will use the movie description to create a TFIDF-matrix, which counts and weights words in all descriptions, and compute a cosine similarity between all of those sparse text-vectors. This can easily be extended to more or different features if you like.
Unfortunately it is impossible for this model to compute a RMSE score, since the model does not recommend the movies directly.
In this way it is possible to find movies closly related to each other, but it is hard to find movies of different genres/categories.

In [17]:
def get_top_n_similar_movies(movie_metadata, n=10, movie='Batman Begins'):
    # Create tf-idf matrix for text comparison and compute cosine similarity between all movies
    tfidf = TfidfVectorizer(stop_words='english', max_features=100)
    tfidf_matrix = tfidf.fit_transform(movie_metadata['overview'].dropna())
    similarity = cosine_similarity(tfidf_matrix)
    similarity -= np.eye(similarity.shape[0])

    # Get index of movie and get titles of similar movies
    index = movie_metadata.reset_index(drop=True)[movie_metadata.index==movie].index[0]
    similar_movies_index = np.argsort(similarity[index])[::-1][:n]
    similar_movies_score = np.sort(similarity[index])[::-1][:n]
    similar_movie_titles = movie_metadata.iloc[similar_movies_index].index
    del similarity, tfidf_matrix
    
    return similar_movie_titles

get_top_n_similar_movies(movie_metadata)

Index(['Stir Crazy', 'Bound for Glory', 'All of My Heart', 'Import/Export',
       'FearDotCom', 'Mr. Blandings Builds His Dream House',
       'Hunt for the Wilderpeople', 'As Good as It Gets',
       'The Internet's Own Boy: The Story of Aaron Swartz',
       ''Tis the Season for Love'],
      dtype='object', name='original_title')

### Collaborative Filtering: Matrix Factorization with Gradient Descent

The user-movie rating matrix is high dimensional and sparse, therefore I am going to reduce the dimensionality to represent the data in a dense form.
Using matrix factorisation a large matrix can be estimated/decomposed into two long but slim matrices. With gradient descent it is possible to adjust these matrices to represent the given ratings. The gradient descent algorithm finds latent variables which represent the underlying structure of the dataset. Afterwards these latent variables can be used to reconstruct the original matrix and to predict the missing ratings for each user.
In this case the model has not been trained to convergence and is not hyperparameter optimized.

In [18]:
# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_filterd['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filterd['Movie'].unique())}


# Create correctly mapped train- & testset
train_user_data = df_train['User'].map(user_id_mapping)
train_movie_data = df_train['Movie'].map(movie_id_mapping)

test_user_data = df_test['User'].map(user_id_mapping)
test_movie_data = df_test['Movie'].map(movie_id_mapping)


# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)
embedding_size = 10


##### Create model
# Set input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=embedding_size, 
                            input_dim=movies,
                            input_length=1, 
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)
movie_vector = Reshape([embedding_size])(movie_embedding)

# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, movie_vector])

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')


# Fit model
model.fit([train_user_data, train_movie_data],
          df_train['Rating'],
          batch_size=256, 
          epochs=1,
          validation_split=0.1,
          shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
y_true = df_test['Rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Matrix-Factorization: {:.4f} RMSE'.format(rmse))



Testing Result With Keras Matrix-Factorization: 0.9275 RMSE


### Matrix Factorization: Deep Learning using various MetaData

With its embedding layers this is similar to the matrix factorization approach above, but instead of using a fixed dot-product as recommendation we will utilize some dense layers so the network can find better combinations. One advantage of deep learning models is, that movie-metadata can easily be added to the model.

I will tf-idf transform the short description of all movies to a sparse vector. The model will learn to reduce the dimensionality of this vector and how to combine metadata with the embedding of the user-id and the movie-id. In this way you can add any additional metadata to your own recommender.

These kind of hybrid systems can learn how to reduce the impact of the cold start problem.

In [19]:
user_id_mapping = {id: i for i, id in enumerate(df['User'].unique())}
movie_id_mapping = {id: i for i, id in enumerate(df['Movie'].unique())}
df['User'] = df['User'].map(user_id_mapping)
df['Movie'] = df['Movie'].map(movie_id_mapping)

# Preprocess metadata
tmp_metadata = movie_metadata.copy()
tmp_metadata.index = tmp_metadata.index.str.lower()

# Preprocess titles
tmp_titles = movie_titles.drop('Year', axis=1).copy()
tmp_titles = tmp_titles.reset_index().set_index('Name')
tmp_titles.index = tmp_titles.index.str.lower()

# Combine titles and metadata
df_id_descriptions = tmp_titles.join(tmp_metadata).dropna().set_index('Id')
df_id_descriptions['overview'] = df_id_descriptions['overview'].str.lower()
del tmp_metadata,tmp_titles
df_id_descriptions.head()


Unnamed: 0_level_0,overview
Id,Unnamed: 1_level_1
7756,an ethical baltimore defense lawyer disgusted ...
2945,a hollywood songwriter goes through a mid-life...
14249,a hollywood songwriter goes through a mid-life...
3463,"bianca, a tenth grader, has never gone on a da..."
11972,"based on the real-life richard speck murders, ..."


In [20]:
# Filter all ratings with metadata, split into training and testing sets
df_hybrid = df.drop('Date', axis=1).set_index('Movie').join(df_id_descriptions).dropna().drop('overview', axis=1).reset_index().rename({'index':'Movie'}, axis=1)
n = 100000
df_hybrid = df_hybrid.sample(frac=1).reset_index(drop=True)
df_hybrid_train = df_hybrid[:1500000]
df_hybrid_test = df_hybrid[-n:]


# Create tf-idf matrix for text comparison
tfidf = TfidfVectorizer(stop_words='english')
tfidf_hybrid = tfidf.fit_transform(df_id_descriptions['overview'])
mapping = {id:i for i, id in enumerate(df_id_descriptions.index)}

train_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in df_hybrid_train['Movie'].values:
    index = mapping[id]
    train_tfidf.append(tfidf_hybrid[index])
    
test_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in df_hybrid_test['Movie'].values:
    index = mapping[id]
    test_tfidf.append(tfidf_hybrid[index])


# Stack the sparse matrices
train_tfidf = vstack(train_tfidf)
test_tfidf = vstack(test_tfidf)

In [21]:
##### Setup the network variables
user_embed = 10
movie_embed = 10

user_id_input = Input(shape=[1], name='user') # Create two input layers
movie_id_input = Input(shape=[1], name='movie')
tfidf_input = Input(shape=[24144], name='tfidf', sparse=True)

# Create separate embeddings for users and movies
user_embedding = Embedding(output_dim=user_embed,
                           input_dim=len(user_id_mapping),
                           input_length=1,
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=movie_embed,
                            input_dim=len(movie_id_mapping),
                            input_length=1,
                            name='movie_embedding')(movie_id_input)

# Create 2 layers, reshape and concatenate them
tfidf_vectors = Dense(128, activation='relu')(tfidf_input)
tfidf_vectors = Dense(32, activation='relu')(tfidf_vectors)
user_vectors = Reshape([user_embed])(user_embedding)
movie_vectors = Reshape([movie_embed])(movie_embedding)
both = Concatenate()([user_vectors, movie_vectors, tfidf_vectors])
dense = Dense(512, activation='relu')(both)
dense = Dropout(0.2)(dense)
output = Dense(1)(dense)

In [None]:
# Create and compile model
model = Model(inputs=[user_id_input, movie_id_input, tfidf_input], outputs=output)
model.compile(loss='mse', optimizer='adam')


# Train and test the network
model.fit([df_hybrid_train['User'], df_hybrid_train['Movie'], train_tfidf],
          df_hybrid_train['Rating'],
          batch_size=1024, 
          epochs=2,
          validation_split=0.1,
          shuffle=True)

y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf])
y_true = df_hybrid_test['Rating'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse))

### Surprise Library

The surprise library was built for creating and analyzing recommender systems.
It has to be mentioned that most of the built-in algorithms use some kind of the above approches. I am going to compare these algorithms to each other in this section using 5-fold crossvalidation. Since the algorithms and the dataset have a large memoryfootprint the comparison will be executed on a subsampled dataset which is not comparable to the above models.

In [24]:
# Run 5-fold cross-validation on the famous SVD algorithm 
movies = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(50000), sp.Reader())
movies

<surprise.dataset.DatasetAutoFolds at 0x7feb843d7350>

First lets test out the famous [SVD algorithm](http://https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD ) by Simon Funk, used in the Netflix Prize Competition.

In [None]:
cross_validate(algo=sp.SVD(), data=movies, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
from surprise import accuracy
from surprise import Dataset
from surprise.model_selection import train_test_split, KFold
from collections import defaultdict

def get_top_n(predictions, n=10):
    # Map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # sort predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Train SVD algorithm on movielens dataset.
movies = Dataset.load_builtin('ml-100k')
train_data, test_data = train_test_split(movies, test_size=0.2)
model = sp.SVD()
predictions = model.fit(train_data).test(test_data)
accuracy.rmse(predictions)
model.predict(uid=str(196), iid=str(302)) # predict a rating that user(i) would give item(j)
top_n = get_top_n(predictions, n=10)
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])


### Precision@K and Recall@K

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    precisions, recalls = {}, {} # Map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Count number of relevant items and recommended items in top k
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

for trainset, testset in KFold(n_splits=5).split(movies):
    model.fit(trainset)
    predictions = model.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

Next, lets try a [K Nearest Neighbours](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline) basic collaborative filtering algorithm taking into account a baseline rating.

In [None]:
# Run 5 fold validation on KNN approach using baseline
cross_validate(algo=sp.KNNBaseline(), data=movies, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
import io 

def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """
    file_name = sp.get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

# Train the KNN algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = sp.KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)