# Netflix Recommender System

## Recommendation Systems Overview

One common architecture for recommendation systems consists of the following components:


*   Candidate generation
*   Scoring
*   Re-ranking




### Import Libraries

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162664 sha256=1d6eef794b934351ba7a0a34edce884762cdfb7d1b02ed39f69ccf9c7bd70835
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [32]:
# To store the data
import pandas as pd

# To do linear algebra
import numpy as np

# To create plots
import matplotlib.pyplot as plt

# To create interactive plots
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

# To shift lists
from collections import deque

# To compute similarities between vectors
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# To use recommender systems
import surprise as sp
from surprise.model_selection import cross_validate

# To create deep learning models
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# To create sparse matrices
#from scipy.sparse import coo_matrix
from surprise import Reader, Dataset, SVD

# To stack sparse matrices
from scipy.sparse import vstack

# To implement SVD
from scipy.sparse.linalg import svds

## Load and Process Movie Data

In [5]:
# Load data for all movies
movie_titles = pd.read_csv('drive/MyDrive/movie_titles.csv', encoding = "ISO-8859-1",
                           header = None,
                           names = ['Id', 'Year', 'Name'], usecols=lambda x: x != 0).set_index('Id')
#df_title = pd.read_csv('movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])

print('Shape Movie-Titles:\t{}'.format(movie_titles.shape))
movie_titles.sample(5)

Shape Movie-Titles:	(17770, 2)


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
7055,1995.0,Get Shorty
5553,2000.0,Best of Travels in Europe with Rick Steves: Ge...
6885,1987.0,Seasons: IMAX
2987,2004.0,Brothers in Arms
7462,2001.0,Dancing at the Blue Iguana


### Load User-Data And Preprocess Data-Structure

In [6]:
# Load single data-file
df_raw = pd.read_csv('drive/MyDrive/combined_data_1.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])


# Find empty rows to slice dataframe for each movie
tmp_movies = df_raw[df_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)


# Gather all dataframes
user_data = []

# Iterate over all movies
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):

    # Check if it is the last movie in the file
    if df_id_1<df_id_2:
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()

    # Create movie_id column
    tmp_df['Movie'] = movie_id

    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all dataframes
df = pd.concat(user_data)
#del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(5)



Shape User-Ratings:	(24053764, 4)


Unnamed: 0,User,Rating,Date,Movie
11994910,467918,4.0,2005-02-24,2339
5997989,1200414,3.0,2002-11-11,1180
12258224,1851757,3.0,2005-02-03,2372
11100002,1035893,3.0,2005-07-25,2152
23352548,1247877,5.0,2005-11-17,4369


### Filter Sparse Movies And Users

In [7]:
# Filter sparse movies
min_movie_ratings = 10000
filter_movies = (df['Movie'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

# Filter sparse users
min_user_ratings = 200
filter_users = (df['User'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filterd = df[(df['Movie'].isin(filter_movies)) & (df['User'].isin(filter_users))]
del filter_users, min_movie_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filterd.shape))

Shape User-Ratings unfiltered:	(24053764, 4)
Shape User-Ratings filtered:	(4178032, 4)


### Create Train- And Testset

In [8]:
# Shuffle DataFrame
df_filterd = df_filterd.drop('Date', axis=1).sample(frac=1).reset_index(drop=True)

# Testingsize
n = 100000

# Split train- & testset
df_train = df_filterd[:-n]
df_test = df_filterd[-n:]

### Transform The User-Ratings To User-Movie-Matrix

In [9]:
# Create a user-movie matrix with empty values
df_p = df_train.pivot_table(index='User', columns='Movie', values='Rating')
print('Shape User-Movie-Matrix:\t{}'.format(df_p.shape))
df_p.sample(3)

Shape User-Movie-Matrix:	(20828, 491)


Movie,8,18,28,30,58,77,83,97,108,111,...,4392,4393,4402,4418,4420,4432,4472,4479,4488,4490
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2247402,,,,,,,,,,,...,3.0,,4.0,,,4.0,3.0,,,
1876896,,4.0,,4.0,,,,,,,...,,,,,,,,,4.0,
60529,,,,4.0,,,,,,,...,,1.0,,,,3.0,5.0,,,


In [10]:
print(df_p.columns)
print(df.columns)
print(df_train.columns)

Int64Index([   8,   18,   28,   30,   58,   77,   83,   97,  108,  111,
            ...
            4392, 4393, 4402, 4418, 4420, 4432, 4472, 4479, 4488, 4490],
           dtype='int64', name='Movie', length=491)
Index(['User', 'Rating', 'Date', 'Movie'], dtype='object')
Index(['User', 'Rating', 'Movie'], dtype='object')


## Collaborative Filtering using SVD

Collaborative filtering uses similarities between users and items simultaneously to provide recommendations.

### SVD using surprise library

In [11]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']], reader)
#data = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)

# Build a Surprise trainset
trainset = data.build_full_trainset()
svd = SVD()

svd.fit(trainset)

# Perform cross-validation
results = cross_validate(svd, data, measures=['RMSE', 'MSE'], cv=5, verbose=True)

# Print the results
print(f'RMSE: {results["test_rmse"].mean()}')
print(f'MSE: {results["test_mse"].mean()}')


Evaluating RMSE, MSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8168  0.8178  0.8186  0.8172  0.8178  0.8176  0.0006  
MSE (testset)     0.6672  0.6688  0.6700  0.6679  0.6688  0.6685  0.0010  
Fit time          112.75  120.04  116.86  116.98  118.77  117.08  2.47    
Test time         25.25   19.56   19.64   20.62   19.13   20.84   2.26    
RMSE: 0.8176472561499903
MSE: 0.6685473804520343


In [12]:
user_785314 = movie_titles.copy()
user_785314 = user_785314.reset_index()
user_785314 = user_785314[~user_785314['Id'].isin(filter_movies)]

user_785314['Estimate_Score'] = user_785314['Id'].apply(lambda x: svd.predict(785314, x).est)
user_785314 = user_785314.drop('Id', axis = 1)

user_785314 = user_785314.sort_values('Estimate_Score', ascending=False)
print(user_785314.head(10))

         Year                                Name  Estimate_Score
0      2003.0                     Dinosaur Planet        3.522171
12016  1979.0                        Quadrophenia        3.522171
12002  1998.0       The X-Files: Fight the Future        3.522171
12003  1989.0                A Fool and His Money        3.522171
12004  1984.0                              Iceman        3.522171
12005  2002.0                       Just One Look        3.522171
12006  1968.0                       The Detective        3.522171
12007  1963.0          The Twilight Zone: Vol. 31        3.522171
12008  1994.0  Space Ghost Coast to Coast: Vol. 1        3.522171
12009  2000.0           Strangers in Good Company        3.522171


### SVD using sparse matrix

In [13]:
from scipy.sparse import csr_matrix

# Function to create a sparse matrix from the DataFrame

def create_sparse_matrix(df):
    users = df['User'].values.astype(int)
    #ratings = df['Rating'].values.astype(float)  # Adjust the data type based on your actual data
    movies = df['Movie'].astype('category').cat.codes.values.astype(int)
    return csr_matrix((ratings, (users, movies)), shape=(users.max() + 1, movies.max() + 1))

#Create sparse matrix
df_p_filled = df_p.fillna(df_p.mean())
sparse_matrix = df_p_filled.values

# Perform matrix factorization using SVD
U, sigma, Vt = svds(sparse_matrix, k=3)

# The Sigma returned is just the values, not a diagonal matrix.
# This can be easily transformed to the diagonal matrix form.
sigma = np.diag(sigma)

# Preview the outputs of SVD
U, sigma, Vt

(array([[-0.00662873, -0.00019821, -0.00652875],
        [-0.00362942, -0.01024045, -0.00652148],
        [ 0.0008258 , -0.00958674, -0.00672707],
        ...,
        [ 0.00598446, -0.00790195, -0.00676789],
        [-0.00341907,  0.00291423, -0.00662865],
        [-0.00083056,  0.00538982, -0.00709865]]),
 array([[  262.69569217,     0.        ,     0.        ],
        [    0.        ,   429.39550473,     0.        ],
        [    0.        ,     0.        , 11240.93253802]]),
 array([[-0.02301866, -0.02036372, -0.0102714 , ..., -0.04114693,
          0.00115971, -0.02248241],
        [-0.02491653, -0.03275093,  0.01802049, ..., -0.04902471,
         -0.06131122, -0.01742754],
        [-0.0389627 , -0.04725104, -0.04825374, ..., -0.0463253 ,
         -0.04475327, -0.04022168]]))

In [14]:
# Predicted ratings
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

num_rows, num_cols = predicted_ratings.shape

# Convert the predicted ratings to a DataFrame for better visualization
predicted_ratings_df = pd.DataFrame(predicted_ratings,
                                    index=[f"User{i+1}" for i in range(num_rows)],
                                    columns=[f"Item {i+1}" for i in range(num_cols)])

predicted_ratings_df.head()

Unnamed: 0,Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,Item 10,...,Item 482,Item 483,Item 484,Item 485,Item 486,Item 487,Item 488,Item 489,Item 490,Item 491
User1,2.901649,3.505968,3.55766,3.554467,3.342319,2.637154,3.502419,3.53873,3.053112,2.868995,...,3.395119,3.038335,3.02409,3.066706,2.993112,3.403944,3.496127,3.475604,3.287609,2.992473
User2,2.987769,3.627285,3.467916,3.337538,3.17688,2.398723,3.583038,3.659004,3.131942,2.758071,...,3.628497,2.815672,3.039128,3.023565,2.974498,3.060178,3.602632,3.650796,3.549243,3.04662
User3,3.043879,3.703458,3.572469,3.383669,3.274779,2.510815,3.631008,3.711612,3.211415,2.843021,...,3.819525,3.01951,3.272215,3.140281,3.029591,3.227635,3.695847,3.695937,3.636817,3.108369
User4,2.849511,3.450762,3.393685,3.332884,3.148098,2.427272,3.429723,3.48414,2.990553,2.7176,...,3.388416,2.815512,2.916834,2.939896,2.88634,3.11312,3.432335,3.450858,3.307648,2.92117
User5,3.031795,3.683221,3.590301,3.44222,3.30871,2.552021,3.626052,3.697434,3.196312,2.865376,...,3.747537,3.038205,3.233735,3.142098,3.040439,3.282982,3.674603,3.67154,3.582821,3.102289


In [15]:
# Assuming df_train_true and df_train_predict are DataFrames with NaN values
# Extract only the non-NaN values
mask = ~np.isnan(df_p.values)
true_values = df_p.values[mask]
predicted_values = predicted_ratings_df.values[mask]

# Calculate RMSE and MSE based only on non-NaN values
rmse_train = np.sqrt(mean_squared_error(true_values, predicted_values))
mse_train = mean_squared_error(true_values, predicted_values)

print("RMSE on non-NaN values in df_train_true and df_train_predict:", rmse_train)
print("MSE on non-NaN values in df_train_true and df_train_predict:", mse_train)

RMSE on non-NaN values in df_train_true and df_train_predict: 0.8884694976222688
MSE on non-NaN values in df_train_true and df_train_predict: 0.7893780482051668


In [16]:
def recommend_items_svd(user_id, original_df, reconstructed_df, num_recommendations=5):

    # Predict ratings for unrated items
    predicted_ratings = predicted_ratings_df.loc[user_id]

    # Recommend items with highest predicted ratings
    recommended_items = predicted_ratings.nlargest(num_recommendations).index.tolist()
    return recommended_items

# Generating recommendations for User_1 using SVD
recommendations_svd = recommend_items_svd('User5', df_p, predicted_ratings_df)
print(recommendations_svd)



['Item 376', 'Item 233', 'Item 355', 'Item 263', 'Item 151']


### With fillna(0)

In [17]:
#Create sparse matrix
df_p_filled = df_p.fillna(0)
sparse_matrix = df_p_filled.values

# Perform matrix factorization using SVD
U, sigma, Vt = svds(sparse_matrix, k=3)

# The Sigma returned is just the values, not a diagonal matrix.
# This can be easily transformed to the diagonal matrix form.
sigma = np.diag(sigma)

# Preview the outputs of SVD
U, sigma, Vt

(array([[-0.00915964,  0.00259541, -0.00487339],
        [-0.00352597, -0.0032241 , -0.0072799 ],
        [ 0.00670825, -0.01158471, -0.0069451 ],
        ...,
        [ 0.00568394, -0.00830516, -0.00492567],
        [ 0.0004839 ,  0.00298604, -0.00683416],
        [-0.00547746,  0.0050728 , -0.00703676]]),
 array([[1188.16548002,    0.        ,    0.        ],
        [   0.        , 1509.55510801,    0.        ],
        [   0.        ,    0.        , 5206.41975223]]),
 array([[-0.00167131, -0.01502479, -0.01901451, ..., -0.07932044,
         -0.02884778, -0.01422346],
        [-0.01996849, -0.04863266,  0.05276976, ..., -0.07689322,
         -0.0930246 ,  0.00387321],
        [-0.00895157, -0.02131361, -0.04982732, ..., -0.03383656,
         -0.04156007, -0.01088119]]))

In [18]:
# Predicted ratings
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

num_rows, num_cols = predicted_ratings.shape

# Convert the predicted ratings to a DataFrame for better visualization
predicted_ratings_df = pd.DataFrame(predicted_ratings,
                                    index=[f"User {i+1}" for i in range(num_rows)],
                                    columns=[f"Item {i+1}" for i in range(num_cols)])

# Clip the predicted ratings from 1 to 5
predicted_ratings_df = predicted_ratings_df.clip(lower=1, upper=5)

predicted_ratings_df.head()

Unnamed: 0,Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,Item 10,...,Item 482,Item 483,Item 484,Item 485,Item 486,Item 487,Item 488,Item 489,Item 490,Item 491
User 1,1.0,1.0,1.67795,2.844768,1.0,1.0,1.446561,1.0,1.0,1.032292,...,1.0,1.318902,1.0,1.0,1.0,2.217588,2.38537,1.42053,1.003994,1.0
User 2,1.0,1.107472,1.711398,3.081632,1.161311,1.0,1.695029,1.0,1.0,1.0,...,1.217698,1.973531,1.687058,1.0,1.0,3.041916,2.941874,1.989025,2.148822,1.0
User 3,1.0,1.501401,1.0,1.76249,1.0,1.0,1.253921,1.0,1.0,1.0,...,3.134242,1.812258,2.523419,1.04731,1.0,2.58607,2.266626,1.935964,2.899634,1.0
User 4,1.0,1.0,1.696669,2.373788,1.322852,1.0,1.0,1.0,1.0,1.007788,...,1.434046,2.026699,1.889129,1.010291,1.0,2.761241,2.053442,1.007627,1.271931,1.0
User 5,1.0,1.620254,1.0,2.70415,1.0,1.0,2.126044,1.0,1.0,1.0,...,1.45193,1.342308,1.356972,1.0,1.0,2.469851,3.121837,2.800213,3.126697,1.0


In [19]:
def recommend_items_svd(user_id, original_df, reconstructed_df, num_recommendations=5):

    # Predict ratings for unrated items
    predicted_ratings = reconstructed_df.loc[user_id]

    # Recommend items with the highest predicted ratings
    recommended_items = predicted_ratings.nlargest(num_recommendations).index.tolist()
    return recommended_items

recommendations_svd = recommend_items_svd('User 5', df_p, predicted_ratings_df)
print(recommendations_svd)



['Item 67', 'Item 304', 'Item 222', 'Item 469', 'Item 263']


In [20]:
# Assuming df_train_true and df_train_predict are DataFrames with NaN values
# Extract only the non-NaN values
mask = ~np.isnan(df_p.values)
true_values = df_p.values[mask]
predicted_values = predicted_ratings_df.values[mask]

# Calculate RMSE and MSE based only on non-NaN values
rmse_train = np.sqrt(mean_squared_error(true_values, predicted_values))
mse_train = mean_squared_error(true_values, predicted_values)

print("RMSE on non-NaN values in df_train_true and df_train_predict:", rmse_train)
print("MSE on non-NaN values in df_train_true and df_train_predict:", mse_train)

RMSE on non-NaN values in df_train_true and df_train_predict: 1.8311881974525723
MSE on non-NaN values in df_train_true and df_train_predict: 3.353250214489601


## Content-based filtering using Cosine Similarity

It uses similarity between items to recommend items similar to what the user likes.

For this method, the dataset netflix_titles.csv was used. That dataset contains a total of 8807 registrations for netflix series and movies. Some of the most important features are:

1.  Titles
2.  Directors
3.  Cast
4.  Country
5.  Category (listed_in)
6.  Description

In [21]:
import pandas as pd
df_cosine = pd.read_csv("netflix_titles.csv")
from sklearn.feature_extraction.text import TfidfVectorizer

# removing stopwords
tfidf = TfidfVectorizer(stop_words="english")

# Replace NaN with an empty string
df_cosine["description"] = df_cosine["description"].fillna("")

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df_cosine["description"])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

(8807, 18895)

In [22]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df_cosine.index, index=df_cosine["title"]).drop_duplicates()

filledna = df_cosine.fillna("")
filledna.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


Now, after transforming all the words to lower-case, we create a vector that contains the title, the cast, the category and the description of every movie.

In [23]:
# make all words lower-case
def clean_data(x):
    return str.lower(x.replace(" ", ""))

In [24]:
features = ["title", "director", "cast", "listed_in", "description"]
filledna = filledna[features]

for feature in features:
    filledna[feature] = filledna[feature].apply(clean_data)

filledna.head(4)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,title,director,cast,listed_in,description
0,dickjohnsonisdead,kirstenjohnson,,documentaries,"asherfathernearstheendofhislife,filmmakerkirst..."
1,blood&water,,"amaqamata,khosingema,gailmabalane,thabangmolab...","internationaltvshows,tvdramas,tvmysteries","aftercrossingpathsataparty,acapetownteensetsou..."
2,ganglands,julienleclercq,"samibouajila,tracygotoas,samueljouy,nabihaakka...","crimetvshows,internationaltvshows,tvaction&adv...","toprotecthisfamilyfromapowerfuldruglord,skille..."
3,jailbirdsneworleans,,,"docuseries,realitytv","feuds,flirtationsandtoilettalkgodownamongthein..."


In [25]:
def create_soup(x):
    return (
        x["title"]
        + " "
        + x["director"]
        + " "
        + x["cast"]
        + " "
        + x["listed_in"]
        + " "
        + x["description"]
    )


filledna["soup"] = filledna.apply(create_soup, axis=1)
filledna["soup"]

0       dickjohnsonisdead kirstenjohnson  documentarie...
1       blood&water  amaqamata,khosingema,gailmabalane...
2       ganglands julienleclercq samibouajila,tracygot...
3       jailbirdsneworleans   docuseries,realitytv feu...
4       kotafactory  mayurmore,jitendrakumar,ranjanraj...
                              ...                        
8802    zodiac davidfincher markruffalo,jakegyllenhaal...
8803    zombiedumb   kids'tv,koreantvshows,tvcomedies ...
8804    zombieland rubenfleischer jesseeisenberg,woody...
8805    zoom peterhewitt timallen,courteneycox,chevych...
8806    zubaan mozezsingh vickykaushal,sarah-janedias,...
Name: soup, Length: 8807, dtype: object

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words="english")
count_matrix = count.fit_transform(filledna["soup"])

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
cosine_sim2

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.04583492, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.04583492, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.06933752,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06933752, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [27]:
filledna = filledna.reset_index()
indices = pd.Series(filledna.index, index=filledna["title"])

In [28]:
def get_recommendations_new(title, cosine_sim=cosine_sim):
    title = title.replace(" ", "").lower()
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df_cosine["title"].iloc[movie_indices]

In [29]:
get_recommendations_new("Better Call Saul", cosine_sim2)

5940             Breaking Bad
6841               Get Shorty
4632             The Good Cop
5922               Lilyhammer
1306               Good Girls
1535    How To Ruin Christmas
15          Dear White People
6252             Battle Creek
7856        Republic of Doyle
4079                 Unsolved
Name: title, dtype: object

## Implementation with Neural Network using Keras

The user-movie rating matrix is high dimensional and sparse, therefore I am going to reduce the dimensionality to represent the data in a dense form.

In [33]:
# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_filterd['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filterd['Movie'].unique())}


# Create correctly mapped train- & testset
train_user_data = df_train['User'].map(user_id_mapping)
train_movie_data = df_train['Movie'].map(movie_id_mapping)

test_user_data = df_test['User'].map(user_id_mapping)
test_movie_data = df_test['Movie'].map(movie_id_mapping)


# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)
embedding_size = 10


##### Create model
# Set input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=embedding_size,
                           input_dim=users,
                           input_length=1,
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=embedding_size,
                            input_dim=movies,
                            input_length=1,
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)
movie_vector = Reshape([embedding_size])(movie_embedding)


Here we use Keras to build a collaborative filtering model for movie recommendations. We define embedding layers for user and movie IDs, concatenates and process the vectors with dense layers, and then train the model on user-movie rating data. Then we evaluate the performance using root mean squared error (RMSE) on a test dataset.

In [34]:
# Setup variables
user_embedding_size = 20
movie_embedding_size = 10


##### Create model
# Set input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=user_embedding_size,
                           input_dim=users,
                           input_length=1,
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=movie_embedding_size,
                            input_dim=movies,
                            input_length=1,
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([user_embedding_size])(user_embedding)
movie_vector = Reshape([movie_embedding_size])(movie_embedding)

# Concatenate the reshaped embedding layers
concat = Concatenate()([user_vector, movie_vector])

# Combine with dense layers
dense = Dense(256)(concat)
y = Dense(1)(dense)

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')


# Fit model
model.fit([train_user_data, train_movie_data],
          df_train['Rating'],
          batch_size=256,
          epochs=1,
          validation_split=0.1,
          shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
y_true = df_test['Rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Deep Learning: {:.4f} RMSE'.format(rmse))



Testing Result With Keras Deep Learning: 0.9069 RMSE
