In [1]:
pip install scikit-surprise

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [353]:
# Install packages here
# Packages for data processing
import numpy as np
import pandas as pd
import datetime
from sklearn import preprocessing
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from scipy.sparse import csr_matrix
import scipy as sp


# Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Packages for modeling
from surprise import Reader
from surprise import Dataset
from surprise import KNNWithMeans
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
import heapq

# Packages for model evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from time import time

# Package to suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Packages for saving models
import pickle

In [354]:
# Load the necessary datasets
movies_df = pd.read_csv('movies.csv')
imdb_df = pd.read_csv('imdb_data.csv')
links_df = pd.read_csv('links.csv')
tags = pd.read_csv('tags.csv')
genome_scores = pd.read_csv('genome_scores.csv')
genome_tags = pd.read_csv('genome_tags.csv')

In [356]:
# Preview datasets
print('The Shape of the movies data is: ', movies_df.shape)
print(movies_df.head(3))

print("********************************************************")
print('The Shape of the  IMDB data is: ', imdb_df.shape)
print(imdb_df.head(3))

print("********************************************************")
print('The Shape of the links data is: ', links_df.shape)
print(links_df.head(3))

print("********************************************************")
print('The Shape of the tags data is: ', tags.shape)
print(tags.head(3))

print("********************************************************")
print('The Shape of the genome data is: ', genome_scores.shape)
print(genome_scores.head(3))

print("********************************************************")
print('The Shape of the genome_tags data is: ', genome_tags.shape)
print(genome_tags.head(3))

The Shape of the movies data is:  (62423, 3)
   movieId                    title  \
0        1         Toy Story (1995)   
1        2           Jumanji (1995)   
2        3  Grumpier Old Men (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
********************************************************
The Shape of the  IMDB data is:  (27278, 6)
   movieId                                         title_cast  \
0        1  Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...   
1        2  Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...   
2        3  Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...   

              director  runtime       budget                     plot_keywords  
0        John Lasseter     81.0  $30,000,000  toy|rivalry|cowboy|cgi animation  
1   Jonathan Hensleigh    104.0  $65,000,000  board game|adventurer|fight|game

In [357]:
# Count the number of unique items in each dataset
print("Movies: ")
print(movies_df.nunique())
print("************")
print("Links: ")
print(links_df.nunique())
print("************")
print("IMDB: ")
print(imdb_df.nunique())
print("************")
print("Tags: ")
print(tags.nunique())
print("************")
print("Genome scores: ")
print(genome_scores.nunique())
print("************")
print("Genome tags: ")
print(genome_tags.nunique())

Movies: 
movieId    62423
title      62325
genres      1639
dtype: int64
************
Links: 
movieId    62423
imdbId     62423
tmdbId     62281
dtype: int64
************
IMDB: 
movieId          27278
title_cast       17143
director         11786
runtime            274
budget            1362
plot_keywords    16008
dtype: int64
************
Tags: 
userId        14592
movieId       45251
tag           73050
timestamp    907730
dtype: int64
************
Genome scores: 
movieId      13816
tagId         1128
relevance     4000
dtype: int64
************
Genome tags: 
tagId    1128
tag      1128
dtype: int64


In [358]:
# checking for null values
print("Movies: ")
print(str(movies_df.isnull().sum()))
print("************")
print("Links: ")
print(str(links_df.isnull().sum()))
print("************")
print("IMDB: ")
print(str(imdb_df.isnull().sum()))
print("************")
print("Tags: ")
print(str(tags.isnull().sum()))

Movies: 
movieId    0
title      0
genres     0
dtype: int64
************
Links: 
movieId      0
imdbId       0
tmdbId     107
dtype: int64
************
IMDB: 
movieId              0
title_cast       10068
director          9874
runtime          12089
budget           19372
plot_keywords    11078
dtype: int64
************
Tags: 
userId        0
movieId       0
tag          16
timestamp     0
dtype: int64


In [359]:
# Merge Four different datasets with important data
merged_df = tags.merge(movies_df, on=['movieId', 'movieId'], how='inner')
merged_df = merged_df.merge(links_df, on=['movieId', 'movieId'], how='inner')
merged_df = merged_df.merge(imdb_df, on=['movieId', 'movieId'], how='inner')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903067 entries, 0 to 903066
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   userId         903067 non-null  int64  
 1   movieId        903067 non-null  int64  
 2   tag            903051 non-null  object 
 3   timestamp      903067 non-null  int64  
 4   title          903067 non-null  object 
 5   genres         903067 non-null  object 
 6   imdbId         903067 non-null  int64  
 7   tmdbId         902800 non-null  float64
 8   title_cast     676067 non-null  object 
 9   director       676944 non-null  object 
 10  runtime        662744 non-null  float64
 11  budget         584539 non-null  object 
 12  plot_keywords  673753 non-null  object 
dtypes: float64(2), int64(4), object(7)
memory usage: 89.6+ MB


In [360]:
merged_df.isnull().sum()

userId                0
movieId               0
tag                  16
timestamp             0
title                 0
genres                0
imdbId                0
tmdbId              267
title_cast       227000
director         226123
runtime          240323
budget           318528
plot_keywords    229314
dtype: int64

In [361]:
# Sort the merged dataframe by 'userId'
merged_df.sort_values(by='userId', inplace=True)
merged_df.head()

Unnamed: 0,userId,movieId,tag,timestamp,title,genres,imdbId,tmdbId,title_cast,director,runtime,budget,plot_keywords
0,3,260,classic,1439472355,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0,,,,,
1,3,260,sci-fi,1439472256,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0,,,,,
8044,4,115569,tense,1573943077,Nightcrawler (2014),Crime|Drama|Thriller,2872718,242582.0,Jake Gyllenhaal|Michael Papajohn|Marco Rodrígu...,Dan Gilroy,117.0,"$8,500,000",sociopath|tv news|ethics|journalism ethics
7596,4,44665,unreliable narrators,1573943619,Lucky Number Slevin (2006),Crime|Drama|Mystery,425210,186.0,Josh Hartnett|Bruce Willis|Lucy Liu|Morgan Fre...,Jason Smilovic,110.0,"$27,000,000",original story|mistaken identity|assassin|jewi...
7483,4,7569,so bad it's good,1573943455,You Only Live Twice (1967),Action|Adventure|Sci-Fi|Thriller,62512,667.0,,,,,


In [362]:
# Extract the columns in 'merged_df' and drop unwanted columns
removed_cols = [ 'timestamp', 'imdbId', 'tmdbId', 'runtime', 'budget', 'plot_keywords']
# Drop colulmns
merged_df.drop(columns=removed_cols, axis=1, inplace=True)
merged_df.head()

Unnamed: 0,userId,movieId,tag,title,genres,title_cast,director
0,3,260,classic,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,,
1,3,260,sci-fi,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,,
8044,4,115569,tense,Nightcrawler (2014),Crime|Drama|Thriller,Jake Gyllenhaal|Michael Papajohn|Marco Rodrígu...,Dan Gilroy
7596,4,44665,unreliable narrators,Lucky Number Slevin (2006),Crime|Drama|Mystery,Josh Hartnett|Bruce Willis|Lucy Liu|Morgan Fre...,Jason Smilovic
7483,4,7569,so bad it's good,You Only Live Twice (1967),Action|Adventure|Sci-Fi|Thriller,,


In [363]:
# Fill NaN values with empty cells
merged_df.fillna('', inplace=True)

In [364]:
# Clean text data by removing '|' and extra spaces to ensure consistency in text data
merged_df['genres'] = merged_df['genres'].apply(lambda x: re.sub(r'\|', ' ', x))
merged_df['title_cast'] = merged_df['title_cast'].apply(lambda x: re.sub(r'\|', ' ', x))

In [365]:
merged_df.head()

Unnamed: 0,userId,movieId,tag,title,genres,title_cast,director
0,3,260,classic,Star Wars: Episode IV - A New Hope (1977),Action Adventure Sci-Fi,,
1,3,260,sci-fi,Star Wars: Episode IV - A New Hope (1977),Action Adventure Sci-Fi,,
8044,4,115569,tense,Nightcrawler (2014),Crime Drama Thriller,Jake Gyllenhaal Michael Papajohn Marco Rodrígu...,Dan Gilroy
7596,4,44665,unreliable narrators,Lucky Number Slevin (2006),Crime Drama Mystery,Josh Hartnett Bruce Willis Lucy Liu Morgan Fre...,Jason Smilovic
7483,4,7569,so bad it's good,You Only Live Twice (1967),Action Adventure Sci-Fi Thriller,,


In [366]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 903067 entries, 0 to 369525
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   userId      903067 non-null  int64 
 1   movieId     903067 non-null  int64 
 2   tag         903067 non-null  object
 3   title       903067 non-null  object
 4   genres      903067 non-null  object
 5   title_cast  903067 non-null  object
 6   director    903067 non-null  object
dtypes: int64(2), object(5)
memory usage: 55.1+ MB


In [367]:
# combine the cleaned text data into a single 'combined' feature
merged_df['combined'] = merged_df['tag'] + ' ' + merged_df['genres'] + ' ' + merged_df['title_cast'] + ' ' + merged_df['director'] 

In [368]:
# Remove each single column that is combined in the 'combined' feature
merged_df.drop(columns=['tag', 'genres', 'title_cast', 'director'], axis=1, inplace=True)

In [369]:
# Sample 10,000 rows from the DataFrame
cleaned_sampled_data = merged_df.sample(n=10000, random_state=42).reset_index(drop=True)
cleaned_sampled_data.head(15)

Unnamed: 0,userId,movieId,title,combined
0,141263,5618,Spirited Away (Sen to Chihiro no kamikakushi) ...,spirits Adventure Animation Fantasy Rumi Hiira...
1,30309,1196,Star Wars: Episode V - The Empire Strikes Back...,fantasy Action Adventure Sci-Fi
2,36535,48322,Jackass Number Two (2006),Ryan Dunn Comedy Documentary Johnny Knoxville ...
3,6550,68444,"Great Buck Howard, The (2008)",career Comedy John Malkovich Colin Hanks Emily...
4,62199,8485,Samsara (2001),Nalin Pan Adventure Drama Romance Shawn Ku Chr...
5,104708,4851,Things Behind the Sun (2001),independent film Drama Aria Alpert Adjani Rube...
6,107409,34405,Serenity (2005),contrived Action Adventure Sci-Fi Nathan Filli...
7,12493,4389,Lost and Delirious (2001),poor acting Drama Piper Perabo Jessica Paré Mi...
8,89977,69784,Brüno (Bruno) (2009),social commentary Comedy Sacha Baron Cohen Gus...
9,81363,2707,Arlington Road (1999),spionage Thriller Jeff Bridges Tim Robbins Joa...


In [370]:
# initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')

# Fit and Transform the combined text data into numerical vectors
matrix = vectorizer.fit_transform(cleaned_sampled_data['combined'])


In [371]:
matrix.shape

(10000, 109491)

In [372]:
# compute the similarity between each vector within our matrix
similarities = cosine_similarity(matrix, matrix)

In [373]:
similarities[:5]

array([[1.00000000e+00, 2.74106986e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.74106986e-02, 1.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 8.03952464e-03, 4.10097139e-02],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 6.90289374e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 4.45696720e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.69735658e-03, 8.69789078e-03, 0.00000000e+00, ...,
        7.51292433e-04, 0.00000000e+00, 6.33858965e-04]])

In [374]:
indices = pd.Series(cleaned_sampled_data.index, index=cleaned_sampled_data['title'])
movie_idx = indices['Jackass Number Two (2006)']
movie_idx

2

In [376]:
# Convert the matrix into a list of tuples (index, similarity score) for a given movie index
def flatten_similarity_matrix(similarity_matrix, movie_idx):
    sim_scores = list(enumerate(similarity_matrix[movie_idx]))
    return sim_scores

In [378]:
# Ensure similarity scores are scalars
def filter_scalar_similarity_scores(sim_scores):
    scalar_sim_scores = [(idx, score) for idx, score in sim_scores if np.isscalar(score)]
    return scalar_sim_scores

In [381]:
def content_based_recommender(movie_title, N=10):
    # Ensure movie title is in the DataFrame
    if movie_title not in cleaned_sampled_data['title'].values:
        return f"Movie '{movie_title}' not found in the dataset."

    titles = cleaned_sampled_data['title']
    indices = pd.Series(cleaned_sampled_data.index, index=cleaned_sampled_data['title']).drop_duplicates()

    # Convert the string movie title to a numeric index for similarity matrix
    movie_idx = indices[movie_title]

    # Flatten the similarity matrix for the given movie index
    sim_scores = flatten_similarity_matrix(similarities, movie_idx)

    # Ensure similarity scores are scalars
    sim_scores = filter_scalar_similarity_scores(sim_scores)

    # Sort values, keeping a copy of the original index of each value
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Select the top-N values for recommendation
    sim_scores = sim_scores[1:N+1]

    # Collect indexes
    movie_indices = [i[0] for i in sim_scores]

    # Convert the indexes back into titles
    return titles.iloc[movie_indices]

In [409]:
content_based_recommender("Mon Oncle (My Uncle) (1958)", N=10)

Series([], Name: title, dtype: object)