### Handling the cold-start problem by using content-based filtering

Collaborative filtering relies solely on user-item interactions within the utility matrix. The issue with this approach is that brand new users or items with no iteractions get excluded from the recommendation system. This is called the **cold start problem**. Content-based filtering is a way to handle this problem by generating recommendations based on user and item features.


### EDA

In [1]:
# impot libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
data = pd.read_csv(r"C:\Users\nguye\Desktop\Movies Recommendation\data\movies.csv")
data.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
data.shape

(9742, 3)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [13]:
data.isna().sum() # no empty rows

movieId    0
title      0
genres     0
dtype: int64

In [17]:
data.duplicated().sum() #no duplicated rows

0

In [35]:
data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [73]:
# replace | symbole by , and remove - 
data['genres_n'] = data['genres'].str.replace('|',',').str.replace('-','')
data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,movieId,title,genres,genres_n
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"Adventure,Animation,Children,Comedy,Fantasy"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"Adventure,Children,Fantasy"
2,3,Grumpier Old Men (1995),Comedy|Romance,"Comedy,Romance"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"Comedy,Drama,Romance"
4,5,Father of the Bride Part II (1995),Comedy,Comedy


## Build model


In [87]:
#calculate the TF-IDF (term frequency - inverse document frequency)
vectorizer = TfidfVectorizer(ngram_range=(1,1))
tfidf_matrix = vectorizer.fit_transform(data['genres_n'])

# check vocabulary and shape of the matrix
print(f'vocabulary word bags: {vectorizer.vocabulary_}')
print(len(vectorizer.vocabulary_))
print(f'the matrix shape is: {tfidf_matrix.shape}')


vocabulary word bags: {'adventure': 1, 'animation': 2, 'children': 3, 'comedy': 4, 'fantasy': 8, 'romance': 17, 'drama': 7, 'action': 0, 'crime': 5, 'thriller': 19, 'horror': 11, 'mystery': 15, 'scifi': 18, 'war': 20, 'musical': 14, 'documentary': 6, 'imax': 12, 'western': 21, 'filmnoir': 9, 'no': 16, 'genres': 10, 'listed': 13}
22
the matrix shape is: (9742, 22)


In [90]:
# get array/dense of the  sparse matrix
dense_matrix = tfidf_matrix.todense()

# put dense_matrix to a dataframe with title
df_tfidf = pd.DataFrame(data=dense_matrix, columns=vectorizer.get_feature_names_out(),index=data['title'])
df_tfidf.head(2) # view the df_tfidf data frame 


Unnamed: 0_level_0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,...,imax,listed,musical,mystery,no,romance,scifi,thriller,war,western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:

# caclulate cosine similarity of each vector by themselve using the 'tfidf_matrix'
cosine_sim = cosine_similarity(tfidf_matrix) 
print(f'cosine similariy shape {cosine_sim.shape}')

# convert to dataframe with title of each cosine similarity
df_cosine_sim = pd.DataFrame(data = cosine_sim,columns=data['title'],index=data['title'])
df_cosine_sim.head()

cosine similariy shape (9742, 9742)


title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.0,0.813578,0.152769,0.135135,0.267586,0.0,0.152769,0.654698,0.0,0.262413,...,0.411168,0.465621,0.196578,0.516225,0.0,0.680258,0.755891,0.0,0.421037,0.267586
Jumanji (1995),0.813578,1.0,0.0,0.0,0.0,0.0,0.0,0.804715,0.0,0.322542,...,0.0,0.0,0.0,0.0,0.0,0.341376,0.379331,0.0,0.0,0.0
Grumpier Old Men (1995),0.152769,0.0,1.0,0.884571,0.570915,0.0,1.0,0.0,0.0,0.0,...,0.18579,0.0,0.419413,0.0,0.0,0.181883,0.202105,0.0,0.0,0.570915
Waiting to Exhale (1995),0.135135,0.0,0.884571,1.0,0.505015,0.0,0.884571,0.0,0.0,0.0,...,0.164344,0.201391,0.68744,0.0,0.0,0.160888,0.178776,0.466405,0.0,0.505015
Father of the Bride Part II (1995),0.267586,0.0,0.570915,0.505015,1.0,0.0,0.570915,0.0,0.0,0.0,...,0.325424,0.0,0.734632,0.0,0.0,0.318581,0.354002,0.0,0.0,1.0


# Implement 

In [123]:
# an user will search a movie name for example 'Toy Story (1995)'
user_input = 'Toy Story (1995)'

# the model will print out the most 10 related movies by checking the top 10 highest cosine similarity
top_k = 10
relevant_data = df_cosine_sim.loc[user_input].sort_values(ascending=False).drop(user_input)[:top_k].reset_index().iloc[:,0]
print(relevant_data)

0    Asterix and the Vikings (Astérix et les Viking...
1                                          Antz (1998)
2                     Emperor's New Groove, The (2000)
3                                   Toy Story 2 (1999)
4                             The Good Dinosaur (2015)
5                               Shrek the Third (2007)
6                                Monsters, Inc. (2001)
7                                         Moana (2016)
8       Adventures of Rocky and Bullwinkle, The (2000)
9                       Tale of Despereaux, The (2008)
Name: title, dtype: object
