In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

## Importing Dataset from Movie Lens 

In [2]:
movies = pd.read_csv(r'C:\Users\Shaun Tay\Documents\Movie recommendation\ml-latest-small\movies.csv')
ratings = pd.read_csv(r'C:\Users\Shaun Tay\Documents\Movie recommendation\ml-latest-small\ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


## Using Cosine Similarity Model

1. Checking number of rows and cols
2. Checking for missing data in df

In [7]:
print(f"movie col, row: {movies.shape}\nratings col, row: {ratings.shape}")

movie col, row: (9742, 3)
ratings col, row: (100836, 4)


In [8]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [9]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

#### Feature Enginerring: 

In [10]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


Using only ratings of first 200 users to reduce datasize

In [11]:
new_ratings = ratings[ratings['userId'] <= 200]
new_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
29263,200,60074,3.5,1229887390
29264,200,61024,4.0,1229889835
29265,200,61323,4.0,1229887427
29266,200,62299,3.5,1229876803


In [12]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [13]:
movie__ratings = new_ratings.merge(movies,left_on='movieId',right_on='movieId')
movie__ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
29263,200,27685,2.0,1229878742,Bring It On Again (2004),Comedy
29264,200,34321,3.0,1229877839,Bad News Bears (2005),Children|Comedy
29265,200,45106,3.0,1229877358,American Dreamz (2006),Comedy|Drama
29266,200,50802,3.0,1229878025,Because I Said So (2007),Comedy|Drama|Romance


In [14]:
movie_ratings = movie__ratings[['userId','title','rating']]
movie_ratings

Unnamed: 0,userId,title,rating
0,1,Toy Story (1995),4.0
1,5,Toy Story (1995),4.0
2,7,Toy Story (1995),4.5
3,15,Toy Story (1995),2.5
4,17,Toy Story (1995),4.5
...,...,...,...
29263,200,Bring It On Again (2004),2.0
29264,200,Bad News Bears (2005),3.0
29265,200,American Dreamz (2006),3.0
29266,200,Because I Said So (2007),3.0


1. pivot table for similiarity

In [15]:
pivot_movies = pd.pivot_table(movie_ratings,values='rating',index='userId',columns='title')
pivot_movies

title,'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",(500) Days of Summer (2009),00 Schneider - Jagd auf Nihil Baxter (1994),1-900 (06) (1994),10 (1979),10 Cent Pistol (2015),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),...,Zombie Strippers! (2008),Zombieland (2009),Zookeeper (2011),Zoolander (2001),Zootopia (2016),Zulu (1964),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,4.0
2,,,,,,,,,,,...,,3.0,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,,,,,,,,,,,...,,,,,,,,,,
197,,,,,,,,,,,...,,,,,,,,,,
198,,,,,,,,,,1.0,...,,,,,,,,5.0,,2.0
199,,,,,,,,,,,...,,,,,,,,,,


### Feature Scaling:
> More on normalisation formula: https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/#:~:text=Normalization%20is%20a%20scaling%20technique,values%20of%20the%20feature%20respectively.

1. Normalise values: Further explanation on why need normalise is given in link: https://stats.stackexchange.com/questions/292596/is-feature-normalisation-needed-prior-to-computing-cosine-distance



2. Replace Nan with 0

In [16]:
pivot_movies_n = pivot_movies.apply(lambda x: round((x-np.min(x))/(np.max(x)-np.min(x)),2), axis=1)

In [17]:
#replace np.Nan values to 0
pivot_movies_n.fillna(0,inplace=True)

#tranposing columns
pivot_movies_n = pivot_movies_n.T
pivot_movies_n

userId,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
'Tis the Season for Love (2015),0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
"'burbs, The (1989)",0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
(500) Days of Summer (2009),0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
00 Schneider - Jagd auf Nihil Baxter (1994),0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zulu (1964),0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
anohana: The Flower We Saw That Day - The Movie (2013),0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
eXistenZ (1999),0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.00,0.0,0.0
xXx (2002),0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0


In [18]:
#Dropping 0 Columns -> This is to ensure after sparse matrix and inputting in cosine sim algo, 
#the data can be put into dataframe with pviot table indexes and columns
pivot_movies_n = pivot_movies_n.loc[:, (pivot_movies_n != 0).any(axis=0)]
pivot_movies_n.shape

(5594, 199)

In [19]:
#using sparse matrix
from scipy.sparse import csr_matrix

piv_sparse = csr_matrix(pivot_movies_n)
piv_sparse.shape

(5594, 199)

#### Implementing Cosine Similarity Model 

In [20]:
#ML model
from sklearn.metrics.pairwise import cosine_similarity

movies_similarity = cosine_similarity(piv_sparse)

In [21]:
#Putting similarity into a dataframe: 
movie_sim_df = pd.DataFrame(movies_similarity, index = pivot_movies_n.index, columns = pivot_movies_n.index)
movie_sim_df.head()

title,'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",(500) Days of Summer (2009),00 Schneider - Jagd auf Nihil Baxter (1994),1-900 (06) (1994),10 (1979),10 Cent Pistol (2015),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),...,Zombie Strippers! (2008),Zombieland (2009),Zookeeper (2011),Zoolander (2001),Zootopia (2016),Zulu (1964),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Tis the Season for Love (2015),0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.281844,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307705,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078567,0.128091,0.024001
(500) Days of Summer (2009),0.0,0.0,0.0,1.0,0.262098,0.0,0.0,0.0,0.166377,0.221505,...,0.0,0.490042,0.0,0.240461,0.357248,0.0,0.0,0.01976,0.246352,0.129019
00 Schneider - Jagd auf Nihil Baxter (1994),0.0,0.0,0.0,0.262098,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.213137,0.0,0.0,0.0,0.0,0.0,0.0


### Creating function to recommend movies for users

1st approach -> iterate over each row in filtered column to find the maximum value: (However, not advisable as iterating through rows may change datatype and is slow. Should only be used as a last resort) 

Reccomended approach: sort the data frame and index the list

In [22]:
def movie_reccommend(movie_name):
    movie_column = movie_sim_df.filter(items=[movie_name])
    
    print(f"Based on movies since you watched {movie_name}")
    number = 1
    for movie in movie_column.sort_values(by=movie_name,ascending=False).index[1:6]:
        print(f"#{number}: {movie}, {movie_sim_df[movie][movie_name]*100}% match")
        number += 1              


movie = input('''Enter: 'Movie Name (Year)' ''')
movie_reccommend(movie)

Enter: 'Movie Name (Year)' Jumanji (1995)
Based on movies since you watched Jumanji (1995)
#1: Santa Clause, The (1994), 52.64732328061969% match
#2: Star Wars: Episode I - The Phantom Menace (1999), 50.888167198009334% match
#3: Honey, I Shrunk the Kids (1989), 50.73063365887568% match
#4: Lion King, The (1994), 48.02876565748221% match
#5: Back to the Future Part II (1989), 47.51251371260069% match


# Using K-clustering Model

Finding the average user rating for each Movie 

In [23]:
avg_rating_movie = ratings.groupby('movieId').mean().round(2)

In [24]:
avg_rating_movie.drop(['userId','timestamp'],axis=1,inplace=True)

In [25]:
avg_rating_movie

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.92
2,3.43
3,3.26
4,2.36
5,3.07
...,...
193581,4.00
193583,3.50
193585,3.50
193587,3.50


In [26]:
movie_ratings = movies.join(avg_rating_movie,on='movieId')
movie_ratings

Unnamed: 0,movieId,title,genres,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.43
2,3,Grumpier Old Men (1995),Comedy|Romance,3.26
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.36
4,5,Father of the Bride Part II (1995),Comedy,3.07
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,4.00
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.50
9739,193585,Flint (2017),Drama,3.50
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.50


Choosing to analysis only the comedy movies 

In [27]:
movies_comedy = movie_ratings[movie_ratings['genres'].str.contains('Comedy')]
movies_comedy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3756 entries, 0 to 9741
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  3756 non-null   int64  
 1   title    3756 non-null   object 
 2   genres   3756 non-null   object 
 3   rating   3753 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 146.7+ KB


In [28]:
movies_romance = movie_ratings[movie_ratings['genres'].str.contains('Romance')]
movies_romance

Unnamed: 0,movieId,title,genres,rating
2,3,Grumpier Old Men (1995),Comedy|Romance,3.26
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.36
6,7,Sabrina (1995),Comedy|Romance,3.19
10,11,"American President, The (1995)",Comedy|Drama|Romance,3.67
14,15,Cutthroat Island (1995),Action|Adventure|Romance,3.00
...,...,...,...,...
9639,179511,Emerald Green (2016),Adventure|Drama|Fantasy|Romance,4.00
9660,181315,Phantom Thread (2017),Drama|Romance,3.50
9691,184349,Elsa & Fred (2005),Comedy|Drama|Romance,3.50
9715,188751,Mamma Mia: Here We Go Again! (2018),Comedy|Romance,4.50


## Visualisations