# MOVIE RECOMMENDER SYSTEM

In [1]:
# Import the required libraries.
import pandas as pd

In [2]:
# Load the MovieLens dataset
ratings = pd.read_csv('movielens_dataset/ratings.csv')
movies = pd.read_csv('movielens_dataset/movies.csv')

In [4]:
ratings.head

<bound method NDFrame.head of         userId  movieId  rating   timestamp
0            1        1     4.0   964982703
1            1        3     4.0   964981247
2            1        6     4.0   964982224
3            1       47     5.0   964983815
4            1       50     5.0   964982931
...        ...      ...     ...         ...
100831     610   166534     4.0  1493848402
100832     610   168248     5.0  1493850091
100833     610   168250     5.0  1494273047
100834     610   168252     5.0  1493846352
100835     610   170875     3.0  1493846415

[100836 rows x 4 columns]>

In [5]:
ratings.shape

(100836, 4)

In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [9]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [13]:
ratings.duplicated().sum()

0

In [15]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [20]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [16]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [17]:
movies.duplicated().sum()

0

In [18]:
movies.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [19]:
movies.head

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
ratings['userId'].value_counts()

414    2698
599    2478
474    2108
448    1864
274    1346
       ... 
406      20
595      20
569      20
431      20
442      20
Name: userId, Length: 610, dtype: int64

In [26]:
#ensuring no user has rated a movie more than once.
# Check for duplicates
duplicates = ratings.duplicated(subset=['userId', 'movieId'])

# Get the entries that are duplicates
duplicate_entries = ratings[duplicates]

# Print the number of duplicate entries
print(f'Number of duplicate ratings: {duplicate_entries.shape[0]}')

Number of duplicate ratings: 0


In [29]:
movies['genres'].value_counts()

Drama                                      1053
Comedy                                      946
Comedy|Drama                                435
Comedy|Romance                              363
Drama|Romance                               349
                                           ... 
Comedy|Drama|Musical|Sci-Fi                   1
Action|Adventure|Romance|Sci-Fi               1
Adventure|Drama|Horror|Mystery|Thriller       1
Mystery|Romance|Sci-Fi|Thriller               1
Drama|Fantasy|Horror|Romance|Thriller         1
Name: genres, Length: 951, dtype: int64

In [30]:
movies['movieId'].value_counts()

86014     1
1282      1
3347      1
1298      1
25870     1
         ..
60072     1
4775      1
50601     1
131749    1
83969     1
Name: movieId, Length: 9742, dtype: int64

In [31]:
ratings['movieId'].value_counts()

356       329
318       317
296       307
593       279
2571      278
         ... 
5986        1
100304      1
34800       1
83976       1
8196        1
Name: movieId, Length: 9724, dtype: int64

In [35]:
# Count the number of ratings for each movie
movie_rating_counts = ratings['movieId'].value_counts().reset_index()

# Rename the columns for clarity
movie_rating_counts.columns = ['movieId', 'ratingCount']

# Print the first few rows to check the counts
print("Movie rating counts:")
print(movie_rating_counts)

Movie rating counts:
      movieId  ratingCount
0         356          329
1         318          317
2         296          307
3         593          279
4        2571          278
...       ...          ...
9719     5986            1
9720   100304            1
9721    34800            1
9722    83976            1
9723     8196            1

[9724 rows x 2 columns]


In [41]:
#Merging the two datasets
merged_df = movies.merge(ratings, on = 'movieId')
merged_df.drop(columns = 'timestamp', inplace= True)
merged_df.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


In [42]:
merged_df.shape

(100836, 5)

In [43]:
merged_df.isna().sum()

movieId    0
title      0
genres     0
userId     0
rating     0
dtype: int64

In [44]:
merged_df.duplicated().sum()

0

In [46]:
movie_pivot = merged_df.pivot_table(columns = 'userId', index = 'title', values = 'rating')
movie_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx (2002),,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos! (1986),4.0,,,,,,,,,,...,,,,,,,,,,


In [47]:
movie_pivot.shape

(9719, 610)

In [49]:
movie_pivot.fillna(0, inplace= True)
movie_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5
¡Three Amigos! (1986),4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
from scipy.sparse import csr_matrix

In [51]:
movie_sparse = csr_matrix(movie_pivot)
movie_sparse

<9719x610 sparse matrix of type '<class 'numpy.float64'>'
	with 100832 stored elements in Compressed Sparse Row format>

In [52]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm= 'brute')


In [53]:
model.fit(movie_sparse)

NearestNeighbors(algorithm='brute')

In [56]:
distance, suggestion = model.kneighbors(movie_pivot.iloc[200,:].values.reshape(1,-1), n_neighbors = 5)


In [57]:
distance

array([[0.        , 8.44097151, 8.45576726, 8.6890736 , 8.74642784]])

In [58]:
suggestion

array([[ 200, 8494, 1126, 2156, 6544]], dtype=int64)

In [59]:
for i in range(len(suggestion)):
    print(movie_pivot.index[suggestion[i]])

Index(['Absolute Power (1997)', 'The Great Raid (2005)', 'Black Robe (1991)',
       'Dark Blue (2003)', 'Penn & Teller Get Killed (1989)'],
      dtype='object', name='title')


In [60]:
movie_pivot.index[200]

'Absolute Power (1997)'

In [61]:
movie_pivot.index

Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       ...
       'Zulu (2013)', '[REC] (2007)', '[REC]² (2009)',
       '[REC]³ 3 Génesis (2012)',
       'anohana: The Flower We Saw That Day - The Movie (2013)',
       'eXistenZ (1999)', 'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)', 'À nous la liberté (Freedom for Us) (1931)'],
      dtype='object', name='title', length=9719)

In [63]:
movie_titles = movie_pivot.index
movie_titles

Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       ...
       'Zulu (2013)', '[REC] (2007)', '[REC]² (2009)',
       '[REC]³ 3 Génesis (2012)',
       'anohana: The Flower We Saw That Day - The Movie (2013)',
       'eXistenZ (1999)', 'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)', 'À nous la liberté (Freedom for Us) (1931)'],
      dtype='object', name='title', length=9719)

In [77]:
import pickle
pickle.dump(model, open('pickle_files/model.pkl', 'wb'))
pickle.dump(model, open('pickle_files/movie_titles.pkl', 'wb'))
pickle.dump(model, open('pickle_files/merged_df.pkl', 'wb'))
pickle.dump(model, open('pickle_files/movie_pivot.pkl', 'wb'))

In [78]:
import numpy as np
def recommend_movie(movie_title):
    movie_id = np.where(movie_pivot.index == movie_title)[0][0]
    distance, suggestion = model.kneighbors(movie_pivot.iloc[movie_id,:].values.reshape(1,-1), n_neighbors = 5)
    
    for i in range(len(suggestion)):
        movies = movie_pivot.index[suggestion[i]]
        for j in movies:
            print(j)


In [72]:
np.where(movie_pivot.index == 'Zulu (2013)')[0][0]

9709

In [76]:
movie_title = 'Zulu (2013)'
recommend_movie(movie_title)

Anti-Social (2015)
Our Family Wedding (2010)
Robot Overlords (2014)
Lola Versus (2012)
Drive Hard (2014)
