## Description:
###### Building a movie recommendation system for the users that they recently watched

* Import all the required libraries 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

* Loading the datasets needed

In [4]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
tags_df = pd.read_csv('tags.csv')
links_df = pd.read_csv('links.csv')

In [5]:
movies_df.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [6]:
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438


In [7]:
tags_df.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,19,2324,bittersweet,1428651158
1,19,2324,holocaust,1428651112


In [8]:
movies_df['movieId'].value_counts()

2047      1
140846    1
122385    1
138424    1
65045     1
         ..
103143    1
123625    1
64234     1
117484    1
131072    1
Name: movieId, Length: 34208, dtype: int64

In [9]:
print(movies_df.shape)
print('****************')
print(ratings_df.shape)
print('****************')
print(tags_df.shape)

(34208, 3)
****************
(22884377, 4)
****************
(586994, 4)


In [10]:
print(movies_df.isnull().sum())
print('****************')
print(ratings_df.isnull().sum())
print('****************')
print(tags_df.isnull().sum())

movieId    0
title      0
genres     0
dtype: int64
****************
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
****************
userId        0
movieId       0
tag          16
timestamp     0
dtype: int64


In [11]:
print(movies_df.duplicated().sum())
print('****************')
print(ratings_df.duplicated().sum())
print('****************')
print(tags_df.duplicated().sum())

0
****************
0
****************
0


* Merging the ratings and movies datframe on basis of movie_id

In [12]:
ratings_with_name = ratings_df.merge(movies_df,on='movieId')

In [13]:
ratings_with_name

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,169,2.5,1204927694,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
1,13,169,1.0,974868393,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
2,14,169,3.0,845470321,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
3,17,169,1.0,944991371,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
4,68,169,1.0,1011092044,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
...,...,...,...,...,...,...
22884372,247509,95581,4.0,1341630851,"Flying Fleet, The (1929)",Adventure|Drama|Romance
22884373,247534,111838,4.0,1429019913,"Man Named Rocca, A (Nommé La Rocca, Un) (Man C...",Drama
22884374,247662,137437,3.0,1436313628,Son of a Lion (2007),(no genres listed)
22884375,247670,145939,5.0,1446432935,Sandesham (1991),Children|Comedy


* Dropping the timestamp as it do not play vital role in building recommendation system here

In [14]:
ratings_with_name.drop('timestamp',inplace=True,axis=1)

In [15]:
ratings_with_name.head(2)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,169,2.5,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
1,13,169,1.0,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama


* Getting number of ratings were given on each title to a new data frame

In [16]:
num_rating_df = ratings_with_name.groupby('title').count()['rating'].reset_index()
num_rating_df.rename(columns={'rating':'No-of ratings'},inplace=True)
num_rating_df.head()

Unnamed: 0,title,No-of ratings
0,"""Great Performances"" Cats (1998)",207
1,#1 Cheerleader Camp (2010),5
2,#chicagoGirl: The Social Network Takes on a Di...,3
3,$ (Dollars) (1971),27
4,$5 a Day (2008),50


* Getting average rating of each title to a new data frame

In [17]:
avg_rating_df = ratings_with_name.groupby('title').mean()['rating'].reset_index()
avg_rating_df.rename(columns = {'rating':'Average_rating'},inplace=True)
avg_rating_df.head()

Unnamed: 0,title,Average_rating
0,"""Great Performances"" Cats (1998)",2.775362
1,#1 Cheerleader Camp (2010),2.5
2,#chicagoGirl: The Social Network Takes on a Di...,3.666667
3,$ (Dollars) (1971),2.740741
4,$5 a Day (2008),2.98


* Merging both average and number of ratings data frame to one

In [18]:
popular_df = num_rating_df.merge(avg_rating_df,on='title')

In [19]:
popular_df.head()

Unnamed: 0,title,No-of ratings,Average_rating
0,"""Great Performances"" Cats (1998)",207,2.775362
1,#1 Cheerleader Camp (2010),5,2.5
2,#chicagoGirl: The Social Network Takes on a Di...,3,3.666667
3,$ (Dollars) (1971),27,2.740741
4,$5 a Day (2008),50,2.98


* We are taking the movies into consideration where a movie got more than 2000 ratings and with rating with desc order with top 50 movies

In [20]:
popular_movies_df = popular_df[popular_df['No-of ratings']>2000].sort_values('Average_rating',ascending=False).head(50)

In [21]:
popular_movies_df.shape

(50, 3)

In [22]:
popular_movies_df.head()

Unnamed: 0,title,No-of ratings,Average_rating
25369,"Shawshank Redemption, The (1994)",77887,4.44171
11767,"Godfather, The (1972)",49846,4.353639
31510,"Usual Suspects, The (1995)",53195,4.318987
24769,Schindler's List (1993),59857,4.290952
11768,"Godfather: Part II, The (1974)",32247,4.268878


* Merging the top 50 movies dataframe to original movies data frame

In [23]:
popular_movies_df = popular_movies_df.merge(movies_df, on='title').drop_duplicates('title')[['title',
                                                                                             'No-of ratings',
                                                                                             'Average_rating',
                                                                                             'genres']]

In [24]:
popular_movies_df.head()

Unnamed: 0,title,No-of ratings,Average_rating,genres
0,"Shawshank Redemption, The (1994)",77887,4.44171,Crime|Drama
1,"Godfather, The (1972)",49846,4.353639,Crime|Drama
2,"Usual Suspects, The (1995)",53195,4.318987,Crime|Mystery|Thriller
3,Schindler's List (1993),59857,4.290952,Drama|War
4,"Godfather: Part II, The (1974)",32247,4.268878,Crime|Drama


# Collaborative Filtering Based Recommendation System

In [25]:
ratings_with_name.head(1)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,169,2.5,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama


In [26]:
ratings_with_name.shape

(22884377, 5)

* Grouping the data by user_id who has given rating to more than 200 movies

In [32]:
x = ratings_with_name.groupby('userId').count()['title']>200
rated_users = x[x].index

* User Id who has given more than 200 ratings in count

In [33]:
rated_users

Int64Index([    15,     17,     20,     37,     38,     39,     47,     50,
                59,     68,
            ...
            247624, 247662, 247700, 247702, 247704, 247705, 247725, 247730,
            247732, 247735],
           dtype='int64', name='userId', length=28729)

* Filtering the users who has given more than 200 ratings from ratings_with_name dataframe

In [34]:
filtered_rating = ratings_with_name[ratings_with_name['userId'].isin(rated_users)]

In [38]:
filtered_rating.head()

Unnamed: 0,userId,movieId,rating,title,genres
3,17,169,1.0,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
4,68,169,1.0,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
5,178,169,2.5,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
6,217,169,1.5,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
7,432,169,2.0,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama


* Filtering the 200 count user dataset on basis of titles which got atleast 50 different reviews from users

In [39]:
y = filtered_rating.groupby('title').count()['rating']>50
famous_movies = y[y].index

In [40]:
filtered_rating = filtered_rating[filtered_rating['title'].isin(famous_movies)]

In [41]:
filtered_rating.head()

Unnamed: 0,userId,movieId,rating,title,genres
3,17,169,1.0,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
4,68,169,1.0,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
5,178,169,2.5,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
6,217,169,1.5,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
7,432,169,2.0,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama


In [42]:
print(filtered_rating['userId'].nunique())
print(filtered_rating['title'].nunique())

28729
10650


#### From the above

* We got 28729 users have given more than 200 revies and 10650 movies got more than 50 different reviews each

* Creating a pivot table with movie title as index, colums as user_id and values as rating 

In [43]:
pt = filtered_rating.pivot_table(index = 'title', columns = 'userId', values = 'rating')

In [44]:
pt

userId,15,17,20,37,38,39,47,50,59,68,...,247624,247662,247700,247702,247704,247705,247725,247730,247732,247735
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
"'burbs, The (1989)",,3.0,,,,,,,,,...,,,,3.0,,4.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loudQUIETloud: A Film About the Pixies (2006),,,,,,,,,,,...,,,,,,,,,,
xXx (2002),,,,,,,,,,,...,,0.5,,,,,,,,4.0
xXx: State of the Union (2005),,,,,,,,,,,...,,1.0,,,,,,,,
¡Three Amigos! (1986),,2.0,,,,,,,,,...,,3.0,,2.0,,,,,,


In [45]:
pt.fillna(0,inplace=True)

In [46]:
pt

userId,15,17,20,37,38,39,47,50,59,68,...,247624,247662,247700,247702,247704,247705,247725,247730,247732,247735
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loudQUIETloud: A Film About the Pixies (2006),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
¡Three Amigos! (1986),0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


* Applying cosine similarity to get the similarity between the userid's and ratings

In [47]:
similarity_scores = cosine_similarity(pt)

In [48]:
similarity_scores

array([[1.        , 0.01561635, 0.02341465, ..., 0.02306728, 0.02182749,
        0.02792622],
       [0.01561635, 1.        , 0.03149477, ..., 0.02494127, 0.05100685,
        0.10712142],
       [0.02341465, 0.03149477, 1.        , ..., 0.05909923, 0.03009537,
        0.01415255],
       ...,
       [0.02306728, 0.02494127, 0.05909923, ..., 1.        , 0.07417993,
        0.00828392],
       [0.02182749, 0.05100685, 0.03009537, ..., 0.07417993, 1.        ,
        0.04942021],
       [0.02792622, 0.10712142, 0.01415255, ..., 0.00828392, 0.04942021,
        1.        ]])

In [49]:
movies_df.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [56]:
def recommendation_system(movie_name):
    index = np.where(pt.index == movie_name)[0][0] # getting the index of the movie by comapring
    similar_items = sorted(list(enumerate(similarity_scores[index])), key = lambda x: x[1], reverse=True)[1:6]
 # Apply enumerate(to traverse on similarity_scores),list,sort on the 2nd value(similarity_scores) and fetch top 5 simialties from pt dataframe 
    data=[]
    for i in similar_items:
        item = []
        temp_df = movies_df[movies_df['title']==pt.index[i[0]]]
        item.append(list(temp_df.drop_duplicates('title')['title'].values))
        item.append(list(temp_df.drop_duplicates('title')['movieId'].values))
        item.append(list(temp_df.drop_duplicates('title')['genres'].values))
        data.append(item)
    return data

In [57]:
recommendation_system('Free Willy 2: The Adventure Home (1995)')

[[['Free Willy 3: The Rescue (1997)'], [1595], ['Adventure|Children|Drama']],
 [['Free Willy (1993)'], [455], ['Adventure|Children|Drama']],
 [['Richie Rich (1994)'], [374], ['Children|Comedy']],
 [['Next Karate Kid, The (1994)'], [502], ['Action|Children|Romance']],
 [['Flintstones, The (1994)'], [355], ['Children|Comedy|Fantasy']]]

In [58]:
recommendation_system('Shawshank Redemption, The (1994)')

[[['Pulp Fiction (1994)'], [296], ['Comedy|Crime|Drama|Thriller']],
 [['Silence of the Lambs, The (1991)'], [593], ['Crime|Horror|Thriller']],
 [['Forrest Gump (1994)'], [356], ['Comedy|Drama|Romance|War']],
 [['Matrix, The (1999)'], [2571], ['Action|Sci-Fi|Thriller']],
 [['Usual Suspects, The (1995)'], [50], ['Crime|Mystery|Thriller']]]