# Movie Recommendation System ##
Data is collected from "grouplens.org" as "Movielens 100K Dataset"(ml-100.zip) -> we used 'u.data' and 'u.item' files for our project. 

In [74]:
# importing necessary modules
import numpy as np
import pandas as pd
import warnings #module used to avoid unnecessary warnings

In [75]:
warnings.filterwarnings('ignore') #filtering unnecessary warnings

#### Reading the movie ratings and user id from the u.data ###

In [76]:
 #defination of columns - # user_id | item_id | rating | timestamp          
columns_name=['user_id','item_id','rating','timestamp']
df = pd.read_csv('u.data',sep="\t",names=columns_name) #reading data from u.data which is nothing but "tab" seperated values.

In [77]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [78]:
 df.shape

(100000, 4)

In [79]:
df['user_id'] # -> returns all the userid's in the data
df['user_id'].unique() # -> returns all the unique user id's in the data
df['user_id'].nunique() # -> returns the no.of unique user id's in the data

943

In [80]:
df['item_id'].nunique() # -> returns the no.of unique item id's ie., movies in the data

1682

#### Reading the movie titles from the u.item

In [81]:
movies_title = pd.read_csv('u.item',sep="\|",header=None) #Here the data's in the u.item are separated by "this"(|) function

In [82]:
movies_title.shape

(1682, 24)

In [83]:
movies_title.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


#### Cleaning the data by removing unwanted info's and storing it into movies_titles

In [84]:
movies_titles = movies_title[[0,1]] #Removing the unwanted info from the u.item and getting only the item_id along with the title
movies_titles.columns = ["item_id","title"]
movies_titles

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [85]:
df = pd.merge(df,movies_titles,on="item_id") #merging based on the "item_id" which is common for both

In [86]:
df

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)
...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962)
99996,655,1640,3,888474646,"Eighth Day, The (1996)"
99997,655,1637,3,888984255,Girls Town (1996)
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1..."


In [87]:
df.tail()

Unnamed: 0,user_id,item_id,rating,timestamp,title
99995,840,1674,4,891211682,Mamma Roma (1962)
99996,655,1640,3,888474646,"Eighth Day, The (1996)"
99997,655,1637,3,888984255,Girls Town (1996)
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1..."
99999,655,1641,3,887427810,Dadetown (1995)


In [88]:
# Creating a new datafram 'ratings' by manipulating existing dataframe
# we are just grouping the 'title' of films(as unique) with average of it's 'rating' from df
ratings = pd.DataFrame(df.groupby('title').mean()["rating"])

In [90]:
ratings.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'Til There Was You (1997),2.333333
1-900 (1994),2.6
101 Dalmatians (1996),2.908257
12 Angry Men (1957),4.344
187 (1997),3.02439


In [93]:
# creating a column name 'num of ratings' of each movie by using the count function
ratings['num of ratings'] = pd.DataFrame(df.groupby('title').count()['rating']) 

In [94]:
ratings.head()

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,9
1-900 (1994),2.6,5
101 Dalmatians (1996),2.908257,109
12 Angry Men (1957),4.344,125
187 (1997),3.02439,41


### Collection and Pre-processing of the Data is Done! 😁 Creating our Recommendation System starts Now! 😊

In [97]:
# we gonna create a movie matrix using "pivot_table" function to create our own table with manually defining the rows columns and the data to be in.
movie_matrix = df.pivot_table(index="user_id",columns="title",values="rating")

In [99]:
movie_matrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


### For Example: We gonna build recommendation system for the users who watched the 'Star Wars (1977)' movie, then we proceed with other movies. 💖

In [107]:
# For example: we are getting the ratings of the "Star Wars (1977)"
starwars_user_ratings= movie_matrix['Star Wars (1977)']

In [108]:
starwars_user_ratings.head()

user_id
1    5.0
2    5.0
3    NaN
4    5.0
5    4.0
Name: Star Wars (1977), dtype: float64

###### Now we want to find the correlation betweens the "Star Wars (1977)" ratings and the "movie_matrix"!
ie., will show how much correlated the ratings other movies with "Star Wars (1977)". 

###### Correlation ranges from -1 to +1 which corresponds to negative and positive correlation! Do do this we have predefined function(corrwith) in pandas!

In [112]:
similar_to_starwars =  movie_matrix.corrwith(starwars_user_ratings)

In [117]:
similar_to_starwars

title
'Til There Was You (1997)                0.872872
1-900 (1994)                            -0.645497
101 Dalmatians (1996)                    0.211132
12 Angry Men (1957)                      0.184289
187 (1997)                               0.027398
                                           ...   
Young Guns II (1990)                     0.228615
Young Poisoner's Handbook, The (1995)   -0.007374
Zeus and Roxanne (1997)                  0.818182
unknown                                  0.723123
Á köldum klaka (Cold Fever) (1994)            NaN
Length: 1664, dtype: float64

In [119]:
# Creating a DataFrame "corr_starwars" using the "similar_to_starwars" correlation values.
corr_starwars = pd.DataFrame(similar_to_starwars,columns=['correlation'])

In [122]:
# Cleaning the data by dropping the 'NaN' using 'dropna' function
corr_starwars.dropna(inplace=True)

In [123]:
corr_starwars

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
'Til There Was You (1997),0.872872
1-900 (1994),-0.645497
101 Dalmatians (1996),0.211132
12 Angry Men (1957),0.184289
187 (1997),0.027398
...,...
Young Guns (1988),0.186377
Young Guns II (1990),0.228615
"Young Poisoner's Handbook, The (1995)",-0.007374
Zeus and Roxanne (1997),0.818182


In [134]:
# Now we gonna find the positively correlated movies with 'Star Wars (1977)' ie., movies with correlation close to '1.0'
corr_starwars.sort_values('correlation',ascending=False).head(20)

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
Hollow Reed (1996),1.0
Commandments (1997),1.0
Cosi (1996),1.0
No Escape (1994),1.0
Stripes (1981),1.0
Star Wars (1977),1.0
Man of the Year (1995),1.0
"Beans of Egypt, Maine, The (1994)",1.0
"Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)",1.0
"Outlaw, The (1943)",1.0


In [135]:
# Now we gonna set our threshold by no of reviews the movie has! To improve our system
ratings

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,9
1-900 (1994),2.600000,5
101 Dalmatians (1996),2.908257,109
12 Angry Men (1957),4.344000,125
187 (1997),3.024390,41
...,...,...
Young Guns II (1990),2.772727,44
"Young Poisoner's Handbook, The (1995)",3.341463,41
Zeus and Roxanne (1997),2.166667,6
unknown,3.444444,9


In [136]:
# we are the joining the 'num of ratings' column of 'ratings' dataframe with 'corr_starwars' dataframe
corr_starwars = corr_starwars.join(ratings['num of ratings'])

In [137]:
corr_starwars

Unnamed: 0_level_0,correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),0.872872,9
1-900 (1994),-0.645497,5
101 Dalmatians (1996),0.211132,109
12 Angry Men (1957),0.184289,125
187 (1997),0.027398,41
...,...,...
Young Guns (1988),0.186377,101
Young Guns II (1990),0.228615,44
"Young Poisoner's Handbook, The (1995)",-0.007374,41
Zeus and Roxanne (1997),0.818182,6


In [139]:
# Predicting the movies which are highly correlated and has threshold of rating more than 100.
corr_starwars[corr_starwars['num of ratings']>100].sort_values('correlation',ascending=False)

Unnamed: 0_level_0,correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),1.000000,583
"Empire Strikes Back, The (1980)",0.747981,367
Return of the Jedi (1983),0.672556,507
Raiders of the Lost Ark (1981),0.536117,420
Austin Powers: International Man of Mystery (1997),0.377433,130
...,...,...
"Edge, The (1997)",-0.127167,113
As Good As It Gets (1997),-0.130466,112
Crash (1996),-0.148507,128
G.I. Jane (1997),-0.176734,175


### Now we have successfully built the Recommendation System for 'Star Wars (1977)' movie viewers! 🎉

#### Let's proceed with generalized movie recommendation system! ✌ (as a function)

In [142]:
# Generalized function for movie recommendation
def predict_movies(movie_name):
    # For example: we are getting the ratings of the "Star Wars (1977)" along with user id's
    movie_user_ratings = movie_matrix[movie_name]
    # Now we find the correlation betweens the "Star Wars (1977)" ratings and the "movie_matrix"!
    similar_to_movie = movie_matrix.corrwith(movie_user_ratings)
    # Creating a DataFrame "corr_movie" using the "similar_to_movie" correlation
    corr_movie = pd.DataFrame(similar_to_movie, columns=['correlation'])
    # Cleaning the data by dropping the 'NaN' using 'dropna' function
    corr_movie.dropna(inplace=True)
    # we are the joining the 'num of ratings' column of 'ratings' dataframe with 'corr_movie' dataframe
    corr_movie = corr_movie.join(ratings['num of ratings'])
    # Predicting the movies which are highly correlated and has threshold of rating more than 100.
    predictions = corr_movie[corr_movie['num of ratings']>100].sort_values('correlation',ascending=False)
    
    return predictions

In [148]:
predict_movies('Titanic (1997)').head()

Unnamed: 0_level_0,correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Titanic (1997),1.0,350
"River Wild, The (1994)",0.4976,146
"Abyss, The (1989)",0.472103,151
Bram Stoker's Dracula (1992),0.44356,120
True Lies (1994),0.435104,208


In [147]:
predict_movies('River Wild, The (1994)').head()

Unnamed: 0_level_0,correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"River Wild, The (1994)",1.0,146
Kiss the Girls (1997),0.7445,143
Young Guns (1988),0.662424,101
Seven Years in Tibet (1997),0.640866,155
My Best Friend's Wedding (1997),0.576746,172


## Hereby, We have successfully built our Movie Recommendation System! 🎁✨🎂🎉💖