## Movie Recommendation Notebook
##### This workbook based on ideas collected all over mainly from the IBM Coursera course on Machine Learning to give movie recommendations based on collaborative filtering.

#### Importing, Data Cleaning and Pre-processing

In [1]:
# import the necessary packages for the first instance
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
# for plotting inside in the jupyter notebook
%matplotlib inline

In [2]:
# import datasets
movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [4]:
# performing some cleaning on the data, removing the columns needed, etc.
# obtaining the date column from the title column and storing as it's own column
movies_df['year'] = movies_df.title.str.extract('(\(\d+\))',expand=False)
movies_df['year'] = movies_df.year.str.extract('(\d+)',expand=False)

# removing the years part from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d+\))', '')
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

  movies_df['title'] = movies_df.title.str.replace('(\(\d+\))', '')


In [5]:
# since we dont need genres in this code we take it out
movies_df = movies_df.drop('genres', 1)
movies_df.head()

  movies_df = movies_df.drop('genres', 1)


Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [6]:
# since we equally dont need timestamp in ratings
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

  ratings_df = ratings_df.drop('timestamp', 1)


Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


### Collaborative Filtering

In [7]:
# Given a user's random input of the movies they have liked and rated
inputMovies = pd.DataFrame([
            {'title':'Superman', 'rating':5},
            {'title':'Django Unchained', 'rating':3.5},
            {'title':'Godfather, The', 'rating':2},
            {'title':"V for Vendetta", 'rating':5},
            {'title':'Incredibles, The', 'rating':4.5},
            {'title':'Waiting to Exhale', 'rating':4},
            {'title':'If Lucy Fell', 'rating':5},
            {'title':'Desperado', 'rating':5},
            {'title':'Expendables, The', 'rating':5},
            {'title':'Apocalypto', 'rating':5}
         ])

In [8]:
# assigning the movie id to the titles in the inputMovies dataframe by matching titles
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('year', 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
inputMovies

  inputMovies = inputMovies.drop('year', 1)


Unnamed: 0,movieId,title,rating
0,4,Waiting to Exhale,4.0
1,118,If Lucy Fell,5.0
2,163,Desperado,5.0
3,858,"Godfather, The",2.0
4,2640,Superman,5.0
5,140417,Superman,5.0
6,8961,"Incredibles, The",4.5
7,44191,V for Vendetta,5.0
8,48304,Apocalypto,5.0
9,79695,"Expendables, The",5.0


In [9]:
# since we have the movie IDs we can extract the users that have seen the same movie and reviewed them
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.size)

431403


In [10]:
# we have about 666966 observations of movies watched and reviewed by others which is good 
# now we take a look at the head of ehis dataset
userSubset.head()

Unnamed: 0,userId,movieId,rating
193,4,8961,4.0
241,7,858,3.5
258,8,163,3.5
289,10,163,3.5
302,10,2640,3.5


In [11]:
# the data has information from all over and since we need the particular users, we group the data as such
userSubsetGroup = userSubset.groupby(['userId'])
# pulling out a random group from the dataset
userSubsetGroup.get_group(1130)

Unnamed: 0,userId,movieId,rating
104193,1130,163,3.5
104535,1130,2640,3.0
104961,1130,44191,1.0
105071,1130,79695,0.5
105132,1130,99114,2.5


In [12]:
# now we sort so that the users that are most relevant to the input users data are on top. ie. the ones with movies they have also watched
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userSubsetGroup[0:3]

[(36946,
           userId  movieId  rating
  3402638   36946        4     1.0
  3402701   36946      118     2.0
  3402719   36946      163     3.5
  3403010   36946      858     0.5
  3403692   36946     2640     3.0
  3405168   36946     8961     5.0
  3405491   36946    44191     4.5
  3405584   36946    48304     1.5
  3406188   36946    79695     2.0
  3406494   36946    99114     4.0),
 (58040,
           userId  movieId  rating
  5418004   58040        4     0.5
  5418084   58040      118     2.0
  5418108   58040      163     3.0
  5418488   58040      858     4.0
  5419456   58040     2640     3.5
  5421527   58040     8961     1.0
  5421957   58040    44191     3.5
  5422069   58040    48304     1.5
  5422979   58040    79695     2.0
  5423321   58040    99114     2.5),
 (60472,
           userId  movieId  rating
  5646279   60472        4     1.0
  5646324   60472      118     1.5
  5646336   60472      163     2.5
  5646498   60472      858     5.0
  5646877   60472     26

##### Creating a correlation relation beteween the user's information and the groups

In [13]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [14]:
pearsonCorrelationDict.items();

In [15]:
# creating a dataframe form the correlation coefficients with the user ids
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.379461,36946
1,-0.256158,58040
2,-0.754163,60472
3,-0.609118,204165
4,-0.48132,204518


In [16]:
# now we sort according to the users with the high correlation score
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
14443,1.0,222855
11105,1.0,128525
9852,1.0,94617
9197,1.0,76423
8748,1.0,63419


##### Using the selected ratings to all the movies and then using the result for the recommendation

In [17]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,222855,1,3.5
1,1.0,222855,3,2.0
2,1.0,222855,16,3.0
3,1.0,222855,19,1.5
4,1.0,222855,32,3.0


In [18]:
# multiplying the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,222855,1,3.5,3.5
1,1.0,222855,3,2.0,2.0
2,1.0,222855,16,3.0,3.0
3,1.0,222855,19,1.5,1.5
4,1.0,222855,32,3.0,3.0


In [19]:
# applying a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30.0,117.0
2,20.0,64.5
3,8.0,19.5
4,1.0,1.0
5,7.0,18.0


In [20]:
# now we calculate the weighted average score of each movie and store that for the final recommendation
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.9,1
2,3.225,2
3,2.4375,3
4,1.0,4
5,2.571429,5


In [27]:
# sorting according the weighted average
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4329,5.0,4329
6975,5.0,6975
4453,5.0,4453
4608,5.0,4608
2973,5.0,2973
2969,5.0,2969
59387,5.0,59387
59392,5.0,59392
2945,5.0,2945
4609,5.0,4609


### Recommendation

In [28]:
# this is the final recommendation list based on the filtering results
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())].reset_index(drop = True)

Unnamed: 0,movieId,title,year
0,2945,Mike's Murder,1984
1,2969,"Man and a Woman, A (Un homme et une femme)",1966
2,2973,Crimes and Misdemeanors,1989
3,4329,Rio Bravo,1959
4,4453,Michael Jordan to the Max,2000
5,4608,"Innocent Man, An",1989
6,4609,Jacknife,1989
7,6975,Funny Games,1997
8,59387,"Fall, The",2006
9,59392,Stargate: The Ark of Truth,2008


---

## Original Author
Saeed Aghabozorgi

### Other Contributors
<a href="https://www.linkedin.com/in/joseph-s-50398b136/?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMDeveloperSkillsNetworkML0101ENSkillsNetwork20718538-2022-01-01" target="_blank">Joseph Santarcangelo</a>

## Change Log
| Date (YYYY-MM-DD) | Version | Changed By | Change Description                 |
| ----------------- | ------- | ---------- | ---------------------------------- |
| 2020-11-03        | 2.1     | Lakshmi    | Updated URL of csv                 |
| 2020-08-27        | 2.0     | Lavanya    | Moved lab to course repo in GitLab |
|                   |         |            |                                    |
|                   |         |            |                                    |

## <h3 align="center"> © IBM Corporation 2020. All rights reserved. <h3/>
