# Recommending movies based on user-based Collaborative Filtering!

## Downloading the dataset!

In [None]:
!wget -O moviedataset.zip https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
print('unziping ...')
!unzip -o -j moviedataset.zip 

## Importing the required libraries

In [1]:
# Dataframe manipulation library
import pandas as pd

# Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt

# Numerical python library
import numpy as np

# Visualization Library
import matplotlib.pyplot as plt


## Data Preprocessing! 

In [22]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [23]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [35]:
ratings.drop("timestamp", 1, inplace = True)

In [25]:
movies.drop("genres", 1, inplace = True)

In [26]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [27]:
movies["year"] = movies.title.str.extract("(\(\d\d\d\d\))", expand = False)

In [28]:
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story (1995),(1995)
1,2,Jumanji (1995),(1995)
2,3,Grumpier Old Men (1995),(1995)
3,4,Waiting to Exhale (1995),(1995)
4,5,Father of the Bride Part II (1995),(1995)


In [29]:
movies.title = movies.title.str.replace("(\(\d\d\d\d\))", "")

In [30]:
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,(1995)
1,2,Jumanji,(1995)
2,3,Grumpier Old Men,(1995)
3,4,Waiting to Exhale,(1995)
4,5,Father of the Bride Part II,(1995)


In [31]:
movies.title = movies.title.apply(lambda x: x.strip())

In [32]:
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,(1995)
1,2,Jumanji,(1995)
2,3,Grumpier Old Men,(1995)
3,4,Waiting to Exhale,(1995)
4,5,Father of the Bride Part II,(1995)


In [33]:
movies.year = movies.year.str.extract("(\d\d\d\d)", expand = False)

In [34]:
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [36]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


## Collaborative Implementation Begin

In [37]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [49]:
input_id = movies[movies.title.isin(inputMovies.title)]

In [50]:
input_id

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
293,296,Pulp Fiction,1994
1246,1274,Akira,1988
1885,1968,"Breakfast Club, The",1985


In [51]:
inputMovies = pd.merge(input_id, inputMovies)

In [52]:
inputMovies

Unnamed: 0,movieId,title,year,rating
0,1,Toy Story,1995,3.5
1,2,Jumanji,1995,2.0
2,296,Pulp Fiction,1994,5.0
3,1274,Akira,1988,4.5
4,1968,"Breakfast Club, The",1985,5.0


## Filtering out the users who have watched the movies present in our inputMovies!

In [53]:
subset_of_users = ratings[ratings.movieId.isin(inputMovies.movieId)]

In [55]:
subset_of_users.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


In [56]:
grouped_users_subset = subset_of_users.groupby("userId")

In [66]:
# Checking out the user details with userId 1130
grouped_users_subset.get_group(1130)

Unnamed: 0,userId,movieId,rating
104167,1130,1,0.5
104168,1130,2,4.0
104214,1130,296,4.0
104363,1130,1274,4.5
104443,1130,1968,4.5
