# Collaborative Filtering

In [1]:
import pandas as pd 
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [2]:
# Read ratings.csv and drop "timestamp" column
ratings = pd.read_csv("ratings.csv") \
    .drop("timestamp", axis=1).set_index("movieId")
  
ratings.head()

Unnamed: 0_level_0,userId,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,4.0
3,1,4.0
6,1,4.0
47,1,5.0
50,1,5.0


In [3]:
# Read movies.csv and drop "genres" column
movies = pd.read_csv("movies.csv") \
    .drop("genres", axis=1).set_index("movieId")

movies.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
1,Toy Story (1995)
2,Jumanji (1995)
3,Grumpier Old Men (1995)
4,Waiting to Exhale (1995)
5,Father of the Bride Part II (1995)


In [4]:
# Join the two dataframes on movieId
df = ratings.join(movies, how="inner").reset_index()

df.head()

Unnamed: 0,movieId,userId,rating,title
0,1,1,4.0,Toy Story (1995)
1,1,5,4.0,Toy Story (1995)
2,1,7,4.5,Toy Story (1995)
3,1,15,2.5,Toy Story (1995)
4,1,17,4.5,Toy Story (1995)


In [5]:
# Using pd.pivot_table, transform the table into a matrix so that each row represents a movie and each column represents a user
# (index should be "movieId")
df2 = pd.pivot_table(df, index=["movieId", "title"], columns="userId", values="rating").fillna(0)
df2 = df2.reset_index().set_index("movieId")

df2.head()

userId,title,1,2,3,4,5,6,7,8,9,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,Jumanji (1995),0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,Grumpier Old Men (1995),4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Create a function that finds the similar movies to a given movie id.
def similar_movies(df, k, movie_id):
    # df is the movie-user matrix 
    # k is the number of similar movies to find
    # movie_id is the movie id to find similar movies to
    
    # Build a NearestNesighbors model
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric='cosine')
    
    # Fit the model to using df
    kNN.fit(df.drop("title", axis=1))
    
    # Find the closest neighbors using .kneighbors and passing the rankings associated with the movie_id
    # This step will return a list of movie ids
    recs = kNN.kneighbors([list(df.drop("title", axis=1).loc[movie_id])], return_distance=False)[0]
#     print(recs)
    # Find the titles of the movie ids
    recs_names = [df.iloc[i]["title"] for i in recs]
    recs_names = [m for m in recs_names if m != df.loc[movie_id]["title"]]
    
    # return a list with the recommended movie titles
    return recs_names

In [7]:
movie_name = 'Shawshank Redemption, The (1994)'

# Find the movieId associated with the movie_name
movie_id = df2[df2["title"] == movie_name].index[0]

# Use similar_movies() to find 10 movie recommendations
recommended_movies = similar_movies(df2, 10, movie_id)

print("Since you watched %s: \n" % movie_name)
for i, m in enumerate(recommended_movies):
    print("%s) %s" % (i+1, m))

Since you watched Shawshank Redemption, The (1994): 

1) Forrest Gump (1994)
2) Pulp Fiction (1994)
3) Silence of the Lambs, The (1991)
4) Usual Suspects, The (1995)
5) Schindler's List (1993)
6) Fight Club (1999)
7) Braveheart (1995)
8) Matrix, The (1999)
9) Apollo 13 (1995)
10) Seven (a.k.a. Se7en) (1995)


In [8]:
## Modify the fuction to take the user id as well and ensure that the recomended movies are not already watched by the user. 

def similar_movies_2(df, k, movie_id, user_id):
    not_watched = list(df[df[user_id] == 0]["title"])
    kNN = NearestNeighbors(n_neighbors=k*3, algorithm="brute", metric='cosine')
    kNN.fit(df.drop("title", axis=1))
    recs = kNN.kneighbors([list(df.drop("title", axis=1).loc[movie_id])], return_distance=False)[0]
    recs_names = [df.iloc[i]["title"] for i in recs]
    recs_names = [m for m in recs_names if (m != df.loc[movie_id]["title"]) and (m in not_watched)]
    return recs_names[:k]

In [9]:
movie_name = 'Shawshank Redemption, The (1994)'
user_id = 5
movie_id = movies[movies["title"] == movie_name].index[0]
recommended_movies = similar_movies_2(df2, 10, movie_id, user_id)

print("Since you watched %s: \n" % movie_name)
for i, m in enumerate(recommended_movies):
    print("%s) %s" % (i+1, m))

Since you watched Shawshank Redemption, The (1994): 

1) Forrest Gump (1994)
2) Silence of the Lambs, The (1991)
3) Fight Club (1999)
4) Matrix, The (1999)
5) Seven (a.k.a. Se7en) (1995)
6) Lord of the Rings: The Return of the King, The (2003)
7) Godfather, The (1972)
8) Good Will Hunting (1997)
9) Jurassic Park (1993)
10) Lord of the Rings: The Fellowship of the Ring, The (2001)
