# Notes

Collaborative filters can further be classified into two types:

* `user-based filtering` - recommend products to a user that **similar** users have liked. If tony and jeris have a similar in games, and tony plays a new game and likes it, it is highly likely that jeris will like it to.

* `item-based filtering` - extremely similar to the content recommendation engine that you built. These systems identify similar items based on how people have rated it in the past. If tony and jeris gave 5 stars to LOTR and the Hobbit, the system identifies the items as similar, so if lehcar buys LOTR, the system will recommend the Hobbit.

In [None]:
import numpy as np
import pandas as pd 
from ast import literal_eval

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
data_path = '/Users/jeremy/data/movie_datasets/'

In [None]:
df_ratings = pd.read_csv(data_path + 'ratings.csv')
df_meta = pd.read_csv(data_path + 'movies_metadata.csv')

In [None]:
df_ratings.head()

# Data Processing

In [None]:
df_meta.dropna(subset=['title'], inplace=True)

df_meta['id'] = pd.to_numeric(df_meta['id'])
df_meta['budget'] = pd.to_numeric(df_meta['budget'])
df_meta['popularity'] = pd.to_numeric(df_meta['popularity'])

df_meta['genres'] = df_meta['genres'].apply(literal_eval)

In [None]:
df_meta.info()

# Collaborative Filtering Recommender

## 1. Cosine Similarities

using mean of other user's weighted ratings based on similarity matrix

In [None]:
data = df_ratings.merge(df_meta[['id','title']], left_on='movieId', right_on='id')
data = data.drop(['id'],axis=1)

In [None]:
# get total counts for each movie
data['count'] = data.groupby('movieId').transform('count')['userId']
data.head()

In [None]:
# fetch top 100 movie ids based on count
top_n = 100
top_n_movie_id = data.drop_duplicates('movieId').sort_values('count', ascending=False)
top_n_movie_id = top_n_movie_id[:top_n]['movieId']

In [None]:
# filter out data according to the remaining movieIds
data = data[data['movieId'].isin(top_n_movie_id)].reset_index(drop=True)
data.head()

In [None]:
# fetch top 20000 users based on number of movies watched
top_n_users= 20000
user_id = data.drop_duplicates('userId').sort_values('count', ascending=False)
user_id = user_id[:top_n_users]['userId']

In [None]:
data = data[data.userId.isin(user_id)].reset_index(drop=True)
display(data.head())
display(data.shape)

In [None]:
# create a user to movie rating matrix
df = data.pivot(index='userId', columns = 'movieId', values='rating')
df.head()

In [None]:
# replace NaN with average movie rating
df_imputed = df.fillna(df.mean(axis=0))
display(df_imputed.head())
display(df_imputed.shape)

In [None]:
# compute similarity between all users
similarity_matrix = cosine_similarity(df_imputed.values)
df_similarity = pd.DataFrame(similarity_matrix, columns = df_imputed.index, index = df_imputed.index)
display(df_similarity.head())
display(df_similarity.shape)

In [None]:
def get_cf_recommendation(user_id: int):
    
    # get similarity scores for user
    sim_scores = df_similarity[user_id]
    
    # get movies that are unrated by user
    movie_ratings_for_user = df.T[user_id]
    movie_ratings_for_user_is_null = movie_ratings_for_user.isnull()
    unrated_movies_for_user = df.loc[df.index==user_id, movie_ratings_for_user_is_null]
    unrated_movies = unrated_movies_for_user.columns.tolist()
    
    # get weighted ratings of unrated movies by all other users
    movie_ratings = (df_imputed[unrated_movies].T * sim_scores).T
    
    # get top 100 similar users, skip current user
    sim_scores = sim_scores.sort_values(ascending=False)[1:101]
    
    movie_ratings = movie_ratings.mean(axis=0)
    movie_ratings = pd.DataFrame(movie_ratings,columns=['rating']).reset_index()
    
    recommended_movies = df_meta[['title','id']].merge(movie_ratings, left_on='id', right_on='movieId')
    recommended_movies = recommended_movies.sort_values('rating', ascending=False)
   
    return recommended_movies.head(10)

get_cf_recommendation(10)

# 2. Surprise Package