# Collaborative Filtering

In [10]:
import surprise as sp
import pandas as pd
# read data
rating_df = pd.read_csv('ratings.csv')

# create surprise reader, rating ranges from 1 to 5
reader = sp.Reader(rating_scale=(1, 5))

# load the dataframe to surprise dataset
# order: [user, movie, rating]
data = sp.Dataset.load_from_df(rating_df[['UserID', 'MovieID', 'Rating']], reader)

In [48]:
# create an algorithm object
# use the 10 nearest neighbors
algo = sp.prediction_algorithms.knns.KNNBasic(k=10)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x183d08203c8>

In [None]:
# fit the model
algo.fit(data)

Oops...what goes wrong? Oh, we need to do the train test split before passing the data!

In [None]:
# split the data into training and testing set
train_data, test_data = sp.model_selection.train_test_split(data, test_size=0.1)
# fit the algorithm using the data
algo.fit(train_data)

In [49]:
import numpy as np
# selecte the first user
selected_user = 1
# length of movie id
n_movies = train_data.n_items
# predient user i's rating for every movie
ratings = []
for i in range(n_movies):
    pred = algo.estimate(selected_user, i)[0]
    ratings.append(pred)
# recommend the top 10 movies
recom_movies = np.argsort([i for i in ratings])[-10:]
# id of top 10 movies
recom_movies

array([  70, 3355, 3025, 3685, 3674, 3673, 3650, 3656, 3276, 3052],
      dtype=int64)

In [50]:
# convert surprise inner id to raw id
raw_id = []
for i in recom_movies:
    raw_id.append(train_data.to_raw_iid(i))
# view information of the recommended movies
movie_df = pd.read_csv('movies.csv')
movie_df[movie_df['MovieID'].isin(raw_id)]

Unnamed: 0,MovieID,Title,Genres
435,439,Dangerous Game (1993),Drama
777,787,"Gate of Heavenly Peace, The (1995)",Documentary
977,989,Schlafes Bruder (Brother of Sleep) (1995),Drama
1762,1830,Follow the Bitch (1998),Comedy
2789,2858,American Beauty (1999),Comedy|Drama
3103,3172,Ulysses (Ulisse) (1954),Adventure
3164,3233,Smashing Time (1967),Comedy
3313,3382,Song of Freedom (1936),Drama
3538,3607,One Little Indian (1973),Comedy|Drama|Western
3811,3881,Bittersweet Motel (2000),Documentary


# Content-based Filtering

In [64]:
from sklearn.feature_extraction.text import CountVectorizer
# use genres as movie attributes
count_vector = CountVectorizer(stop_words='english')
# matrix of movie attributes
movie_attr_mat = count_vector.fit_transform(movie_df['Genres'])

In [83]:
# select a user as example
selected_user = 1
selected_rating = rating_df[rating_df['UserID']==selected_user]
# filter out high rated movies
selected_rating = selected_rating[selected_rating['Rating']>=4]
# obtain movie ids
encoding_of_rated_movies = (selected_rating.MovieID-1).values
# use average movie attributes as user profile
user_profile = movie_attr_mat[encoding_of_rated_movies]
user_profile = np.mean(user_profile, axis=0)

In [106]:
from sklearn.metrics.pairwise import cosine_similarity
# compute cosine similarity between user profile and every movie
sim_matrix = cosine_similarity(user_profile, movie_attr_mat)
# convert to one-dimensional vector
sim_vector = np.array(sim_matrix)[0]
# recommend the 10 most similar movie
recom_movies = np.argsort(sim_vector)[-10:]
movie_df[movie_df['MovieID'].isin(recom_movies+1)]

Unnamed: 0,MovieID,Title,Genres
352,356,Forrest Gump (1994),Comedy|Romance|War
489,493,Menace II Society (1993),Action|Crime|Drama
817,828,"Adventures of Pinocchio, The (1996)",Adventure|Children's
1472,1504,Hollow Reed (1996),Drama
1700,1753,Half Baked (1998),Comedy
1928,1997,"Exorcist, The (1973)",Horror
2034,2103,Tall Tale (1994),Adventure|Children's
2125,2194,"Untouchables, The (1987)",Action|Crime|Drama
2289,2358,Savior (1998),Drama
2369,2438,Outside Ozona (1998),Drama|Thriller
