In [32]:
import pandas as pd
import numpy as np

In [2]:
u_cols = ["user_id", "age", "sex", "job", "zip_code"]

In [3]:
user_df = pd.read_csv("../datasets/u.user", sep="|", names=u_cols, encoding="latin-1")

In [4]:
user_df.head()

Unnamed: 0,user_id,age,sex,job,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
user_df.shape

(943, 5)

In [6]:
i_cols = ["movie_id", "title", "release date", "video release date", "imdb url", 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [7]:
movie_df = pd.read_csv("../datasets/u.item", sep="|", names=i_cols, encoding="latin-1")

In [8]:
movie_df.head()

Unnamed: 0,movie_id,title,release date,video release date,imdb url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
movie_df.shape

(1682, 24)

In [10]:
movie_df = movie_df[["movie_id", "title"]]

In [11]:
movie_df.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [12]:
r_cols = ["user_id", "movie_id", "rating", "timestamp"]

In [15]:
rating_df = pd.read_csv("../datasets/u.data", sep="\t", names=r_cols, encoding="latin-1")

In [16]:
rating_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [17]:
rating_df = rating_df.drop("timestamp", axis=1)

In [18]:
rating_df.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [19]:
rating_df.shape

(100000, 3)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x = rating_df.copy()

In [22]:
y = rating_df["user_id"]

In [23]:
x_train, x_test, y_train, y_test, = train_test_split(x, y, test_size=0.25, stratify=y, random_state=42)

In [27]:
from sklearn.metrics import mean_squared_error

In [28]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [29]:
def baseline(user_id, movie_id):
    return 3.0

In [33]:
def score(cf_model):
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(x_test["user_id"], x_test["movie_id"])
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(x_test["rating"])
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [34]:
score(baseline)

1.2488234462885457

In [35]:
r_matrix = x_train.pivot_table(values='rating', index='user_id', columns='movie_id')

In [37]:
r_matrix.shape

(943, 1641)

In [38]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, movie_id):
    #Check if movie_id exists in r_matrix
    if movie_id in r_matrix:
    #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[movie_id].mean()
    else:
    #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    return mean_rating

In [39]:
score(cf_user_mean)

1.0300824802393536

In [40]:
r_matrix_dummy = r_matrix.copy().fillna(0)

In [41]:
r_matrix_dummy.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,0.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [44]:
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

In [46]:
cosine_sim.shape

(943, 943)

In [47]:
cosine_sim.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.108361,0.046638,0.029577,0.245753,0.335853,0.344724,0.191582,0.057149,0.251979,...,0.257073,0.069412,0.231643,0.108093,0.176842,0.104799,0.232472,0.051528,0.129555,0.256333
2,0.108361,1.0,0.057613,0.130237,0.054918,0.190552,0.079399,0.076146,0.167992,0.147376,...,0.136993,0.252887,0.255454,0.285193,0.232751,0.149088,0.102807,0.062386,0.109143,0.107686
3,0.046638,0.057613,1.0,0.139805,0.0,0.032485,0.043869,0.080968,0.022263,0.059925,...,0.027402,0.0,0.17506,0.010343,0.105635,0.019052,0.127099,0.023917,0.060392,0.0
4,0.029577,0.130237,0.139805,1.0,0.0,0.04519,0.088586,0.199526,0.135013,0.026919,...,0.055392,0.049773,0.076549,0.139382,0.113886,0.0,0.130343,0.077357,0.15789,0.063911
5,0.245753,0.054918,0.0,0.0,1.0,0.176443,0.28186,0.132205,0.03879,0.1342,...,0.183969,0.019305,0.073714,0.041807,0.081088,0.029743,0.188392,0.068342,0.055557,0.207259


In [51]:
def cf_user_wmean(user_id, movie_id):
    #Check if movie_id exists in r_matrix
    if movie_id in r_matrix:
    #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
    #Get the user ratings for the movie in question
        m_ratings = r_matrix[movie_id]
    #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
    #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
    #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
    #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()
    else:
    #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 3.0
    if wmean_rating is not np.NAN:
        return wmean_rating
    else:
        return 3.0

In [52]:
score(cf_user_wmean)

  wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()


ValueError: Input contains NaN.

In [53]:
from sklearn.model_selection import train_test_split

In [57]:
x = rating_df.copy()

In [58]:
y = rating_df["movie_id"]

In [60]:
x_train, x_test, y_train, y_test, = train_test_split(x, y, test_size=0.25, random_state=42)

In [61]:
from sklearn.metrics import mean_squared_error

In [62]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [63]:
def baseline(user_id, movie_id):
    return 3.0

In [64]:
def score(cf_model):
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(x_test["movie_id"], x_test["user_id"])
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(x_test["rating"])
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [65]:
score(baseline)

1.243784547258889

In [66]:
r_matrix = x_train.pivot_table(values='rating', index='movie_id', columns='user_id')

In [67]:
r_matrix.shape

(1642, 943)

In [68]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, movie_id):
    #Check if movie_id exists in r_matrix
    if movie_id in r_matrix:
    #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[movie_id].mean()
    else:
    #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    return mean_rating

In [69]:
score(cf_user_mean)

1.044885130655045

In [70]:
r_matrix_dummy = r_matrix.copy().fillna(0)

In [71]:
r_matrix_dummy.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
2,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
5,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
from sklearn.metrics.pairwise import cosine_similarity

In [73]:
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [74]:
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

In [75]:
cosine_sim.shape

(1642, 1642)

In [76]:
cosine_sim.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1670,1671,1672,1673,1676,1678,1679,1680,1681
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.29432,0.252481,0.318105,0.193389,0.098734,0.470498,0.351443,0.366423,0.208826,...,0.0,0.0,0.0,0.0,0.042312,0.0,0.0,0.0,0.0,0.0
2,0.29432,1.0,0.18856,0.358484,0.30246,0.0,0.297852,0.303719,0.208703,0.104786,...,0.0,0.0,0.0,0.064342,0.0,0.0,0.0,0.0,0.0,0.090993
3,0.252481,0.18856,1.0,0.237236,0.149656,0.070271,0.254691,0.178093,0.21833,0.097036,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.318105,0.358484,0.237236,1.0,0.26809,0.019944,0.329948,0.366548,0.24995,0.174417,...,0.0,0.0,0.0,0.045094,0.0,0.106287,0.0,0.0,0.0,0.063772
5,0.193389,0.30246,0.149656,0.26809,1.0,0.021167,0.23482,0.171509,0.174599,0.019225,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
def cf_user_wmean(user_id, movie_id):
    #Check if movie_id exists in r_matrix
    if movie_id in r_matrix:
    #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
    #Get the user ratings for the movie in question
        m_ratings = r_matrix[movie_id]
    #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
    #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
    #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
    #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()
    else:
    #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 3.0
    if wmean_rating is not np.NAN:
        return wmean_rating
    else:
        return 3.0

In [52]:
score(cf_user_wmean)

  wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()


ValueError: Input contains NaN.