In [1]:
import numpy as np
import pandas as pd

In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=header)
df.head(10)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [4]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [6]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [7]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]


In [8]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')


In [9]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [10]:
user_similarity

array([[ 0.        ,  0.89435302,  0.97580102, ...,  0.89585846,
         0.83197024,  0.69954136],
       [ 0.89435302,  0.        ,  0.9295624 , ...,  0.81833339,
         0.90756094,  0.9308135 ],
       [ 0.97580102,  0.9295624 ,  0.        , ...,  0.92777578,
         0.93913979,  0.9660639 ],
       ..., 
       [ 0.89585846,  0.81833339,  0.92777578, ...,  0.        ,
         0.93552127,  0.91181683],
       [ 0.83197024,  0.90756094,  0.93913979, ...,  0.93552127,
         0.        ,  0.856848  ],
       [ 0.69954136,  0.9308135 ,  0.9660639 , ...,  0.91181683,
         0.856848  ,  0.        ]])

In [11]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [12]:
user_prediction
x = pd.DataFrame(user_similarity)

writer = pd.ExcelWriter('user_similarity.xlsx')
x.to_excel(writer,'Sheet1')
writer.save()

In [53]:
train_data_matrix

x = pd.DataFrame(test_data_matrix)

writer = pd.ExcelWriter('test_data_matrix.xlsx')
x.to_excel(writer,'Sheet1')
writer.save()

In [15]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [19]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.1307659963536376
Item-based CF RMSE: 3.4582299015386035


In [20]:
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print('The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%')

The sparsity level of MovieLens100K is 93.7%


In [21]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 2.7229828967640057


In [27]:
# Load in movie data
idx_to_movie = {}
with open('u.item', 'r') as f:
    for line in f.readlines():
        info = line.split('|')
        idx_to_movie[int(info[0])-1] = info[1]
        
def top_k_movies(similarity, mapper, movie_idx, k=5):
    return [mapper[x] for x in np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

In [49]:
idx = 1 # Toy Story
movies = top_k_movies(item_prediction, idx_to_movie, idx)
print(movies)
idx_to_movie[idx]

['Vie est belle, La (Life is Rosey) (1987)', 'Liebelei (1933)', 'Vermont Is For Lovers (1992)', 'It Takes Two (1995)', 'Shadows (Cienie) (1988)']


'GoldenEye (1995)'

In [51]:
idx = 0 # Toy Story
movies = top_k_movies(item_similarity, idx_to_movie, idx)
print(movies)
idx_to_movie[idx]

['Shooter, The (1995)', 'Innocents, The (1961)', 'Mad Dog Time (1996)', 'Getting Even with Dad (1994)', 'Babysitter, The (1995)']


'Toy Story (1995)'