In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD, evaluate
from surprise import accuracy

# Collaborative Filtering 

## Load files

In [4]:
names = ["movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western | "]
names = [i.split(' | ') for i in names][0]
movies = pd.read_csv("u.item",delimiter="|",encoding="437",names=names)
movies['video release date'] = movies['release date']
ratings =pd.read_csv("u.data",delimiter="\t",names = ["user id","item id","rating" ,"timestamp"])
u_cols = ['user id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols,
 encoding='latin-1')
ratings.head()

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
reader = Reader()
data = Dataset.load_from_df(ratings[['user id', 'item id', 'rating']], reader)
data.split(n_folds=5)
svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)
ratings[ratings['user id'] == 1]

Unnamed: 0,user id,item id,rating,timestamp
202,1,61,4,878542420
305,1,189,3,888732928
333,1,33,4,878542699
334,1,160,4,875072547
478,1,20,4,887431883
639,1,202,5,875072442
687,1,171,5,889751711
820,1,265,4,878542441
933,1,155,2,878542201
972,1,117,3,874965739


## Make Prediction 

In [7]:
svd.predict(1,94,2)

Prediction(uid=1, iid=94, r_ui=2, est=2.3049731027502487, details={'was_impossible': False})

## Evaluation 

In [6]:
from collections import defaultdict

from surprise import Dataset
from surprise import SVD
from surprise.model_selection import KFold


def precision_recall_at_k(predictions, k=5, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls


data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
svd = SVD()

for trainset, testset in kf.split(data):
    svd.fit(trainset)
    predictions = svd.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

0.879720155863
0.249750558313
0.871797593772
0.260068726059
0.876114649682
0.262204756654
0.874443266172
0.263374030678
0.874256900212
0.251886430413


In [8]:
from surprise.model_selection import cross_validate
cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9417  0.9340  0.9335  0.9312  0.9419  0.9365  0.0045  
Fit time          5.21    5.25    5.17    5.16    5.19    5.20    0.03    
Test time         0.22    0.15    0.19    0.14    0.15    0.17    0.03    


{'fit_time': (5.208379030227661,
  5.254695892333984,
  5.171963930130005,
  5.160135984420776,
  5.189282178878784),
 'test_rmse': array([ 0.94172334,  0.93402388,  0.93351668,  0.93124798,  0.94192331]),
 'test_time': (0.22307991981506348,
  0.15170836448669434,
  0.19297194480895996,
  0.14153289794921875,
  0.14574408531188965)}