In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate as cv
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import accuracy
from surprise.model_selection import cross_validate
import random
from datetime import datetime
from sklearn.model_selection import KFold
from collections import defaultdict
#Para garantizar reproducibilidad en resultados
seed = 10
random.seed(seed)
np.random.seed(seed)

# importing the module
import imdb
   
# creating instance of IMDb
ia = imdb.IMDb()

In [11]:
def get_top5_recommendations(predictions, topN = 5):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

In [27]:
def get_top5_recommendations_original(predictions, topN = 5):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r in predictions:
        top_recs[uid].append((iid, true_r))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

In [2]:
ratings=pd.read_csv('ml-latest-small/ratings.csv',sep=',')
ratings['timestamp'] = ratings['timestamp'].apply(datetime.fromtimestamp)
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 13:45:03
1,1,3,4.0,2000-07-30 13:20:47
2,1,6,4.0,2000-07-30 13:37:04
3,1,47,5.0,2000-07-30 14:03:35
4,1,50,5.0,2000-07-30 13:48:51
...,...,...,...,...
100831,610,166534,4.0,2017-05-03 16:53:22
100832,610,168248,5.0,2017-05-03 17:21:31
100833,610,168250,5.0,2017-05-08 14:50:47
100834,610,168252,5.0,2017-05-03 16:19:12


In [5]:
reviews=ratings[['userId','movieId','rating','timestamp']].sort_values(by='timestamp').copy()
ratings['user_id_simple'] = pd.factorize(reviews.userId)[0]
ratings['movie_id_simple'] = pd.factorize(reviews.movieId)[0]
ratings

Unnamed: 0,userId,movieId,rating,timestamp,user_id_simple,movie_id_simple
0,1,1,4.0,2000-07-30 13:45:03,0,0
1,1,3,4.0,2000-07-30 13:20:47,0,1
2,1,6,4.0,2000-07-30 13:37:04,0,2
3,1,47,5.0,2000-07-30 14:03:35,0,3
4,1,50,5.0,2000-07-30 13:48:51,0,4
...,...,...,...,...,...,...
100831,610,166534,4.0,2017-05-03 16:53:22,601,9614
100832,610,168248,5.0,2017-05-03 17:21:31,601,9553
100833,610,168250,5.0,2017-05-08 14:50:47,601,3047
100834,610,168252,5.0,2017-05-03 16:19:12,601,3039


In [8]:
# getting the number unique users and restaurants
unique_users = reviews.userId.unique().shape[0]
unique_restaurants = reviews.movieId.unique().shape[0]

reader = Reader( rating_scale = ( 0, 5 ) )
reviews.head()

Unnamed: 0,userId,movieId,rating,timestamp
66719,429,595,5.0,1996-03-29 13:36:55
66716,429,588,5.0,1996-03-29 13:36:55
66717,429,590,5.0,1996-03-29 13:36:55
66718,429,592,5.0,1996-03-29 13:36:55
66712,429,432,3.0,1996-03-29 13:36:55


In [9]:
surprise_dataset = Dataset.load_from_df( reviews[ ["userId","movieId","rating"] ], reader )
train_set, test_set=  train_test_split(surprise_dataset, test_size=.3)

In [10]:

sim_options = {'name': 'cosine',
               'user_based': True  # calcule similitud user_user
               }
for i in (3,50,100):
    algo1 = KNNBasic(k=i, min_k=3, sim_options=sim_options)
    algo1.fit(train_set)
    predictions1 = algo1.test(test_set)
    print(accuracy.rmse(predictions1))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0625
1.0625154457380344
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9740
0.9739837297171795
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9751
0.9750903829552365


In [72]:
top10_recommendations_est = get_top5_recommendations(predictions1,10)
top10_recommendations_est[553]
#predictions1

[(50, 4.384847167547031),
 (2571, 4.2257469763632),
 (1704, 4.11749013418424),
 (32, 4.045627969191606),
 (778, 3.974569600220769),
 (27831, 3.9274073895956003),
 (5989, 3.894440319622537),
 (16, 3.868774330550282),
 (49530, 3.837817818129477),
 (3897, 3.817155080016401)]

In [69]:
top10_recommendations_ori=get_top5_recommendations_original(test_set,10)
top10_recommendations_ori[553]

[(32, 5.0),
 (16, 5.0),
 (40148, 5.0),
 (2231, 5.0),
 (5059, 5.0),
 (50851, 5.0),
 (55247, 5.0),
 (5989, 5.0),
 (778, 5.0),
 (1729, 5.0)]

## p@10

In [73]:
# j[0] = id pelicula
# j[i] = rating
total= 0
posicion_est= 1
posicion_ori= 1
num = 0
for i in top10_recommendations_est[553]:
    for j in top10_recommendations_ori[553]:
        #print(i[0])
        if i[0]==j[0]:
            total= total+1
        if posicion_est == posicion_ori:
            num= num + (total/posicion_est)
        posicion_ori = posicion_ori+1
    posicion_est=posicion_est+1    
    #print(total)

In [74]:
if num >0:
    result = 0#(1/total)*num
else:
    result = 0
result

0

## Exportar modelo

In [75]:
import pickle

In [77]:
filename = 'filtrado_colaborativo.sav'
pickle.dump(algo1, open(filename, 'wb'))