In [1]:
import random

from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader

from surprise.model_selection import GridSearchCV

In [2]:

import os
import pandas as pd
import numpy as np
from surprise.model_selection import train_test_split

In [4]:
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')


In [5]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
# Cargamos el dataset nuestro

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader=reader)

In [7]:

#Asignamos los datos a una lista raw_ratings, sin indices
raw_ratings = data.raw_ratings


In [8]:
random.shuffle(raw_ratings)

In [9]:
# Separamos en train y test
threshold = int(.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = train_raw_ratings 
# Reemplaza en data con los valores de entrenamiento

In [10]:
# Utilizamos gridsearch para obetener los mejores parametros para el algoritmo
print('Grid Search...')
param_grid = {'n_factors': [50,100,150],'n_epochs':[25,50,75],'lr_all': [0.005,0.01],'reg_all':[0.1,0.5,1]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs = -1)
grid_search.fit(data)

Grid Search...


In [11]:
algo = grid_search.best_estimator['rmse']

print(grid_search.best_score['rmse'])
print(grid_search.best_params['rmse'])

0.8724699875762031
{'n_factors': 150, 'n_epochs': 75, 'lr_all': 0.01, 'reg_all': 0.1}


In [12]:
print(algo)

<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7faa52ff9340>


In [13]:
# Se reentrena el set de entrenamiento con el mejor conjunto de parametros obtenido

trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7faa52ff9340>

In [17]:
# Con la primer funcion creamos un set de test a partir del set de entrenamiento
predictions = algo.test(trainset.build_testset())
print('Accuracy on Trainset,', end='   ')
accuracy.rmse(predictions)

Accuracy on Trainset,   RMSE: 0.4985


0.4984600557347057

In [18]:
# Prueba el modelo con el set de test
testset = data.construct_testset(test_raw_ratings)  
predictions2 = algo.test(testset)
print('Accuracy on Testset,', end=' ')
accuracy.rmse(predictions2)

Accuracy on Testset, RMSE: 0.8368


0.8367634899030604

In [114]:
p= predictions+predictions2

In [22]:
print(trainset.n_users)
print(trainset.n_items)
print(algo.qi.shape)
print(algo.pu.shape)

610
8989
(8989, 150)
(610, 150)


In [70]:
#predictions (Tiene las predicciones para un usuario y un item, el rating reak y el estimado)

In [76]:
from collections import defaultdict
import os
import io

In [77]:
def get_top_n(predictions, n = 10):
    
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
        
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

In [124]:
top_n = 10
top_pred = get_top_n(p, n = top_n)


In [80]:
df_movies = pd.read_csv('ml-latest-small/movies.csv')

#rid_to_name, name_to_rid = read_item_names('ml-latest-small/movies.csv')

In [81]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [82]:
def tituloPelicula(iid):
    return df_movies.loc[df_movies.movieId == iid, 'title'].values[0]

In [129]:
# User raw Id
uid_list = [100]

# Imprime las 10 recomendaciones de peliculas para un determinado usuario
for uid, user_ratings in top_pred.items():
    if uid in uid_list:
        print("Recomendaciones para el usuario "+ str(uid) + ': ')
        for (iid, rating) in user_ratings:
            movie = tituloPelicula(iid)
            print('Movie:', iid, '-', movie, ', rating:', str(rating))


Recomendaciones para el usuario 100: 
Movie: 1958 - Terms of Endearment (1983) , rating: 4.631887341457577
Movie: 2423 - Christmas Vacation (National Lampoon's Christmas Vacation) (1989) , rating: 4.4445952413381375
Movie: 6183 - Pillow Talk (1959) , rating: 4.346143640794372
Movie: 4995 - Beautiful Mind, A (2001) , rating: 4.340347116081119
Movie: 838 - Emma (1996) , rating: 4.323807198093856
Movie: 932 - Affair to Remember, An (1957) , rating: 4.31134613548217
Movie: 356 - Forrest Gump (1994) , rating: 4.3060533725600365
Movie: 1246 - Dead Poets Society (1989) , rating: 4.30425924175819
Movie: 5380 - Importance of Being Earnest, The (2002) , rating: 4.297965631302695
Movie: 899 - Singin' in the Rain (1952) , rating: 4.28862157176267
