In [8]:
import random

from surprise import NMF
from surprise import Dataset
from surprise import accuracy
from surprise import Reader

from surprise.model_selection import GridSearchCV

In [2]:
# Load the Python libraries
import os
import pandas as pd
import numpy as np
from surprise.model_selection import train_test_split

In [3]:
file_path = os.path.expanduser('~/PycharmProjects/Tesis1/ml-100k/u.data')

reader = Reader(line_format='user item rating timestamp', sep='\t', rating_scale=(1, 5))

data = Dataset.load_from_file(file_path, reader=reader)

In [4]:

#Asignamos los datos a una lista raw_ratings, sin indices
raw_ratings = data.raw_ratings



In [5]:
random.shuffle(raw_ratings)

In [6]:
# Separamos en train y test
threshold = int(.9 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = train_raw_ratings  # Reemplaza en data con los valores de entrenamiento

In [16]:
# Utilizamos gridsearch para obetener los mejores parametros para el algoritmo
print('Grid Search...')
param_grid = {'n_factors': [50,100,150],'n_epochs':[25,50,75]}
grid_search = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)

Grid Search...


In [17]:
algo = grid_search.best_estimator['rmse']
print(grid_search.best_score['rmse'])
print(grid_search.best_params['rmse'])

0.9774351799518451
{'n_factors': 100, 'n_epochs': 75}


In [18]:
print(algo)

<surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7fb0f8053fd0>


In [19]:

# Se reentrena el set de entrenamiento con el mejor conjunto de parametros obtenido
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7fb0f8053fd0>

In [23]:

# Con la primer funcion creamos un set de test a partir del set de entrenamiento
predictions = algo.test(trainset.build_testset())
print('Accuracy on Trainset,', end='   ')
accuracy.rmse(predictions)

Accuracy on Trainset,   RMSE: 0.7220


0.7220002371925545

In [24]:
# Prueba el modelo con el set de test
testset = data.construct_testset(test_raw_ratings)  # testset is now the set B
predictions = algo.test(testset)
print('Accuracy on Testset,', end=' ')
accuracy.rmse(predictions)

Accuracy on Testset, RMSE: 0.9607


0.9607096199028853

In [25]:
print(trainset.n_users)
print(trainset.n_items)
print(algo.qi.shape)
print(algo.pu.shape)

943
1668
(1668, 100)
(943, 100)
