In [3]:
! pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=3366446 sha256=4cc9096650d5b380520c42589a064ccaba16b2a431a312529c6b7137e41fc43c
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [4]:
#Conectar drive con google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise.dump import dump
from surprise.model_selection import cross_validate

In [6]:
#Se cargan solamente las columnas o campos que vamos a trabajar para nuestro modelo
df_ML = pd.read_parquet('/content/drive/MyDrive/Henry/completo.parquet', columns=['id', 'userId', 'calificacion'])

In [7]:
# Revisando si existen valores nulos
df_ML.isna().sum()

id              0
userId          0
calificacion    0
dtype: int64

In [8]:
# Crear un objeto Reader y especificar la escala de las calificaciones
reader = Reader(rating_scale=(1, 5))

In [9]:
# Cargar los datos en un objeto Dataset
data_ML = Dataset.load_from_df(df_ML[['id', 'userId', 'calificacion']], reader)

**Con el 25% de los datos**

In [10]:
# Dividir los datos en conjuntos de entrenamiento y prueba
trainset, testset = train_test_split(data_ML, test_size=0.25)

In [11]:
# Crear un modelo SVD (Singular Value Decomposition)
modelo_SVD = SVD()

In [12]:
# Entrenar el modelo en el conjunto de entrenamiento
modelo_SVD.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb9e35b3880>

In [14]:
# Hacemos una predicción al azar para usuario y película
userId = 10172
movieId = 'ns7455'
calificacion = modelo_SVD.predict(userId, movieId).est
print(calificacion)

3.533281907487661


In [17]:
# guarda el modelo entrenado en un archivo
dump('modelo_SVD_entrenado.plk', algo=modelo_SVD)

**Evaluación**

In [16]:
# Evaluar el modelo en el conjunto de prueba
prediccion_SVD = modelo_SVD.test(testset)

In [18]:
#Predecimos
prediccion_SVD[1]

Prediction(uid='as2061', iid=31554, r_ui=4.0, est=4.2042863621278395, details={'was_impossible': False})

In [19]:
# Imprimir la precisión del modelo
from surprise import accuracy
accuracy.rmse(prediccion_SVD)

RMSE: 1.0016


1.0015599765739638

**Optimización de hiperparámetros**

In [14]:
# train and validate
cross_validate(modelo_SVD, data_ML, measures=['RMSE','MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0020  1.0013  1.0014  1.0016  1.0021  1.0017  0.0003  
MAE (testset)     0.7767  0.7763  0.7766  0.7763  0.7769  0.7766  0.0002  
Fit time          220.08  244.04  235.51  245.88  248.80  238.86  10.38   
Test time         34.79   37.63   39.97   38.07   39.05   37.90   1.76    


{'test_rmse': array([1.0020482 , 1.00134015, 1.00143768, 1.00162754, 1.00213004]),
 'test_mae': array([0.77671331, 0.77626579, 0.77658117, 0.77632648, 0.77690022]),
 'fit_time': (220.076664686203,
  244.0360472202301,
  235.51421213150024,
  245.88076043128967,
  248.80352330207825),
 'test_time': (34.785120487213135,
  37.63435411453247,
  39.97498345375061,
  38.06835436820984,
  39.04879117012024)}