In [2]:
import urllib
import random
from scipy.special import digamma
from math import exp

In [3]:
%pip install scikit-surprise --quiet

Note: you may need to restart the kernel to use updated packages.


**Mejor solución actual**:

- SVD: Mejores parámetros: {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.06}. Predicciones guardadas en
predictions\svd_gridsearch\predictions_svd.csv. MAE: 1.260


## Reader Dataset 

In [4]:
import pandas as pd

df_train = pd.read_csv('train.csv', sep=',', index_col=False)
df_test = pd.read_csv('test.csv', sep=',', index_col=False)


In [7]:
df_train.head(5)

Unnamed: 0,user,item,rating
0,1,25715,7.0
1,1,25716,10.0
2,5,25851,9.0
3,6,25923,5.0
4,7,25924,6.0


In [5]:
import os
import random
import operator
import requests
import numpy as np
import pandas as pd
from scipy import sparse
import sys
from surprise import Dataset, Reader
from surprise import KNNBasic, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.dataset import DatasetAutoFolds
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

In [8]:
reader = Reader(rating_scale=(0,10)) # rating scale range
data = Dataset.load_from_df(df_train[['user', 'item', 'rating']], reader)
print(type(data))

<class 'surprise.dataset.DatasetAutoFolds'>


In [9]:
trainset, testset = train_test_split(data, test_size=0.25)
print(type(trainset))

<class 'surprise.trainset.Trainset'>


In [30]:
trainset = data.build_full_trainset()

## SVD

In [31]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1fecbe35b20>

In [32]:
predictions = algo.test(testset)

In [33]:
accuracy.mae(predictions)

MAE:  0.7635


0.7634623190271725

> Exportacion de predicciones.csv

In [34]:
testset[0:5]

[(3891, 81714, 10.0),
 (49435, 77613, 8.0),
 (38654, 112871, 6.0),
 (69560, 55505, 10.0),
 (69799, 28638, 9.0)]

In [35]:
solution = []

for _, row in df_test.iterrows():
    user = row['user']
    item = row['item']
    
    pred = algo.predict(user, item).est  # Predicción de rating
    solution.append([row['ID'], pred])

solution_df = pd.DataFrame(solution, columns=["ID", "rating"])

solution_df.to_csv('predictions_svd.csv', index=False)


> Pruebas SVD ajustando hiperparametros

In [None]:
import os
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

output_dir = "predictions/svd_gridsearch"
os.makedirs(output_dir, exist_ok=True)

# Configurar el rango de ratings
reader = Reader(rating_scale=(df_train["rating"].min(), df_train["rating"].max()))
data = Dataset.load_from_df(df_train[['user', 'item', 'rating']], reader)

param_grid = {
    "n_factors": [50, 100, 150],     # Aumentar complejidad
    "n_epochs": [20, 30, 50],        # Más iteraciones para mejor convergencia
    "lr_all": [0.002, 0.005, 0.01],  # Ajuste de tasa de aprendizaje
    "reg_all": [0.02, 0.04, 0.06],   # Regularización para evitar overfitting
}

grid_search = GridSearchCV(SVD, param_grid, measures=["mae"], cv=3, n_jobs=-1, joblib_verbose=1)
grid_search.fit(data)

best_params = grid_search.best_params["mae"]
print(f"Mejores parámetros: {best_params}")

### ENTRENO  EL MEJOR MODELO CON TODOS LOS DATOS
best_svd = SVD(**best_params, random_state=42)
trainset = data.build_full_trainset()
best_svd.fit(trainset)

solution = []
for _, row in df_test.iterrows():
    user = row["user"]
    item = row["item"]
    pred = best_svd.predict(user, item).est
    solution.append([row["ID"], pred])

# Guardar predicciones en CSV
output_file = os.path.join(output_dir, "predictions_svd.csv")
solution_df = pd.DataFrame(solution, columns=["ID", "rating"])
solution_df.to_csv(output_file, index=False)

print(f"Predicciones guardadas en {output_file}")


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 15.0min finished


Mejores parámetros: {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.06}
Predicciones guardadas en svd_gridsearch\predictions_svd.csv


---

## NMF

In [38]:
trainset = data.build_full_trainset()

In [29]:
from surprise import Dataset, Reader, NMF

algo = NMF(n_factors=15, n_epochs=50, biased=False, reg_pu=0.06, reg_qi=0.06, random_state=42, verbose=True)
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.mae(predictions)

solution = []

for _, row in df_test.iterrows():
    user = row['user']
    item = row['item']
    
    pred = algo.predict(user, item).est  # Predicción de rating
    solution.append([row['ID'], pred])

solution_df = pd.DataFrame(solution, columns=["ID", "rating"])

solution_df.to_csv('predictions_nmf.csv', index=False)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
MAE:  1.77

In [42]:
from surprise import Dataset, Reader, NMF

algo = NMF(n_factors=40, n_epochs=75, biased=False, reg_pu=0.06, reg_qi=0.06, random_state=42, verbose=True)
algo.fit(trainset)
predictions = algo.test(testset)

# accuracy.mae(predictions)

solution = []

for _, row in df_test.iterrows():
    user = row['user']
    item = row['item']
    
    pred = algo.predict(user, item).est  # Predicción de rating
    solution.append([row['ID'], pred])

solution_df = pd.DataFrame(solution, columns=["ID", "rating"])

solution_df.to_csv('predictions_nmf_k40_e75.csv', index=False)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

**GridSearch**

In [None]:
import os
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, NMF
from surprise.model_selection import GridSearchCV

# Crear el directorio donde se guardarán los archivos
output_dir = "nmf_gridsearch"
os.makedirs(output_dir, exist_ok=True)

# Cargar datos
df_train = pd.read_csv("train.csv")  # Reemplaza con tu archivo real
df_test = pd.read_csv("test.csv")  # Reemplaza con tu archivo real

# Configurar el rango de ratings
reader = Reader(rating_scale=(df_train["rating"].min(), df_train["rating"].max()))
data = Dataset.load_from_df(df_train[['user', 'item', 'rating']], reader)

# Definir la malla de hiperparámetros para NMF
param_grid = {
    "n_factors": [10, 15, 20],
    "n_epochs": [30, 50, 70],
    "reg_pu": [0.04, 0.06, 0.08],
    "reg_qi": [0.04, 0.06, 0.08],
}

# Ejecutar Grid Search
grid_search = GridSearchCV(NMF, param_grid, measures=["mae"], cv=3, n_jobs=-1)
grid_search.fit(data)

# Obtener los mejores parámetros
best_params = grid_search.best_params["mae"]
print(f"Mejores parámetros: {best_params}")

# Entrenar el mejor modelo con todos los datos
best_nmf = NMF(**best_params, random_state=42)
trainset = data.build_full_trainset()
best_nmf.fit(trainset)

# Generar predicciones para df_test
solution = []
for _, row in df_test.iterrows():
    user = row["user"]
    item = row["item"]
    pred = best_nmf.predict(user, item).est
    solution.append([row["ID"], pred])

# Guardar predicciones en CSV
solution_df = pd.DataFrame(solution, columns=["ID", "rating"])
output_file = os.path.join(output_dir, "predictions_nmf.csv")
solution_df.to_csv(output_file, index=False)

print(f"Predicciones guardadas en {output_file}")