In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install surprise



In [3]:
# Imports
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# A dictionary output that does not raise a key error
from collections import defaultdict

# To compute the accuracy of models
from surprise import accuracy

# This class is used to parse a file containing ratings, data should be in structure - user; item; rating
from surprise.reader import Reader

# Class for loading datasets
from surprise.dataset import Dataset

# For tuning model hyperparameters
from surprise.model_selection import GridSearchCV

# For splitting the rating data in train and test datasets
from surprise.model_selection import train_test_split

# For implementing similarity-based recommendation system
from surprise.prediction_algorithms.knns import KNNBasic

# For implementing matrix factorization based recommendation system
from surprise.prediction_algorithms.matrix_factorization import SVD

# For implementing K-Fold cross-validation
from surprise.model_selection import KFold

# For implementing clustering-based recommendation system
from surprise import CoClustering

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [4]:
#Cargar el dataset

datos = pd.read_parquet("/content/drive/MyDrive/EAFIT/Semestre 3/Mineria de grandes Volumenes de Datos/Proyecto Integrador/data1pct.parquet"
                        , engine="pyarrow")

datos.head()

Unnamed: 0,customer_id,product_id,star_rating
0,24912421,B000A3WS84,5.0
1,22668859,B0032QA93M,5.0
2,29511033,B005HB8JN2,1.0
3,35207047,B00BQ5RYI4,5.0
4,20204904,B00AWJEOFG,5.0


# **Modelo 1:** Basado en ranking

In [5]:
average_ratings = datos.groupby('product_id')['star_rating'].mean()
count_rating = datos.groupby('product_id')['star_rating'].count()
final_rating = pd.DataFrame({'avg': average_ratings, 'count': count_rating})

final_rating.head()

Unnamed: 0_level_0,avg,count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0000032034,5.0,1
0000032050,5.0,1
0001711474,5.0,1
000200092X,1.0,1
0002000946,5.0,1


In [6]:
def top_n_products(groupby_data, n, min_interactions=100):
    recommendations = groupby_data[groupby_data['count'] >= min_interactions]
    recommendations = recommendations.sort_values(by='avg', ascending=False)
    return recommendations.index[:n]

Este es un modelo simple, y da una idea de los articulos o productos que mayor rotación tienen, con base en las interacciones de los usuarios, independiente de las calificaciones que estos otorguen

**Funciones para calcular desempeño de modelos**

In [8]:
# Helper 1
def precision_recall_at_k(model, k = 10, threshold = 3.5):
    """Returns precision and recall at k metrics for each user."""

    # First map the predictions to each user
    user_est_true = defaultdict(list)

    # Making predictions on the test data
    predictions = model.test(testset)

    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key = lambda x: x[0], reverse = True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant. When n_rec_k is 0,
        # Precision is undefined. We here set Precision to 0 when n_rec_k is 0

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended. When n_rel is 0,
        # Recall is undefined. We here set Recall to 0 when n_rel is 0

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    # Mean of all the predicted precisions are calculated
    precision = round((sum(prec for prec in precisions.values()) / len(precisions)), 3)

    # Mean of all the predicted recalls are calculated
    recall = round((sum(rec for rec in recalls.values()) / len(recalls)), 3)

    accuracy.rmse(predictions)

    # Command to print the overall precision
    print('Precision: ', precision)

    # Command to print the overall recall
    print('Recall: ', recall)

    # Formula to compute the F-1 score
    print('F1_score: ', round((2*precision * recall) / (precision + recall), 3))

**Funciones para generar recomendaciones basadas en los modelos**

In [9]:
# Helper 2
def get_recommendations(data, user_id, top_n, model):

  # Recreate the user_item_interactions_matrix from the filtered data
    user_item_interactions_matrix = data.pivot_table(index='customer_id', columns='product_id', values='star_rating')


    # Creating an empty list to store the recommended restaurant ids
    recommendations = []

    # Creating an user item interactions matrix
    user_item_interactions_matrix = data.pivot_table(index = 'customer_id', columns = 'product_id', values = 'star_rating')

    # Extracting those product_id which the customer_id has not bought  yet
    non_interacted_products = user_item_interactions_matrix.loc[user_id][user_item_interactions_matrix.loc[user_id].isnull()].index.tolist()

    # Looping through each of the product_id which customer_id has not interacted yet
    for item_id in non_interacted_products:

        # Predicting the ratings for those non bought product_id by this user
        est = model.predict(user_id, item_id).est

        # Appending the predicted ratings
        recommendations.append((item_id, est))

    # Sorting the predicted ratings in descending order
    recommendations.sort(key = lambda x: x[1], reverse = True)

    # Returing top n highest predicted rating product_id for this user
    return recommendations[:top_n]

In [None]:
# Crear listado de usuarios con Top N de interacciones
def top_users_by_interactions(data, min_interactions=50, top_n=20):
    """
    Genera el listado de los top N usuarios (user_id) con al menos
    un número de interacciones especificada.

    """
    # Agrupar por 'customer_id' y contabilizar interaciones
    user_interactions = datos.groupby('customer_id')['product_id'].count()

    # Filtrar usuarios con al menos un minimo de interacciones 'min_interactions'
    filtered_users = user_interactions[user_interactions >= min_interactions]

    # Ordenar por conteo de interaccionesy obtener el top de N usuarios 'top_n'
    top_users = filtered_users.sort_values(ascending=False).head(top_n)

    # Desplegar o crear un DataFrame para los resultados
    result_df = pd.DataFrame({'customer_id': top_users.index, 'interactions': top_users.values})


    return result_df

top_20_users = top_users_by_interactions(datos)
print(top_20_users)

# **`Factores latentes - SVD`**

In [28]:
# Import the SVD class from the Surprise library
from surprise import SVD
from surprise.model_selection import KFold
from collections import defaultdict

In [31]:
reader_fl = Reader(rating_scale=(1, 5))
sdata_fl = Dataset.load_from_df(datos[['customer_id', 'product_id', 'star_rating']], reader_fl)

trainset, testset = train_test_split(sdata_fl, test_size=0.2, random_state=42)

In [32]:
# Using SVD matrix factorization
svd = SVD(random_state = 42)

# Training the algorithm on the trainset
svd.fit(trainset)

# Let us compute precision@k and recall@k with k = 10
precision_recall_at_k(svd)

RMSE: 1.2650
Precision:  0.778
Recall:  0.781
F1_score:  0.779


In [33]:
svd.predict(43430756, "B00DQDK3SW", verbose = True)

user: 43430756   item: B00DQDK3SW r_ui = None   est = 3.36   {'was_impossible': False}


Prediction(uid=43430756, iid='B00DQDK3SW', r_ui=None, est=3.358009299278069, details={'was_impossible': False})

In [34]:
# Set the parameter space to tune
param_grid = {'n_epochs': [10, 20, 30], 'lr_all': [0.001, 0.005, 0.01],
              'reg_all': [0.2, 0.4, 0.6]}

# Performing 3-fold gridsearch cross validation
gs = GridSearchCV(SVD, param_grid, measures = ['rmse'], cv = 3, n_jobs = -1)

# Fitting the model on the data
gs.fit(sdata)

# Print the best RMSE score
print(gs.best_score['rmse'])

# Print the combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.2808864744612705
{'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.4}


In [35]:
# Using SVD matrix factorization
svd = SVD(n_epochs=20, lr_all=0.01, reg_all=0.4, random_state = 42)

# Training the algorithm on the trainset
svd.fit(trainset)

# Let us compute precision@k and recall@k with k = 10
precision_recall_at_k(svd)

RMSE: 1.2633
Precision:  0.777
Recall:  0.78
F1_score:  0.778
