<a href="https://colab.research.google.com/github/JCaballerot/Recommender-Systems/blob/main/Autoencoder_CF_Yelp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [1]:
%%capture
# Descargar el dataset de Last.fm desde Kaggle
!pip install kaggle

In [2]:

from google.colab import files
files.upload()  # Sube tu archivo kaggle.json aquí

# Configurar Kaggle API
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


# Descargar otro dataset de Yelp
!kaggle datasets download -d yelp-dataset/yelp-dataset

# Descomprimir el archivo descargado
!unzip yelp-dataset.zip


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset
License(s): other
Downloading yelp-dataset.zip to /content
100% 4.06G/4.07G [00:19<00:00, 218MB/s]
100% 4.07G/4.07G [00:19<00:00, 227MB/s]
Archive:  yelp-dataset.zip
  inflating: Dataset_User_Agreement.pdf  
  inflating: yelp_academic_dataset_business.json  
  inflating: yelp_academic_dataset_checkin.json  
  inflating: yelp_academic_dataset_review.json  
  inflating: yelp_academic_dataset_tip.json  
  inflating: yelp_academic_dataset_user.json  


In [None]:
import json
import pandas as pd

# Leer el archivo JSON
file_path = "yelp_academic_dataset_review.json"
ratings_data = []

with open(file_path, 'r') as file:
    for line in file:
        ratings_data.append(json.loads(line))  # Decodificar cada línea como JSON

# Convertir la lista de diccionarios en un DataFrame
ratings_df = pd.DataFrame(ratings_data)

# Mostrar las primeras filas del DataFrame
ratings_df[['user_id', 'business_id', 'stars']].head()


In [None]:
# Leer el archivo JSON
file_path = "yelp_academic_dataset_business.json"
ratings_data = []

with open(file_path, 'r') as file:
    for line in file:
        ratings_data.append(json.loads(line))  # Decodificar cada línea como JSON

# Convertir la lista de diccionarios en un DataFrame
yelp_academic_dataset_business = pd.DataFrame(ratings_data)

# Mostrar las primeras filas del DataFrame
yelp_academic_dataset_business[['business_id', 'name']].head()


In [None]:
ratings_df = ratings_df[['user_id', 'business_id', 'stars']]
ratings_df.head()

In [35]:
business_id_counts = ratings_df.groupby('business_id').size().sort_values(ascending=False)


In [None]:
# Visualizar distribución long tail
plt.figure(figsize=(12, 6))
plt.bar(range(len(business_id_counts)), business_id_counts, color='lightblue')
plt.title('Distribución del Número de reviews por negocio (Long Tail)')
plt.xlabel('Negocios ordenados por reviews')
plt.ylabel('Número de reviews')
plt.ylim(1, 1000)
plt.show()


In [36]:
# Filtrar artistas con al menos 500 reviews
min_reviews_per_business = 500
popular_business = business_id_counts[business_id_counts >= min_reviews_per_business].index
ratings_df_filtered = ratings_df[ratings_df['business_id'].isin(popular_business)]


In [None]:
# Filtrar usuarios con al menos 100 escuchas
users_review_counts = ratings_df_filtered.groupby('user_id').size().sort_values(ascending=False)
users_review_counts

In [38]:
# Filtrar usuarios con al menos 500 reviews
min_reviews_per_user = 50
popular_users = users_review_counts[users_review_counts >= min_reviews_per_user].index
ratings_df_filtered = ratings_df_filtered[ratings_df_filtered['user_id'].isin(popular_users)]


In [None]:
ratings_df_filtered = ratings_df_filtered.groupby(['business_id', 'user_id'], as_index=False).agg({'stars': 'mean'})
interaction_matrix = ratings_df_filtered.pivot(index='business_id', columns='user_id', values='stars').reset_index().fillna(0)
interaction_matrix.head()

In [None]:

# Convertir la matriz en un array de numpy
business = list(set(interaction_matrix.columns.tolist()) - set('business_id'))

interaction_matrix[business] = interaction_matrix[business].apply(pd.to_numeric, errors='coerce').fillna(0)
interaction_array = interaction_matrix[business].values

# Definir dimensiones
num_items, num_users = interaction_array.shape
input_dim = num_users  # Cada fila representa un negocio


# Construcción del Autoencoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='sigmoid')(input_layer)  # Codificación a 64 neuronas
encoded = Dense(32, activation='sigmoid')(encoded)      # Codificación a 32 neuronas

decoded = Dense(64, activation='sigmoid')(encoded)      # Decodificación
decoded = Dense(input_dim, activation='sigmoid')(decoded)  # Reconstrucción (usa sigmoid para valores entre 0 y 1)

# Modelo Autoencoder
autoencoder = Model(inputs=input_layer, outputs=decoded)

# Compilación del modelo
autoencoder.compile(optimizer='adam', loss='mean_squared_error')


# Entrenamiento del modelo
# Normalizar datos entre 0 y 1 si es necesario
interaction_array_norm = (interaction_array / np.max(interaction_array))



# Definir el callback de Early Stopping
early_stopping = EarlyStopping(
    monitor='val_loss',      # Monitorea la pérdida en el conjunto de validación
    patience=10,             # Número de épocas sin mejora antes de detener el entrenamiento
    restore_best_weights=True  # Restaura los mejores pesos al final del entrenamiento
)


# Entrenamiento del modelo con Early Stopping
history = autoencoder.fit(
    interaction_array_norm,
    interaction_array_norm,  # Reconstrucción
    epochs=500,
    batch_size=32,
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stopping]  # Agregar el callback aquí
)


In [None]:
import matplotlib.pyplot as plt

# Graficar el desempeño de la red
plt.figure(figsize=(10, 6))

# Pérdida del entrenamiento
plt.plot(history.history['loss'], label='Loss', marker='o')

# Pérdida de validación (si está disponible)
if 'val_loss' in history.history:
    plt.plot(history.history['val_loss'], label='Validation Loss', marker='o')

# Configuración de la gráfica
plt.title('Model Training and Validation Loss', fontsize=16)
plt.xlabel('Epochs', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=12)
plt.tight_layout()

# Mostrar gráfica
plt.show()