# Modelo TwoTower y ThreeTower

El modelo Two-Tower (o modelo de dos torres) es una arquitectura de red neuronal que aprende representaciones (embeddings) separadas para diferentes tipos de entrada ‚Äîen este caso, usuarios y textos de rese√±as‚Äî, y luego las combina para predecir una puntuaci√≥n.

üèóÔ∏è **Arquitectura**

User Tower:

- Recibe el user_id como entrada.

- Aplica una capa de embedding para convertir el ID del usuario en un vector denso.

- Pasa ese vector por una capa totalmente conectada (Linear) con activaci√≥n ReLU.

- Su objetivo es aprender una representaci√≥n del usuario basada en su historial.

Review Tower:

- Recibe la representaci√≥n vectorial de la rese√±a (TF-IDF).

- Procesa esta entrada mediante una capa lineal con ReLU.

- Aprender una representaci√≥n sem√°ntica del texto.

Head:

- Concatena los vectores generados por ambas torres.

- Pasa el vector combinado por una red neuronal profunda:

- Capa Linear + Dropout + ReLU.

- Capa final Linear que produce un √∫nico valor escalar (la predicci√≥n del rating).

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack

In [12]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import os

base_path = "/content/drive/MyDrive/"

df_users = pd.read_csv(os.path.join(base_path, "usuarios.csv"), sep=",", dtype={"elite": str})
df_businesses = pd.read_csv(os.path.join(base_path, "negocios.csv"), sep=",")
df_train = pd.read_csv(os.path.join(base_path, "train_reviews.csv"), sep=",")
df_test = pd.read_csv(os.path.join(base_path, "test_reviews.csv"), sep=",")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699619 entries, 0 to 699618
Data columns (total 22 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             699619 non-null  object 
 1   name                699607 non-null  object 
 2   review_count        699619 non-null  int64  
 3   yelping_since       699619 non-null  object 
 4   useful              699619 non-null  int64  
 5   funny               699619 non-null  int64  
 6   cool                699619 non-null  int64  
 7   elite               55411 non-null   object 
 8   friends             427407 non-null  object 
 9   fans                699619 non-null  int64  
 10  average_stars       699619 non-null  float64
 11  compliment_hot      699619 non-null  int64  
 12  compliment_more     699619 non-null  int64  
 13  compliment_profile  699619 non-null  int64  
 14  compliment_cute     699619 non-null  int64  
 15  compliment_list     699619 non-nul

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 967784 entries, 0 to 967783
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   review_id    967784 non-null  object 
 1   user_id      967784 non-null  object 
 2   business_id  967784 non-null  object 
 3   stars        967784 non-null  float64
 4   useful       967784 non-null  int64  
 5   funny        967784 non-null  int64  
 6   cool         967784 non-null  int64  
 7   text         967784 non-null  object 
 8   date         967784 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 66.5+ MB


In [None]:
df_train.head(2)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,ZZO43qKB-s65zplC8RfJqw,-1BSu2dt_rOAqllw9ZDXtA,smkZq4G1AOm4V6p3id5sww,5.0,0,0,0,Fantastic fresh food. The greek salad is amazi...,2016-09-30 15:49:32
1,vojXOF_VOgvuKD95gCO8_Q,xpe178ng_gj5X6HgqtOing,96_c_7twb7hYRZ9HHrq01g,1.0,2,0,1,Been a patient at Largo Med/Diagnostic Clinic ...,2020-12-09 14:39:51


In [None]:
df_users.head(2)

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946


In [None]:
# review_count, yelping_since, fans, average_stars, compliment_hot, compliment_more	compliment_profile	compliment_cute	compliment_list	compliment_note	compliment_plain	compliment_cool	compliment_funny	compliment_writer	compliment_photos

In [None]:
# useful, funny, cool

---

## Primer Experimento TwoTower
Se tiene en cuenta s√≥lo el ID de usuario y el texto de la review.

- Prueba con `epoch = 1` y `batch_size` de 256. Aumentar el n√∫mero de epochs no mejora los resultados.

- Se hacen pruebas con TF-IDF de 500, 700, 5000, 20000.

In [4]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [None]:
class YelpReviewDataset(Dataset):
    def __init__(self, df, df_users, text_vectorizer):
        self.df = df.reset_index(drop=True)
        self.df_users = df_users.set_index("user_id")
        self.vectorizer = text_vectorizer

        # Label Encoding de user_id
        self.user_encoder = LabelEncoder()
        self.df["user_id_encoded"] = self.user_encoder.fit_transform(self.df["user_id"])

        # Emparejamos info de usuarios
        #self.df["review_text_vec"] = list(self.vectorizer.transform(self.df["text"]).toarray())
        self.review_matrix = self.vectorizer.transform(self.df["text"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_id = row["user_id_encoded"]
        review_vec_sparse = self.review_matrix[idx]
        review_vec_dense = torch.tensor(review_vec_sparse.toarray(), dtype=torch.float32).squeeze(0)
        rating = torch.tensor(row["stars"], dtype=torch.float32)

        return user_id, review_vec_dense, rating




class YelpReviewTestDataset(Dataset):
    def __init__(self, df, df_users, text_vectorizer):
        self.df = df.reset_index(drop=True)
        self.df_users = df_users.set_index("user_id")
        self.vectorizer = text_vectorizer

        # Label Encoding de user_id
        self.user_encoder = LabelEncoder()
        self.df["user_id_encoded"] = self.user_encoder.fit_transform(self.df["user_id"])

        # Emparejamos info de usuarios
        #self.df["review_text_vec"] = list(self.vectorizer.transform(self.df["text"]).toarray())
        self.review_matrix = self.vectorizer.transform(self.df["text"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_id = row["user_id_encoded"]
        review_vec_sparse = self.review_matrix[idx]
        review_vec_dense = torch.tensor(review_vec_sparse.toarray(), dtype=torch.float32).squeeze(0)

        return user_id, review_vec_dense

In [None]:
# Preentrena TF-IDF sobre el corpus completo
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(df_train["text"])

In [None]:
class UserTower(nn.Module):
    def __init__(self, num_users, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, user_id):
        x = self.user_embedding(user_id)
        return self.fc(x)

class ReviewTower(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=128):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, review_vec):
        return self.fc(review_vec)

class TwoTowerModel(nn.Module):
    def __init__(self, num_users, text_input_dim=300, hidden_dim=128):
        super().__init__()
        self.user_tower = UserTower(num_users, hidden_dim=hidden_dim)
        self.review_tower = ReviewTower(text_input_dim, hidden_dim=hidden_dim)
        self.head = nn.Sequential(
            nn.Linear(hidden_dim * 2, 256),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, user_id, review_vec):
        user_embed = self.user_tower(user_id)
        review_embed = self.review_tower(review_vec)
        combined = torch.cat([user_embed, review_embed], dim=-1)
        return self.head(combined).squeeze(1)

In [None]:
torch.cuda.is_available()

True

In [None]:
import time

###################################
############## DATOS ##############
###################################
train_df, val_df = train_test_split(df_train, test_size=0.1, random_state=42)

train_dataset = YelpReviewDataset(train_df, df_users, vectorizer)
val_dataset = YelpReviewDataset(val_df, df_users, vectorizer)

train_loader = DataLoader(train_dataset, batch_size=256)
val_loader = DataLoader(val_dataset, batch_size=256)

In [None]:
##########################################
############## MODELO ####################
##########################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TwoTowerModel(num_users=len(train_dataset.user_encoder.classes_), text_input_dim=5000).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)
criterion = nn.MSELoss()

print("--- 1/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, review_vecs, stars) in enumerate(train_loader):

        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_time:.2f}s")

print("")
print("--- Validaci√≥n ---")
model.eval()
total_loss = 0
with torch.no_grad():
    for i, (user_ids, review_vecs, stars) in enumerate(val_loader):
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)
        total_loss += loss.item()

print(f"Validation Loss: {total_loss / len(val_loader):.4f}")

--- 1/2 Entrenamiento ---
Epoch 1, Loss: 0.6022, Time: 258.38s

--- Validaci√≥n ---
Validation Loss: 0.4537


> Continuaci√≥n del entrenamiento con los datos de validaci√≥n

In [None]:
print("--- 2/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, review_vecs, stars) in enumerate(val_loader):

        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(val_loader):.4f}, Time: {epoch_time:.2f}s")

--- 2/2 Entrenamiento ---
Epoch 1, Loss: 0.4819, Time: 31.12s


In [None]:
import gc
import torch

# # Elimina referencias expl√≠citamente
# del train_df, val_df, train_dataset, val_dataset, train_loader, val_loader

# # Limpia el recolector de basura de Python
# gc.collect()


# review_id, stars
test_dataset = YelpReviewTestDataset(df_test, df_users, vectorizer)
test_loader = DataLoader(test_dataset, batch_size=256)

# Test set
model.eval()
predictions = []
with torch.no_grad():
    for i, (user_ids, review_vecs) in enumerate(test_loader):
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        preds = model(user_ids, review_vecs)
        predictions.extend(preds.cpu().numpy())
        if i % 100 == 0:
            print(f"Batch {i}/{len(test_loader)}")


Batch 0/1621
Batch 100/1621
Batch 200/1621
Batch 300/1621
Batch 400/1621
Batch 500/1621
Batch 600/1621
Batch 700/1621
Batch 800/1621
Batch 900/1621
Batch 1000/1621
Batch 1100/1621
Batch 1200/1621
Batch 1300/1621
Batch 1400/1621
Batch 1500/1621
Batch 1600/1621


In [None]:
submission_df = pd.DataFrame({
    'review_id': df_test['review_id'],
    'stars': predictions
})
submission_df.to_csv('prediction_TwoTowers_7.csv', index=False)

> Prueba con TF-IDF de 20000

In [None]:
##############
### TF-IDF ###
##############
vectorizer = TfidfVectorizer(max_features=20000)
vectorizer.fit(df_train["text"])

import time

###################################
############## DATOS ##############
###################################
train_df, val_df = train_test_split(df_train, test_size=0.1, random_state=42)

train_dataset = YelpReviewDataset(train_df, df_users, vectorizer)
val_dataset = YelpReviewDataset(val_df, df_users, vectorizer)

train_loader = DataLoader(train_dataset, batch_size=256)
val_loader = DataLoader(val_dataset, batch_size=256)

#######################################################
############## ENTRENAMIENTO Y VALIDACI√ìN #############
#######################################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TwoTowerModel(num_users=len(train_dataset.user_encoder.classes_), text_input_dim=20000).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=4e-3)
criterion = nn.MSELoss()

print("--- 1/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, review_vecs, stars) in enumerate(train_loader):

        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_time:.2f}s")

print("")
print("--- Validaci√≥n ---")
model.eval()
total_loss = 0
with torch.no_grad():
    for i, (user_ids, review_vecs, stars) in enumerate(val_loader):
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)
        total_loss += loss.item()

print(f"Validation Loss: {total_loss / len(val_loader):.4f}")


print("--- 2/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, review_vecs, stars) in enumerate(val_loader):

        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(val_loader):.4f}, Time: {epoch_time:.2f}s")


##########################################
############## PREDICCIONES ##############
##########################################

test_dataset = YelpReviewTestDataset(df_test, df_users, vectorizer)
test_loader = DataLoader(test_dataset, batch_size=256)

model.eval()
predictions = []
with torch.no_grad():
    for i, (user_ids, review_vecs) in enumerate(test_loader):
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        preds = model(user_ids, review_vecs)
        predictions.extend(preds.cpu().numpy())
        if i % 100 == 0:
            print(f"Batch {i}/{len(test_loader)}")

submission_df = pd.DataFrame({
    'review_id': df_test['review_id'],
    'stars': predictions
})
submission_df.to_csv('prediction_TwoTowers_8.csv', index=False)

--- 1/2 Entrenamiento ---
Epoch 1, Loss: 0.5877, Time: 340.95s

--- Validaci√≥n ---
Validation Loss: 0.4460
--- 2/2 Entrenamiento ---
Epoch 1, Loss: 0.4711, Time: 33.67s
Batch 0/1621
Batch 100/1621
Batch 200/1621
Batch 300/1621
Batch 400/1621
Batch 500/1621
Batch 600/1621
Batch 700/1621
Batch 800/1621
Batch 900/1621
Batch 1000/1621
Batch 1100/1621
Batch 1200/1621
Batch 1300/1621
Batch 1400/1621
Batch 1500/1621
Batch 1600/1621


**Resultado**: MAE: 0.47

---

## Segundo Experimento. TF-IDF + ThreeTower

In [34]:
class YelpReviewDataset(Dataset):
    def __init__(self, df, df_users, df_businesses, text_vectorizer):
        self.df = df.reset_index(drop=True)
        self.df_users = df_users.set_index("user_id")
        self.df_businesses = df_businesses.set_index("business_id")
        self.vectorizer = text_vectorizer

        # Label Encoding de user_id y business_id
        self.user_encoder = LabelEncoder()
        self.business_encoder = LabelEncoder()

        self.df["user_id_encoded"] = self.user_encoder.fit_transform(self.df["user_id"])
        self.df["business_id_encoded"] = self.business_encoder.fit_transform(self.df["business_id"])

        # Vectorizamos el texto
        self.review_matrix = self.vectorizer.transform(self.df["text"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        user_id = row["user_id_encoded"]
        business_id = row["business_id_encoded"]

        review_vec_sparse = self.review_matrix[idx]
        review_vec_dense = torch.tensor(review_vec_sparse.toarray(), dtype=torch.float32).squeeze(0)

        rating = torch.tensor(row["stars"], dtype=torch.float32)

        return user_id, business_id, review_vec_dense, rating


class YelpReviewTestDataset(Dataset):
    def __init__(self, df, df_users, df_businesses, text_vectorizer):
        self.df = df.reset_index(drop=True)
        self.df_users = df_users.set_index("user_id")
        self.df_businesses = df_businesses.set_index("business_id")
        self.vectorizer = text_vectorizer

        # Label Encoding de user_id y business_id
        self.user_encoder = LabelEncoder()
        self.business_encoder = LabelEncoder()

        self.df["user_id_encoded"] = self.user_encoder.fit_transform(self.df["user_id"])
        self.df["business_id_encoded"] = self.business_encoder.fit_transform(self.df["business_id"])

        # Vectorizamos el texto
        self.review_matrix = self.vectorizer.transform(self.df["text"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        user_id = row["user_id_encoded"]
        business_id = row["business_id_encoded"]

        review_vec_sparse = self.review_matrix[idx]
        review_vec_dense = torch.tensor(review_vec_sparse.toarray(), dtype=torch.float32).squeeze(0)

        # En el dataset de test no necesitamos el rating, por eso no lo devolvemos
        return user_id, business_id, review_vec_dense


In [35]:
# Preentrena TF-IDF sobre el corpus completo
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(df_train["text"])

In [36]:
class UserTower(nn.Module):
    def __init__(self, num_users, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, user_id):
        x = self.user_embedding(user_id)
        return self.fc(x)

class BusinessTower(nn.Module):
    def __init__(self, num_businesses, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.business_embedding = nn.Embedding(num_businesses, embedding_dim)
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, business_id):
        x = self.business_embedding(business_id)
        return self.fc(x)

class ReviewTower(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=128):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, review_vec):
        return self.fc(review_vec)

class ThreeTowerModel(nn.Module):
    def __init__(self, num_users, num_businesses, text_input_dim=300, hidden_dim=128):
        super().__init__()
        # Aseguramos que las dimensiones sean correctas para cada torre
        self.user_tower = UserTower(num_users, embedding_dim=64, hidden_dim=hidden_dim)
        self.business_tower = BusinessTower(num_businesses, embedding_dim=64, hidden_dim=hidden_dim)
        self.review_tower = ReviewTower(input_dim=text_input_dim, hidden_dim=hidden_dim)

        self.head = nn.Sequential(
            nn.Linear(hidden_dim * 3, 256),  # Multiplicamos por 3, ya que concatenamos user, business y review
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, user_id, business_id, review_vec):
        # Hacemos que las tres torres devuelvan embeddings compatibles
        user_embed = self.user_tower(user_id)
        business_embed = self.business_tower(business_id)
        review_embed = self.review_tower(review_vec)

        # Concatenamos las representaciones de las tres torres
        combined = torch.cat([user_embed, business_embed, review_embed], dim=-1)

        # Pasa la concatenaci√≥n por la cabeza para hacer la predicci√≥n final
        return self.head(combined).squeeze(1)


In [37]:
###################################
############## DATOS ##############
###################################

# Split de entrenamiento/validaci√≥n
train_df, val_df = train_test_split(df_train, test_size=0.1, random_state=42)

train_dataset = YelpReviewDataset(train_df, df_users, df_businesses, vectorizer)
val_dataset = YelpReviewDataset(val_df, df_users, df_businesses, vectorizer)

train_loader = DataLoader(train_dataset, batch_size=256)
val_loader = DataLoader(val_dataset, batch_size=256)

#######################################################
############## ENTRENAMIENTO Y VALIDACI√ìN #############
#######################################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [38]:
# Ajusta el modelo para que reciba tanto el user_id como el business_id
model = ThreeTowerModel(
    num_users=len(train_dataset.user_encoder.classes_),  # N√∫mero de usuarios
    num_businesses=len(train_dataset.business_encoder.classes_),  # N√∫mero de negocios (tienes que definir un encoder para 'business_id')
    text_input_dim=5000  # Ajusta seg√∫n la dimensi√≥n de tu entrada de texto
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=4e-3)
criterion = nn.MSELoss()

print("--- 1/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, business_ids, review_vecs, stars) in enumerate(train_loader):

        user_ids = user_ids.to(device)
        business_ids = business_ids.to(device)  # Ahora tambi√©n tomamos business_id
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, business_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_time:.2f}s")

print("")
print("--- Validaci√≥n ---")
model.eval()
total_loss = 0
with torch.no_grad():
    for i, (user_ids, business_ids, review_vecs, stars) in enumerate(val_loader):
        user_ids = user_ids.to(device)
        business_ids = business_ids.to(device)  # Ahora tambi√©n tomamos business_id
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, business_ids, review_vecs)
        loss = criterion(preds, stars)
        total_loss += loss.item()

print(f"Validation Loss: {total_loss / len(val_loader):.4f}")

print("--- 2/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, business_ids, review_vecs, stars) in enumerate(val_loader):

        user_ids = user_ids.to(device)
        business_ids = business_ids.to(device)  # Ahora tambi√©n tomamos business_id
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, business_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(val_loader):.4f}, Time: {epoch_time:.2f}s")

##########################################
############## PREDICCIONES ##############
##########################################

# Dataset de test con business_id tambi√©n
test_dataset = YelpReviewTestDataset(df_test, df_users, df_businesses, vectorizer)
test_loader = DataLoader(test_dataset, batch_size=256)

model.eval()
predictions = []
with torch.no_grad():
    for i, (user_ids, business_ids, review_vecs) in enumerate(test_loader):
        user_ids = user_ids.to(device)
        business_ids = business_ids.to(device)  # Ahora tambi√©n tomamos business_id
        review_vecs = review_vecs.to(device)
        preds = model(user_ids, business_ids, review_vecs)
        predictions.extend(preds.cpu().numpy())
        if i % 100 == 0:
            print(f"Batch {i}/{len(test_loader)}")

# Guardar las predicciones en el archivo de salida
submission_df = pd.DataFrame({
    'review_id': df_test['review_id'],
    'stars': predictions
})
submission_df.to_csv('prediction_TwoTowers_with_business.csv', index=False)


--- 1/2 Entrenamiento ---
Epoch 1, Loss: 0.5854, Time: 376.74s

--- Validaci√≥n ---
Validation Loss: 0.4644
--- 2/2 Entrenamiento ---
Epoch 1, Loss: 0.4845, Time: 33.50s
Batch 0/1621
Batch 100/1621
Batch 200/1621
Batch 300/1621
Batch 400/1621
Batch 500/1621
Batch 600/1621
Batch 700/1621
Batch 800/1621
Batch 900/1621
Batch 1000/1621
Batch 1100/1621
Batch 1200/1621
Batch 1300/1621
Batch 1400/1621
Batch 1500/1621
Batch 1600/1621


---

## Ampliaci√≥n de caracter√≠sticas de usuario

In [15]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import time

###########################################################
#################### COLUMNAS USER ########################
###########################################################
user_features_cols = [
    "review_count", "fans", "average_stars",
    "compliment_hot", "compliment_more", "compliment_profile", "compliment_cute"
]

# Aseguramos que las columnas del df_users son num√©ricas
df_users[user_features_cols] = df_users[user_features_cols].apply(pd.to_numeric, errors="coerce").fillna(0).astype("float32")

###########################################################
############### MERGE DE TRAIN Y TEST #####################
###########################################################
df_train = df_train.merge(df_users, on="user_id", how="left")
df_test = df_test.merge(df_users, on="user_id", how="left")

In [None]:
# Preentrena TF-IDF sobre el corpus completo
vectorizer = TfidfVectorizer(max_features=700)
vectorizer.fit(df_train["text"])


In [24]:
###########################################################
#################### DATASET GENERAL ######################
###########################################################

class YelpReviewDataset(Dataset):
    def __init__(self, df, text_vectorizer, user_features_cols):
        self.df = df.reset_index(drop=True)
        self.vectorizer = text_vectorizer
        self.user_features_cols = user_features_cols

        # Label Encoding del user_id
        self.user_encoder = LabelEncoder()
        self.df["user_id_encoded"] = self.user_encoder.fit_transform(self.df["user_id"])

        # Vectorizamos el texto
        self.review_matrix = self.vectorizer.transform(self.df["text"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        star_ = row['stars'].copy()
        user_id = row["user_id_encoded"].copy()

        row = self.df[self.user_features_cols].iloc[idx]
        # Vectorizamos el texto
        review_vec = torch.tensor(self.review_matrix[idx].toarray(), dtype=torch.float32).squeeze(0)

        # Features del usuario (directamente desde el df)
        user_feats_np = row.values.astype(np.float32)
        user_feats = torch.from_numpy(user_feats_np)

        rating = torch.tensor(star_, dtype=torch.float32)
        return user_id, review_vec, user_feats, rating


class YelpTestReviewDataset(Dataset):
    def __init__(self, df, text_vectorizer, user_features_cols):
        self.df = df.reset_index(drop=True)
        self.vectorizer = text_vectorizer
        self.user_features_cols = user_features_cols

        # Label Encoding del user_id
        self.user_encoder = LabelEncoder()
        self.df["user_id_encoded"] = self.user_encoder.fit_transform(self.df["user_id"])

        # Vectorizamos el texto
        self.review_matrix = self.vectorizer.transform(self.df["text"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_id = row["user_id_encoded"].copy()
        row = self.df[self.user_features_cols].iloc[idx]

        # Vectorizamos el texto
        review_vec = torch.tensor(self.review_matrix[idx].toarray(), dtype=torch.float32).squeeze(0)

        # Features del usuario (directamente desde el df)
        user_feats_np = row[self.user_features_cols].values.astype(np.float32)
        user_feats = torch.from_numpy(user_feats_np)

        # No necesitamos el "stars" ya que estamos en test
        return user_id, review_vec, user_feats

###########################################################
############### SPLIT + DATASETS + LOADERS ################
###########################################################

# Split de entrenamiento/validaci√≥n
train_df, val_df = train_test_split(df_train, test_size=0.3, random_state=42)

train_dataset = YelpReviewDataset(train_df, vectorizer, user_features_cols)
val_dataset = YelpReviewDataset(val_df, vectorizer, user_features_cols)
test_dataset = YelpReviewDataset(df_test, vectorizer, user_features_cols)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024)
test_loader = DataLoader(test_dataset, batch_size=1024)

###########################################################
################### MODELO 2-TORRES #######################
###########################################################

class UserTower(nn.Module):
    def __init__(self, num_users, user_feature_dim, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(num_users, embedding_dim)
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim + user_feature_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, user_id, user_feats):
        emb = self.embedding(user_id)
        x = torch.cat([emb, user_feats], dim=-1)
        return self.fc(x)

class ReviewTower(nn.Module):
    def __init__(self, text_input_dim, hidden_dim=128):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(text_input_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, review_vec):
        return self.fc(review_vec)

class TwoTowerModel(nn.Module):
    def __init__(self, num_users, user_feature_dim, text_input_dim):
        super().__init__()
        self.user_tower = UserTower(num_users, user_feature_dim)
        self.review_tower = ReviewTower(text_input_dim)
        self.head = nn.Sequential(
            nn.Linear(128 * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )

    def forward(self, user_id, review_vec, user_feats):
        user_repr = self.user_tower(user_id, user_feats)
        review_repr = self.review_tower(review_vec)
        combined = torch.cat([user_repr, review_repr], dim=-1)
        return self.head(combined).squeeze(1)


In [None]:

###########################################################
################## INSTANCIACI√ìN ##########################
###########################################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sample_batch = next(iter(train_loader))
user_feature_dim = sample_batch[2].shape[1]
text_input_dim = sample_batch[1].shape[1]

model = TwoTowerModel(
    num_users=len(train_dataset.user_encoder.classes_),
    user_feature_dim=user_feature_dim,
    text_input_dim=text_input_dim
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)
criterion = nn.MSELoss()

###########################################################
##################### ENTRENAMIENTO #######################
###########################################################

print("--- Entrenamiento ---")
for epoch in range(1):
    model.train()
    total_loss = 0
    for idx, (user_ids, review_vecs, user_feats, stars) in enumerate(train_loader):
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        user_feats = user_feats.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs, user_feats)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        if idx % 50 == 0:
          print(f"Traza Batch {idx}")
    print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")

In [None]:
###########################################################
###################### VALIDACI√ìN #########################
###########################################################

print("--- Validaci√≥n ---")
model.eval()
val_loss = 0
with torch.no_grad():
    for user_ids, review_vecs, user_feats, stars in val_loader:
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        user_feats = user_feats.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs, user_feats)
        loss = criterion(preds, stars)
        val_loss += loss.item()

print(f"Validation Loss: {val_loss / len(val_loader):.4f}")


In [None]:
# review_id, stars
test_dataset = YelpReviewTestDataset(df_test, df_users, vectorizer)
test_loader = DataLoader(test_dataset, batch_size=256)

# predicciones del test set
model.eval()
predictions = []
with torch.no_grad():
    for i, (user_ids, review_vecs) in enumerate(test_loader):
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        preds = model(user_ids, review_vecs)
        predictions.extend(preds.cpu().numpy())
        if i % 100 == 0:
            print(f"Batch {i}/{len(test_loader)}")


In [None]:
submission_df = pd.DataFrame({
    'review_id': df_test['review_id'],
    'stars': predictions
})
submission_df.to_csv('prediction_TwoTowers_9.csv', index=False)