# **SISTEMAS DE RECOMENDACIÓN**

## **Filtrado Basado en Contenido**


Miembros del Grupo:
- Paula Arias Fernández
- Jorge del Castillo Gómez
- Anny Álvarez Nogales

In [None]:
### BORRAR ESTE COMENTARIO CUANDO LO LEÁIS
#Sigo la estructura de la presentación 
# (dejo los huequitos antes de mi código pero si queréis cambiar 
# el orden de algo o moverlo sin problema que no sabia como ponerlo)


In [None]:
# Importing libraries
import pandas as pd
import re
from joblib import Parallel, delayed

# **Pruebas Datos Textuales** 

## Pruebas iniciales

1. TF-IDF:
    - TFIDF + LogisticRegression -  `MAE: 0.65`
    - TFIDF + RandomForestRegression - `MAE: 0.82`
    - TFIDF + xgboost - `MAE: - 0.65`
2. Doc2Vec + LogisticRegression - `MAE: 1.24`

In [None]:
# -- Tfidf elimina las stopwords
vectorizer = TfidfVectorizer(stop_words='english', max_features=200, ngram_range=(1, 2))

X_train_tfidf = vectorizer.fit_transform(df_train['text'])
X_test_tfidf = vectorizer.transform(df_test['text'])

In [None]:
y = df_train['stars']

X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y, test_size=0.2, random_state=42)

# model = RandomForestRegressor(n_estimators=100, random_state=42)
model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (Train Test Split): {mae}')

y_pred_test = model.predict(X_test_tfidf)

**GridSearch** con LogisticRegression

In [None]:
y = df_train['stars']
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y, test_size=0.2, random_state=42)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],     # Regularización
    'solver': ['liblinear', 'saga'],  # Métodos de optimización
    'max_iter': [100, 500, 1000],     # Número máximo de iteraciones
}

model = LogisticRegression(random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
y_pred_best = best_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)

print(f"Mejores parámetros: {best_params}")
print(f"Mean Absolute Error (mejor modelo): {mae_best}")

# Comparacion entre el mejor modelo y uno predeterminado (sin optimizar)
model_default = LogisticRegression(max_iter=1000, random_state=42)
model_default.fit(X_train, y_train)
y_pred_default = model_default.predict(X_test)
mae_default = mean_absolute_error(y_test, y_pred_default)
print(f"Mean Absolute Error (modelo sin optimizar): {mae_default}")

### Predicciones
y_pred_test = best_model.predict(X_test_tfidf)

submission_df = pd.DataFrame({
    'review_id': df_test['review_id'],  
    'stars': y_pred_test  
})

submission_df.to_csv('prediction_tfidf_logisticReg_gridSearch.csv', index=False)

**Con un Clasificador RandomForest**

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=100, ngram_range=(1, 2))

X_train_tfidf = vectorizer.fit_transform(df_train['text'])
X_test_tfidf = vectorizer.transform(df_test['text'])

### Con RandomForest
y = df_train['stars']

X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (Train Test Split): {mae}')

y_pred_test = model.predict(X_test_tfidf)

submission_df = pd.DataFrame({
    'review_id': df_test['review_id'],
    'stars': y_pred_test
})

submission_df.to_csv('prediction_tfidf_randomForest.csv', index=False)

**Con un clasificador xgboost**

In [None]:
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

y = df_train['stars'].astype(int) - 1

model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1)
model.fit(X_train_tfidf, y)

y_pred_test = model.predict(X_test_tfidf) + 1

submission_df = pd.DataFrame({
    'review_id': df_test['review_id'],
    'stars': y_pred_test
})

submission_df.to_csv('prediction_tfidf_xgboost.csv', index=False)

### **Doc2Vec**
Enlace: https://spotintelligence.com/2023/09/06/doc2vec/

In [None]:
######################
#### PREPROCESADO ####
######################
print("--- Inicio Preprocesado...")

# Tokenizer
def preprocess_text_parallel(text):
    return re.findall(r'\b[a-zA-Z]+\b', text.lower())

df_train['tokens'] = Parallel(n_jobs=-1)(delayed(preprocess_text_parallel)(text) for text in df_train['text'])
df_test['tokens'] = Parallel(n_jobs=-1)(delayed(preprocess_text_parallel)(text) for text in df_test['text'])

# TaggedDocument
tagged_train = [TaggedDocument(words=tokens, tags=[str(i)]) for i, tokens in enumerate(df_train['tokens'])]

########################
#### MODELO DOC2VEC ####
########################

# Initialize the Doc2Vec model
model = Doc2Vec(vector_size=50,   # Dimensionality of the document vectors
                window=2,         # Maximum distance between the current and predicted word within a sentence
                min_count=1,      # Ignores all words with total frequency lower than this
                workers=-1,       # Number of CPU cores to use for training
                epochs=2)         # Number of training epochs

model.build_vocab(tagged_train)
model.train(tagged_train, total_examples=len(tagged_train), epochs=model.epochs)

# Inferir vectores
df_train['vector'] = df_train['tokens'].apply(lambda x: model.infer_vector(x))
df_test['vector'] = df_test['tokens'].apply(lambda x: model.infer_vector(x))


X_test = list(df_test['vector'])

#############################
#### MODELO CLASIFICADOR ####
#############################

X_list_train = list(df_train['vector'])
y_train = df_train['stars']


classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_list_train, y_train)

predicted_stars = classifier.predict(X_test)

submission_df = pd.DataFrame({
    'review_id': df_test['review_id'],
    'stars': predicted_stars
})
submission_df.to_csv('prediction_doc2vec_logreg.csv', index=False)

## Word2Vec Embedding Model



In [None]:
#DATOS
train_reviews=pd.read_csv('train_reviews.csv', sep=',')
test_reviews = pd.read_csv('test_reviews.csv')
negocios_df=pd.read_csv('negocios.csv')

df = train_reviews.merge(negocios_df, on='business_id',how='inner')
df

In [None]:
#PREPROCESS DATA 

df['text'] = df['text'].astype(str) + " " + df['categories'].astype(str)

def preprocess_text_parallel(text):
    return re.findall(r'\b[a-zA-Z]+\b', text.lower())

train_reviews['tokens'] = Parallel(n_jobs=-1)(delayed(preprocess_text_parallel)(text) for text in df['text'])
test_reviews['tokens'] = Parallel(n_jobs=-1)(delayed(preprocess_text_parallel)(text) for text in test_reviews['text'])


In [None]:
#TEXT MODEL WORD2VEC
#se entrena una vez y se guarda el modelo

import gensim
from gensim.models import Word2Vec
model = Word2Vec(
    sentences=train_reviews['tokens'],
    vector_size=50,  
    window=5,
    min_count=5,  
    workers=4,  
    epochs=7 
)


#model.save("word2vec_model.model")
import gensim
from gensim.models import Word2Vec
model = Word2Vec.load("word2vec_model.model")

In [None]:
#APPLY TEXT EMBEDDING MODEL
def get_review_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return [0] * model.vector_size  
    return sum(vectors) / len(vectors)

train_reviews['vector'] = train_reviews['tokens'].apply(lambda x: get_review_vector(x, model))

X_train = list(train_reviews['vector'])
y_train = train_reviews['stars']

In [None]:
#CLASSIFIER LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=2500,C= 14.867708330182724,solver='newton-cg',penalty='l2')
classifier.fit(X_train, y_train)

test_reviews['vector'] = test_reviews['tokens'].apply(lambda x: get_review_vector(x, model))

X_test = list(test_reviews['vector'])
predicted_stars = classifier.predict(X_test)

submission_df = pd.DataFrame({
    'review_id': test_reviews['review_id'],
    'stars': predicted_stars
})

submission_df

In [None]:
#CLASSIFIER RANDOMFOREST

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

test_reviews['vector'] = test_reviews['tokens'].apply(lambda x: get_review_vector(x, model))

X_test = list(test_reviews['vector'])
predicted_stars = classifier.predict(X_test)

submission_df = pd.DataFrame({
    'review_id': test_reviews['review_id'],
    'stars': predicted_stars
})


submission_df

In [None]:
#XGBOOST CLASSIFIER

from xgboost import XGBClassifier
y_train_adj = y_train - 1

classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
classifier.fit(X_train, y_train_adj)

X_test = list(test_reviews['vector'])
predicted_stars = classifier.predict(X_test) + 1  


submission_df = pd.DataFrame({
    'review_id': test_reviews['review_id'],
    'stars': predicted_stars
})

In [None]:
#OPTUNA WITH LOGISTIC REGRESSION
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Optimización para el mejor modelo de regresión logística

def objective(trial):
    C = trial.suggest_loguniform('C', 1e-5, 100)  
    solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs', 'newton-cg', 'saga'])
    max_iter = trial.suggest_int('max_iter', 1000, 5000, step=500)
    penalty = trial.suggest_categorical('penalty', ['l2'])

    classifier = LogisticRegression(C=C, solver=solver, max_iter=max_iter, penalty=penalty)
    classifier.fit(X_train, y_train)

    predicted_stars = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, predicted_stars)
    
    return accuracy

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Maximize accuracy
study = optuna.create_study(direction='maximize')  
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

best_classifier = LogisticRegression(**best_params)
best_classifier.fit(X_train, y_train)

#evaluation
final_predictions = best_classifier.predict(X_test)
final_accuracy = accuracy_score(y_test, final_predictions)
print(f"Final Accuracy: {final_accuracy}")


## Fast Text Embedding Model

In [None]:
#TEXT MODEL FAST TEXT
from gensim.models import FastText


fasttext_model = FastText(sentences=train_reviews['tokens'], vector_size=50, window=5, min_count=5, epochs=10)
fasttext_model.save("fasttext_model.model")

In [None]:
def get_review_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return [0] * model.vector_size
    
    return sum(vectors) / len(vectors)

train_reviews['vector'] = train_reviews['tokens'].apply(lambda x: get_review_vector(x, fasttext_model))

X_train = list(train_reviews['vector'])
y_train = train_reviews['stars']

In [None]:
# XGBOOST CLASSIFIER

from xgboost import XGBClassifier
y_train_adj = y_train - 1

classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
classifier.fit(X_train, y_train_adj)

X_test = list(test_reviews['vector'])
predicted_stars = classifier.predict(X_test) + 1  


submission_df = pd.DataFrame({
    'review_id': test_reviews['review_id'],
    'stars': predicted_stars
})


# **Pruebas Datos No Textuales**

In [None]:
# FEATURES SELECTION
usuarios=pd.read_csv('usuarios.csv', sep=',')

df = df.merge(usuarios, on='user_id',how='inner')

df2=df.drop(['review_id',	'user_id'	,'business_id','text', 'date', 'name_x','address','city','state','attributes','categories','name_y','elite','friends','hours','yelping_since','postal_code'],axis=1)
df2.rename(columns={'stars_x':'stars','name_y':'user_name','useful_x':'useful','funny_x':'funny','cool_x':'cool'}, inplace=True)
df2

In [None]:
#NO TEXTUAL DATA NORMALIZATION

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
df2=df2[['funny','stars','cool','useful']]
label_encoder = LabelEncoder()


y_train = df2['stars'].round().astype(int)  
X_train = df2.drop(columns=['stars'])  


# Normalización
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Entrenamiento
classifier = LogisticRegression(max_iter=3000)
classifier.fit(X_train_scaled, y_train)

In [None]:
# CLASSIFIER

#X_test_scaled = scaler.transform(test_reviews)
common_cols = list(set(X_train.columns) & set(test_reviews.columns))
X_test = test_reviews[common_cols]
X_test_scaled = scaler.fit_transform(X_test)

X_test_scaled
predicted_stars = classifier.predict(X_test_scaled)
predicted_stars

submission_df = pd.DataFrame({
    'review_id': test_reviews['review_id'],
    'stars': predicted_stars
})

submission_df.to_csv('submission.csv', index=False)

# **Modelo TwoTower y ThreeTower**

El modelo Two-Tower (o modelo de dos torres) es una arquitectura de red neuronal que aprende representaciones (embeddings) separadas para diferentes tipos de entrada —en este caso, usuarios y textos de reseñas—, y luego las combina para predecir una puntuación.

**Arquitectura**

1. User Tower:

    - Recibe el user_id como entrada.

    - Aplica una capa de embedding para convertir el ID del usuario en un vector denso.

    - Pasa ese vector por una capa totalmente conectada (Linear) con activación ReLU.

    - Su objetivo es aprender una representación del usuario basada en su historial.

2. Review Tower:

    - Recibe la representación vectorial de la reseña (TF-IDF).

    - Procesa esta entrada mediante una capa lineal con ReLU.

    - Aprender una representación semántica del texto.

3. Head:

    - Concatena los vectores generados por ambas torres.

    - Pasa el vector combinado por una red neuronal profunda:

    - Capa Linear + Dropout + ReLU.

    - Capa final Linear que produce un único valor escalar (la predicción del rating).

In [None]:
class YelpReviewDataset(Dataset):
    def __init__(self, df, df_users, text_vectorizer):
        self.df = df.reset_index(drop=True)
        self.df_users = df_users.set_index("user_id")
        self.vectorizer = text_vectorizer

        # Label Encoding de user_id
        self.user_encoder = LabelEncoder()
        self.df["user_id_encoded"] = self.user_encoder.fit_transform(self.df["user_id"])

        # Emparejamos info de usuarios
        #self.df["review_text_vec"] = list(self.vectorizer.transform(self.df["text"]).toarray())
        self.review_matrix = self.vectorizer.transform(self.df["text"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_id = row["user_id_encoded"]
        review_vec_sparse = self.review_matrix[idx]
        review_vec_dense = torch.tensor(review_vec_sparse.toarray(), dtype=torch.float32).squeeze(0)
        rating = torch.tensor(row["stars"], dtype=torch.float32)

        return user_id, review_vec_dense, rating




class YelpReviewTestDataset(Dataset):
    def __init__(self, df, df_users, text_vectorizer):
        self.df = df.reset_index(drop=True)
        self.df_users = df_users.set_index("user_id")
        self.vectorizer = text_vectorizer

        # Label Encoding de user_id
        self.user_encoder = LabelEncoder()
        self.df["user_id_encoded"] = self.user_encoder.fit_transform(self.df["user_id"])

        # Emparejamos info de usuarios
        #self.df["review_text_vec"] = list(self.vectorizer.transform(self.df["text"]).toarray())
        self.review_matrix = self.vectorizer.transform(self.df["text"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_id = row["user_id_encoded"]
        review_vec_sparse = self.review_matrix[idx]
        review_vec_dense = torch.tensor(review_vec_sparse.toarray(), dtype=torch.float32).squeeze(0)

        return user_id, review_vec_dense

In [None]:
# Preentrena TF-IDF sobre el corpus completo
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(df_train["text"])

In [None]:
class UserTower(nn.Module):
    def __init__(self, num_users, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, user_id):
        x = self.user_embedding(user_id)
        return self.fc(x)

class ReviewTower(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=128):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, review_vec):
        return self.fc(review_vec)

class TwoTowerModel(nn.Module):
    def __init__(self, num_users, text_input_dim=300, hidden_dim=128):
        super().__init__()
        self.user_tower = UserTower(num_users, hidden_dim=hidden_dim)
        self.review_tower = ReviewTower(text_input_dim, hidden_dim=hidden_dim)
        self.head = nn.Sequential(
            nn.Linear(hidden_dim * 2, 256),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, user_id, review_vec):
        user_embed = self.user_tower(user_id)
        review_embed = self.review_tower(review_vec)
        combined = torch.cat([user_embed, review_embed], dim=-1)
        return self.head(combined).squeeze(1)

In [None]:
import time

###################################
############## DATOS ##############
###################################
train_df, val_df = train_test_split(df_train, test_size=0.1, random_state=42)

train_dataset = YelpReviewDataset(train_df, df_users, vectorizer)
val_dataset = YelpReviewDataset(val_df, df_users, vectorizer)

train_loader = DataLoader(train_dataset, batch_size=256)
val_loader = DataLoader(val_dataset, batch_size=256)

In [None]:
##########################################
############## MODELO ####################
##########################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TwoTowerModel(num_users=len(train_dataset.user_encoder.classes_), text_input_dim=5000).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)
criterion = nn.MSELoss()

print("--- 1/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, review_vecs, stars) in enumerate(train_loader):

        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_time:.2f}s")

print("")
print("--- Validación ---")
model.eval()
total_loss = 0
with torch.no_grad():
    for i, (user_ids, review_vecs, stars) in enumerate(val_loader):
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)
        total_loss += loss.item()

print(f"Validation Loss: {total_loss / len(val_loader):.4f}")

> Continuación del entrenamiento con los datos de validación

In [None]:
print("--- 2/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, review_vecs, stars) in enumerate(val_loader):

        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(val_loader):.4f}, Time: {epoch_time:.2f}s")

In [None]:
import gc
import torch

# # Elimina referencias explícitamente
# del train_df, val_df, train_dataset, val_dataset, train_loader, val_loader

# # Limpia el recolector de basura de Python
# gc.collect()


# review_id, stars
test_dataset = YelpReviewTestDataset(df_test, df_users, vectorizer)
test_loader = DataLoader(test_dataset, batch_size=256)

# Test set
model.eval()
predictions = []
with torch.no_grad():
    for i, (user_ids, review_vecs) in enumerate(test_loader):
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        preds = model(user_ids, review_vecs)
        predictions.extend(preds.cpu().numpy())
        if i % 100 == 0:
            print(f"Batch {i}/{len(test_loader)}")


> Prueba con TF-IDF de 20000

In [None]:
##############
### TF-IDF ###
##############
vectorizer = TfidfVectorizer(max_features=20000)
vectorizer.fit(df_train["text"])

import time

###################################
############## DATOS ##############
###################################
train_df, val_df = train_test_split(df_train, test_size=0.1, random_state=42)

train_dataset = YelpReviewDataset(train_df, df_users, vectorizer)
val_dataset = YelpReviewDataset(val_df, df_users, vectorizer)

train_loader = DataLoader(train_dataset, batch_size=256)
val_loader = DataLoader(val_dataset, batch_size=256)

#######################################################
############## ENTRENAMIENTO Y VALIDACIÓN #############
#######################################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TwoTowerModel(num_users=len(train_dataset.user_encoder.classes_), text_input_dim=20000).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=4e-3)
criterion = nn.MSELoss()

print("--- 1/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, review_vecs, stars) in enumerate(train_loader):

        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_time:.2f}s")

print("")
print("--- Validación ---")
model.eval()
total_loss = 0
with torch.no_grad():
    for i, (user_ids, review_vecs, stars) in enumerate(val_loader):
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)
        total_loss += loss.item()

print(f"Validation Loss: {total_loss / len(val_loader):.4f}")


print("--- 2/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, review_vecs, stars) in enumerate(val_loader):

        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(val_loader):.4f}, Time: {epoch_time:.2f}s")


##########################################
############## PREDICCIONES ##############
##########################################

test_dataset = YelpReviewTestDataset(df_test, df_users, vectorizer)
test_loader = DataLoader(test_dataset, batch_size=256)

model.eval()
predictions = []
with torch.no_grad():
    for i, (user_ids, review_vecs) in enumerate(test_loader):
        user_ids = user_ids.to(device)
        review_vecs = review_vecs.to(device)
        preds = model(user_ids, review_vecs)
        predictions.extend(preds.cpu().numpy())
        if i % 100 == 0:
            print(f"Batch {i}/{len(test_loader)}")

submission_df = pd.DataFrame({
    'review_id': df_test['review_id'],
    'stars': predictions
})
submission_df.to_csv('prediction_TwoTowers_8.csv', index=False)

### Segundo Experimento. TF-IDF + ThreeTower

In [None]:
class YelpReviewDataset(Dataset):
    def __init__(self, df, df_users, df_businesses, text_vectorizer):
        self.df = df.reset_index(drop=True)
        self.df_users = df_users.set_index("user_id")
        self.df_businesses = df_businesses.set_index("business_id")
        self.vectorizer = text_vectorizer

        # Label Encoding de user_id y business_id
        self.user_encoder = LabelEncoder()
        self.business_encoder = LabelEncoder()

        self.df["user_id_encoded"] = self.user_encoder.fit_transform(self.df["user_id"])
        self.df["business_id_encoded"] = self.business_encoder.fit_transform(self.df["business_id"])

        # Vectorizamos el texto
        self.review_matrix = self.vectorizer.transform(self.df["text"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        user_id = row["user_id_encoded"]
        business_id = row["business_id_encoded"]

        review_vec_sparse = self.review_matrix[idx]
        review_vec_dense = torch.tensor(review_vec_sparse.toarray(), dtype=torch.float32).squeeze(0)

        rating = torch.tensor(row["stars"], dtype=torch.float32)

        return user_id, business_id, review_vec_dense, rating


class YelpReviewTestDataset(Dataset):
    def __init__(self, df, df_users, df_businesses, text_vectorizer):
        self.df = df.reset_index(drop=True)
        self.df_users = df_users.set_index("user_id")
        self.df_businesses = df_businesses.set_index("business_id")
        self.vectorizer = text_vectorizer

        # Label Encoding de user_id y business_id
        self.user_encoder = LabelEncoder()
        self.business_encoder = LabelEncoder()

        self.df["user_id_encoded"] = self.user_encoder.fit_transform(self.df["user_id"])
        self.df["business_id_encoded"] = self.business_encoder.fit_transform(self.df["business_id"])

        # Vectorizamos el texto
        self.review_matrix = self.vectorizer.transform(self.df["text"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        user_id = row["user_id_encoded"]
        business_id = row["business_id_encoded"]

        review_vec_sparse = self.review_matrix[idx]
        review_vec_dense = torch.tensor(review_vec_sparse.toarray(), dtype=torch.float32).squeeze(0)

        # En el dataset de test no necesitamos el rating, por eso no lo devolvemos
        return user_id, business_id, review_vec_dense


In [None]:
# Preentrena TF-IDF sobre el corpus completo
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(df_train["text"])

In [None]:
class UserTower(nn.Module):
    def __init__(self, num_users, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, user_id):
        x = self.user_embedding(user_id)
        return self.fc(x)

class BusinessTower(nn.Module):
    def __init__(self, num_businesses, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.business_embedding = nn.Embedding(num_businesses, embedding_dim)
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, business_id):
        x = self.business_embedding(business_id)
        return self.fc(x)

class ReviewTower(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=128):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, review_vec):
        return self.fc(review_vec)

class ThreeTowerModel(nn.Module):
    def __init__(self, num_users, num_businesses, text_input_dim=300, hidden_dim=128):
        super().__init__()
        # Aseguramos que las dimensiones sean correctas para cada torre
        self.user_tower = UserTower(num_users, embedding_dim=64, hidden_dim=hidden_dim)
        self.business_tower = BusinessTower(num_businesses, embedding_dim=64, hidden_dim=hidden_dim)
        self.review_tower = ReviewTower(input_dim=text_input_dim, hidden_dim=hidden_dim)

        self.head = nn.Sequential(
            nn.Linear(hidden_dim * 3, 256),  # Multiplicamos por 3, ya que concatenamos user, business y review
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, user_id, business_id, review_vec):
        # Hacemos que las tres torres devuelvan embeddings compatibles
        user_embed = self.user_tower(user_id)
        business_embed = self.business_tower(business_id)
        review_embed = self.review_tower(review_vec)

        # Concatenamos las representaciones de las tres torres
        combined = torch.cat([user_embed, business_embed, review_embed], dim=-1)

        # Pasa la concatenación por la cabeza para hacer la predicción final
        return self.head(combined).squeeze(1)


In [None]:
###################################
############## DATOS ##############
###################################

# Split de entrenamiento/validación
train_df, val_df = train_test_split(df_train, test_size=0.1, random_state=42)

train_dataset = YelpReviewDataset(train_df, df_users, df_businesses, vectorizer)
val_dataset = YelpReviewDataset(val_df, df_users, df_businesses, vectorizer)

train_loader = DataLoader(train_dataset, batch_size=256)
val_loader = DataLoader(val_dataset, batch_size=256)

#######################################################
############## ENTRENAMIENTO Y VALIDACIÓN #############
#######################################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# Ajusta el modelo para que reciba tanto el user_id como el business_id
model = ThreeTowerModel(
    num_users=len(train_dataset.user_encoder.classes_),  # Número de usuarios
    num_businesses=len(train_dataset.business_encoder.classes_),  # Número de negocios (tienes que definir un encoder para 'business_id')
    text_input_dim=5000  # Ajusta según la dimensión de tu entrada de texto
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=4e-3)
criterion = nn.MSELoss()

print("--- 1/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, business_ids, review_vecs, stars) in enumerate(train_loader):

        user_ids = user_ids.to(device)
        business_ids = business_ids.to(device)  # Ahora también tomamos business_id
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, business_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_time:.2f}s")

print("")
print("--- Validación ---")
model.eval()
total_loss = 0
with torch.no_grad():
    for i, (user_ids, business_ids, review_vecs, stars) in enumerate(val_loader):
        user_ids = user_ids.to(device)
        business_ids = business_ids.to(device)  # Ahora también tomamos business_id
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, business_ids, review_vecs)
        loss = criterion(preds, stars)
        total_loss += loss.item()

print(f"Validation Loss: {total_loss / len(val_loader):.4f}")

print("--- 2/2 Entrenamiento ---")
for epoch in range(1):
    epoch_time = time.time()

    model.train()
    total_loss = 0
    for i, (user_ids, business_ids, review_vecs, stars) in enumerate(val_loader):

        user_ids = user_ids.to(device)
        business_ids = business_ids.to(device)  # Ahora también tomamos business_id
        review_vecs = review_vecs.to(device)
        stars = stars.to(device)

        preds = model(user_ids, business_ids, review_vecs)
        loss = criterion(preds, stars)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_time
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(val_loader):.4f}, Time: {epoch_time:.2f}s")

##########################################
############## PREDICCIONES ##############
##########################################

# Dataset de test con business_id también
test_dataset = YelpReviewTestDataset(df_test, df_users, df_businesses, vectorizer)
test_loader = DataLoader(test_dataset, batch_size=256)

model.eval()
predictions = []
with torch.no_grad():
    for i, (user_ids, business_ids, review_vecs) in enumerate(test_loader):
        user_ids = user_ids.to(device)
        business_ids = business_ids.to(device)  # Ahora también tomamos business_id
        review_vecs = review_vecs.to(device)
        preds = model(user_ids, business_ids, review_vecs)
        predictions.extend(preds.cpu().numpy())
        if i % 100 == 0:
            print(f"Batch {i}/{len(test_loader)}")

# Guardar las predicciones en el archivo de salida
submission_df = pd.DataFrame({
    'review_id': df_test['review_id'],
    'stars': predictions
})
submission_df.to_csv('prediction_TwoTowers_with_business.csv', index=False)
