In [None]:
import sys
sys.path.insert(1, '../') 
from utils.loader import Loader
from utils.evaluator import Evaluator

from sklearn.metrics import mean_squared_error
import tensorflow as tf
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torchmetrics import MeanSquaredError
from torch.utils.data import Dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, BertModel
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# Descargar la lista de stopwords si no está ya descargada
nltk.download('stopwords')

# Cargar las stopwords en español
spanish_stopwords = set(stopwords.words('spanish'))


print("¿GPU disponible?:", tf.config.list_physical_devices('GPU'))

In [None]:
train = Loader.load_NLP()

In [None]:
def custom_concat(row, cols):
    # Construir la descripción con lógica condicional basada en el valor de la celda
    parts = []
    for col_name in cols:  # Cambio para iterar solo sobre las columnas especificadas
        if col_name in row.index:  # Verificar que el nombre de la columna esté en el DataFrame
            value = row[col_name]
            if value == "no tiene" or not isinstance(value, str):
                parts.append(f"no tiene {col_name}")
            else:
                parts.append(str(value))  # Convertir a string para evitar problemas al unir
    # Unir todas las partes con espacios
    return ' '.join(parts)

# Aplicar la función al DataFrame
def filter_train_data(train):
    descriptions = [col for col in train.columns if "description" in col]
    train['full_description'] = train.apply(custom_concat, axis=1, args=(descriptions,))
    filtered_columns = ["price", "km", "fuelType", "full_description"]
    train = train[filtered_columns]
    train.dropna(inplace=True)
    return train

train = filter_train_data(train)

km_scaler = StandardScaler()
train["km"] = km_scaler.fit_transform(train["km"].to_numpy().reshape(-1, 1))

price_scaler = StandardScaler()
train["price"] = price_scaler.fit_transform(train["price"].to_numpy().reshape(-1, 1))

In [None]:
verb_size = 128
model_name = 'dccuchile/bert-base-spanish-wwm-cased'  # BETO


In [None]:
train_texts, val_texts, train_labels, val_labels, train_km, val_km = train_test_split(
    train["full_description"],
    train["price"],
    train["km"],
    test_size=0.2,
    random_state=42,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
train_encodings = tokenizer(
    list(train_texts), truncation=True, padding=True, max_length=verb_size
)
val_encodings = tokenizer(
    list(val_texts), truncation=True, padding=True, max_length=verb_size
)


In [None]:
class RegressionDataset(Dataset):
    def __init__(self, encodings, labels, km_values):
        self.encodings = encodings
        self.labels = labels.astype(np.float32)  # Ensure labels are float
        self.km_values = torch.tensor(km_values, dtype=torch.float32)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        item['km'] = self.km_values[idx]
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
train_dataset = RegressionDataset(train_encodings, train_labels, train_km.values)
val_dataset = RegressionDataset(val_encodings, val_labels, val_km.values)

class CustomRegressionModel(torch.nn.Module):
    def __init__(self, bert_model_name, km_dim=1):
        super(CustomRegressionModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.km_layer = torch.nn.Linear(km_dim, 16)  # Process km separately
        self.regressor = torch.nn.Linear(self.bert.config.hidden_size + 16, 1)  # Combine BERT and km outputs

    def forward(self, input_ids, attention_mask, km, labels=None):
        # BERT outputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_cls_output = outputs.pooler_output  # [CLS] token representation

        # Process km
        if len(km.shape) == 1:
            km = km.unsqueeze(1)  # Ensure km has two dimensions
        km_output = self.km_layer(km)

        # Concatenate and pass to regression head
        combined_output = torch.cat((bert_cls_output, km_output), dim=1)
        logits = self.regressor(combined_output)

        loss = None
        if labels is not None:
            loss_fn = torch.nn.MSELoss()
            loss = loss_fn(logits.view(-1), labels.view(-1))

        return {"loss": loss, "logits": logits, "labels": labels}
    
model = CustomRegressionModel(model_name)

In [None]:
mse_metric = MeanSquaredError()


def compute_metrics(pred):
    # Unpack predictions and ensure it's a tensor
    predictions = torch.tensor(pred.predictions[0]).flatten()  # Access first element of the tuple and flatten
    labels = torch.tensor(pred.label_ids)  # Labels are directly accessible

    # Compute Mean Squared Error
    mse = mean_squared_error(labels, predictions)

    return {"mse": mse}


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',  # Evaluar al final de cada época
    save_strategy='epoch',  # Guardar modelo al final de cada época
    logging_strategy='epoch',
    logging_dir='./logs',
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='eval_mse',  # Usar eval_loss como métrica principal
    disable_tqdm=False
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()
output_model_dir = './final_model'
trainer.save_model(output_model_dir)
metrics = trainer.evaluate()
print("Returned metrics:", metrics)


In [None]:
# Realizar predicciones con el conjunto de validación
predictions = trainer.predict(val_dataset)

predicted_values = price_scaler.inverse_transform(predictions.predictions[0])  
true_values = price_scaler.inverse_transform(val_labels.values.reshape(-1, 1))  

In [None]:
print(predicted_values.shape)
print(true_values.shape)
Evaluator.eval_regression(y_pred=np.array(predicted_values.ravel()), y_true=np.array(true_values.ravel()), plot=False, n_features=2, regressor_name="BETO")
Evaluator.save("BETO")