In [None]:
import sys
sys.path.insert(1, '../') 
from utils.loader import Loader
from utils.evaluator import Evaluator

import tensorflow as tf
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# Descargar la lista de stopwords si no está ya descargada
nltk.download('stopwords')

# Cargar las stopwords en español
spanish_stopwords = set(stopwords.words('spanish'))


print("¿GPU disponible?:", tf.config.list_physical_devices('GPU'))

In [None]:
train = Loader.load_NLP()

In [None]:
def custom_concat(row, cols):
    # Construir la descripción con lógica condicional basada en el valor de la celda
    parts = []
    for col_name in cols:  # Cambio para iterar solo sobre las columnas especificadas
        if col_name in row.index:  # Verificar que el nombre de la columna esté en el DataFrame
            value = row[col_name]
            if value == "no tiene" or not isinstance(value, str):
                parts.append(f"no tiene {col_name}")
            else:
                parts.append(str(value))  # Convertir a string para evitar problemas al unir
    # Unir todas las partes con espacios
    return ' '.join(parts)

# Aplicar la función al DataFrame
def filter_train_data(train):
    descriptions = [col for col in train.columns if "description" in col]
    train['full_description'] = train.apply(custom_concat, axis=1, args=(descriptions,))
    filtered_columns = ["price", "km", "fuelType", "full_description"]
    train = train[filtered_columns]
    train.dropna(inplace=True)
    return train

train = filter_train_data(train)

km_scaler = StandardScaler()
train["km"] = km_scaler.fit_transform(train["km"].to_numpy().reshape(-1, 1))

price_scaler = StandardScaler()
train["price"] = price_scaler.fit_transform(train["price"].to_numpy().reshape(-1, 1))

In [None]:
verb_size = 128
model_name = 'dccuchile/bert-base-spanish-wwm-cased'  # BETO


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train["full_description"], train["price"], test_size=0.2, random_state=42
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
train_encodings = tokenizer(
    list(train_texts), truncation=True, padding=True, max_length=verb_size
)
val_encodings = tokenizer(
    list(val_texts), truncation=True, padding=True, max_length=verb_size
)


In [None]:
class RegressionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.astype(np.float32)  # Ensure labels are float

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = RegressionDataset(train_encodings, train_labels)
val_dataset = RegressionDataset(val_encodings, val_labels)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,                 # For regression, set num_labels to 1
    problem_type='regression'     # Specify regression problem
)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',    # Already set to 'epoch'
    save_strategy='epoch',          # Set save strategy to 'epoch' to match
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1,
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


In [None]:
trainer.train()


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Convert predictions from logits to a 1D array
    predictions = predictions.reshape(-1)
    labels = labels.reshape(-1)
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    return {'MSE': mse, 'MAE': mae}


In [None]:
trainer.compute_metrics = compute_metrics

In [None]:
trainer.evaluate()

In [None]:
model.save_pretrained('./beto_model')
tokenizer.save_pretrained('./beto_model')