In [7]:
# To visualize plots in the notebook
%matplotlib inline

import numpy as np
import pandas as pd # To read data tables from csv files
import seaborn as sns # To plot statistical graphics
import matplotlib.pyplot as plt # To plot the figures

import os
from termcolor import colored
import tqdm
import scipy
import gc

# For plots and graphical results
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pylab

# That's default image size for this interactive session
pylab.rcParams['figure.figsize'] = 9, 6

In [8]:
from google.colab import drive
drive.mount('/content/drive')

# Cargar los datos desde el archivo JSON
data_path = '/content/drive/My Drive/proyecto/full_format_recipes.json'  # Clara
#data_path = '/content/drive/My Drive/Colab Notebooks/proyecto/full_format_recipes.json'  # Jorge
recipes_df = pd.read_json(data_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Comparación con fine-tuning de modelo preentrenado *Hugging Face***

In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import scipy
import torch
import json
import random
from collections import defaultdict
import nltk
nltk.download("punkt")

# Figures plotted inside the notebook
%matplotlib inline
# High quality figures
%config InlineBackend.figure_format = 'retina'
# Figures style
sns.set_style("darkgrid")
sns.color_palette("deep")
# Figues size
plt.rcParams['figure.figsize'] = [8, 6]

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore',module='gradio')

from transformers import DistilBertModel, DistilBertTokenizer, Trainer, TrainingArguments # Changed to DistilBert
import torch.nn as nn
from sklearn.metrics import mean_squared_error, r2_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Con 2000 muestras**

In [None]:
# Preparamos los datos
reducidos = recipes_df.sample(n=2000, random_state=42)
reducidos = reducidos.dropna(subset=['directions', 'rating'])
texts = reducidos['directions'].tolist()
labels = reducidos['rating'].tolist()

# Extraemos los embeddings
# Cargamos el tokenizer y el modelo preentrenado de DistilBERT
model_name = 'distilbert-base-uncased'
embedding_tokenizer = DistilBertTokenizer.from_pretrained(model_name)
embedding_model = DistilBertModel.from_pretrained(model_name)

# Fine tuning
# Modelo de regresión basado en los embeddings de DistilBERT
class DistilBertRegressionModel(nn.Module):
    def __init__(self, bert_model):
        super(DistilBertRegressionModel, self).__init__()
        self.bert = bert_model
        self.regressor = nn.Linear(bert_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        cls_embedding = last_hidden_state[:, 0, :]
        regression_output = self.regressor(cls_embedding)
        return regression_output

# Cargar tokenizer y modelo de regresión
regression_tokenizer = DistilBertTokenizer.from_pretrained(model_name)
regression_model = DistilBertRegressionModel(DistilBertModel.from_pretrained(model_name))


# Entrenamiento y evaluación
# Tokenizar los datos de entrada
def tokenize_data(texts, tokenizer, max_length=128):
    lattened_texts = [' '.join([str(element) for element in (sublist if isinstance(sublist, list) else [sublist])]) for sublist in texts]

    return tokenizer(lattened_texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt', is_split_into_words=False)

encodings = tokenize_data(texts, regression_tokenizer)

# Creamos un dataset personalizado para regresión
class RegressionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Dividimos los datos en entrenamiento y evaluación
train_size = int(0.8 * len(texts))
test_size = len(texts) - train_size
train_texts, test_texts = texts[:train_size], texts[train_size:]
train_labels, test_labels = labels[:train_size], labels[train_size:]

train_encodings = tokenize_data(train_texts, regression_tokenizer)
test_encodings = tokenize_data(test_texts, regression_tokenizer)

train_dataset = RegressionDataset(train_encodings, train_labels)
test_dataset = RegressionDataset(test_encodings, test_labels)

# Configuración de los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',            # Directorio de salida
    num_train_epochs=1,                # Número de épocas
    per_device_train_batch_size=16,    # Tamaño del batch de entrenamiento
    per_device_eval_batch_size=64,     # Tamaño del batch de evaluación
    warmup_steps=500,                  # Número de pasos de calentamiento
    weight_decay=0.01,                 # Decaimiento del peso
    logging_dir='./logs',              # Directorio de logs
    logging_steps=10,                  # Frecuencia de los logs
    report_to='none'                   # Desactiva wandb
)


# Métricas MSE y R2
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    mse = mean_squared_error(labels, predictions)

    return {"eval_mse": mse}

# Configuración del trainer
trainer = Trainer(
    model=regression_model,            # Modelo preentrenado
    args=training_args,               # Argumentos de entrenamiento
    train_dataset=train_dataset,      # Dataset de entrenamiento
    eval_dataset=test_dataset,        # Dataset de evaluación
    compute_metrics=compute_metrics
)

# Fine-tuning del modelo
trainer.train()

# Get predictions and calculate MSE
predictions = trainer.predict(test_dataset)
predicted_ratings = predictions.predictions.flatten()

# Get the actual labels from the test dataset
actual_labels = [test_dataset[i]['labels'].item() for i in range(len(test_dataset))]

# Calculate MSE using the actual labels and predicted ratings
mse = mean_squared_error(actual_labels, predicted_ratings)
print(f"MSE: {mse:.4f}")




Step,Training Loss
10,-0.0512
20,-0.177
30,-0.4953
40,-1.0181
50,-2.0586
60,-3.2401
70,-4.5465
80,-5.7107
90,-6.542
100,-7.5903


MSE: 144.7877


**Con 10000 muestras**

In [None]:
from transformers import DistilBertModel, DistilBertTokenizer, Trainer, TrainingArguments
from sklearn.metrics import mean_squared_error, r2_score
from torch.utils.data import Dataset, DataLoader

# Preparamos los datos
reducidos = recipes_df.sample(n=10000, random_state=42)
reducidos = reducidos.dropna(subset=['directions', 'rating'])
texts = reducidos['directions'].tolist()
labels = reducidos['rating'].tolist()


# Extraemos los embeddings
# Cargar el tokenizer y el modelo preentrenado de DistilBERT
model_name = 'distilbert-base-uncased'
embedding_tokenizer = DistilBertTokenizer.from_pretrained(model_name)
embedding_model = DistilBertModel.from_pretrained(model_name)


# Fine-tuning para regresión
# Modelo de regresión basado en los embeddings de DistilBERT
class DistilBertRegressionModel(nn.Module):
    def __init__(self, bert_model):
        super(DistilBertRegressionModel, self).__init__()
        self.bert = bert_model
        self.regressor = nn.Linear(bert_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        cls_embedding = last_hidden_state[:, 0, :]
        regression_output = self.regressor(cls_embedding)
        return regression_output

# Cargamos tokenizer y modelo de regresión
regression_tokenizer = DistilBertTokenizer.from_pretrained(model_name)
regression_model = DistilBertRegressionModel(DistilBertModel.from_pretrained(model_name))


# Entrenamiento y evaluación
# Tokenizar los datos de entrada
def tokenize_data(texts, tokenizer, max_length=128):
    flattened_texts = [' '.join([str(element) for element in (sublist if isinstance(sublist, list) else [sublist])]) for sublist in texts]
    return tokenizer(flattened_texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Creamos un dataset personalizado para regresión
class RegressionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Tokenizamos todos los datos
encodings = tokenize_data(texts, regression_tokenizer)

# Creamos el dataset
dataset = RegressionDataset(encodings, labels)

# Dividimos en conjuntos de entrenamiento y prueba
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Creamos dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4)


# Define TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',            # Directorio de salida
    num_train_epochs=1,                # Número de épocas
    per_device_train_batch_size=16,    # Tamaño del batch de entrenamiento
    per_device_eval_batch_size=64,     # Tamaño del batch de evaluación
    warmup_steps=500,                  # Número de pasos de calentamiento
    weight_decay=0.01,                 # Decaimiento del peso
    logging_dir='./logs',              # Directorio de logs
    logging_steps=10,                  # Frecuencia de los logs
    report_to='none'                   # Desactiva wandb
)

# Create Trainer instance
trainer = Trainer(
    model=regression_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tuning
trainer.train()

# Cálculo predicciones
predictions = trainer.predict(test_dataset)
predicted_ratings = predictions.predictions.flatten()

# tomamos las etiquetas del dataset
actual_labels = [test_dataset[i]['labels'].item() for i in range(len(test_dataset))]

# Calculamos el MSE
mse = mean_squared_error(actual_labels, predicted_ratings)
print(f"MSE: {mse:.4f}")


Step,Training Loss
10,-0.2103
20,-0.272
30,-0.6137
40,-1.153
50,-2.042
60,-3.2529
70,-4.2344
80,-5.5705
90,-6.6395
100,-7.3497


MSE: 410.5475
