In [2]:
!pip install kaggle




In [1]:
import pandas as pd
import kagglehub
import numpy as np
import re

from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

file_path = kagglehub.dataset_download("mexwell/amazon-reviews-multi")
csv_path = file_path + "/train.csv"

df = pd.read_csv(csv_path, encoding="latin-1")

print("Shape:", df.shape)
print("Columnas:", df.columns)
print(df.head())

Downloading from https://www.kaggle.com/api/v1/datasets/download/mexwell/amazon-reviews-multi?dataset_version_number=1...


100%|██████████| 131M/131M [00:03<00:00, 36.0MB/s]

Extracting files...





Shape: (1200000, 9)
Columnas: Index(['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars',
       'review_body', 'review_title', 'language', 'product_category'],
      dtype='object')
   Unnamed: 0   review_id          product_id          reviewer_id  stars  \
0           0  de_0203609  product_de_0865382  reviewer_de_0267719      1   
1           1  de_0559494  product_de_0678997  reviewer_de_0783625      1   
2           2  de_0238777  product_de_0372235  reviewer_de_0911426      1   
3           3  de_0477884  product_de_0719501  reviewer_de_0836478      1   
4           4  de_0270868  product_de_0022613  reviewer_de_0736276      1   

                                         review_body  \
0     Armband ist leider nach 1 Jahr kaputt gegangen   
1                 In der Lieferung war nur Ein Akku!   
2  Ein Stern, weil gar keine geht nicht. Es hande...   
3  Dachte, das wÃ¤ren einfach etwas festere Binde...   
4  Meine Kinder haben kaum damit gespielt und nac...   

     

In [2]:
#Preprocesamiento

df["text"] = df["review_title"].fillna("") + " " + df["review_body"].fillna("")
df["labels"] = df["stars"] - 1
df = df[["text", "labels", "language"]]
dataset = Dataset.from_pandas(df)

# Splits
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_val = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = train_val["train"]
validation_dataset = train_val["test"]
test_dataset = dataset["test"]

print("Splits:")
print("Train:", train_dataset.shape)
print("Validation:", validation_dataset.shape)
print("Test:", test_dataset.shape)

Splits:
Train: (972000, 3)
Validation: (108000, 3)
Test: (120000, 3)


In [3]:
#Tokenización
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.unk_token

def preprocess(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=180
    )

train_dataset = train_dataset.map(preprocess, batched=True)
validation_dataset = validation_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

train_dataset = train_dataset.remove_columns(["text", "language"])
validation_dataset = validation_dataset.remove_columns(["text", "language"])
test_dataset = test_dataset.remove_columns(["text", "language"])

train_dataset.set_format(type="torch")
validation_dataset.set_format(type="torch")
test_dataset.set_format(type="torch")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/972000 [00:00<?, ? examples/s]

Map:   0%|          | 0/108000 [00:00<?, ? examples/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

In [None]:
#Fine-tunning de RoBERTa
id2label = {i: f"{i+1}_stars" for i in range(5)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,
    id2label=id2label,
    label2id=label2id
)


#Métricas
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0)
    }


training_args = TrainingArguments(
    output_dir="./xlm-roberta-amazon",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    warmup_ratio=0.10,
    weight_decay=0.01,
    seed=42,
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print(" Iniciando entrenamiento...")
train_result = trainer.train()
print("\n Entrenamiento finalizado. Métricas:")
print(train_result.metrics)


trainer.save_model("./xlm_roberta_finetuned_amazon")
print("Modelo guardado correctamente.")

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


🚀 Iniciando entrenamiento...


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
