In [None]:
# Instala las librerías necesarias
!pip install transformers datasets accelerate -q
!pip install scikit-learn pandas -q
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
import pandas as pd

df = pd.read_csv("FinancES_train_kaggle.csv", encoding="utf-8")

print(df.head())
print("Tamaño del dataset:", df.shape)
print(df['label'].value_counts())

   id                                               text  label
0   0  Renfe afronta mañana un nuevo día de paros par...      2
1   1       Presupuesto populista con cimientos frágiles      2
2   2  Biden no cree que la OPEP+ vaya a ayudar con l...      2
3   3  La deuda de las familias cae en 25.000 millone...      0
4   4  Bestinver: no hay "momento más inoportuno" par...      2
Tamaño del dataset: (6359, 3)
label
2    2935
0    2818
1     606
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df,
    test_size=0.2,  # 80% train, 20% val
    random_state=42,
    stratify=df['label']
)

print("Train size:", train_df.shape)
print("Val size:", val_df.shape)

Train size: (5087, 3)
Val size: (1272, 3)


In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)

# Observa la primera instancia
print(train_dataset[0])

{'id': 5966, 'text': 'La CNMC pone deberes al Gobierno para mejorar el mercado eléctrico', 'label': 0, '__index_level_0__': 5966}


In [None]:
model_checkpoint = "dccuchile/bert-base-spanish-wwm-uncased"

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
num_labels = 3

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset   = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5087 [00:00<?, ? examples/s]

Map:   0%|          | 0/1272 [00:00<?, ? examples/s]

In [None]:
# Retirar columnas no necesarias para el entrenamiento
columns_to_remove = ['id', 'text']
train_dataset = train_dataset.remove_columns(columns_to_remove)
val_dataset   = val_dataset.remove_columns(columns_to_remove)

print(train_dataset)
print(val_dataset)

Dataset({
    features: ['label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5087
})
Dataset({
    features: ['label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1272
})


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="beto-sentiment-checkpoints", # carpeta de salida
    evaluation_strategy="epoch",            # evalúa al final de cada época
    save_strategy="epoch",                  # guarda checkpoint al final de cada época
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,                     # ajusta según necesites
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro"        # nombre de la métrica
)

import numpy as np
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_macro = f1_score(labels, predictions, average='macro')
    return {
        "f1_macro": f1_macro
    }

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.6046,0.638425,0.686439
2,0.4611,0.624257,0.697963
3,0.33,0.721502,0.697844


TrainOutput(global_step=954, training_loss=0.48381819765022965, metrics={'train_runtime': 399.0933, 'train_samples_per_second': 38.239, 'train_steps_per_second': 2.39, 'total_flos': 1003843466986752.0, 'train_loss': 0.48381819765022965, 'epoch': 3.0})

In [None]:
# Evaluación en el conjunto de validación
metrics = trainer.evaluate()
print("Resultado en validación:", metrics)
print("F1 Macro (validación):", metrics["eval_f1_macro"])

Resultado en validación: {'eval_loss': 0.6242566704750061, 'eval_f1_macro': 0.6979632557693467, 'eval_runtime': 9.6193, 'eval_samples_per_second': 132.234, 'eval_steps_per_second': 8.317, 'epoch': 3.0}
F1 Macro (validación): 0.6979632557693467


In [None]:
test_df = pd.read_csv("FinancES_test_kaggle.csv", encoding="utf-8")  # solo tiene id, text
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Si no está la columna 'label', solo remueves 'id' y 'text'
test_dataset = test_dataset.remove_columns(['id','text'])

# Generamos las predicciones
predictions = trainer.predict(test_dataset)
test_preds = np.argmax(predictions.predictions, axis=-1)

# Luego guardamos en un CSV las predicciones
submission = pd.DataFrame({
    "id": test_df["id"],
    "label": test_preds
})
submission.to_csv("predicciones_test.csv", index=False)

print("Predicciones generadas. El archivo 'predicciones_test.csv' tiene el id y la predicción.")


Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Predicciones generadas. El archivo 'predicciones_test.csv' tiene el id y la predicción.


In [None]:
!zip -r beto-sentiment-checkpoints.zip beto-sentiment-checkpoints

  adding: beto-sentiment-checkpoints/ (stored 0%)
  adding: beto-sentiment-checkpoints/checkpoint-318/ (stored 0%)
  adding: beto-sentiment-checkpoints/checkpoint-318/trainer_state.json (deflated 63%)
  adding: beto-sentiment-checkpoints/checkpoint-318/scheduler.pt (deflated 56%)
  adding: beto-sentiment-checkpoints/checkpoint-318/tokenizer.json (deflated 71%)
  adding: beto-sentiment-checkpoints/checkpoint-318/training_args.bin (deflated 52%)
  adding: beto-sentiment-checkpoints/checkpoint-318/config.json (deflated 51%)
  adding: beto-sentiment-checkpoints/checkpoint-318/rng_state.pth (deflated 25%)
  adding: beto-sentiment-checkpoints/checkpoint-318/optimizer.pt (deflated 23%)
  adding: beto-sentiment-checkpoints/checkpoint-318/vocab.txt (deflated 56%)
  adding: beto-sentiment-checkpoints/checkpoint-318/tokenizer_config.json (deflated 75%)
  adding: beto-sentiment-checkpoints/checkpoint-318/model.safetensors (deflated 7%)
  adding: beto-sentiment-checkpoints/checkpoint-318/special_to

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>