In [2]:
!pip install scikit-optimize
!pip install scikit-learn
!pip install datasets transformers pandas numpy

from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the dataset
df = pd.read_csv("FinancES_train_kaggle.csv", encoding="utf-8")
print("Dataset shape:", df.shape)

# Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load BETO model and tokenizer
model_checkpoint = "UMUTeam/roberta-spanish-sentiment-analysis"
num_labels = len(df["label"].unique())
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["id", "text"])
val_dataset = val_dataset.remove_columns(["id", "text"])

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"f1_macro": f1_score(labels, preds, average="macro")}

# Define the search space
space = [
    Real(1e-6, 1e-4, "log-uniform", name="learning_rate"),
    Categorical([8, 16, 32], name="batch_size"),
    Integer(3, 10, name="num_epochs"),
    Real(1e-4, 1e-2, "log-uniform", name="weight_decay"),
    Categorical([1, 2, 4], name="gradient_accumulation_steps")
]

# Objective function for Bayesian optimization
@use_named_args(space)
def objective(**params):
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

    training_args = TrainingArguments(
        output_dir="beto-checkpoints",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["batch_size"],
        num_train_epochs=params["num_epochs"],
        weight_decay=params["weight_decay"],
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        fp16=True,
        gradient_accumulation_steps=params["gradient_accumulation_steps"],
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()
    return -metrics["eval_f1_macro"]  # Negative because we want to maximize

# Run Bayesian optimization
result = gp_minimize(objective, space, n_calls=10, random_state=42)

# Print best parameters and score
print("Best parameters:")
for name, value in zip(["learning_rate", "batch_size", "num_epochs", "weight_decay", "gradient_accumulation_steps"], result.x):
    print(f"  {name}: {value}")
print(f"Best F1 Macro: {-result.fun}")

# Train final model with best hyperparameters
best_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
best_training_args = TrainingArguments(
    output_dir="beto-checkpoints-best",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=result.x[0],
    per_device_train_batch_size=result.x[1],
    num_train_epochs=result.x[2],
    weight_decay=result.x[3],
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    gradient_accumulation_steps=result.x[4],
    report_to="none"
)

best_trainer = Trainer(
    model=best_model,
    args=best_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

best_trainer.train()

# Final evaluation and prediction generation
final_metrics = best_trainer.evaluate()
print("F1 Macro final (validación):", final_metrics["eval_f1_macro"])

test_df = pd.read_csv("FinancES_test_kaggle.csv", encoding="utf-8")
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.remove_columns(["id", "text"])

predictions = best_trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

submission = pd.DataFrame({"id": test_df["id"], "label": preds})
submission.to_csv("submission_roberta_optimized.csv", index=False)
print("Archivo 'submission_roberta_optimized.csv' creado con las predicciones optimizadas.")


Collecting scikit-optimize
  Using cached scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Using cached pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Using cached scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
Using cached pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2
Dataset shape: (6359, 3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.66M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Map:   0%|          | 0/5087 [00:00<?, ? examples/s]

Map:   0%|          | 0/1272 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.571146,0.715261
2,1.076200,0.578381,0.721001
3,1.076200,0.910749,0.736509
4,0.400800,1.229826,0.721937
5,0.108900,1.431472,0.730633
6,0.108900,1.522819,0.729958
7,0.011500,1.55646,0.736111
8,0.003000,1.576768,0.737981


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.83485,0.494536
2,No log,0.656786,0.679553
3,No log,0.613129,0.698858
4,1.860200,0.59897,0.713136
5,1.860200,0.594763,0.715567


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,1.938259,0.196491
2,No log,1.06664,0.379483
3,No log,0.889237,0.482392
4,No log,0.803904,0.578165
5,No log,0.740639,0.623173
6,No log,0.699281,0.658104
7,No log,0.676401,0.671651
8,No log,0.664242,0.675359
9,No log,0.65669,0.67983


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.520269,0.727571
2,No log,0.508357,0.737
3,No log,0.577493,0.752833


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.7446,0.538563,0.722138
2,0.3971,0.603851,0.734602
3,0.268,0.821864,0.746462
4,0.0565,1.109519,0.72559
5,0.0236,1.279392,0.715335
6,0.0076,1.367469,0.718165
7,0.0024,1.428095,0.716783
8,0.0005,1.480713,0.710453
9,0.0009,1.517652,0.715353
10,0.0006,1.527643,0.711859


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.534165,0.715793
2,No log,0.523019,0.726456
3,No log,0.580741,0.74562
4,No log,0.787035,0.747033
5,No log,1.045979,0.736728
6,No log,1.198138,0.746441
7,0.961500,1.257224,0.735669
8,0.961500,1.297535,0.734674
9,0.961500,1.320539,0.739744


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.524782,0.719771
2,No log,0.518967,0.744304
3,No log,0.616316,0.753566


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.571317,0.715791
2,No log,0.536576,0.723634
3,No log,0.554943,0.730404
4,2.193300,0.600015,0.727734
5,2.193300,0.637256,0.727138


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.522057,0.723966
2,No log,0.514275,0.737977
3,No log,0.62331,0.744503
4,0.382100,0.766434,0.746019


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.547714,0.721063
2,0.519300,0.576619,0.747391
3,0.519300,0.868248,0.749814
4,0.151600,0.991881,0.748226


Best parameters:
  learning_rate: 2.2941144328643983e-05
  batch_size: 16
  num_epochs: 3
  weight_decay: 0.0076630826802558585
  gradient_accumulation_steps: 2
Best F1 Macro: 0.7535662335410075


  best_trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.524782,0.719771
2,No log,0.518967,0.744304
3,No log,0.616316,0.753566


F1 Macro final (validación): 0.7535662335410075


Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Archivo 'submission_roberta_optimized.csv' creado con las predicciones optimizadas.
