In [2]:
!pip install scikit-optimize
!pip install scikit-learn
!pip install datasets transformers pandas numpy

from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the dataset
df = pd.read_csv("FinancES_train_kaggle.csv", encoding="utf-8")
print("Dataset shape:", df.shape)

# Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load BETO model and tokenizer
model_checkpoint = "dccuchile/bert-base-spanish-wwm-cased"
num_labels = len(df["label"].unique())
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["id", "text"])
val_dataset = val_dataset.remove_columns(["id", "text"])

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"f1_macro": f1_score(labels, preds, average="macro")}

# Define the search space
space = [
    Real(1e-6, 1e-4, "log-uniform", name="learning_rate"),
    Categorical([8, 16, 32], name="batch_size"),
    Integer(3, 10, name="num_epochs"),
    Real(1e-4, 1e-2, "log-uniform", name="weight_decay"),
    Categorical([1, 2, 4], name="gradient_accumulation_steps")
]

# Objective function for Bayesian optimization
@use_named_args(space)
def objective(**params):
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

    training_args = TrainingArguments(
        output_dir="beto-checkpoints",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["batch_size"],
        num_train_epochs=params["num_epochs"],
        weight_decay=params["weight_decay"],
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        fp16=True,
        gradient_accumulation_steps=params["gradient_accumulation_steps"],
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()
    return -metrics["eval_f1_macro"]  # Negative because we want to maximize

# Run Bayesian optimization
result = gp_minimize(objective, space, n_calls=10, random_state=42)

# Print best parameters and score
print("Best parameters:")
for name, value in zip(["learning_rate", "batch_size", "num_epochs", "weight_decay", "gradient_accumulation_steps"], result.x):
    print(f"  {name}: {value}")
print(f"Best F1 Macro: {-result.fun}")

# Train final model with best hyperparameters
best_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
best_training_args = TrainingArguments(
    output_dir="beto-checkpoints-best",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=result.x[0],
    per_device_train_batch_size=result.x[1],
    num_train_epochs=result.x[2],
    weight_decay=result.x[3],
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    gradient_accumulation_steps=result.x[4],
    report_to="none"
)

best_trainer = Trainer(
    model=best_model,
    args=best_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

best_trainer.train()

# Final evaluation and prediction generation
final_metrics = best_trainer.evaluate()
print("F1 Macro final (validación):", final_metrics["eval_f1_macro"])

test_df = pd.read_csv("FinancES_test_kaggle.csv", encoding="utf-8")
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.remove_columns(["id", "text"])

predictions = best_trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

submission = pd.DataFrame({"id": test_df["id"], "label": preds})
submission.to_csv("submission_beto_optimized.csv", index=False)
print("Archivo 'submission_beto_optimized.csv' creado con las predicciones optimizadas.")


Dataset shape: (6359, 3)


Map:   0%|          | 0/5087 [00:00<?, ? examples/s]

Map:   0%|          | 0/1272 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.595058,0.715468
2,1.079800,0.564214,0.739301
3,1.079800,0.911812,0.721731
4,0.501900,1.399289,0.723379
5,0.199300,1.559654,0.7239
6,0.199300,1.666948,0.727381
7,0.053600,1.745794,0.727678
8,0.021900,1.770672,0.721505


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.841962,0.47955
2,No log,0.702331,0.587333
3,No log,0.651909,0.660689
4,1.575700,0.629397,0.664918
5,1.575700,0.62376,0.675009


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.934538,0.366532
2,No log,0.895627,0.427615
3,No log,0.860276,0.470232
4,No log,0.817867,0.49009
5,No log,0.773652,0.499487
6,No log,0.739407,0.504251
7,No log,0.716918,0.508843
8,No log,0.703947,0.511717
9,No log,0.695644,0.513988


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.563968,0.711555
2,No log,0.553033,0.732817
3,No log,0.582784,0.731925


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.7032,0.568967,0.716928
2,0.4961,0.573225,0.727404
3,0.425,0.72004,0.728131
4,0.3116,0.993572,0.740371
5,0.2439,1.087396,0.733918
6,0.1928,1.195415,0.739706
7,0.1947,1.295409,0.745443
8,0.1029,1.40071,0.73387
9,0.0878,1.482794,0.735077
10,0.0909,1.476644,0.735329


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.577503,0.696199
2,No log,0.551787,0.743608
3,No log,0.585992,0.735222
4,No log,0.655063,0.72599
5,No log,0.742269,0.731285
6,No log,0.781537,0.728893
7,1.395200,0.860307,0.72651
8,1.395200,0.918375,0.722594
9,1.395200,0.968355,0.733316


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.548712,0.721722
2,No log,0.556517,0.736849
3,No log,0.607815,0.731141


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.611608,0.648755
2,No log,0.562608,0.708379
3,No log,0.557848,0.727165
4,2.366500,0.56728,0.724472
5,2.366500,0.572436,0.723163


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.565796,0.706259
2,No log,0.534698,0.730998
3,No log,0.588728,0.726662
4,0.471000,0.626936,0.73363


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.573639,0.7243
2,0.569100,0.570499,0.73882
3,0.569100,0.754205,0.739318
4,0.253100,1.041867,0.732349


Best parameters:
  learning_rate: 6.3055350401992874e-06
  batch_size: 8
  num_epochs: 10
  weight_decay: 0.00029210748185657135
  gradient_accumulation_steps: 1
Best F1 Macro: 0.7454430514757032


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  best_trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.7316,0.585081,0.703868
2,0.5056,0.585921,0.716128
3,0.4324,0.714417,0.714762
4,0.3213,0.968699,0.732378
5,0.2678,1.048533,0.716248
6,0.1955,1.162513,0.728566
7,0.2057,1.322175,0.72992
8,0.1168,1.403411,0.709603
9,0.0941,1.477062,0.711847
10,0.1067,1.481527,0.712586


F1 Macro final (validación): 0.732377794557594


Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Archivo 'submission_beto_optimized.csv' creado con las predicciones optimizadas.
