In [1]:
!pip install -q transformers datasets accelerate scikit-learn pandas nlpaug

import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score
from datasets import Dataset
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AdamW,
    get_linear_schedule_with_warmup,
    EarlyStoppingCallback
)
from nlpaug.augmenter.word import ContextualWordEmbsAug
import nltk
nltk.download('punkt')

import random

# Carga de datos
df = pd.read_csv("FinancES_train_kaggle.csv", encoding="utf-8")
print("Dataset shape:", df.shape)
print(df.head())

# Aumento de datos usando un modelo contextual en español
aug = ContextualWordEmbsAug(model_path='dccuchile/bert-base-spanish-wwm-cased', action="substitute")

def augment_data(texts, labels, augment_percentage=0.2):
    augmented_texts = []
    augmented_labels = []
    num_to_augment = int(len(texts) * augment_percentage)
    indices_to_augment = random.sample(range(len(texts)), num_to_augment)

    for idx, (text, label) in enumerate(zip(texts, labels)):
        if idx in indices_to_augment:
            augmented_text = aug.augment(text)[0]
            augmented_texts.append(augmented_text)
            augmented_labels.append(label)

    return texts + augmented_texts, labels + augmented_labels

texts, labels = augment_data(df['text'].tolist(), df['label'].tolist(), augment_percentage=0.3)
df_augmented = pd.DataFrame({'text': texts, 'label': labels})

# Configuración del modelo
model_checkpoint = "UMUTeam/roberta-spanish-sentiment-analysis"
num_labels = 3  # positivo(0), neutro(1), negativo(2)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

# Validación cruzada
kfold = KFold(n_splits=4, shuffle=True, random_state=42)

for fold, (train_ids, val_ids) in enumerate(kfold.split(df_augmented)):
    print(f"Fold {fold + 1}")

    train_df = df_augmented.iloc[train_ids]
    val_df = df_augmented.iloc[val_ids]

    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    train_dataset = train_dataset.remove_columns(["text"])
    val_dataset = val_dataset.remove_columns(["text"])

    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_labels
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "f1_macro": f1_score(labels, preds, average="macro")
        }

    training_args = TrainingArguments(
        output_dir=f"roberta-spanish-checkpoints-fold-{fold+1}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Early stopping
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

    trainer.train()

    # Evaluación en validación
    metrics = trainer.evaluate()
    print(f"Fold {fold+1} - F1 Macro (validación):", metrics["eval_f1_macro"])

    # Guardar el mejor modelo para este fold
    trainer.save_model(f"best_model_fold_{fold+1}")

# Ensemble de modelos
test_df = pd.read_csv("FinancES_test_kaggle.csv", encoding="utf-8")
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.remove_columns(["id", "text"])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset shape: (6359, 3)
   id                                               text  label
0   0  Renfe afronta mañana un nuevo día de paros par...      2
1   1       Presupuesto populista con cimientos frágiles      2
2   2  Biden no cree que la OPEP+ vaya a ayudar con l...      2
3   3  La deuda de las familias cae en 25.000 millone...      0
4   4  Bestinver: no hay "momento más inoportuno" par...      2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.66M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Fold 1


Map:   0%|          | 0/6199 [00:00<?, ? examples/s]

Map:   0%|          | 0/2067 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.5773,0.555992,0.751353
2,0.3069,0.637019,0.740253
3,0.0937,0.988667,0.747404
4,0.0537,1.240076,0.733086


Fold 1 - F1 Macro (validación): 0.751352537630574
Fold 2


Map:   0%|          | 0/6199 [00:00<?, ? examples/s]

Map:   0%|          | 0/2067 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.6041,0.533922,0.746232
2,0.3057,0.653227,0.734814
3,0.0814,1.038702,0.730828
4,0.0572,1.372506,0.729781


Fold 2 - F1 Macro (validación): 0.7462315357226696
Fold 3


Map:   0%|          | 0/6200 [00:00<?, ? examples/s]

Map:   0%|          | 0/2066 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.5641,0.5888,0.730123
2,0.2798,0.68607,0.733643
3,0.0747,1.202681,0.737614
4,0.0079,1.386184,0.736217
5,0.0008,1.555212,0.726518
6,0.0002,1.639819,0.733726


Fold 3 - F1 Macro (validación): 0.7376142561464266
Fold 4


Map:   0%|          | 0/6200 [00:00<?, ? examples/s]

Map:   0%|          | 0/2066 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.5643,0.596066,0.714613
2,0.3245,0.706654,0.730686
3,0.1481,1.150938,0.710727
4,0.0515,1.377235,0.717522
5,0.0008,1.494444,0.718953


Fold 4 - F1 Macro (validación): 0.7306861602111382


Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

In [4]:
models = [AutoModelForSequenceClassification.from_pretrained(f"best_model_fold_{i+1}", num_labels=num_labels) for i in range(4)]

def ensemble_predict(models, dataset):
    all_predictions = []
    for model in models:
        model.eval()
        trainer = Trainer(model=model)
        predictions = trainer.predict(dataset)
        all_predictions.append(predictions.predictions)

    ensemble_preds = np.mean(all_predictions, axis=0)
    return np.argmax(ensemble_preds, axis=-1)

final_preds = ensemble_predict(models, test_dataset)

submission = pd.DataFrame({
    "id": test_df["id"],
    "label": final_preds
})

submission.to_csv("submission_spacy_roberta.csv", index=False)
print("Archivo 'submission_spacy_roberta.csv' creado con las predicciones del ensemble.")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Archivo 'submission_spacy_roberta.csv' creado con las predicciones del ensemble.
