In [1]:
!pip install -q transformers datasets accelerate scikit-learn pandas nlpaug

import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from nlpaug.augmenter.word import ContextualWordEmbsAug
import random

# Carga de datos con conversión explícita de labels
df = pd.read_csv("FinancES_train_kaggle.csv", encoding="utf-8")
df['label'] = df['label'].astype(int)  # Conversión crítica

print("Distribución de clases original:")
print(df['label'].value_counts())

# Aumento de datos usando modelo base español
aug = ContextualWordEmbsAug(
    model_path='BSC-TeMU/roberta-base-bne',
    action="substitute",
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

def augment_data(texts, labels, augment_percentage=0.3):
    augmented_texts = []
    augmented_labels = []
    num_to_augment = int(len(texts) * augment_percentage)
    indices_to_augment = random.sample(range(len(texts)), num_to_augment)

    for idx, (text, label) in enumerate(zip(texts, labels)):
        if idx in indices_to_augment:
            try:
                augmented_text = aug.augment(text)[0]
                augmented_texts.append(augmented_text)
                augmented_labels.append(label)
            except:
                continue  # Manejo de errores en aumento

    return texts + augmented_texts, labels + augmented_labels

# Balanceo de clases mediante sample weighting
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(df['label']),
    y=df['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Configuración del modelo
model_checkpoint = "UMUTeam/roberta-spanish-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
num_labels = 3

def tokenize_function(examples):
    encoding = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )

    # Solo agregar "labels" si la columna "label" está en los datos
    if "label" in examples:
        encoding["labels"] = torch.tensor([int(label) for label in examples["label"]])

    return encoding


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.66M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

In [2]:
# Validación cruzada mejorada
kfold = KFold(n_splits=4, shuffle=True, random_state=42)

for fold, (train_ids, val_ids) in enumerate(kfold.split(df)):
    print(f"\n=== Fold {fold + 1} ===")

    # Aumento solo en datos de entrenamiento
    train_df = df.iloc[train_ids]
    texts, labels = augment_data(
        train_df['text'].tolist(),
        train_df['label'].tolist(),
        augment_percentage=0.2
    )

    df_augmented = pd.DataFrame({'text': texts, 'label': labels})
    val_df = df.iloc[val_ids]

    train_dataset = Dataset.from_pandas(df_augmented)
    val_dataset = Dataset.from_pandas(val_df)

    train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

    # Cargar modelo con inicialización adecuada
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_labels
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "f1_macro": f1_score(labels, preds, average="macro"),
            "f1_weighted": f1_score(labels, preds, average="weighted")
        }

    training_args = TrainingArguments(
        output_dir=f"checkpoints-fold-{fold+1}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        fp16=True,
        gradient_accumulation_steps=2,
        warmup_ratio=0.1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Early stopping dinámico
    trainer.add_callback(EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.001
    ))

    # Entrenamiento con manejo de memoria
    torch.cuda.empty_cache()
    train_result = trainer.train()
    metrics = train_result.metrics
    print(f"Resultados entrenamiento: {metrics}")

    # Evaluación detallada
    eval_metrics = trainer.evaluate()
    print(f"Resultados validación: {eval_metrics}")

    # Guardado optimizado
    trainer.save_model(f"best_model_fold_{fold+1}")
    tokenizer.save_pretrained(f"best_model_fold_{fold+1}")


=== Fold 1 ===


Map:   0%|          | 0/5722 [00:00<?, ? examples/s]

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted
1,1.1978,0.628994,0.712084,0.756302
2,0.819,0.52217,0.740878,0.79046
3,0.2936,0.70272,0.745731,0.790246
4,0.054,1.186976,0.736405,0.784256
5,0.0166,1.271272,0.739846,0.794134


Resultados entrenamiento: {'train_runtime': 260.2294, 'train_samples_per_second': 219.883, 'train_steps_per_second': 6.879, 'total_flos': 1881918720299520.0, 'train_loss': 0.7627079586742976, 'epoch': 5.0}


Resultados validación: {'eval_loss': 0.702719509601593, 'eval_f1_macro': 0.7457313834551874, 'eval_f1_weighted': 0.7902459872433125, 'eval_runtime': 2.6638, 'eval_samples_per_second': 596.884, 'eval_steps_per_second': 18.77, 'epoch': 5.0}

=== Fold 2 ===


Map:   0%|          | 0/5722 [00:00<?, ? examples/s]

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted
1,1.2262,0.611006,0.718976,0.752791
2,0.8281,0.619858,0.720583,0.762043
3,0.1926,0.948786,0.694029,0.741329
4,0.0814,1.325111,0.719449,0.766074


Resultados entrenamiento: {'train_runtime': 227.0182, 'train_samples_per_second': 252.05, 'train_steps_per_second': 7.885, 'total_flos': 1505534976239616.0, 'train_loss': 0.9455080896033256, 'epoch': 4.0}


Resultados validación: {'eval_loss': 0.619857907295227, 'eval_f1_macro': 0.7205832813777769, 'eval_f1_weighted': 0.7620427872178716, 'eval_runtime': 2.7886, 'eval_samples_per_second': 570.179, 'eval_steps_per_second': 17.93, 'epoch': 4.0}

=== Fold 3 ===


Map:   0%|          | 0/5722 [00:00<?, ? examples/s]

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted
1,1.2293,0.575551,0.715212,0.768135
2,0.7948,0.591545,0.720873,0.762984
3,0.2415,0.85719,0.733908,0.780781
4,0.0585,1.236136,0.713931,0.76553
5,0.0218,1.463877,0.726223,0.770847


Resultados entrenamiento: {'train_runtime': 258.5404, 'train_samples_per_second': 221.319, 'train_steps_per_second': 6.923, 'total_flos': 1881918720299520.0, 'train_loss': 0.7629552112611312, 'epoch': 5.0}


Resultados validación: {'eval_loss': 0.8571904897689819, 'eval_f1_macro': 0.7339084223115763, 'eval_f1_weighted': 0.7807812345720359, 'eval_runtime': 2.671, 'eval_samples_per_second': 595.286, 'eval_steps_per_second': 18.72, 'epoch': 5.0}

=== Fold 4 ===


Map:   0%|          | 0/5724 [00:00<?, ? examples/s]

Map:   0%|          | 0/1589 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted
1,1.2226,0.545458,0.732807,0.782296
2,0.816,0.54452,0.743028,0.791007
3,0.2576,0.750745,0.729831,0.775314
4,0.1054,1.133111,0.724503,0.783878


Resultados entrenamiento: {'train_runtime': 219.1475, 'train_samples_per_second': 261.194, 'train_steps_per_second': 8.168, 'total_flos': 1506061203075072.0, 'train_loss': 0.9783536860063755, 'epoch': 4.0}


Resultados validación: {'eval_loss': 0.54451984167099, 'eval_f1_macro': 0.743027954061354, 'eval_f1_weighted': 0.7910072578541141, 'eval_runtime': 2.8262, 'eval_samples_per_second': 562.247, 'eval_steps_per_second': 17.692, 'epoch': 4.0}


In [10]:
# Ensemble de modelos

# Cargar y tokenizar el dataset de test
test_df = pd.read_csv("FinancES_test_kaggle.csv", encoding="utf-8")
test_dataset = Dataset.from_pandas(test_df)

# Tokenizar el dataset de test
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["id", "text"])

models = [AutoModelForSequenceClassification.from_pretrained(f"best_model_fold_{i+1}", num_labels=num_labels) for i in range(4)]

def ensemble_predict(models, dataset):
    all_predictions = []
    for model in models:
        model.eval()
        trainer = Trainer(model=model)
        predictions = trainer.predict(dataset)
        all_predictions.append(predictions.predictions)

    ensemble_preds = np.mean(all_predictions, axis=0)
    return np.argmax(ensemble_preds, axis=-1)

final_preds = ensemble_predict(models, test_dataset)

submission = pd.DataFrame({
    "id": test_df["id"],
    "label": final_preds
})

submission.to_csv("submission_roberta_2_context.csv", index=False)
print("Archivo 'submission_roberta_2_context.csv' creado con las predicciones del ensemble.")


Map:   0%|          | 0/1621 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Archivo 'submission_roberta_2_context.csv' creado con las predicciones del ensemble.
