In [None]:
# Sentiment Analysis en Pueblos Mágicos

# 1. Instalación de librerías
def install_dependencies():
    !pip install pandas "numpy<2.0" scikit-learn openpyxl transformers torch sentencepiece datasets evaluate sacremoses tqdm

# 2. Carga de datos
def load_data(train_path, test_path):
    import pandas as pd
    train = pd.read_excel(train_path)
    test  = pd.read_excel(test_path)
    return train, test

# 3. Limpieza básica del texto
def preprocess_text(df):
    import re
    df['clean'] = (
        df['Review']
        .str.lower()
        .str.replace(r"<[^>]+>", " ", regex=True)
        .str.replace(r"[^a-záéíóúñü ]", " ", regex=True)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    return df

# 4. Balanceo por aumentación de datos en clases minoritarias, como hay mas datos de 5 estrellas, generamos datos para las otras estellas usando traducción al ingles y de regreso
from tqdm.auto import tqdm
import torch
import os

def balance_with_back_translation(df, target_count=None, batch_size=128, checkpoint_path='train_balanced.csv'):
    import pandas as pd
    from math import ceil
    from transformers import MarianMTModel, MarianTokenizer

    # Verificar si ya existe el checkpoint
    if os.path.exists(checkpoint_path):
        print(f"Cargando datos balanceados desde checkpoint: {checkpoint_path}")
        return pd.read_csv(checkpoint_path)

    print("Generando datos balanceados")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Carga de modelos de traducción y envío a GPU
    me2en_tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-es-en')
    me2en = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-es-en').to(device)
    en2me_tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-es')
    en2me = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-es').to(device)

    counts = df['Polarity'].value_counts().to_dict()
    max_count = target_count or max(counts.values())
    augmented = []  # lista de (texto, polaridad)

    for polarity, cnt in counts.items():
        if cnt < max_count:
            needed = max_count - cnt
            samples = df[df['Polarity'] == polarity]['clean'].tolist()
            reps = ceil(needed / len(samples))
            pool = samples * reps
            to_aug = pool[:needed]

            batch_augmented = []
            for i in tqdm(range(0, len(to_aug), batch_size), desc=f'Aug Polarity {polarity}'):
                batch_texts = to_aug[i:i+batch_size]

                with torch.no_grad():
                    # es -> en
                    enc = me2en_tok(batch_texts, return_tensors='pt', padding=True, truncation=True).to(device)
                    en_ids = me2en.generate(**enc, max_length=128)
                    en_texts = me2en_tok.batch_decode(en_ids, skip_special_tokens=True)

                    # en -> es
                    dec = en2me_tok(en_texts, return_tensors='pt', padding=True, truncation=True).to(device)
                    es_ids = en2me.generate(**dec, max_length=128)
                    aug_texts = en2me_tok.batch_decode(es_ids, skip_special_tokens=True)

                batch_augmented.extend([(t, polarity) for t in aug_texts])

            if batch_augmented:
                augmented.extend(batch_augmented)
                df_partial = pd.DataFrame({
                    'Review': [t for t,_ in batch_augmented],
                    'Polarity': [p for _,p in batch_augmented],
                    'clean': [t for t,_ in batch_augmented]
                })
                df_partial.to_csv(f'augmented_polarity_{int(polarity)}.csv', mode='a', header=not pd.io.common.file_exists(f'augmented_polarity_{int(polarity)}.csv'), index=False, encoding='utf-8')

    texts, labels = zip(*augmented) if augmented else ([], [])
    aug_df = pd.DataFrame({'Review': texts, 'Polarity': labels, 'clean': texts})
    balanced_df = pd.concat([df, aug_df], ignore_index=True)

    # Guardamos un checkpoint
    balanced_df.to_csv(checkpoint_path, index=False, encoding='utf-8')
    print(f"💾 Datos balanceados guardados en: {checkpoint_path}")

    return balanced_df

# 5. Domain-Adaptive Pretraining
def domain_adaptive_pretraining(texts, base_model='PlanTL-GOB-ES/roberta-base-bne', checkpoint_dir='domain_adapted_model'):
    from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
    from datasets import Dataset
    import os

    # Verificar si ya existe el checkpoint
    if os.path.exists(checkpoint_dir) and os.path.exists(f"{checkpoint_dir}_tokenizer"):
        print(f"Cargando modelo adaptado al dominio desde checkpoint: {checkpoint_dir}")
        model = AutoModelForMaskedLM.from_pretrained(checkpoint_dir)
        tokenizer = AutoTokenizer.from_pretrained(f"{checkpoint_dir}_tokenizer")
        return model, tokenizer

    print("Realizando adaptación al dominio")
    tok = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForMaskedLM.from_pretrained(base_model)
    ds = Dataset.from_dict({'text': texts})
    def tok_fn(x): return tok(x['text'], truncation=True, padding='max_length', max_length=128)
    tok_ds = ds.map(tok_fn, batched=True)

    data_collator = DataCollatorForLanguageModeling(tok, mlm=True, mlm_probability=0.15)
    args = TrainingArguments(
        output_dir='domain_adapt', num_train_epochs=3,
        per_device_train_batch_size=32, logging_steps=100, save_steps=500,
        learning_rate=2e-5, weight_decay=0.01
    )
    trainer = Trainer(model=model, args=args, train_dataset=tok_ds, data_collator=data_collator)
    trainer.train()

    # Guardar checkpoint
    model.save_pretrained(checkpoint_dir)
    tok.save_pretrained(f"{checkpoint_dir}_tokenizer")
    print(f"Modelo adaptado al dominio guardado en: {checkpoint_dir}")

    return model, tok

# 6. Fine‑tuning con validación cruzada estratificada
def cross_val_finetune(train_df, model, tokenizer, n_splits=4):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import StratifiedKFold
    from datasets import Dataset
    from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
    from sklearn.metrics import f1_score, mean_absolute_error
    import torch

    # Forzar compatibilidad con NumPy
    import os
    os.environ['NUMPY_EXPERIMENTAL_ARRAY_FUNCTION'] = '0'

    # Resetear índices para evitar problemas con el acceso a datos
    train_df = train_df.reset_index(drop=True)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    metrics = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df['clean'], train_df['Polarity']), start=1):
        print(f"Procesando fold {fold}/{n_splits}")

        # Preparar datasets de entrenamiento y validación usando .iloc para acceso seguro
        train_texts = train_df.iloc[train_idx]['clean'].tolist()
        train_labels = [int(label - 1) for label in train_df.iloc[train_idx]['Polarity'].tolist()]
        val_texts = train_df.iloc[val_idx]['clean'].tolist()
        val_labels = [int(label - 1) for label in train_df.iloc[val_idx]['Polarity'].tolist()]
        train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        val_encodings = tokenizer(val_texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')

        class CustomDataset(torch.utils.data.Dataset):
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels

            def __getitem__(self, idx):
                item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
                item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
                return item

            def __len__(self):
                return len(self.labels)

        train_dataset = CustomDataset(train_encodings, train_labels)
        val_dataset = CustomDataset(val_encodings, val_labels)

        # Crear y entrenar modelo
        model_cls = AutoModelForSequenceClassification.from_pretrained(
            model.name_or_path if hasattr(model, 'name_or_path') else 'PlanTL-GOB-ES/roberta-base-bne',
            num_labels=5,
            problem_type="single_label_classification"
        )

        args = TrainingArguments(
            output_dir=f'cv_fold{fold}',
            num_train_epochs=4,
            per_device_train_batch_size=8,
            logging_steps=200,
            save_strategy="no",
            learning_rate=2e-5,
            weight_decay=0.01
        )

        trainer = Trainer(
            model=model_cls,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset
        )
        trainer.train()

        print(f"Evaluando fold {fold}")
        model_cls.eval()
        device = next(model_cls.parameters()).device

        all_preds = []
        with torch.no_grad():
            for i in range(0, len(val_texts), 512):  # Procesar en lotes de 512
                batch_texts = val_texts[i:i+512]
                batch_encodings = tokenizer(batch_texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
                batch_encodings = {k: v.to(device) for k, v in batch_encodings.items()}

                outputs = model_cls(**batch_encodings)
                predictions = torch.argmax(outputs.logits, dim=-1)
                all_preds.extend(predictions.cpu().numpy())

        # Convertir de vuelta a etiquetas originales (1-5)
        true_labels = [label + 1 for label in val_labels]
        pred_labels = [pred + 1 for pred in all_preds]

        f1 = f1_score(val_labels, all_preds, average='macro')
        mae = mean_absolute_error(true_labels, pred_labels)

        metrics.append({'fold': fold, 'f1_macro': f1, 'mae': mae})
        print(f"Fold {fold} - F1 Macro: {f1:.4f}, MAE: {mae:.4f}")

    return metrics


def check_checkpoint_status():
    """Verifica qué checkpoints existen y su tamaño"""
    import os
    print("📋 ESTADO DE LOS CHECKPOINTS:")
    print("-" * 50)

    # Datos balanceados
    if os.path.exists('train_balanced.csv'):
        size = os.path.getsize('train_balanced.csv') / (1024*1024)  # MB
        print(f"Datos balanceados: train_balanced.csv ({size:.1f} MB)")
    else:
        print("Datos balanceados: NO encontrados")

    # Modelo adaptado al dominio
    if os.path.exists('domain_adapted_model') and os.path.exists('domain_adapted_model_tokenizer'):
        print("Modelo adaptado al dominio: domain_adapted_model/")
    else:
        print("Modelo adaptado al dominio: NO encontrado")

    # Archivos de aumentación parcial
    aug_files = [f for f in os.listdir('.') if f.startswith('augmented_polarity_')]
    if aug_files:
        print(f"Archivos de aumentación parcial: {len(aug_files)} archivos")

    print("-" * 50)

def clean_checkpoints():
    """Limpia todos los checkpoints para empezar desde cero"""
    import os
    import shutil

    print("LIMPIANDO CHECKPOINTS...")

    # Archivos CSV
    files_to_remove = ['train_balanced.csv'] + [f for f in os.listdir('.') if f.startswith('augmented_polarity_')]
    for file in files_to_remove:
        if os.path.exists(file):
            os.remove(file)
            print(f"️Eliminado: {file}")

    # Directorios de modelos
    dirs_to_remove = ['domain_adapted_model', 'domain_adapted_model_tokenizer', 'domain_adapt']
    for dir_name in dirs_to_remove:
        if os.path.exists(dir_name):
            shutil.rmtree(dir_name)
            print(f"Eliminado directorio: {dir_name}")

    # Directorios de CV
    cv_dirs = [d for d in os.listdir('.') if d.startswith('cv_fold')]
    for cv_dir in cv_dirs:
        if os.path.exists(cv_dir):
            shutil.rmtree(cv_dir)
            print(f"️Eliminado directorio: {cv_dir}")

    print("Limpieza completada")

def run_complete_pipeline(train_path, test_path, force_restart=False):
    """
    Ejecuta el pipeline completo con manejo inteligente de checkpoints

    Args:
        train_path: Ruta al archivo de entrenamiento
        test_path: Ruta al archivo de test
        force_restart: Si True, ignora checkpoints y empieza desde cero
    """
    print("INICIANDO PIPELINE DE SENTIMENT ANALYSIS")
    print("=" * 60)

    if force_restart:
        clean_checkpoints()

    check_checkpoint_status()

    # 1. Cargar datos
    print("\n1️⃣  CARGANDO DATOS...")
    train, test = load_data(train_path, test_path)

    # 2. Preprocesar
    print("\n2️⃣  PREPROCESANDO DATOS...")
    train = preprocess_text(train)
    test = preprocess_text(test)

    # 3. Balancear
    print("\n3️⃣  BALANCEANDO DATOS...")
    train_bal = balance_with_back_translation(train)

    # 4. Adaptación al dominio
    print("\n4️⃣  ADAPTACIÓN AL DOMINIO...")
    model_da, tokenizer = domain_adaptive_pretraining(train_bal['clean'])

    # 5. Validación cruzada
    print("\n5️⃣  VALIDACIÓN CRUZADA...")
    cv_metrics = cross_val_finetune(train_bal, model_da, tokenizer)
    print("\nRESULTADOS DE VALIDACIÓN CRUZADA:")
    for metric in cv_metrics:
        print(f"Fold {metric['fold']}: F1={metric['f1_macro']:.4f}, MAE={metric['mae']:.4f}")

    # 6. Entrenamiento final
    print("\n6️⃣  ENTRENAMIENTO FINAL Y PREDICCIONES...")
    final_train_and_predict(train_bal, test, model_da, tokenizer)

    print("\nPIPELINE COMPLETADO")
    print("Archivo de predicciones: predicciones.txt")

    return cv_metrics

def final_train_and_predict(train_df, test_df, model, tokenizer, output_path='predicciones.txt'):
    from datasets import Dataset
    from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
    import torch

    # Resetear índices
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    # Preparar datos de entrenamiento
    train_texts = train_df['clean'].tolist()
    labels = [int(l-1) for l in train_df['Polarity'].tolist()]

    ds_train = Dataset.from_dict({'text': train_texts, 'label': labels})

    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

    ds_train = ds_train.map(tokenize_function, batched=True)
    ds_train = ds_train.remove_columns(['text'])
    ds_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    # Crear y entrenar modelo final
    model_cls = AutoModelForSequenceClassification.from_pretrained(
        model.name_or_path if hasattr(model, 'name_or_path') else 'PlanTL-GOB-ES/roberta-base-bne',
        num_labels=5,
        problem_type="single_label_classification"
    )

    args = TrainingArguments(
        output_dir='final_train',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        save_strategy="no",
        logging_steps=200,
        learning_rate=2e-5,
        weight_decay=0.01
    )

    trainer = Trainer(model=model_cls, args=args, train_dataset=ds_train)
    trainer.train()

    # Predicciones en test
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_cls.to(device)
    model_cls.eval()

    with open(output_path, 'w', encoding='utf-8') as f:
        for idx in range(len(test_df)):
            _id = test_df.iloc[idx]['ID']
            txt = test_df.iloc[idx]['clean']

            inputs = tokenizer(txt, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                logits = model_cls(**inputs).logits
                pred = torch.argmax(logits, dim=1).item() + 1

            f.write(f"MeIA {_id} {pred}\n")

    print(f"Archivo de salida generado: {output_path}")

In [None]:
install_dependencies()

Collecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014

In [None]:
run_complete_pipeline('MeIA_2025_train.xlsx', 'MeIA_2025_test_wo_labels.xlsx')

INICIANDO PIPELINE DE SENTIMENT ANALYSIS
📋 ESTADO DE LOS CHECKPOINTS:
--------------------------------------------------
Datos balanceados: train_balanced.csv (4.8 MB)
Modelo adaptado al dominio: NO encontrado
--------------------------------------------------

1️⃣  CARGANDO DATOS...

2️⃣  PREPROCESANDO DATOS...

3️⃣  BALANCEANDO DATOS...
Cargando datos balanceados desde checkpoint: train_balanced.csv

4️⃣  ADAPTACIÓN AL DOMINIO...
Realizando adaptación al dominio


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mjuanmario[0m ([33mjuanmario-unam-universidad-nacional-aut-noma-de-m-xico[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,2.5841
200,2.4665
300,2.3767
400,2.3199
500,2.3073


Modelo adaptado al dominio guardado en: domain_adapted_model

5️⃣  VALIDACIÓN CRUZADA...
Procesando fold 1/4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,1.2594
400,1.1054
600,1.0119
800,0.7683
1000,0.7199
1200,0.582
1400,0.3354
1600,0.3275
1800,0.2124
2000,0.1399


Evaluando fold 1
Fold 1 - F1 Macro: 0.6333, MAE: 0.4173
Procesando fold 2/4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,1.2886
400,1.101
600,0.9749
800,0.7688
1000,0.7088
1200,0.6249
1400,0.3249
1600,0.3301
1800,0.238
2000,0.1327


Evaluando fold 2
Fold 2 - F1 Macro: 0.6224, MAE: 0.4207
Procesando fold 3/4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,1.2145
400,1.092
600,0.9884
800,0.7525
1000,0.6864
1200,0.5757
1400,0.3576
1600,0.3499
1800,0.2248
2000,0.1427


Evaluando fold 3
Fold 3 - F1 Macro: 0.6208, MAE: 0.4360
Procesando fold 4/4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,1.2502
400,1.0495
600,1.0209
800,0.7139
1000,0.7604
1200,0.6025
1400,0.335
1600,0.3398
1800,0.1829
2000,0.1074


Evaluando fold 4
Fold 4 - F1 Macro: 0.5988, MAE: 0.4560

RESULTADOS DE VALIDACIÓN CRUZADA:
Fold 1: F1=0.6333, MAE=0.4173
Fold 2: F1=0.6224, MAE=0.4207
Fold 3: F1=0.6208, MAE=0.4360
Fold 4: F1=0.5988, MAE=0.4560

6️⃣  ENTRENAMIENTO FINAL Y PREDICCIONES...


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,1.2747
400,1.0645
600,1.0498
800,0.9492
1000,0.7202
1200,0.6819
1400,0.6496
1600,0.512
1800,0.3238
2000,0.2948


Archivo de salida generado: predicciones.txt

PIPELINE COMPLETADO
Archivo de predicciones: predicciones.txt


[{'fold': 1, 'f1_macro': 0.6333192063943998, 'mae': 0.41733333333333333},
 {'fold': 2, 'f1_macro': 0.622407988807034, 'mae': 0.4206666666666667},
 {'fold': 3, 'f1_macro': 0.6208227596772979, 'mae': 0.436},
 {'fold': 4, 'f1_macro': 0.5988215083060047, 'mae': 0.456}]