In [None]:
# Sentiment Analysis en Pueblos Mágicos (alineado con Rest‑Mex) - VERSIÓN CORREGIDA

# 1. Instalación de librerías y setup inicial
def install_dependencies():
    !pip install pandas "numpy<2.0" scikit-learn openpyxl transformers torch sentencepiece datasets evaluate sacremoses tqdm

# 2. Carga de datos
def load_data(train_path, test_path):
    import pandas as pd
    train = pd.read_excel(train_path)
    test  = pd.read_excel(test_path)
    return train, test

# 3. Exploración y limpieza básica del texto
def preprocess_text(df):
    import re
    df['clean'] = (
        df['Review']
        .str.lower()
        .str.replace(r"<[^>]+>", " ", regex=True)
        .str.replace(r"[^a-záéíóúñü ]", " ", regex=True)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    return df

# 4. Balanceo por aumentación de datos en clases minoritarias (batches + GPU) CON CHECKPOINTS
from tqdm.auto import tqdm
import torch
import os

def balance_with_back_translation(df, target_count=None, batch_size=128, checkpoint_path='train_balanced.csv'):
    import pandas as pd
    from math import ceil
    from transformers import MarianMTModel, MarianTokenizer

    # Verificar si ya existe el checkpoint
    if os.path.exists(checkpoint_path):
        print(f"📁 Cargando datos balanceados desde checkpoint: {checkpoint_path}")
        return pd.read_csv(checkpoint_path)

    print("🔄 Generando datos balanceados (esto puede tomar tiempo)...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Carga de modelos de traducción y envío a GPU
    me2en_tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-es-en')
    me2en = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-es-en').to(device)
    en2me_tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-es')
    en2me = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-es').to(device)

    counts = df['Polarity'].value_counts().to_dict()
    max_count = target_count or max(counts.values())
    augmented = []  # lista de (texto, polaridad)

    for polarity, cnt in counts.items():
        if cnt < max_count:
            needed = max_count - cnt
            samples = df[df['Polarity'] == polarity]['clean'].tolist()
            reps = ceil(needed / len(samples))
            pool = samples * reps
            to_aug = pool[:needed]

            batch_augmented = []
            for i in tqdm(range(0, len(to_aug), batch_size), desc=f'Aug Polarity {polarity}'):
                batch_texts = to_aug[i:i+batch_size]

                with torch.no_grad():
                    # es -> en
                    enc = me2en_tok(batch_texts, return_tensors='pt', padding=True, truncation=True).to(device)
                    en_ids = me2en.generate(**enc, max_length=128)
                    en_texts = me2en_tok.batch_decode(en_ids, skip_special_tokens=True)

                    # en -> es
                    dec = en2me_tok(en_texts, return_tensors='pt', padding=True, truncation=True).to(device)
                    es_ids = en2me.generate(**dec, max_length=128)
                    aug_texts = en2me_tok.batch_decode(es_ids, skip_special_tokens=True)

                batch_augmented.extend([(t, polarity) for t in aug_texts])

            if batch_augmented:
                augmented.extend(batch_augmented)
                df_partial = pd.DataFrame({
                    'Review': [t for t,_ in batch_augmented],
                    'Polarity': [p for _,p in batch_augmented],
                    'clean': [t for t,_ in batch_augmented]
                })
                df_partial.to_csv(f'augmented_polarity_{int(polarity)}.csv', mode='a', header=not pd.io.common.file_exists(f'augmented_polarity_{int(polarity)}.csv'), index=False, encoding='utf-8')

    texts, labels = zip(*augmented) if augmented else ([], [])
    aug_df = pd.DataFrame({'Review': texts, 'Polarity': labels, 'clean': texts})
    balanced_df = pd.concat([df, aug_df], ignore_index=True)

    # Guardar checkpoint
    balanced_df.to_csv(checkpoint_path, index=False, encoding='utf-8')
    print(f"💾 Datos balanceados guardados en: {checkpoint_path}")

    return balanced_df

# 5. Adaptación de dominio (Domain-Adaptive Pretraining) CON CHECKPOINTS
def domain_adaptive_pretraining(texts, base_model='PlanTL-GOB-ES/roberta-base-bne', checkpoint_dir='domain_adapted_model'):
    from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
    from datasets import Dataset
    import os

    # Verificar si ya existe el checkpoint
    if os.path.exists(checkpoint_dir) and os.path.exists(f"{checkpoint_dir}_tokenizer"):
        print(f"📁 Cargando modelo adaptado al dominio desde checkpoint: {checkpoint_dir}")
        model = AutoModelForMaskedLM.from_pretrained(checkpoint_dir)
        tokenizer = AutoTokenizer.from_pretrained(f"{checkpoint_dir}_tokenizer")
        return model, tokenizer

    print("🔄 Realizando adaptación al dominio (esto puede tomar tiempo)...")
    tok = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForMaskedLM.from_pretrained(base_model)
    ds = Dataset.from_dict({'text': texts})
    def tok_fn(x): return tok(x['text'], truncation=True, padding='max_length', max_length=128)
    tok_ds = ds.map(tok_fn, batched=True)

    data_collator = DataCollatorForLanguageModeling(tok, mlm=True, mlm_probability=0.15)
    args = TrainingArguments(
        output_dir='domain_adapt', num_train_epochs=3,
        per_device_train_batch_size=32, logging_steps=100, save_steps=500,
        learning_rate=2e-5, weight_decay=0.01
    )
    trainer = Trainer(model=model, args=args, train_dataset=tok_ds, data_collator=data_collator)
    trainer.train()

    # Guardar checkpoint
    model.save_pretrained(checkpoint_dir)
    tok.save_pretrained(f"{checkpoint_dir}_tokenizer")
    print(f"💾 Modelo adaptado al dominio guardado en: {checkpoint_dir}")

    return model, tok

# 6. Fine‑tuning con validación cruzada estratificada (5-fold) - VERSIÓN CORREGIDA
def cross_val_finetune(train_df, model, tokenizer, n_splits=4):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import StratifiedKFold
    from datasets import Dataset
    from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
    from sklearn.metrics import f1_score, mean_absolute_error
    import torch

    # Forzar compatibilidad con NumPy
    import os
    os.environ['NUMPY_EXPERIMENTAL_ARRAY_FUNCTION'] = '0'

    # Resetear índices para evitar problemas con el acceso a datos
    train_df = train_df.reset_index(drop=True)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    metrics = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df['clean'], train_df['Polarity']), start=1):
        print(f"Procesando fold {fold}/{n_splits}")

        # Preparar datasets de entrenamiento y validación usando .iloc para acceso seguro
        train_texts = train_df.iloc[train_idx]['clean'].tolist()
        train_labels = [int(label - 1) for label in train_df.iloc[train_idx]['Polarity'].tolist()]
        val_texts = train_df.iloc[val_idx]['clean'].tolist()
        val_labels = [int(label - 1) for label in train_df.iloc[val_idx]['Polarity'].tolist()]

        # Crear datasets usando un enfoque más directo
        train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        val_encodings = tokenizer(val_texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')

        # CLASE CUSTOMDATASET CORREGIDA - SIN WARNINGS
        class CustomDataset(torch.utils.data.Dataset):
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels

            def __getitem__(self, idx):
                # CORRECCIÓN: Usar .clone().detach() en lugar de torch.tensor()
                item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
                item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
                return item

            def __len__(self):
                return len(self.labels)

        train_dataset = CustomDataset(train_encodings, train_labels)
        val_dataset = CustomDataset(val_encodings, val_labels)

        # Crear y entrenar modelo
        model_cls = AutoModelForSequenceClassification.from_pretrained(
            model.name_or_path if hasattr(model, 'name_or_path') else 'PlanTL-GOB-ES/roberta-base-bne',
            num_labels=5,
            problem_type="single_label_classification"
        )

        args = TrainingArguments(
            output_dir=f'cv_fold{fold}',
            num_train_epochs=4,
            per_device_train_batch_size=8,
            logging_steps=200,
            save_strategy="no" , # No guardar checkpoints para ahorrar espacio
            learning_rate=2e-5,
            weight_decay=0.01
        )

        trainer = Trainer(
            model=model_cls,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset
        )
        trainer.train()

        # Evaluación manual para evitar problemas con datasets
        print(f"Evaluando fold {fold}")
        model_cls.eval()
        device = next(model_cls.parameters()).device

        all_preds = []
        with torch.no_grad():
            for i in range(0, len(val_texts), 512):  # Procesar en lotes de 32
                batch_texts = val_texts[i:i+512]
                batch_encodings = tokenizer(batch_texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
                batch_encodings = {k: v.to(device) for k, v in batch_encodings.items()}

                outputs = model_cls(**batch_encodings)
                predictions = torch.argmax(outputs.logits, dim=-1)
                all_preds.extend(predictions.cpu().numpy())

        # Convertir de vuelta a etiquetas originales (1-5)
        true_labels = [label + 1 for label in val_labels]
        pred_labels = [pred + 1 for pred in all_preds]

        f1 = f1_score(val_labels, all_preds, average='macro')
        mae = mean_absolute_error(true_labels, pred_labels)

        metrics.append({'fold': fold, 'f1_macro': f1, 'mae': mae})
        print(f"Fold {fold} - F1 Macro: {f1:.4f}, MAE: {mae:.4f}")

    return metrics

# FUNCIONES ADICIONALES PARA GESTIÓN DE CHECKPOINTS

def check_checkpoint_status():
    """Verifica qué checkpoints existen y su tamaño"""
    import os
    print("📋 ESTADO DE LOS CHECKPOINTS:")
    print("-" * 50)

    # Datos balanceados
    if os.path.exists('train_balanced.csv'):
        size = os.path.getsize('train_balanced.csv') / (1024*1024)  # MB
        print(f"✅ Datos balanceados: train_balanced.csv ({size:.1f} MB)")
    else:
        print("❌ Datos balanceados: NO encontrados")

    # Modelo adaptado al dominio
    if os.path.exists('domain_adapted_model') and os.path.exists('domain_adapted_model_tokenizer'):
        print("✅ Modelo adaptado al dominio: domain_adapted_model/")
    else:
        print("❌ Modelo adaptado al dominio: NO encontrado")

    # Archivos de aumentación parcial
    aug_files = [f for f in os.listdir('.') if f.startswith('augmented_polarity_')]
    if aug_files:
        print(f"📂 Archivos de aumentación parcial: {len(aug_files)} archivos")

    print("-" * 50)

def clean_checkpoints():
    """Limpia todos los checkpoints para empezar desde cero"""
    import os
    import shutil

    print("🧹 LIMPIANDO CHECKPOINTS...")

    # Archivos CSV
    files_to_remove = ['train_balanced.csv'] + [f for f in os.listdir('.') if f.startswith('augmented_polarity_')]
    for file in files_to_remove:
        if os.path.exists(file):
            os.remove(file)
            print(f"🗑️  Eliminado: {file}")

    # Directorios de modelos
    dirs_to_remove = ['domain_adapted_model', 'domain_adapted_model_tokenizer', 'domain_adapt']
    for dir_name in dirs_to_remove:
        if os.path.exists(dir_name):
            shutil.rmtree(dir_name)
            print(f"🗑️  Eliminado directorio: {dir_name}")

    # Directorios de CV
    cv_dirs = [d for d in os.listdir('.') if d.startswith('cv_fold')]
    for cv_dir in cv_dirs:
        if os.path.exists(cv_dir):
            shutil.rmtree(cv_dir)
            print(f"🗑️  Eliminado directorio: {cv_dir}")

    print("✅ Limpieza completada")

def mount_drive_for_persistence():
    """Monta Google Drive para persistencia de checkpoints"""
    try:
        from google.colab import drive
        drive.mount('/content/drive')

        # Crear directorio de trabajo en Drive
        import os
        work_dir = '/content/drive/MyDrive/sentiment_analysis_checkpoints'
        os.makedirs(work_dir, exist_ok=True)
        os.chdir(work_dir)

        print(f"📁 Directorio de trabajo: {work_dir}")
        print("✅ Google Drive montado - Los checkpoints persistirán entre sesiones")

    except ImportError:
        print("❌ No estás en Google Colab - checkpoints solo durarán esta sesión")
    except Exception as e:
        print(f"❌ Error montando Google Drive: {e}")

# Función de ejecución completa con manejo inteligente de checkpoints
def run_complete_pipeline(train_path, test_path, force_restart=False):
    """
    Ejecuta el pipeline completo con manejo inteligente de checkpoints

    Args:
        train_path: Ruta al archivo de entrenamiento
        test_path: Ruta al archivo de test
        force_restart: Si True, ignora checkpoints y empieza desde cero
    """
    print("🚀 INICIANDO PIPELINE DE SENTIMENT ANALYSIS")
    print("=" * 60)

    if force_restart:
        clean_checkpoints()

    check_checkpoint_status()

    # 1. Cargar datos
    print("\n1️⃣  CARGANDO DATOS...")
    train, test = load_data(train_path, test_path)

    # 2. Preprocesar
    print("\n2️⃣  PREPROCESANDO DATOS...")
    train = preprocess_text(train)
    test = preprocess_text(test)

    # 3. Balancear (con checkpoint)
    print("\n3️⃣  BALANCEANDO DATOS...")
    train_bal = balance_with_back_translation(train)

    # 4. Adaptación al dominio (con checkpoint)
    print("\n4️⃣  ADAPTACIÓN AL DOMINIO...")
    model_da, tokenizer = domain_adaptive_pretraining(train_bal['clean'])

    # 5. Validación cruzada
    print("\n5️⃣  VALIDACIÓN CRUZADA...")
    cv_metrics = cross_val_finetune(train_bal, model_da, tokenizer)
    print("\n📊 RESULTADOS DE VALIDACIÓN CRUZADA:")
    for metric in cv_metrics:
        print(f"Fold {metric['fold']}: F1={metric['f1_macro']:.4f}, MAE={metric['mae']:.4f}")

    # 6. Entrenamiento final
    print("\n6️⃣  ENTRENAMIENTO FINAL Y PREDICCIONES...")
    final_train_and_predict(train_bal, test, model_da, tokenizer)

    print("\n✅ PIPELINE COMPLETADO")
    print("📄 Archivo de predicciones: predicciones.txt")

    return cv_metrics
def final_train_and_predict(train_df, test_df, model, tokenizer, output_path='predicciones.txt'):
    from datasets import Dataset
    from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
    import torch

    # Resetear índices
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    # Preparar datos de entrenamiento
    train_texts = train_df['clean'].tolist()
    labels = [int(l-1) for l in train_df['Polarity'].tolist()]

    ds_train = Dataset.from_dict({'text': train_texts, 'label': labels})

    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

    ds_train = ds_train.map(tokenize_function, batched=True)
    ds_train = ds_train.remove_columns(['text'])
    ds_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    # Crear y entrenar modelo final
    model_cls = AutoModelForSequenceClassification.from_pretrained(
        model.name_or_path if hasattr(model, 'name_or_path') else 'PlanTL-GOB-ES/roberta-base-bne',
        num_labels=5,
        problem_type="single_label_classification"
    )

    args = TrainingArguments(
        output_dir='final_train',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        save_strategy="no"
    )

    trainer = Trainer(model=model_cls, args=args, train_dataset=ds_train)
    trainer.train()

    # Predicciones en test
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_cls.to(device)
    model_cls.eval()

    with open(output_path, 'w', encoding='utf-8') as f:
        for idx in range(len(test_df)):
            _id = test_df.iloc[idx]['ID']
            txt = test_df.iloc[idx]['clean']

            inputs = tokenizer(txt, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                logits = model_cls(**inputs).logits
                pred = torch.argmax(logits, dim=1).item() + 1

            f.write(f"MeIA {_id} {pred}\n")

    print(f"Archivo de salida generado: {output_path}")

# 8. Pauta de ejecución simplificada en Colab CON CHECKPOINTS:
"""
EJECUCIÓN PASO A PASO CON CHECKPOINTS:

1) Instalar dependencias:
   install_dependencies()

2) Cargar datos:
   train, test = load_data('train.xlsx', 'test.xlsx')

3) Preprocesar datos:
   train = preprocess_text(train)
   test = preprocess_text(test)

4) Balancear datos (se guarda automáticamente):
   train_bal = balance_with_back_translation(train)
   # Si ya existe 'train_balanced.csv', se carga automáticamente

5) Adaptación al dominio (se guarda automáticamente):
   model_da, tokenizer = domain_adaptive_pretraining(train_bal['clean'])
   # Si ya existe 'domain_adapted_model/', se carga automáticamente

6) Validación cruzada:
   cv_metrics = cross_val_finetune(train_bal, model_da, tokenizer)
   print(cv_metrics)

7) Entrenamiento final y predicciones:
   final_train_and_predict(train_bal, test, model_da, tokenizer)

COMANDOS ÚTILES PARA GESTIONAR CHECKPOINTS:

- Ver archivos guardados:
  !ls -la *.csv
  !ls -la domain_adapted_model*/

- Forzar regeneración (borrar checkpoints):
  !rm -f train_balanced.csv
  !rm -rf domain_adapted_model*

- Verificar espacio en disco:
  !df -h
"""

"\nEJECUCIÓN PASO A PASO CON CHECKPOINTS:\n\n1) Instalar dependencias:\n   install_dependencies()\n\n2) Cargar datos:\n   train, test = load_data('train.xlsx', 'test.xlsx')\n\n3) Preprocesar datos:\n   train = preprocess_text(train)\n   test = preprocess_text(test)\n\n4) Balancear datos (se guarda automáticamente):\n   train_bal = balance_with_back_translation(train)\n   # Si ya existe 'train_balanced.csv', se carga automáticamente\n\n5) Adaptación al dominio (se guarda automáticamente):\n   model_da, tokenizer = domain_adaptive_pretraining(train_bal['clean'])\n   # Si ya existe 'domain_adapted_model/', se carga automáticamente\n\n6) Validación cruzada:\n   cv_metrics = cross_val_finetune(train_bal, model_da, tokenizer)\n   print(cv_metrics)\n\n7) Entrenamiento final y predicciones:\n   final_train_and_predict(train_bal, test, model_da, tokenizer)\n\nCOMANDOS ÚTILES PARA GESTIONAR CHECKPOINTS:\n\n- Ver archivos guardados:\n  !ls -la *.csv\n  !ls -la domain_adapted_model*/\n\n- Forzar r

In [None]:
install_dependencies()



In [None]:
train, test = load_data('MeIA_2025_train.xlsx', 'MeIA_2025_test_wo_labels.xlsx')

In [None]:
train = preprocess_text(train)
test = preprocess_text(test)

In [None]:
train_bal = balance_with_back_translation(train)

🔄 Generando datos balanceados (esto puede tomar tiempo)...


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

Aug Polarity 4.0:   0%|          | 0/1 [00:00<?, ?it/s]

Aug Polarity 3.0:   0%|          | 0/2 [00:00<?, ?it/s]

Aug Polarity 2.0:   0%|          | 0/3 [00:00<?, ?it/s]

Aug Polarity 1.0:   0%|          | 0/4 [00:00<?, ?it/s]

💾 Datos balanceados guardados en: train_balanced.csv


In [None]:
model_da, tokenizer = domain_adaptive_pretraining(train_bal['clean'])

🔄 Realizando adaptación al dominio (esto puede tomar tiempo)...


tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjuanmario[0m ([33mjuanmario-unam-universidad-nacional-aut-noma-de-m-xico[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,2.5804
200,2.4694
300,2.3786
400,2.3287
500,2.2992


💾 Modelo adaptado al dominio guardado en: domain_adapted_model


In [None]:
cv_metrics = cross_val_finetune(train_bal, model_da, tokenizer)
print(cv_metrics)

Procesando fold 1/4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,1.2618
400,1.0988
600,1.0153
800,0.7597
1000,0.7153
1200,0.5708
1400,0.3438
1600,0.3468
1800,0.2212
2000,0.1367


Evaluando fold 1
Fold 1 - F1 Macro: 0.6398, MAE: 0.4127
Procesando fold 2/4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,1.2906
400,1.1038
600,0.9783
800,0.7594
1000,0.6994
1200,0.6123
1400,0.3211
1600,0.3275
1800,0.2301
2000,0.1385


Evaluando fold 2
Fold 2 - F1 Macro: 0.6211, MAE: 0.4187
Procesando fold 3/4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,1.2198
400,1.0842
600,0.9881
800,0.7382
1000,0.6832
1200,0.5616
1400,0.3461
1600,0.3456
1800,0.2175
2000,0.1286


Evaluando fold 3
Fold 3 - F1 Macro: 0.6142, MAE: 0.4480
Procesando fold 4/4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
200,1.2537
400,1.0532
600,1.0201
800,0.7177
1000,0.7661
1200,0.5936
1400,0.3375
1600,0.3719
1800,0.1972
2000,0.1049


Evaluando fold 4
Fold 4 - F1 Macro: 0.6076, MAE: 0.4533
[{'fold': 1, 'f1_macro': 0.6398462263443698, 'mae': 0.4126666666666667}, {'fold': 2, 'f1_macro': 0.621145706728966, 'mae': 0.4186666666666667}, {'fold': 3, 'f1_macro': 0.614223187880991, 'mae': 0.448}, {'fold': 4, 'f1_macro': 0.6076154674685162, 'mae': 0.4533333333333333}]


In [None]:
final_train_and_predict(train_bal, test, model_da, tokenizer)

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.212
1000,0.9485
1500,0.7693
2000,0.3798


Archivo de salida generado: predicciones.txt
