## 1. Setup and Imports

In [None]:
# Install required packages (uncomment if needed)
# !pip install torch transformers datasets scikit-learn pandas tqdm matplotlib

In [None]:
import os
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import (
    accuracy_score, 
    precision_recall_fscore_support, 
    classification_report
)
from datasets import Dataset as HFDataset
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. Load and Prepare Data

In [None]:
# Load the full dataset (both English and Spanish)
df = pd.read_csv('../data/aggregated_data.csv')

# Create label mappings
unique_labels = sorted(df['label_sexist'].unique().tolist())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
num_labels = len(label2id)

print(f"Total samples: {len(df)}")
print(f"Number of labels: {num_labels}")
print(f"Label mapping: {label2id}")
print(f"\nLanguage distribution:")
print(df['lang'].value_counts())
print(f"\nSplit distribution:")
print(df['split'].value_counts())
print(f"\nLabel distribution:")
print(df['label_sexist'].value_counts())

In [None]:
# Split data
train_df = df[df['split'] == 'train'].copy()
dev_df = df[df['split'] == 'dev'].copy()
test_df = df[df['split'] == 'test'].copy()

# Add numeric labels
train_df['label'] = train_df['label_sexist'].map(label2id)
dev_df['label'] = dev_df['label_sexist'].map(label2id)
test_df['label'] = test_df['label_sexist'].map(label2id)

print(f"Train samples: {len(train_df)} (EN: {len(train_df[train_df['lang']=='en'])}, ES: {len(train_df[train_df['lang']=='es'])})")
print(f"Dev samples: {len(dev_df)} (EN: {len(dev_df[dev_df['lang']=='en'])}, ES: {len(dev_df[dev_df['lang']=='es'])})")
print(f"Test samples: {len(test_df)} (EN: {len(test_df[test_df['lang']=='en'])}, ES: {len(test_df[test_df['lang']=='es'])})")

## 3. Define Models to Test

We'll test multiple models with different language configurations:
- **"multi"**: Train on both English and Spanish data
- **"en"**: Train only on English data
- **"es"**: Train only on Spanish data

In [None]:
# Define models and their language configurations
# Keys: model names, Values: "en" (English only), "es" (Spanish only), "multi" (both languages)
models_config = {
    'bert-base-multilingual-cased': 'multi',
    'bert-base-uncased': 'en',
    'dccuchile/bert-base-spanish-wwm-cased': 'es',
    'roberta-base': 'en',
    'distilbert-base-multilingual-cased': 'multi',
}

print("Models configuration:")
print("=" * 70)
for model_name, lang_config in models_config.items():
    lang_desc = {
        'multi': 'Multilingual (EN + ES)',
        'en': 'English only',
        'es': 'Spanish only'
    }[lang_config]
    print(f"{model_name:45s} -> {lang_desc}")
print("=" * 70)

In [None]:
def tokenize_function(examples, tokenizer):
    """Tokenize texts for the model"""
    return tokenizer(examples['text'], truncation=True, max_length=128)

def prepare_datasets(train_df, dev_df, test_df, tokenizer, lang_config):
    """
    Prepare datasets based on language configuration
    
    Args:
        train_df, dev_df, test_df: DataFrames with data
        tokenizer: Tokenizer to use
        lang_config: "en", "es", or "multi"
    
    Returns:
        Tokenized train, dev, and test datasets
    """
    # Filter data based on language configuration
    if lang_config == 'en':
        train_filtered = train_df[train_df['lang'] == 'en'].copy()
        dev_filtered = dev_df[dev_df['lang'] == 'en'].copy()
        test_filtered = test_df[test_df['lang'] == 'en'].copy()
    elif lang_config == 'es':
        train_filtered = train_df[train_df['lang'] == 'es'].copy()
        dev_filtered = dev_df[dev_df['lang'] == 'es'].copy()
        test_filtered = test_df[test_df['lang'] == 'es'].copy()
    else:  # multi
        train_filtered = train_df.copy()
        dev_filtered = dev_df.copy()
        test_filtered = test_df.copy()
    
    # Convert to HuggingFace datasets
    train_dataset = HFDataset.from_pandas(train_filtered[['text', 'label']])
    dev_dataset = HFDataset.from_pandas(dev_filtered[['text', 'label']])
    test_dataset = HFDataset.from_pandas(test_filtered[['text', 'label']])
    
    # Tokenize
    train_dataset = train_dataset.map(
        lambda x: tokenize_function(x, tokenizer), batched=True
    )
    dev_dataset = dev_dataset.map(
        lambda x: tokenize_function(x, tokenizer), batched=True
    )
    test_dataset = test_dataset.map(
        lambda x: tokenize_function(x, tokenizer), batched=True
    )
    
    print(f"  Train: {len(train_dataset)} samples")
    print(f"  Dev: {len(dev_dataset)} samples")
    print(f"  Test: {len(test_dataset)} samples")
    
    return train_dataset, dev_dataset, test_dataset

In [None]:
# Define metrics function
def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# Dictionary to store all results
all_results = {}

# Train and evaluate each model
for model_name, lang_config in models_config.items():
    print("\n" + "=" * 80)
    print(f"Processing: {model_name}")
    print(f"Language configuration: {lang_config}")
    print("=" * 80)
    
    # Load tokenizer
    print(f"\nLoading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Prepare datasets based on language configuration
    print(f"Preparing datasets for '{lang_config}' configuration...")
    train_dataset, dev_dataset, test_dataset = prepare_datasets(
        train_df, dev_df, test_df, tokenizer, lang_config
    )
    
    # Initialize model
    print(f"\nInitializing model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )
    print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Setup training arguments
    output_dir = f'./results_{model_name.replace("/", "_")}_{lang_config}'
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        warmup_steps=500,
        weight_decay=0.01,
        learning_rate=2e-5,
        logging_dir=f'./logs_{model_name.replace("/", "_")}_{lang_config}',
        logging_steps=100,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        save_total_limit=2,
    )
    
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    
    # Train
    print(f"\nTraining...")
    trainer.train()
    
    # Evaluate on test set
    print(f"\nEvaluating on test set...")
    test_results = trainer.evaluate(test_dataset)
    
    # Store results
    all_results[f"{model_name} ({lang_config})"] = {
        'model_name': model_name,
        'lang_config': lang_config,
        'accuracy': test_results['eval_accuracy'],
        'precision': test_results['eval_precision'],
        'recall': test_results['eval_recall'],
        'f1': test_results['eval_f1'],
        'trainer': trainer,
        'test_dataset': test_dataset
    }
    
    print(f"\nResults:")
    print(f"  Accuracy:  {test_results['eval_accuracy']:.4f}")
    print(f"  Precision: {test_results['eval_precision']:.4f}")
    print(f"  Recall:    {test_results['eval_recall']:.4f}")
    print(f"  F1-Score:  {test_results['eval_f1']:.4f}")
    
    # Save model
    save_dir = f'./{model_name.replace("/", "_")}_{lang_config}'
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"\nModel saved to '{save_dir}'")
    
    # Clear memory
    del model, trainer, tokenizer
    if device.type == 'cuda':
        torch.cuda.empty_cache()

print("\n" + "=" * 80)
print("All models trained and evaluated!")
print("=" * 80)

## 4. Compare Results by Language Configuration

### 4.1 Overall Comparison Table

In [None]:
# Create comparison dataframe
comparison_data = []
for key, results in all_results.items():
    comparison_data.append({
        'Model': results['model_name'].split('/')[-1],
        'Lang Config': results['lang_config'].upper(),
        'Accuracy': results['accuracy'],
        'Precision': results['precision'],
        'Recall': results['recall'],
        'F1-Score': results['f1']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('F1-Score', ascending=False)

print("=" * 90)
print("ALL MODELS COMPARISON - TEST SET RESULTS")
print("=" * 90)
print(comparison_df.to_string(index=False))
print("=" * 90)

# Identify best overall model
best_idx = comparison_df.index[0]
print(f"\nBest Overall Model: {comparison_df.iloc[0]['Model']} ({comparison_df.iloc[0]['Lang Config']})")
print(f"Best F1-Score: {comparison_df.iloc[0]['F1-Score']:.4f}")

In [None]:
# Separate results by language configuration
en_results = comparison_df[comparison_df['Lang Config'] == 'EN'].copy()
es_results = comparison_df[comparison_df['Lang Config'] == 'ES'].copy()
multi_results = comparison_df[comparison_df['Lang Config'] == 'MULTI'].copy()

print("\n" + "=" * 70)
print("ENGLISH-ONLY MODELS")
print("=" * 70)
if len(en_results) > 0:
    print(en_results.to_string(index=False))
    print(f"\nBest EN model: {en_results.iloc[0]['Model']} (F1: {en_results.iloc[0]['F1-Score']:.4f})")
else:
    print("No English-only models")

print("\n" + "=" * 70)
print("SPANISH-ONLY MODELS")
print("=" * 70)
if len(es_results) > 0:
    print(es_results.to_string(index=False))
    print(f"\nBest ES model: {es_results.iloc[0]['Model']} (F1: {es_results.iloc[0]['F1-Score']:.4f})")
else:
    print("No Spanish-only models")

print("\n" + "=" * 70)
print("MULTILINGUAL MODELS")
print("=" * 70)
if len(multi_results) > 0:
    print(multi_results.to_string(index=False))
    print(f"\nBest MULTI model: {multi_results.iloc[0]['Model']} (F1: {multi_results.iloc[0]['F1-Score']:.4f})")
else:
    print("No multilingual models")

In [None]:
import matplotlib.pyplot as plt

# Create comprehensive visualization
fig = plt.figure(figsize=(18, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# Color mapping for language configs
color_map = {'EN': '#1f77b4', 'ES': '#ff7f0e', 'MULTI': '#2ca02c'}
comparison_df['Color'] = comparison_df['Lang Config'].map(color_map)

# 1. Overall F1-Score comparison (large plot)
ax1 = fig.add_subplot(gs[0, :])
sorted_df = comparison_df.sort_values('F1-Score', ascending=True)
bars = ax1.barh(range(len(sorted_df)), sorted_df['F1-Score'], 
                color=sorted_df['Color'], alpha=0.8)
ax1.set_yticks(range(len(sorted_df)))
ax1.set_yticklabels([f"{row['Model']} ({row['Lang Config']})" 
                      for _, row in sorted_df.iterrows()])
ax1.set_xlabel('F1-Score', fontsize=12)
ax1.set_title('Overall F1-Score Comparison', fontsize=14, fontweight='bold')
ax1.set_xlim([0, 1])
ax1.grid(True, alpha=0.3, axis='x')
for i, (_, row) in enumerate(sorted_df.iterrows()):
    ax1.text(row['F1-Score'] + 0.01, i, f"{row['F1-Score']:.4f}", 
             va='center', fontsize=9)

# 2-4. Metrics by language configuration
metrics = ['Accuracy', 'Precision', 'Recall']
for idx, metric in enumerate(metrics):
    ax = fig.add_subplot(gs[1, idx])
    
    # Prepare data for grouped bars
    configs = ['EN', 'ES', 'MULTI']
    for i, config in enumerate(configs):
        config_data = comparison_df[comparison_df['Lang Config'] == config]
        if len(config_data) > 0:
            x_pos = np.arange(len(config_data)) + i * 0.25
            ax.bar(x_pos, config_data[metric], 0.25, 
                   label=config, color=color_map[config], alpha=0.8)
    
    ax.set_ylabel(metric, fontsize=11)
    ax.set_title(f'{metric} by Config', fontsize=12, fontweight='bold')
    ax.set_ylim([0, 1])
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')

# 5-7. Individual comparisons for each language config
for idx, (config, config_data) in enumerate([
    ('EN', en_results), ('ES', es_results), ('MULTI', multi_results)
]):
    ax = fig.add_subplot(gs[2, idx])
    
    if len(config_data) > 0:
        x = np.arange(len(config_data))
        width = 0.2
        
        ax.bar(x - width*1.5, config_data['Precision'], width, 
               label='Precision', alpha=0.8)
        ax.bar(x - width*0.5, config_data['Recall'], width, 
               label='Recall', alpha=0.8)
        ax.bar(x + width*0.5, config_data['F1-Score'], width, 
               label='F1-Score', alpha=0.8)
        ax.bar(x + width*1.5, config_data['Accuracy'], width, 
               label='Accuracy', alpha=0.8)
        
        ax.set_ylabel('Score', fontsize=11)
        ax.set_title(f'{config} Models Comparison', fontsize=12, fontweight='bold')
        ax.set_xticks(x)
        ax.set_xticklabels(config_data['Model'], rotation=45, ha='right', fontsize=9)
        ax.legend(fontsize=9)
        ax.set_ylim([0, 1])
        ax.grid(True, alpha=0.3, axis='y')
    else:
        ax.text(0.5, 0.5, f'No {config} models', 
                ha='center', va='center', fontsize=12)
        ax.set_xlim([0, 1])
        ax.set_ylim([0, 1])
        ax.axis('off')

plt.savefig('bert_models_comprehensive_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nComprehensive comparison plot saved as 'bert_models_comprehensive_comparison.png'")

### 4.2 Analysis: Multilingual vs Monolingual Strategy

In [None]:
# Compare multilingual approach vs best monolingual combination
if len(multi_results) > 0:
    best_multi = multi_results.iloc[0]
    
    # Get best EN and ES models if they exist
    if len(en_results) > 0 and len(es_results) > 0:
        best_en = en_results.iloc[0]
        best_es = es_results.iloc[0]
        
        # Estimate combined monolingual performance (weighted average by test set size)
        # This is an approximation - actual merged performance would require combining predictions
        total_test = len(test_df)
        en_test_size = len(test_df[test_df['lang'] == 'en'])
        es_test_size = len(test_df[test_df['lang'] == 'es'])
        
        combined_f1 = (best_en['F1-Score'] * en_test_size + 
                       best_es['F1-Score'] * es_test_size) / total_test
        combined_precision = (best_en['Precision'] * en_test_size + 
                             best_es['Precision'] * es_test_size) / total_test
        combined_recall = (best_en['Recall'] * en_test_size + 
                          best_es['Recall'] * es_test_size) / total_test
        
        print("=" * 80)
        print("STRATEGY COMPARISON: Multilingual vs Monolingual Combination")
        print("=" * 80)
        print(f"\nBest Multilingual Model: {best_multi['Model']}")
        print(f"  F1-Score:  {best_multi['F1-Score']:.4f}")
        print(f"  Precision: {best_multi['Precision']:.4f}")
        print(f"  Recall:    {best_multi['Recall']:.4f}")
        
        print(f"\nBest Monolingual Combination: {best_en['Model']} (EN) + {best_es['Model']} (ES)")
        print(f"  Combined F1-Score (est.):  {combined_f1:.4f}")
        print(f"  Combined Precision (est.): {combined_precision:.4f}")
        print(f"  Combined Recall (est.):    {combined_recall:.4f}")
        
        print(f"\nDifference:")
        print(f"  F1-Score: {best_multi['F1-Score'] - combined_f1:+.4f}")
        
        if best_multi['F1-Score'] > combined_f1:
            print(f"\n✓ Multilingual approach performs better by {best_multi['F1-Score'] - combined_f1:.4f}")
        else:
            print(f"\n✓ Monolingual combination performs better by {combined_f1 - best_multi['F1-Score']:.4f}")
        print("=" * 80)
    else:
        print("Cannot compare - need both EN and ES models for monolingual strategy")

### 4.3 Detailed Results for Best Model

In [None]:
# Get best model's detailed results
best_model_key = list(all_results.keys())[
    list(all_results.values()).index(
        max(all_results.values(), key=lambda x: x['f1'])
    )
]

best_model_results = all_results[best_model_key]

print("=" * 80)
print(f"BEST MODEL DETAILS: {best_model_key}")
print("=" * 80)
print(f"Accuracy:  {best_model_results['accuracy']:.4f}")
print(f"Precision: {best_model_results['precision']:.4f}")
print(f"Recall:    {best_model_results['recall']:.4f}")
print(f"F1-Score:  {best_model_results['f1']:.4f}")

# Get predictions for detailed report
print("\nGenerating detailed classification report...")
predictions = best_model_results['trainer'].predict(best_model_results['test_dataset'])
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

print("\nDetailed Classification Report:")
print("-" * 80)
print(classification_report(
    true_labels, 
    pred_labels, 
    target_names=[id2label[i] for i in sorted(id2label.keys())]
))

## 5. Save Best Model Predictions

In [None]:
# Save predictions from best model
output_file = f'../data/bert_predictions_{best_model_results["model_name"].replace("/", "_")}_{best_model_results["lang_config"]}.json'

# Create submission dictionary
submission = {}
test_ids = test_df['id'].tolist() if best_model_results['lang_config'] == 'multi' else \
           test_df[test_df['lang'] == 'en']['id'].tolist() if best_model_results['lang_config'] == 'en' else \
           test_df[test_df['lang'] == 'es']['id'].tolist()

for test_id, pred_label in zip(test_ids, pred_labels):
    submission[str(test_id)] = id2label[int(pred_label)]

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(submission, f, indent=2, ensure_ascii=False)

print(f"Best model predictions saved to: {output_file}")
print(f"Total predictions: {len(submission)}")

In [None]:
# Export results to CSV for further analysis
comparison_df.to_csv('bert_models_comparison_results.csv', index=False)
print("\nResults exported to 'bert_models_comparison_results.csv'")