In [None]:
# Install required packages
# !pip install torch transformers scikit-learn tqdm

In [None]:
import json
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support, classification_report
from tqdm import tqdm
from collections import Counter

## 1. Load Data

In [None]:
# Load data from CSV file
df = pd.read_csv('../data/aggregated_data_en.csv')

# Split data based on 'split' column
train_data = df[df['split'] == 'train']
dev_data = df[df['split'] == 'dev']
test_data = df[df['split'] == 'test']

print(f"Training samples: {len(train_data)}")
print(f"Dev samples: {len(dev_data)}")
print(f"Test samples: {len(test_data)}")
print(f"\nLabel distribution in training:")
print(train_data['label_sexist'].value_counts())

## 2. Prepare Reference Data (Train + Dev)

In [None]:
# Combine training and dev data for reference (better coverage)
reference_data = pd.concat([train_data, dev_data], ignore_index=True)

reference_texts = reference_data['text'].tolist()
reference_labels = reference_data['label_sexist'].tolist()
reference_ids = reference_data['id'].tolist()

print(f"Total reference samples (train + dev): {len(reference_texts)}")
print(f"Label distribution: {Counter(reference_labels)}")

In [None]:
# Prepare test data
test_texts = test_data['text'].tolist()
test_ids = test_data['id'].tolist()
test_labels = test_data['label_sexist'].tolist()

print(f"Test samples: {len(test_texts)}")

## 3. Define Models to Test

We'll test multiple pre-trained models and compare their performance.

In [None]:
# Define list of models to test
models_to_test = [
    'bert-base-multilingual-cased',           # Multilingual BERT
    'bert-base-uncased',                       # English BERT
    'roberta-base',                            # RoBERTa (English)
    'distilbert-base-uncased',                 # DistilBERT (smaller, faster)
    'sentence-transformers/all-MiniLM-L6-v2', # Sentence-BERT (optimized for similarity)
]

print("Models to test:")
for i, model_name in enumerate(models_to_test, 1):
    print(f"  {i}. {model_name}")

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

In [None]:
def get_cls_embeddings(texts, model, tokenizer, batch_size=32):
    """Extract CLS embeddings for a list of texts"""
    embeddings = []
    
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Extracting embeddings", leave=False):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors='pt'
            ).to(device)
            
            # Get model output
            outputs = model(**encoded)
            
            # Extract CLS token embedding (first token)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embeddings)
    
    return np.vstack(embeddings)

## 4. Extract Embeddings and Classify with Each Model

In [None]:
# Dictionary to store results for each model
results = {}

# Process each model
for model_name in models_to_test:
    print("=" * 80)
    print(f"Processing model: {model_name}")
    print("=" * 80)
    
    # Load tokenizer and model
    print(f"Loading {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model = model.to(device)
    model.eval()
    
    # Extract embeddings for reference data
    print("Extracting reference embeddings...")
    reference_embeddings = get_cls_embeddings(reference_texts, model, tokenizer)
    
    # Extract embeddings for test data
    print("Extracting test embeddings...")
    test_embeddings = get_cls_embeddings(test_texts, model, tokenizer)
    
    # Perform classification
    print("Classifying...")
    predictions = []
    for i in tqdm(range(len(test_embeddings)), desc="Finding nearest neighbors", leave=False):
        test_emb = test_embeddings[i:i+1]
        similarities = cosine_similarity(test_emb, reference_embeddings)[0]
        nearest_idx = np.argmax(similarities)
        predictions.append(reference_labels[nearest_idx])
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        test_labels, predictions, average='weighted'
    )
    
    # Store results
    results[model_name] = {
        'predictions': predictions,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'reference_embeddings': reference_embeddings,
        'test_embeddings': test_embeddings
    }
    
    print(f"✓ Completed - F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
    print()
    
    # Clear GPU memory
    del model, tokenizer, reference_embeddings, test_embeddings
    if device.type == 'cuda':
        torch.cuda.empty_cache()

print("\n" + "=" * 80)
print("All models processed!")
print("=" * 80)

## 5. Compare Results Across All Models

In [None]:
# Create comparison dataframe
comparison_data = []
for model_name, model_results in results.items():
    comparison_data.append({
        'Model': model_name.split('/')[-1],  # Use short name
        'Precision': model_results['precision'],
        'Recall': model_results['recall'],
        'F1-Score': model_results['f1']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('F1-Score', ascending=False)

print("=" * 80)
print("MODEL COMPARISON - TEST SET RESULTS")
print("=" * 80)
print(comparison_df.to_string(index=False))
print("=" * 80)

# Identify best model
best_model = comparison_df.iloc[0]['Model']
best_f1 = comparison_df.iloc[0]['F1-Score']
print(f"\nBest Model: {best_model}")
print(f"Best F1-Score: {best_f1:.4f}")

In [None]:
import matplotlib.pyplot as plt

# Create visualization comparing all models
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Sort by F1-score for better visualization
sorted_df = comparison_df.sort_values('F1-Score', ascending=True)

# Colors for bars
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(sorted_df)))

# F1-Score comparison
axes[0, 0].barh(sorted_df['Model'], sorted_df['F1-Score'], color=colors)
axes[0, 0].set_xlabel('F1-Score', fontsize=12)
axes[0, 0].set_title('F1-Score Comparison', fontsize=13, fontweight='bold')
axes[0, 0].set_xlim([0, 1])
axes[0, 0].grid(True, alpha=0.3, axis='x')
for i, (model, score) in enumerate(zip(sorted_df['Model'], sorted_df['F1-Score'])):
    axes[0, 0].text(score + 0.01, i, f'{score:.4f}', va='center', fontsize=10)

# Precision comparison
axes[0, 1].barh(sorted_df['Model'], sorted_df['Precision'], color=colors)
axes[0, 1].set_xlabel('Precision', fontsize=12)
axes[0, 1].set_title('Precision Comparison', fontsize=13, fontweight='bold')
axes[0, 1].set_xlim([0, 1])
axes[0, 1].grid(True, alpha=0.3, axis='x')
for i, (model, score) in enumerate(zip(sorted_df['Model'], sorted_df['Precision'])):
    axes[0, 1].text(score + 0.01, i, f'{score:.4f}', va='center', fontsize=10)

# Recall comparison
axes[1, 0].barh(sorted_df['Model'], sorted_df['Recall'], color=colors)
axes[1, 0].set_xlabel('Recall', fontsize=12)
axes[1, 0].set_title('Recall Comparison', fontsize=13, fontweight='bold')
axes[1, 0].set_xlim([0, 1])
axes[1, 0].grid(True, alpha=0.3, axis='x')
for i, (model, score) in enumerate(zip(sorted_df['Model'], sorted_df['Recall'])):
    axes[1, 0].text(score + 0.01, i, f'{score:.4f}', va='center', fontsize=10)

# Combined metrics comparison
x = np.arange(len(sorted_df))
width = 0.25

axes[1, 1].bar(x - width, sorted_df['Precision'], width, label='Precision', alpha=0.8)
axes[1, 1].bar(x, sorted_df['Recall'], width, label='Recall', alpha=0.8)
axes[1, 1].bar(x + width, sorted_df['F1-Score'], width, label='F1-Score', alpha=0.8)
axes[1, 1].set_ylabel('Score', fontsize=12)
axes[1, 1].set_title('All Metrics Comparison', fontsize=13, fontweight='bold')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(sorted_df['Model'], rotation=45, ha='right')
axes[1, 1].legend()
axes[1, 1].set_ylim([0, 1])
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Comparison plot saved as 'model_comparison.png'")

## 6. Detailed Results for Best Model

In [None]:
# Get the full name of the best model
best_model_full = None
for model_name in results.keys():
    if model_name.split('/')[-1] == best_model:
        best_model_full = model_name
        break

print(f"Best performing model: {best_model_full}")
print("=" * 70)

# Get predictions from best model
best_predictions = results[best_model_full]['predictions']

# Get unique labels from the data
unique_labels = sorted(list(set(reference_labels + test_labels)))

# Show detailed classification report for best model
print("\nDetailed Classification Report (Best Model):")
print("-" * 70)
print(classification_report(test_labels, best_predictions, target_names=unique_labels))

## 7. Save Predictions from Best Model

In [None]:
# Create submission dictionary using best model predictions
submission = {}
for test_id, prediction in zip(test_ids, best_predictions):
    submission[test_id] = prediction

# Save to JSON file
output_file = f'../data/vanilla_nn_predictions_{best_model}.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(submission, f, indent=2, ensure_ascii=False)

print(f"Best model predictions saved to: {output_file}")

# Display sample predictions
print("\nSample predictions from best model:")
print("=" * 100)
for i in range(min(5, len(test_texts))):
    print(f"\nTest ID: {test_ids[i]}")
    print(f"Text: {test_texts[i][:100]}...")
    print(f"True Label: {test_labels[i]}")
    print(f"Predicted Label: {best_predictions[i]}")
    print(f"Correct: {'✓' if test_labels[i] == best_predictions[i] else '✗'}")
    print("-" * 100)