In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, 
    recall_score, classification_report, confusion_matrix
)
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import warnings
import time
import os

warnings.filterwarnings('ignore')

# Set random seeds
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print("=" * 80)
print("🤖 ENCODER-ONLY MODELS FOR TEXT CLASSIFICATION")
print("=" * 80)

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\n💻 Using device: {device}")
if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")

# ============================================================================
# 1. LOAD DATA
# ============================================================================
print("\n📂 Loading data...")
df = pd.read_excel("labels_sentences_FINAL_ROUND2.xlsx")
X = df['sentences_clean'].values
y = df['labels'].values

print(f"✅ Total samples: {len(df)}")
print(f"✅ Class distribution: {Counter(y)}")

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_labels = len(label_encoder.classes_)
print(f"✅ Number of classes: {num_labels}")
print(f"✅ Classes: {label_encoder.classes_}")

# ============================================================================
# 2. TRAIN-VAL-TEST SPLIT (70-10-20)
# ============================================================================
print("\n" + "=" * 80)
print("📊 Train-Validation-Test Split (70-10-20)")
print("=" * 80)

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp
)

print(f"✅ Training: {len(X_train)} ({len(X_train)/len(df)*100:.1f}%)")
print(f"✅ Validation: {len(X_val)} ({len(X_val)/len(df)*100:.1f}%)")
print(f"✅ Test: {len(X_test)} ({len(X_test)/len(df)*100:.1f}%)")

# ============================================================================
# 3. DATASET CLASS
# ============================================================================
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# ============================================================================
# 4. ENCODER-ONLY MODELS
# ============================================================================
print("\n" + "=" * 80)
print("🤖 ENCODER-ONLY MODELS TO EVALUATE")
print("=" * 80)

models_to_train = [
    "bert-base-multilingual-cased",  # mBERT
    "xlm-roberta-base",              # XLM-R
    "google/muril-base-cased",       # MURIL
    "ai4bharat/indic-bert",          # IndicBERT
    "distilbert-base-multilingual-cased",  # DistilmBERT (faster)
]

print("\n📋 Selected Models:")
for i, model in enumerate(models_to_train, 1):
    print(f"   {i}. {model}")

# ============================================================================
# 5. OPTIMAL HYPERPARAMETERS (based on NLP best practices)
# ============================================================================
print("\n" + "=" * 80)
print("⚙️ HYPERPARAMETERS")
print("=" * 80)

config = {
    "learning_rate": 2e-5,
    "batch_size": 16,
    "num_epochs": 5,
    "warmup_ratio": 0.1,
    "weight_decay": 0.01,
    "max_length": 128
}

print("\n📊 Configuration:")
for param, value in config.items():
    print(f"   {param}: {value}")

# ============================================================================
# 6. METRICS FUNCTION
# ============================================================================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted', zero_division=0)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# ============================================================================
# 7. TRAINING FUNCTION
# ============================================================================
def train_and_evaluate(model_name, config):
    print(f"\n{'='*80}")
    print(f"🏋️ Training: {model_name}")
    print(f"{'='*80}")
    
    # Load tokenizer and model
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            ignore_mismatched_sizes=True
        )
    except Exception as e:
        print(f"❌ Error loading {model_name}: {str(e)}")
        return None
    
    # Create datasets
    train_dataset = TextDataset(X_train, y_train, tokenizer, config['max_length'])
    val_dataset = TextDataset(X_val, y_val, tokenizer, config['max_length'])
    test_dataset = TextDataset(X_test, y_test, tokenizer, config['max_length'])
    
    # Output directory
    output_dir = f"./results/{model_name.replace('/', '_')}"
    os.makedirs(output_dir, exist_ok=True)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=config['num_epochs'],
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'],
        learning_rate=config['learning_rate'],
        weight_decay=config['weight_decay'],
        warmup_ratio=config['warmup_ratio'],
        logging_dir=f'{output_dir}/logs',
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        seed=42,
        fp16=torch.cuda.is_available(),
        report_to="none",
        save_total_limit=1,
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    # Train
    print("\n⏳ Training started...")
    start_time = time.time()
    trainer.train()
    train_time = time.time() - start_time
    
    # Validation metrics
    val_metrics = trainer.evaluate()
    print(f"\n✅ Training completed in {train_time/60:.2f} minutes")
    print(f"\n📊 Validation Metrics:")
    print(f"   Accuracy: {val_metrics['eval_accuracy']*100:.2f}%")
    print(f"   F1 Score: {val_metrics['eval_f1']:.4f}")
    print(f"   Precision: {val_metrics['eval_precision']:.4f}")
    print(f"   Recall: {val_metrics['eval_recall']:.4f}")
    
    # Test set evaluation
    print(f"\n🎯 Evaluating on Test Set...")
    test_metrics = trainer.evaluate(test_dataset)
    print(f"\n📊 Test Set Performance:")
    print(f"   Accuracy: {test_metrics['eval_accuracy']*100:.2f}%")
    print(f"   F1 Score: {test_metrics['eval_f1']:.4f}")
    print(f"   Precision: {test_metrics['eval_precision']:.4f}")
    print(f"   Recall: {test_metrics['eval_recall']:.4f}")
    
    # Predictions
    predictions = trainer.predict(test_dataset)
    y_pred = predictions.predictions.argmax(-1)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title(f'Confusion Matrix - {model_name.split("/")[-1]}\n'
              f'Test Accuracy: {test_metrics["eval_accuracy"]*100:.2f}%',
              fontsize=12, fontweight='bold')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    cm_file = f'{model_name.replace("/", "_")}_confusion_matrix.png'
    plt.savefig(cm_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved: {cm_file}")
    
    # Classification Report
    print(f"\n📋 Classification Report:")
    print(classification_report(y_test, y_pred, 
                               target_names=label_encoder.classes_,
                               digits=4))
    
    # Save predictions
    pred_file = f'{model_name.replace("/", "_")}_predictions.xlsx'
    test_pred_df = pd.DataFrame({
        'sentence': X_test,
        'true_label': label_encoder.inverse_transform(y_test),
        'predicted_label': label_encoder.inverse_transform(y_pred),
        'correct': y_test == y_pred
    })
    test_pred_df.to_excel(pred_file, index=False)
    print(f"✅ Saved: {pred_file}")
    
    return {
        'model_name': model_name,
        'val_accuracy': val_metrics['eval_accuracy'],
        'val_f1': val_metrics['eval_f1'],
        'val_precision': val_metrics['eval_precision'],
        'val_recall': val_metrics['eval_recall'],
        'test_accuracy': test_metrics['eval_accuracy'],
        'test_f1': test_metrics['eval_f1'],
        'test_precision': test_metrics['eval_precision'],
        'test_recall': test_metrics['eval_recall'],
        'train_time_minutes': train_time / 60
    }

# ============================================================================
# 8. TRAIN ALL MODELS
# ============================================================================
print("\n" + "=" * 80)
print("🚀 STARTING TRAINING PIPELINE")
print("=" * 80)

all_results = []

for model_name in models_to_train:
    try:
        result = train_and_evaluate(model_name, config)
        if result:
            all_results.append(result)
    except Exception as e:
        print(f"\n❌ Failed to train {model_name}: {str(e)}")
        continue

# ============================================================================
# 9. RESULTS SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("📊 FINAL RESULTS")
print("=" * 80)

if all_results:
    results_df = pd.DataFrame(all_results)
    results_df = results_df.sort_values('test_f1', ascending=False)
    results_df.to_excel('encoder_models_results.xlsx', index=False)
    print("\n✅ Saved: encoder_models_results.xlsx")
    
    print("\n🏆 Model Rankings (by Test F1):")
    print("-" * 80)
    for idx, row in results_df.iterrows():
        print(f"\n{idx + 1}. {row['model_name'].split('/')[-1]}")
        print(f"   Test Acc: {row['test_accuracy']*100:.2f}% | F1: {row['test_f1']:.4f}")
        print(f"   Val Acc: {row['val_accuracy']*100:.2f}% | F1: {row['val_f1']:.4f}")
        print(f"   Training Time: {row['train_time_minutes']:.2f} min")
    
    # Comparison plot
    plt.figure(figsize=(14, 6))
    
    x = np.arange(len(results_df))
    width = 0.35
    
    plt.bar(x - width/2, results_df['val_f1'], width, 
            label='Validation F1', alpha=0.8, color='skyblue')
    plt.bar(x + width/2, results_df['test_f1'], width, 
            label='Test F1', alpha=0.8, color='coral')
    
    plt.xlabel('Model', fontsize=12, fontweight='bold')
    plt.ylabel('F1 Score', fontsize=12, fontweight='bold')
    plt.title('Encoder-Only Models Comparison', fontsize=14, fontweight='bold')
    plt.xticks(x, [name.split('/')[-1] for name in results_df['model_name']], 
               rotation=45, ha='right')
    plt.legend()
    plt.axhline(y=0.60, color='red', linestyle='--', linewidth=1, 
                alpha=0.5, label='60% Baseline')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig('models_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("\n✅ Saved: models_comparison.png")
    
    # Best model
    best_model = results_df.iloc[0]
    print(f"\n{'='*80}")
    print(f"🥇 BEST MODEL: {best_model['model_name']}")
    print(f"{'='*80}")
    print(f"   Test Accuracy: {best_model['test_accuracy']*100:.2f}%")
    print(f"   Test F1: {best_model['test_f1']:.4f}")
    print(f"   Training Time: {best_model['train_time_minutes']:.2f} minutes")
    
else:
    print("\n❌ No models were successfully trained!")

print("\n" + "=" * 80)
print("✅ PIPELINE COMPLETED!")
print("=" * 80)

print("\n📁 Generated Files:")
print("   • encoder_models_results.xlsx - All results")
print("   • models_comparison.png - Visual comparison")
print("   • [model]_confusion_matrix.png - Per-model confusion matrices")
print("   • [model]_predictions.xlsx - Per-model predictions")

print("\n💡 Tips to Improve Performance:")
print("   1. Increase epochs (try 8-10)")
print("   2. Try learning rate 3e-5 or 5e-5")
print("   3. Use class weights if imbalanced")
print("   4. Add data augmentation (back-translation, paraphrasing)")
print("   5. Try ensemble of top 3 models")

print("\n" + "=" * 80)

  from .autonotebook import tqdm as notebook_tqdm


🤖 ENCODER-ONLY MODELS FOR TEXT CLASSIFICATION

💻 Using device: cuda
   GPU: NVIDIA GeForce RTX 3060 Laptop GPU

📂 Loading data...
✅ Total samples: 494
✅ Class distribution: Counter({'positive': 254, 'negative': 133, 'neutral': 107})
✅ Number of classes: 3
✅ Classes: ['negative' 'neutral' 'positive']

📊 Train-Validation-Test Split (70-10-20)
✅ Training: 345 (69.8%)
✅ Validation: 50 (10.1%)
✅ Test: 99 (20.0%)

🤖 ENCODER-ONLY MODELS TO EVALUATE

📋 Selected Models:
   1. bert-base-multilingual-cased
   2. xlm-roberta-base
   3. google/muril-base-cased
   4. ai4bharat/indic-bert
   5. distilbert-base-multilingual-cased

⚙️ HYPERPARAMETERS

📊 Configuration:
   learning_rate: 2e-05
   batch_size: 16
   num_epochs: 5
   warmup_ratio: 0.1
   weight_decay: 0.01
   max_length: 128

🚀 STARTING TRAINING PIPELINE

🏋️ Training: bert-base-multilingual-cased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



⏳ Training started...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.007129,0.52,0.355789,0.2704,0.52
2,No log,0.911592,0.6,0.497908,0.553913,0.6
3,0.976000,0.796506,0.6,0.570471,0.565758,0.6
4,0.976000,0.762303,0.68,0.66613,0.671037,0.68
5,0.652200,0.740013,0.7,0.686509,0.719826,0.7



✅ Training completed in 1.02 minutes

📊 Validation Metrics:
   Accuracy: 70.00%
   F1 Score: 0.6865
   Precision: 0.7198
   Recall: 0.7000

🎯 Evaluating on Test Set...

📊 Test Set Performance:
   Accuracy: 69.70%
   F1 Score: 0.6976
   Precision: 0.7001
   Recall: 0.6970
✅ Saved: bert-base-multilingual-cased_confusion_matrix.png

📋 Classification Report:
              precision    recall  f1-score   support

    negative     0.7083    0.6296    0.6667        27
     neutral     0.5217    0.5714    0.5455        21
    positive     0.7692    0.7843    0.7767        51

    accuracy                         0.6970        99
   macro avg     0.6664    0.6618    0.6629        99
weighted avg     0.7001    0.6970    0.6976        99

✅ Saved: bert-base-multilingual-cased_predictions.xlsx

🏋️ Training: xlm-roberta-base


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



⏳ Training started...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.017393,0.52,0.355789,0.2704,0.52
2,No log,0.982314,0.52,0.355789,0.2704,0.52
3,1.046200,0.987358,0.52,0.355789,0.2704,0.52



✅ Training completed in 0.85 minutes

📊 Validation Metrics:
   Accuracy: 52.00%
   F1 Score: 0.3558
   Precision: 0.2704
   Recall: 0.5200

🎯 Evaluating on Test Set...

📊 Test Set Performance:
   Accuracy: 51.52%
   F1 Score: 0.3503
   Precision: 0.2654
   Recall: 0.5152
✅ Saved: xlm-roberta-base_confusion_matrix.png

📋 Classification Report:
              precision    recall  f1-score   support

    negative     0.0000    0.0000    0.0000        27
     neutral     0.0000    0.0000    0.0000        21
    positive     0.5152    1.0000    0.6800        51

    accuracy                         0.5152        99
   macro avg     0.1717    0.3333    0.2267        99
weighted avg     0.2654    0.5152    0.3503        99

✅ Saved: xlm-roberta-base_predictions.xlsx

🏋️ Training: google/muril-base-cased


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



⏳ Training started...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.08916,0.52,0.355789,0.2704,0.52
2,No log,1.068779,0.52,0.355789,0.2704,0.52
3,1.086600,1.059355,0.52,0.355789,0.2704,0.52



✅ Training completed in 2.78 minutes

📊 Validation Metrics:
   Accuracy: 52.00%
   F1 Score: 0.3558
   Precision: 0.2704
   Recall: 0.5200

🎯 Evaluating on Test Set...

📊 Test Set Performance:
   Accuracy: 51.52%
   F1 Score: 0.3503
   Precision: 0.2654
   Recall: 0.5152
✅ Saved: google_muril-base-cased_confusion_matrix.png

📋 Classification Report:
              precision    recall  f1-score   support

    negative     0.0000    0.0000    0.0000        27
     neutral     0.0000    0.0000    0.0000        21
    positive     0.5152    1.0000    0.6800        51

    accuracy                         0.5152        99
   macro avg     0.1717    0.3333    0.2267        99
weighted avg     0.2654    0.5152    0.3503        99

✅ Saved: google_muril-base-cased_predictions.xlsx

🏋️ Training: ai4bharat/indic-bert
❌ Error loading ai4bharat/indic-bert: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tok

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



⏳ Training started...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.022412,0.52,0.355789,0.2704,0.52
2,No log,0.964058,0.52,0.355789,0.2704,0.52
3,1.013100,0.893005,0.62,0.542485,0.677273,0.62
4,1.013100,0.84272,0.66,0.635138,0.642996,0.66
5,0.764500,0.822971,0.68,0.65045,0.696571,0.68



✅ Training completed in 1.10 minutes

📊 Validation Metrics:
   Accuracy: 68.00%
   F1 Score: 0.6505
   Precision: 0.6966
   Recall: 0.6800

🎯 Evaluating on Test Set...

📊 Test Set Performance:
   Accuracy: 67.68%
   F1 Score: 0.6606
   Precision: 0.6761
   Recall: 0.6768
✅ Saved: distilbert-base-multilingual-cased_confusion_matrix.png

📋 Classification Report:
              precision    recall  f1-score   support

    negative     0.6818    0.5556    0.6122        27
     neutral     0.6667    0.3810    0.4848        21
    positive     0.6769    0.8627    0.7586        51

    accuracy                         0.6768        99
   macro avg     0.6751    0.5998    0.6186        99
weighted avg     0.6761    0.6768    0.6606        99

✅ Saved: distilbert-base-multilingual-cased_predictions.xlsx

📊 FINAL RESULTS

✅ Saved: encoder_models_results.xlsx

🏆 Model Rankings (by Test F1):
--------------------------------------------------------------------------------

1. bert-base-multilingual

Try 1

#

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, 
    recall_score, classification_report, confusion_matrix
)
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import warnings
import time
import os
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print("=" * 80)
print("🤖 TRANSFORMER MODELS WITH HYPERPARAMETER TUNING")
print("=" * 80)

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\n💻 Using device: {device}")
if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")

# ============================================================================
# 1. LOAD DATA
# ============================================================================
print("\n📂 Loading data...")
df = pd.read_excel("labels_sentences_FINAL_ROUND2.xlsx")
X = df['sentences_clean'].values
y = df['labels'].values

print(f"✅ Total samples: {len(df)}")
print(f"✅ Class distribution: {Counter(y)}")

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_labels = len(label_encoder.classes_)
print(f"✅ Number of classes: {num_labels}")
print(f"✅ Classes: {label_encoder.classes_}")

# ============================================================================
# 2. TRAIN-TEST SPLIT
# ============================================================================
print("\n" + "=" * 80)
print("📊 Train-Validation-Test Split (70-10-20)")
print("=" * 80)

# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded
)

# Second split: 70% train, 10% val from the 80%
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp
)

print(f"\n✅ Training set: {len(X_train)} samples ({len(X_train)/len(df)*100:.1f}%)")
print(f"✅ Validation set: {len(X_val)} samples ({len(X_val)/len(df)*100:.1f}%)")
print(f"✅ Test set: {len(X_test)} samples ({len(X_test)/len(df)*100:.1f}%)")

# ============================================================================
# 3. DEFINE DATASET CLASS
# ============================================================================
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# ============================================================================
# 4. DEFINE MODELS TO TRAIN
# ============================================================================
print("\n" + "=" * 80)
print("🤖 MODELS TO TRAIN")
print("=" * 80)

models_config = {
    # ENCODER-ONLY MODELS (Best for Classification)
    "bert-base-multilingual-cased": {
        "type": "Encoder-Only",
        "description": "Multilingual BERT - 104 languages",
        "params": "179M"
    },
    "xlm-roberta-base": {
        "type": "Encoder-Only", 
        "description": "XLM-RoBERTa - Strong multilingual model",
        "params": "279M"
    },
    "google/muril-base-cased": {
        "type": "Encoder-Only",
        "description": "MURIL - Indian languages specialized",
        "params": "237M"
    },
    "ai4bharat/indic-bert": {
        "type": "Encoder-Only",
        "description": "IndicBERT - 12 Indian languages",
        "params": "180M"
    },
    
    # ENCODER-DECODER MODEL (Good for seq2seq, can be used for classification)
    "google/mt5-small": {
        "type": "Encoder-Decoder",
        "description": "Multilingual T5 - Seq2seq architecture",
        "params": "300M"
    }
}

print("\nℹ️  Note: Decoder-only models (like GPT) are not ideal for classification tasks.")
print("   They're designed for generation. We'll focus on encoder and encoder-decoder models.\n")

for model_name, info in models_config.items():
    print(f"📌 {model_name}")
    print(f"   Type: {info['type']}")
    print(f"   Description: {info['description']}")
    print(f"   Parameters: {info['params']}\n")

# ============================================================================
# 5. HYPERPARAMETER GRID
# ============================================================================
print("=" * 80)
print("🔧 HYPERPARAMETER SEARCH SPACE")
print("=" * 80)

hyperparam_configs = [
    {
        "learning_rate": 2e-5,
        "batch_size": 16,
        "num_epochs": 3,
        "warmup_ratio": 0.1,
        "weight_decay": 0.01
    },
    {
        "learning_rate": 3e-5,
        "batch_size": 16,
        "num_epochs": 4,
        "warmup_ratio": 0.1,
        "weight_decay": 0.01
    },
    {
        "learning_rate": 5e-5,
        "batch_size": 8,
        "num_epochs": 3,
        "warmup_ratio": 0.0,
        "weight_decay": 0.01
    },
]

print("\n📋 Hyperparameter Configurations to Test:")
for i, config in enumerate(hyperparam_configs, 1):
    print(f"\n   Config {i}:")
    for param, value in config.items():
        print(f"      {param}: {value}")

print(f"\n🔢 Total configurations per model: {len(hyperparam_configs)}")

# ============================================================================
# 6. COMPUTE METRICS FUNCTION
# ============================================================================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted', zero_division=0)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# ============================================================================
# 7. TRAINING FUNCTION
# ============================================================================
def train_model(model_name, config, train_dataset, val_dataset, output_dir):
    print(f"\n{'='*80}")
    print(f"🏋️ Training: {model_name}")
    print(f"   Config: LR={config['learning_rate']}, BS={config['batch_size']}, Epochs={config['num_epochs']}")
    print(f"{'='*80}")
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        ignore_mismatched_sizes=True
    )
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=config['num_epochs'],
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'],
        learning_rate=config['learning_rate'],
        weight_decay=config['weight_decay'],
        warmup_ratio=config['warmup_ratio'],
        logging_dir=f'{output_dir}/logs',
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        seed=42,
        fp16=torch.cuda.is_available(),
        report_to="none",
        save_total_limit=1,
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    # Train
    start_time = time.time()
    train_result = trainer.train()
    train_time = time.time() - start_time
    
    # Evaluate on validation set
    val_metrics = trainer.evaluate()
    
    print(f"\n✅ Training completed in {train_time/60:.2f} minutes")
    print(f"📊 Validation Metrics:")
    print(f"   Accuracy: {val_metrics['eval_accuracy']*100:.2f}%")
    print(f"   F1 Score: {val_metrics['eval_f1']:.4f}")
    print(f"   Precision: {val_metrics['eval_precision']:.4f}")
    print(f"   Recall: {val_metrics['eval_recall']:.4f}")
    
    return {
        'trainer': trainer,
        'tokenizer': tokenizer,
        'val_metrics': val_metrics,
        'train_time': train_time
    }

# ============================================================================
# 8. MAIN TRAINING LOOP
# ============================================================================
print("\n" + "=" * 80)
print("🚀 STARTING TRAINING PIPELINE")
print("=" * 80)

all_results = []

# Select top 3 models for faster training (you can train all if needed)
selected_models = [
    "bert-base-multilingual-cased",
    "xlm-roberta-base",
    "google/muril-base-cased"
]

print(f"\n📌 Training {len(selected_models)} models x {len(hyperparam_configs)} configs = {len(selected_models) * len(hyperparam_configs)} total runs")
print("\nℹ️  To train all models including IndicBERT and mT5, modify the 'selected_models' list\n")

for model_name in selected_models:
    model_info = models_config[model_name]
    print(f"\n{'#'*80}")
    print(f"MODEL: {model_name} ({model_info['type']})")
    print(f"{'#'*80}")
    
    # Tokenizer for this model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    best_f1 = 0
    best_config = None
    best_result = None
    
    for config_idx, config in enumerate(hyperparam_configs, 1):
        print(f"\n--- Config {config_idx}/{len(hyperparam_configs)} ---")
        
        # Create datasets
        train_dataset = TextDataset(X_train, y_train, tokenizer)
        val_dataset = TextDataset(X_val, y_val, tokenizer)
        
        # Output directory
        output_dir = f"./results/{model_name.replace('/', '_')}_config{config_idx}"
        
        try:
            # Train model
            result = train_model(model_name, config, train_dataset, val_dataset, output_dir)
            
            # Track best config for this model
            if result['val_metrics']['eval_f1'] > best_f1:
                best_f1 = result['val_metrics']['eval_f1']
                best_config = config
                best_result = result
            
            # Save result
            all_results.append({
                'model_name': model_name,
                'model_type': model_info['type'],
                'config_num': config_idx,
                'learning_rate': config['learning_rate'],
                'batch_size': config['batch_size'],
                'num_epochs': config['num_epochs'],
                'warmup_ratio': config['warmup_ratio'],
                'weight_decay': config['weight_decay'],
                'val_accuracy': result['val_metrics']['eval_accuracy'],
                'val_f1': result['val_metrics']['eval_f1'],
                'val_precision': result['val_metrics']['eval_precision'],
                'val_recall': result['val_metrics']['eval_recall'],
                'train_time_minutes': result['train_time'] / 60
            })
            
        except Exception as e:
            print(f"❌ Error training {model_name} with config {config_idx}: {str(e)}")
            continue
    
    # Evaluate best model on test set
    if best_result is not None:
        print(f"\n{'='*80}")
        print(f"🏆 BEST CONFIG FOR {model_name}")
        print(f"{'='*80}")
        print(f"\n📊 Best Validation F1: {best_f1:.4f}")
        print(f"\n⚙️ Best Hyperparameters:")
        for param, value in best_config.items():
            print(f"   {param}: {value}")
        
        # Test set evaluation
        print(f"\n🎯 Evaluating on Test Set...")
        test_dataset = TextDataset(X_test, y_test, best_result['tokenizer'])
        test_metrics = best_result['trainer'].evaluate(test_dataset)
        
        print(f"\n📊 Test Set Performance:")
        print(f"   Accuracy: {test_metrics['eval_accuracy']*100:.2f}%")
        print(f"   F1 Score: {test_metrics['eval_f1']:.4f}")
        print(f"   Precision: {test_metrics['eval_precision']:.4f}")
        print(f"   Recall: {test_metrics['eval_recall']:.4f}")
        
        # Get predictions for confusion matrix
        predictions = best_result['trainer'].predict(test_dataset)
        y_pred = predictions.predictions.argmax(-1)
        
        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=label_encoder.classes_,
                    yticklabels=label_encoder.classes_)
        plt.title(f'Confusion Matrix - {model_name}\nTest Accuracy: {test_metrics["eval_accuracy"]*100:.2f}%',
                  fontsize=12, fontweight='bold')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'{model_name.replace("/", "_")}_confusion_matrix.png', dpi=300)
        plt.close()
        print(f"\n✅ Saved: {model_name.replace('/', '_')}_confusion_matrix.png")
        
        # Classification Report
        print(f"\n📋 Detailed Classification Report:")
        print(classification_report(y_test, y_pred, 
                                   target_names=label_encoder.classes_,
                                   digits=4))
        
        # Save predictions
        test_pred_df = pd.DataFrame({
            'sentence': X_test,
            'true_label': label_encoder.inverse_transform(y_test),
            'predicted_label': label_encoder.inverse_transform(y_pred),
            'correct': y_test == y_pred
        })
        test_pred_df.to_excel(f'{model_name.replace("/", "_")}_test_predictions.xlsx', index=False)
        print(f"✅ Saved: {model_name.replace('/', '_')}_test_predictions.xlsx")
        
        # Add test metrics to results
        all_results[-1].update({
            'test_accuracy': test_metrics['eval_accuracy'],
            'test_f1': test_metrics['eval_f1'],
            'test_precision': test_metrics['eval_precision'],
            'test_recall': test_metrics['eval_recall']
        })

# ============================================================================
# 9. SAVE ALL RESULTS
# ============================================================================
print("\n" + "=" * 80)
print("💾 SAVING COMPREHENSIVE RESULTS")
print("=" * 80)

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('val_f1', ascending=False)
results_df.to_excel('all_models_results.xlsx', index=False)
print("\n✅ Saved: all_models_results.xlsx")

# ============================================================================
# 10. FINAL SUMMARY & COMPARISON
# ============================================================================
print("\n" + "=" * 80)
print("🏆 FINAL RESULTS SUMMARY")
print("=" * 80)

print("\n📊 Top 5 Model Configurations by Validation F1:")
print("-" * 80)
for idx, row in results_df.head(5).iterrows():
    print(f"\nRank {idx + 1}: {row['model_name']}")
    print(f"   Type: {row['model_type']}")
    print(f"   Val F1: {row['val_f1']:.4f} | Val Acc: {row['val_accuracy']*100:.2f}%")
    if 'test_f1' in row and pd.notna(row['test_f1']):
        print(f"   Test F1: {row['test_f1']:.4f} | Test Acc: {row['test_accuracy']*100:.2f}%")
    print(f"   LR: {row['learning_rate']}, BS: {row['batch_size']}, Epochs: {row['num_epochs']}")

# Visualization: Compare Models
if len(results_df) > 0:
    # Best result per model
    best_per_model = results_df.groupby('model_name').first().reset_index()
    
    plt.figure(figsize=(12, 6))
    x = np.arange(len(best_per_model))
    width = 0.35
    
    plt.bar(x - width/2, best_per_model['val_f1'], width, label='Validation F1', alpha=0.8)
    if 'test_f1' in best_per_model.columns:
        plt.bar(x + width/2, best_per_model['test_f1'], width, label='Test F1', alpha=0.8)
    
    plt.xlabel('Model', fontsize=12)
    plt.ylabel('F1 Score', fontsize=12)
    plt.title('Model Comparison - Best Configuration per Model', fontsize=14, fontweight='bold')
    plt.xticks(x, [name.split('/')[-1] for name in best_per_model['model_name']], rotation=45, ha='right')
    plt.legend()
    plt.axhline(y=0.85, color='red', linestyle='--', linewidth=2, alpha=0.5, label='85% Target')
    plt.tight_layout()
    plt.savefig('model_comparison.png', dpi=300)
    plt.close()
    print("\n✅ Saved: model_comparison.png")

print("\n" + "=" * 80)
print("✅ TRANSFORMER TRAINING PIPELINE COMPLETED!")
print("=" * 80)

print("\n📁 Generated Files:")
print("   • all_models_results.xlsx - Complete results for all models/configs")
print("   • model_comparison.png - Visual comparison of models")
print("   • [model_name]_confusion_matrix.png - Per-model confusion matrices")
print("   • [model_name]_test_predictions.xlsx - Per-model test predictions")

print("\n🎯 Next Steps:")
print("   1. Review all_models_results.xlsx to identify best model")
print("   2. Analyze confusion matrices for error patterns")
print("   3. Fine-tune best model further if needed")
print("   4. Deploy best model for production")

print("\n" + "=" * 80)

🤖 TRANSFORMER MODELS WITH HYPERPARAMETER TUNING

💻 Using device: cpu

📂 Loading data...
✅ Total samples: 494
✅ Class distribution: Counter({'positive': 254, 'negative': 133, 'neutral': 107})
✅ Number of classes: 3
✅ Classes: ['negative' 'neutral' 'positive']

📊 Train-Validation-Test Split (70-10-20)

✅ Training set: 345 samples (69.8%)
✅ Validation set: 50 samples (10.1%)
✅ Test set: 99 samples (20.0%)

🤖 MODELS TO TRAIN

ℹ️  Note: Decoder-only models (like GPT) are not ideal for classification tasks.
   They're designed for generation. We'll focus on encoder and encoder-decoder models.

📌 bert-base-multilingual-cased
   Type: Encoder-Only
   Description: Multilingual BERT - 104 languages
   Parameters: 179M

📌 xlm-roberta-base
   Type: Encoder-Only
   Description: XLM-RoBERTa - Strong multilingual model
   Parameters: 279M

📌 google/muril-base-cased
   Type: Encoder-Only
   Description: MURIL - Indian languages specialized
   Parameters: 237M

📌 ai4bharat/indic-bert
   Type: Encoder-O

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.018467,0.52,0.355789,0.2704,0.52
2,No log,1.009732,0.52,0.355789,0.2704,0.52
3,1.031500,0.990553,0.52,0.355789,0.2704,0.52



✅ Training completed in 6.72 minutes
📊 Validation Metrics:
   Accuracy: 52.00%
   F1 Score: 0.3558
   Precision: 0.2704
   Recall: 0.5200

--- Config 2/3 ---

🏋️ Training: bert-base-multilingual-cased
   Config: LR=3e-05, BS=16, Epochs=4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.000587,0.52,0.355789,0.2704,0.52
2,No log,0.841454,0.58,0.490868,0.459643,0.58
3,0.958400,0.822307,0.62,0.613904,0.627427,0.62
4,0.958400,0.812436,0.66,0.661429,0.683743,0.66



✅ Training completed in 10.12 minutes
📊 Validation Metrics:
   Accuracy: 66.00%
   F1 Score: 0.6614
   Precision: 0.6837
   Recall: 0.6600

--- Config 3/3 ---

🏋️ Training: bert-base-multilingual-cased
   Config: LR=5e-05, BS=8, Epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.057137,0.54,0.490217,0.468,0.54
2,1.024000,0.941716,0.58,0.467512,0.456444,0.58
3,0.840700,0.875461,0.58,0.581317,0.590265,0.58



✅ Training completed in 6.56 minutes
📊 Validation Metrics:
   Accuracy: 58.00%
   F1 Score: 0.5813
   Precision: 0.5903
   Recall: 0.5800

🏆 BEST CONFIG FOR bert-base-multilingual-cased

📊 Best Validation F1: 0.6614

⚙️ Best Hyperparameters:
   learning_rate: 3e-05
   batch_size: 16
   num_epochs: 4
   warmup_ratio: 0.1
   weight_decay: 0.01

🎯 Evaluating on Test Set...

📊 Test Set Performance:
   Accuracy: 66.67%
   F1 Score: 0.6722
   Precision: 0.6956
   Recall: 0.6667

✅ Saved: bert-base-multilingual-cased_confusion_matrix.png

📋 Detailed Classification Report:
              precision    recall  f1-score   support

    negative     0.5312    0.6296    0.5763        27
     neutral     0.5926    0.7619    0.6667        21
    positive     0.8250    0.6471    0.7253        51

    accuracy                         0.6667        99
   macro avg     0.6496    0.6795    0.6561        99
weighted avg     0.6956    0.6667    0.6722        99

✅ Saved: bert-base-multilingual-cased_test_pre

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.014141,0.52,0.355789,0.2704,0.52
2,No log,0.978922,0.52,0.355789,0.2704,0.52
3,1.030000,0.951126,0.52,0.355789,0.2704,0.52



✅ Training completed in 7.22 minutes
📊 Validation Metrics:
   Accuracy: 52.00%
   F1 Score: 0.3558
   Precision: 0.2704
   Recall: 0.5200

--- Config 2/3 ---

🏋️ Training: xlm-roberta-base
   Config: LR=3e-05, BS=16, Epochs=4


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.111409,0.52,0.355789,0.2704,0.52
2,No log,1.016187,0.52,0.355789,0.2704,0.52
3,1.059600,1.002424,0.52,0.355789,0.2704,0.52



✅ Training completed in 7.19 minutes
📊 Validation Metrics:
   Accuracy: 52.00%
   F1 Score: 0.3558
   Precision: 0.2704
   Recall: 0.5200

--- Config 3/3 ---

🏋️ Training: xlm-roberta-base
   Config: LR=5e-05, BS=8, Epochs=3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.084824,0.26,0.18478,0.221,0.26
2,1.066000,0.980138,0.52,0.355789,0.2704,0.52
3,1.025600,0.953281,0.52,0.355789,0.2704,0.52



✅ Training completed in 8.46 minutes
📊 Validation Metrics:
   Accuracy: 52.00%
   F1 Score: 0.3558
   Precision: 0.2704
   Recall: 0.5200

🏆 BEST CONFIG FOR xlm-roberta-base

📊 Best Validation F1: 0.3558

⚙️ Best Hyperparameters:
   learning_rate: 2e-05
   batch_size: 16
   num_epochs: 3
   warmup_ratio: 0.1
   weight_decay: 0.01

🎯 Evaluating on Test Set...

📊 Test Set Performance:
   Accuracy: 51.52%
   F1 Score: 0.3503
   Precision: 0.2654
   Recall: 0.5152

✅ Saved: xlm-roberta-base_confusion_matrix.png

📋 Detailed Classification Report:
              precision    recall  f1-score   support

    negative     0.0000    0.0000    0.0000        27
     neutral     0.0000    0.0000    0.0000        21
    positive     0.5152    1.0000    0.6800        51

    accuracy                         0.5152        99
   macro avg     0.1717    0.3333    0.2267        99
weighted avg     0.2654    0.5152    0.3503        99

✅ Saved: xlm-roberta-base_test_predictions.xlsx

#####################

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:  59%|#####9    | 566M/953M [00:00<?, ?B/s]