In [None]:
# EACL 2026 Abjad NLP: Medical Text Classification with AraBERT v2

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch. utils.data import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn. metrics import f1_score, classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import json
import os
from datetime import datetime
import random

warnings.filterwarnings('ignore')

In [None]:
# CONFIGURATION


# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)


# MODEL_NAME = "aubmindlab/bert-base-arabertv2"
# MODEL_NAME = "aubmindlab/bert-base-arabertv02"  
# MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix"  
# MODEL_NAME = "xlm-roberta-base"  
# MODEL_NAME = "UBC-NLP/MARBERTv2" 
# MODEL_NAME = "aubmindlab/bert-large-arabertv02"



# Model configuration
MODEL_NAME = "aubmindlab/bert-base-arabertv02" 
NUM_LABELS = 82
MAX_LENGTH = 384
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 10
WARMUP_STEPS = 500

# Clear memory
torch.cuda.empty_cache()
import gc
gc.collect()


BASE_DIR = '/kaggle/working'
OUTPUT_DIR = f'{BASE_DIR}/results'           # Training checkpoints
MODEL_SAVE_DIR = f'{BASE_DIR}/arabert_medical_model'  # Final model
LOGS_DIR = f'{BASE_DIR}/logs'                # Training logs

# Create directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
os.makedirs(LOGS_DIR, exist_ok=True)

print("="*80)
print("EACL 2026 ABJAD NLP - ARABERT V2 TRAINING PIPELINE")

print(f"\nüñ•Ô∏è  Device Information:")
print(f"   PyTorch version: {torch.__version__}")
print(f"   CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"   CUDA memory: {torch.cuda. get_device_properties(0).total_memory / 1e9:.2f} GB")

EACL 2026 ABJAD NLP - ARABERT V2 TRAINING PIPELINE

üñ•Ô∏è  Device Information:
   PyTorch version: 2.6.0+cu124
   CUDA available: True
   CUDA device: Tesla T4
   CUDA memory: 15.83 GB


In [None]:
# 1. LOAD DATA

print("\n" + "="*80)
print("[1/12] LOADING DATA")
print("="*80)

# Update this path to your training data
train_df = pd.read_csv('/kaggle/input/arabic/shared_task_train.csv')

print(f"\n‚úÖ Data loaded successfully!")
print(f"   Shape: {train_df.shape}")
print(f"   Columns: {train_df.columns.tolist()}")
print(f"\nFirst few rows:")
print(train_df.head())

# Quick validation
assert 'text' in train_df. columns, "Missing 'text' column"
assert 'label' in train_df.columns, "Missing 'label' column"
assert train_df['label'].min() == 0, "Labels should start from 0"
assert train_df['label'].max() == 81, "Labels should go up to 81"
print("\n‚úÖ Data validation passed!")



[1/12] LOADING DATA

‚úÖ Data loaded successfully!
   Shape: (27951, 3)
   Columns: ['text', 'category', 'label']

First few rows:
                                                text                category  \
0  ÿßŸÑÿ≥ÿ§ÿßŸÑ\n-------\nÿßŸÑÿ≥ŸÑÿßŸÖ ÿπŸÑŸäŸÉŸÖ ÿßŸÜÿß ŸÖÿµÿßÿ® ÿ®ŸÅŸÇÿ± ÿßŸÑ...  Hematological diseases   
1  ÿßŸÑÿ≥ÿ§ÿßŸÑ\n-------\nÿßŸÜÿß ÿ¥ÿßÿ® ÿπŸÜÿØŸâ 25 ÿ≥ŸÜŸá ŸàÿπŸÜÿØŸâ ÿ™ÿ®Ÿà...     Urogenital diseases   
2  ÿßŸÑÿ≥ÿ§ÿßŸÑ\n-------\nÿµÿ®ÿßÿ≠ ÿßŸÑÿÆŸäÿ± ÿπŸÜÿØŸä ÿßŸÑŸÇÿ∂Ÿäÿ® ÿ∫Ÿäÿ± ŸÜÿ¥...         Medicinal herbs   
3  ÿßŸÑÿ≥ÿ§ÿßŸÑ\n-------\nŸáŸÑ Ÿäÿ∏Ÿáÿ± ÿßŸÑÿ≠ÿ¥Ÿäÿ¥ ŸÅŸä ÿ™ÿ≠ŸÑŸäŸÑ CBC Ÿà...               Addiction   
4  ÿßŸÑÿ≥ÿ§ÿßŸÑ\n-------\nŸàÿ≤ŸÜŸä 58 ŸÉÿ∫ŸÖ Ÿàÿßÿ±ŸäÿØ ÿßŸÜ ÿßŸÅŸÇÿØ 5 ŸÉ...                 Biology   

   label  
0     33  
1     76  
2     45  
3      0  
4      7  

‚úÖ Data validation passed!


In [None]:
# 2. ANALYZE CLASS DISTRIBUTION

print("\n" + "="*80)
print("[2/12] ANALYZING CLASS DISTRIBUTION")
print("="*80)

label_counts = train_df['label'].value_counts().sort_index()

print(f"\nClass Statistics:")
print(f"   Unique classes: {train_df['label'].nunique()}")
print(f"   Mean samples/class: {label_counts.mean():.2f}")
print(f"   Median samples/class: {label_counts.median():.2f}")
print(f"   Min samples:  {label_counts.min()}")
print(f"   Max samples: {label_counts.max()}")
print(f"   Imbalance ratio: {label_counts.max() / label_counts.min():.2f}x")


[2/12] ANALYZING CLASS DISTRIBUTION

Class Statistics:
   Unique classes: 82
   Mean samples/class: 340.87
   Median samples/class: 350.50
   Min samples:  7
   Max samples: 600
   Imbalance ratio: 85.71x


In [8]:
# Quick class distribution check
class_counts = train_df['label'].value_counts().sort_index()

print("="*80)
print("CLASS DISTRIBUTION (Ascending Order by Class ID)")
print("="*80)

# Show all classes in ascending order
for class_id, count in class_counts.items():
    print(f"Class {class_id: 2d}: {count:4d} samples")

CLASS DISTRIBUTION (Ascending Order by Class ID)
Class  0:  600 samples
Class  1:  333 samples
Class  2:  232 samples
Class  3:   37 samples
Class  4:   34 samples
Class  5:  600 samples
Class  6:    7 samples
Class  7:   29 samples
Class  8:  345 samples
Class  9:  600 samples
Class  10:   11 samples
Class  11:  600 samples
Class  12:   13 samples
Class  13:  600 samples
Class  14:  600 samples
Class  15:  600 samples
Class  16:  600 samples
Class  17:  600 samples
Class  18:  154 samples
Class  19:  600 samples
Class  20:   40 samples
Class  21:  600 samples
Class  22:  600 samples
Class  23:   41 samples
Class  24:  600 samples
Class  25:  600 samples
Class  26:  600 samples
Class  27:  100 samples
Class  28:   26 samples
Class  29:   10 samples
Class  30:  156 samples
Class  31:  600 samples
Class  32:  600 samples
Class  33:  600 samples
Class  34:   11 samples
Class  35:  144 samples
Class  36:  600 samples
Class  37:   55 samples
Class  38:    7 samples
Class  39:  242 samples
C

In [None]:
# 3. TEXT PREPROCESSING

print("\n" + "="*80)
print("[3/12] PREPROCESSING ARABIC TEXT")
print("="*80)

def preprocess_arabic_text(text):
    """Preprocess Arabic text for medical classification"""
    if not isinstance(text, str):
        return ""
    
    # Remove diacritics
    text = re.sub(r'[ŸãŸåŸçŸéŸèŸêŸëŸí]', '', text)
    
    # Normalize Arabic letters
    text = re.sub(r'[ÿ•ÿ£ÿ¢ÿß]', 'ÿß', text)
    text = re.sub(r'Ÿâ', 'Ÿä', text)
    text = re.sub(r'ÿ©', 'Ÿá', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("\nApplying preprocessing...")
train_df['text_clean'] = train_df['text'].apply(preprocess_arabic_text)

print("‚úÖ Preprocessing complete!")
print(f"\nExample:")
print(f"Original: {train_df['text']. iloc[0][: 150]}")
print(f"Cleaned:   {train_df['text_clean'].iloc[0][:150]}")


[3/12] PREPROCESSING ARABIC TEXT

Applying preprocessing...
‚úÖ Preprocessing complete!

Example:
Original: ÿßŸÑÿ≥ÿ§ÿßŸÑ
-------
ÿßŸÑÿ≥ŸÑÿßŸÖ ÿπŸÑŸäŸÉŸÖ ÿßŸÜÿß ŸÖÿµÿßÿ® ÿ®ŸÅŸÇÿ± ÿßŸÑÿØŸÖ ÿßŸÑŸÖŸÜÿ¨ŸÑŸä (ÿßŸÑÿ≥ŸÉŸÑÿ≥ŸÑ) ÿπŸÑŸÖÿ¢ ÿ®ÿ£ŸÜ ŸÜÿ≥ÿ®ÿ© ÿßŸÑÿ≥ŸÉŸÑÿ≥ŸÑ 72 ŸÅÿπŸÜÿØŸÖÿß ÿ™ÿµÿ®ÿ≠ ŸÜÿ≥ÿ®ÿ© ÿßŸÑÿØŸÖ 7 ŸÅÿ£ŸÜ ÿßŸÑÿßŸÑÿßŸÖ ÿ™ÿ£ÿ™Ÿä ÿ®ŸÉÿ´ÿ±Ÿá ŸÅŸÖÿß ÿßŸÑÿ≠ŸÑ ŸÑÿ≤ŸäÿßÿØ
Cleaned:   ÿßŸÑÿ≥ÿ§ÿßŸÑ ------- ÿßŸÑÿ≥ŸÑÿßŸÖ ÿπŸÑŸäŸÉŸÖ ÿßŸÜÿß ŸÖÿµÿßÿ® ÿ®ŸÅŸÇÿ± ÿßŸÑÿØŸÖ ÿßŸÑŸÖŸÜÿ¨ŸÑŸä (ÿßŸÑÿ≥ŸÉŸÑÿ≥ŸÑ) ÿπŸÑŸÖÿß ÿ®ÿßŸÜ ŸÜÿ≥ÿ®Ÿá ÿßŸÑÿ≥ŸÉŸÑÿ≥ŸÑ 72 ŸÅÿπŸÜÿØŸÖÿß ÿ™ÿµÿ®ÿ≠ ŸÜÿ≥ÿ®Ÿá ÿßŸÑÿØŸÖ 7 ŸÅÿßŸÜ ÿßŸÑÿßŸÑÿßŸÖ ÿ™ÿßÿ™Ÿä ÿ®ŸÉÿ´ÿ±Ÿá ŸÅŸÖÿß ÿßŸÑÿ≠ŸÑ ŸÑÿ≤ŸäÿßÿØ


In [None]:
# VIEW PER-CLASS SAMPLE DISTRIBUTION

import pandas as pd
import matplotlib.pyplot as plt

# Analyze class distribution
class_counts = train_df['label'].value_counts().sort_index()

print("="*80)
print("PER-CLASS SAMPLE DISTRIBUTION")
print("="*80)

# Summary statistics
print(f"\nüìä Summary:")
print(f"   Total classes: {len(class_counts)}")
print(f"   Total samples: {len(train_df):,}")
print(f"   Mean samples/class: {class_counts.mean():.2f}")
print(f"   Median samples/class: {class_counts.median():.2f}")
print(f"   Min samples:  {class_counts.min()} (Class {class_counts.idxmin()})")
print(f"   Max samples: {class_counts.max()} (Class {class_counts.idxmax()})")
print(f"   Imbalance ratio: {class_counts.max() / class_counts.min():.2f}x")

PER-CLASS SAMPLE DISTRIBUTION

üìä Summary:
   Total classes: 82
   Total samples: 27,951
   Mean samples/class: 340.87
   Median samples/class: 350.50
   Min samples:  7 (Class 6)
   Max samples: 600 (Class 0)
   Imbalance ratio: 85.71x


In [None]:
# DATA AUGMENTATION - BACK-TRANSLATION METHOD (BEST QUALITY)

def augment_minority_classes(train_df, min_samples=50, target_samples=150):
    """
    Augment minority classes using back-translation (Arabic ‚Üí English ‚Üí Arabic)
    
    Args:
        train_df: DataFrame with 'text' and 'label' columns
        min_samples:  Classes below this get augmented
        target_samples:  Target samples per minority class
    
    Returns: 
        Augmented DataFrame (same name:  train_df)
    """
    print("\n" + "="*80)
    print("DATA AUGMENTATION - BACK-TRANSLATION METHOD")
    print("="*80)
    
    from transformers import MarianMTModel, MarianTokenizer
    from tqdm.auto import tqdm
    import random
    import torch
    
    # Load translation models
    print("\nLoading translation models...")
    print("  [1/2] Loading Arabic ‚Üí English...")
    
    try:
        # Arabic to English
        ar_en_model_name = 'Helsinki-NLP/opus-mt-ar-en'
        ar_en_tokenizer = MarianTokenizer.from_pretrained(ar_en_model_name)
        ar_en_model = MarianMTModel.from_pretrained(ar_en_model_name)
        
        # English to Arabic
        print("  [2/2] Loading English ‚Üí Arabic...")
        en_ar_model_name = 'Helsinki-NLP/opus-mt-en-ar'
        en_ar_tokenizer = MarianTokenizer.from_pretrained(en_ar_model_name)
        en_ar_model = MarianMTModel.from_pretrained(en_ar_model_name)
        
        # Move to GPU if available
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        ar_en_model = ar_en_model.to(device)
        en_ar_model = en_ar_model. to(device)
        
        use_backtranslation = True
        print(f"‚úÖ Translation models loaded on {device}")
        
    except Exception as e:
        use_backtranslation = False
        print(f"‚ö†Ô∏è  Translation models failed: {e}")
        print("   Falling back to fast augmentation...")
    
    # Augmentation functions
    def back_translate(text):
        """Arabic ‚Üí English ‚Üí Arabic back-translation"""
        try: 
            # Translate to English
            inputs = ar_en_tokenizer(text, return_tensors="pt", padding=True, 
                                    truncation=True, max_length=512).to(device)
            translated = ar_en_model.generate(**inputs, max_length=512, num_beams=4, 
                                             early_stopping=True)
            english_text = ar_en_tokenizer.decode(translated[0], skip_special_tokens=True)
            
            # Translate back to Arabic
            inputs = en_ar_tokenizer(english_text, return_tensors="pt", padding=True, 
                                    truncation=True, max_length=512).to(device)
            back_translated = en_ar_model.generate(**inputs, max_length=512, num_beams=4,
                                                   early_stopping=True)
            arabic_text = en_ar_tokenizer.decode(back_translated[0], skip_special_tokens=True)
            
            return arabic_text if arabic_text. strip() else text
        except: 
            return text  # Return original if translation fails
    
    def fast_augment(text):
        """Fast fallback - swap words"""
        words = text.split()
        if len(words) < 2:
            return text
        
        # Swap 1-2 random word pairs
        n_swaps = random.randint(1, min(2, len(words)//2))
        for _ in range(n_swaps):
            idx1, idx2 = random.sample(range(len(words)), 2)
            words[idx1], words[idx2] = words[idx2], words[idx1]
        
        return ' '.join(words)
    
    def synonym_replacement(text, n=2):
        """Simple synonym replacement for medical terms"""
        words = text.split()
        if len(words) < 3:
            return text
        
        # Arabic medical synonyms
        synonyms = {
            'ŸÖÿ±Ÿäÿ∂': ['ŸÖÿµÿßÿ®', 'ÿπŸÑŸäŸÑ', 'ÿ≥ŸÇŸäŸÖ'],
            'ÿπŸÑÿßÿ¨': ['ÿØŸàÿßÿ°', 'ŸÖÿπÿßŸÑÿ¨ÿ©', 'ÿ∑ÿ®'],
            'ÿ∑ÿ®Ÿäÿ®': ['ÿØŸÉÿ™Ÿàÿ±', 'ŸÖÿπÿßŸÑÿ¨'],
            'ÿ£ŸÑŸÖ': ['Ÿàÿ¨ÿπ', 'ŸÖÿπÿßŸÜÿßÿ©'],
            'ÿ≠ÿßŸÑÿ©': ['Ÿàÿ∂ÿπ', 'ÿ∏ÿ±ŸÅ'],
            'ÿµÿ≠ÿ©': ['ÿπÿßŸÅŸäÿ©', 'ÿ≥ŸÑÿßŸÖÿ©'],
            'ŸÖÿ±ÿ∂': ['ÿØÿßÿ°', 'ÿπŸÑÿ©'],
            'ŸÅÿ≠ÿµ': ['ÿßÿÆÿ™ÿ®ÿßÿ±', 'ŸÉÿ¥ŸÅ'],
            'ÿ£ÿπÿ±ÿßÿ∂': ['ÿπŸÑÿßŸÖÿßÿ™', 'ŸÖÿ∏ÿßŸáÿ±'],
        }
        
        new_words = words.copy()
        indices = random.sample(range(len(words)), min(n, len(words)))
        
        for idx in indices:
            word = words[idx]
            if word in synonyms:
                new_words[idx] = random.choice(synonyms[word])
        
        return ' '.join(new_words)
    
    # Identify minority classes
    class_counts = train_df['label'].value_counts()
    minority_classes = class_counts[class_counts < min_samples]. index.tolist()
    
    print(f"\nüìä Found {len(minority_classes)} minority classes (< {min_samples} samples)")
    print(f"   Minority classes: {sorted(minority_classes)}")
    print(f"   Will augment each to {target_samples} samples")
    
    if use_backtranslation: 
        print(f"\nüîÑ Using back-translation (Arabic ‚Üí English ‚Üí Arabic)")
    else:
        print(f"\n‚ö° Using fast augmentation (word swap + synonyms)")
    
    # Augment
    augmented_rows = []
    
    for class_id in tqdm(minority_classes, desc="Augmenting classes"):
        class_df = train_df[train_df['label'] == class_id]
        current_count = len(class_df)
        needed = target_samples - current_count
        
        if needed <= 0:
            continue
        
        # Use text_clean if available, otherwise use text
        text_col = 'text_clean' if 'text_clean' in train_df.columns else 'text'
        texts = class_df[text_col].tolist()
        
        for i in range(needed):
            original_text = random.choice(texts)
            
            # Apply augmentation with mixed methods
            if use_backtranslation:
                # 60% back-translation, 30% synonym, 10% swap
                rand = random.random()
                if rand < 0.6:
                    augmented_text = back_translate(original_text)
                elif rand < 0.9:
                    augmented_text = synonym_replacement(original_text, n=2)
                else: 
                    augmented_text = fast_augment(original_text)
            else:
                # Fallback:  70% synonym, 30% swap
                if random.random() < 0.7:
                    augmented_text = synonym_replacement(original_text, n=2)
                else:
                    augmented_text = fast_augment(original_text)
            
            # Add augmented sample
            new_row = class_df.iloc[0].copy()
            new_row[text_col] = augmented_text
            augmented_rows. append(new_row)
    
    # Combine and shuffle
    augmented_df = pd. DataFrame(augmented_rows)
    train_df = pd.concat([train_df, augmented_df], ignore_index=True)
    train_df = train_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
    
    print(f"\n‚úÖ Augmentation complete!")
    print(f"   Original samples: {len(train_df) - len(augmented_df):,}")
    print(f"   Added samples: {len(augmented_df):,}")
    print(f"   Total samples: {len(train_df):,}")
    print(f"   Increase: +{len(augmented_df)/(len(train_df)-len(augmented_df))*100:.1f}%")
    
    
    # Cleanup
    if use_backtranslation:
        del ar_en_model, en_ar_model, ar_en_tokenizer, en_ar_tokenizer
    
    import gc
    gc.collect()
    torch.cuda.empty_cache()
    
    return train_df

# USAGE - ADD THIS AFTER PREPROCESSING
            
# Augment training data with back-translation
train_df = augment_minority_classes(
    train_df, 
    min_samples=50,      # Augment classes with < 50 samples
    target_samples=150   # Boost to 150 samples each
)

# Show final distribution
print("\n" + "="*80)
print("FINAL CLASS DISTRIBUTION AFTER AUGMENTATION")
print("="*80)
final_counts = train_df['label'].value_counts().sort_index()
print(f"Min samples:   {final_counts.min()} (Class {final_counts.idxmin()})")
print(f"Max samples:  {final_counts.max()} (Class {final_counts.idxmax()})")
print(f"Imbalance ratio: {final_counts.max() / final_counts.min():.2f}x")
print(f"Total samples: {len(train_df):,}")


DATA AUGMENTATION - BACK-TRANSLATION METHOD

Loading translation models...
  [1/2] Loading Arabic ‚Üí English...


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

  [2/2] Loading English ‚Üí Arabic...


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

‚úÖ Translation models loaded on cuda

üìä Found 24 minority classes (< 50 samples)
   Minority classes: [3, 4, 6, 7, 10, 12, 20, 23, 28, 29, 34, 38, 44, 48, 53, 56, 58, 60, 64, 69, 71, 75, 78, 79]
   Will augment each to 150 samples

üîÑ Using back-translation (Arabic ‚Üí English ‚Üí Arabic)


Augmenting classes:   0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
# 4. COMPUTE CLASS WEIGHTS (CLIPPED FOR STABILITY)

print("\n" + "="*80)
print("[4/12] COMPUTING CLASS WEIGHTS FOR IMBALANCE")
print("="*80)

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)

# Clip extreme weights to prevent training instability
class_weights = np.clip(class_weights, 0.5, 10.0)
class_weights_tensor = torch.FloatTensor(class_weights)

print(f"\n‚úÖ Class weights computed and CLIPPED!")
print(f"   Original - Min: {class_weights.min():.2f}, Max: {class_weights.max():.2f}")
print(f"   Clipped  - Min: {class_weights.min():.2f}, Max: {class_weights.max():.2f}")
print(f"   Mean weight: {class_weights.mean():.2f}")
print(f"\nüí° Clipped to max=10.0 to prevent gradient explosion")


[4/12] COMPUTING CLASS WEIGHTS FOR IMBALANCE

‚úÖ Class weights computed!
   Mean weight: 7.1595
   Min weight: 0.5681
   Max weight: 48.6951
   Weight range: 85.71x


In [None]:
# 5. TRAIN/VALIDATION SPLIT

print("\n" + "="*80)
print("[5/12] CREATING TRAIN/VALIDATION SPLIT")
print("="*80)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text_clean']. values,
    train_df['label'].values,
    test_size=0.2,
    random_state=SEED,
    stratify=train_df['label']. values
)

print(f"\n‚úÖ Split complete!")
print(f"   Training samples: {len(train_texts)}")
print(f"   Validation samples: {len(val_texts)}")
print(f"   Split ratio: 80/20")


[5/12] CREATING TRAIN/VALIDATION SPLIT

‚úÖ Split complete!
   Training samples: 22360
   Validation samples: 5591
   Split ratio: 80/20


In [None]:
# 6. LOAD TOKENIZER AND MODEL

print("\n" + "="*80)
print("[6/12] LOADING ARABERT V2 MODEL")
print("="*80)

print(f"\nLoading tokenizer from {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(f"Loading model from {MODEL_NAME}...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="single_label_classification"
)


# CRITICAL: Enable gradient checkpointing (trades compute for memory)
model.gradient_checkpointing_enable()

print(f"\n‚úÖ Model loaded successfully!")
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p. numel() for p in model.parameters() if p.requires_grad)
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")


[6/12] LOADING ARABERT V2 MODEL

Loading tokenizer from aubmindlab/bert-large-arabertv02...
Loading model from aubmindlab/bert-large-arabertv02...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-large-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



‚úÖ Model loaded successfully!
   Total parameters: 369,507,410
   Trainable parameters: 369,507,410


In [None]:
# 7. CREATE DATASET

print("\n" + "="*80)
print("[7/12] CREATING PYTORCH DATASETS")
print("="*80)

class ArabicMedicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = ArabicMedicalDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_dataset = ArabicMedicalDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)

print(f"\n‚úÖ Datasets created!")
print(f"   Train dataset: {len(train_dataset)} samples")
print(f"   Val dataset: {len(val_dataset)} samples")



[7/12] CREATING PYTORCH DATASETS

‚úÖ Datasets created!
   Train dataset: 22360 samples
   Val dataset: 5591 samples


In [None]:
# 8. DEFINE FOCAL LOSS

print("\n" + "="*80)
print("[8/12] SETTING UP FOCAL LOSS")
print("="*80)

class FocalLoss(nn. Module):
    """Focal Loss for handling class imbalance"""
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss. mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

print("‚úÖ Focal Loss defined with gamma=2.0 and class weights")


[8/12] SETTING UP FOCAL LOSS
‚úÖ Focal Loss defined with gamma=2.0 and class weights


In [None]:
# 9. CUSTOM TRAINER WITH FOCAL LOSS (UPDATED FOR NEW TRANSFORMERS)

print("\n" + "="*80)
print("[9/12] CREATING CUSTOM TRAINER")
print("="*80)

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        if class_weights is not None:  
            self.loss_fn = FocalLoss(
                alpha=class_weights. to(self.args.device) if hasattr(self.args, 'device') else class_weights,
                gamma=2.5
            )
        else:
            self.loss_fn = None
    
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Compute loss with Focal Loss and class weights.
        Updated to support num_items_in_batch parameter (new in transformers 4.46+)
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        if self.loss_fn is not None:
            loss = self.loss_fn(logits, labels)
        else:
            loss = F.cross_entropy(logits, labels)
        
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    """Compute macro F1 and other metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    macro_f1 = f1_score(labels, predictions, average='macro', zero_division=0)
    micro_f1 = f1_score(labels, predictions, average='micro', zero_division=0)
    weighted_f1 = f1_score(labels, predictions, average='weighted', zero_division=0)
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'macro_f1': macro_f1,
        'micro_f1': micro_f1,
        'weighted_f1': weighted_f1,
        'accuracy': accuracy
    }

print("‚úÖ Custom Trainer with Focal Loss ready!")
print("   ‚úì Compatible with Transformers 4.46+")
print("   ‚úì Supports num_items_in_batch parameter")


[9/12] CREATING CUSTOM TRAINER
‚úÖ Custom Trainer with Focal Loss ready!
   ‚úì Compatible with Transformers 4.46+
   ‚úì Supports num_items_in_batch parameter


In [None]:
# 10. TRAINING CONFIGURATION

print("\n" + "="*80)
print("[10/12] CONFIGURING TRAINING")
print("="*80)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=WARMUP_STEPS,
    logging_dir=LOGS_DIR,
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=2,
    report_to='none',
    seed=SEED,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("‚úÖ Training configuration complete!")
print(f"   Epochs: {NUM_EPOCHS}")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Learning rate:  {LEARNING_RATE}")
print(f"   FP16: {training_args.fp16}")
print(f"   Early stopping patience: 3 epochs")


[10/12] CONFIGURING TRAINING
‚úÖ Training configuration complete!
   Epochs: 5
   Batch size: 12
   Learning rate:  2e-05
   FP16: True
   Early stopping patience: 3 epochs


In [None]:
# 11. TRAIN MODEL

print("\n" + "="*80)
print("[11/12] STARTING TRAINING")
print("="*80)
print("\nThis may take 30-60 minutes depending on your hardware.. .\n")

training_start_time = datetime.now()

train_result = trainer.train()

training_end_time = datetime.now()
training_duration = training_end_time - training_start_time

print("\n" + "="*80)
print("‚úÖ TRAINING COMPLETE!")


[11/12] STARTING TRAINING

This may take 30-60 minutes depending on your hardware.. .



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
print("="*80)
print(f"\nTraining Duration: {training_duration}")

print(f"\nFinal Training Metrics:")
for key, value in train_result.metrics.items():
    if isinstance(value, float):
        print(f"   {key}:  {value:.4f}")
    else:
        print(f"   {key}: {value}")

# Evaluate on validation set
print("\n" + "="*80)
print("EVALUATING ON VALIDATION SET")
print("="*80)

eval_results = trainer. evaluate()

print(f"\n‚úÖ Validation Results:")
for key, value in eval_results.items(): 
    if isinstance(value, float):
        print(f"   {key}:  {value:.4f}")
    else:
        print(f"   {key}: {value}")

print(f"\nüéØ MACRO F1 SCORE (Competition Metric): {eval_results['eval_macro_f1']:.4f}")

In [None]:
# KAGGLE:  SAVE MODEL TO KAGGLE DATASET

print("\n" + "="*80)
print("SAVING MODEL FOR KAGGLE")
print("="*80)

# In Kaggle, save to /kaggle/working/ (this gets saved as output)
MODEL_SAVE_DIR = '/kaggle/working/arabert_medical_model'
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

print(f"\nSaving model to {MODEL_SAVE_DIR}...")

# Save model and tokenizer
trainer.save_model(MODEL_SAVE_DIR)
tokenizer.save_pretrained(MODEL_SAVE_DIR)

# Save metadata
metadata = {
    'model_name': MODEL_NAME,
    'num_labels': NUM_LABELS,
    'max_length': MAX_LENGTH,
    'batch_size': BATCH_SIZE,
    'learning_rate': LEARNING_RATE,
    'num_epochs':  NUM_EPOCHS,
    'macro_f1': float(eval_results['eval_macro_f1']),
    'micro_f1': float(eval_results['eval_micro_f1']),
    'accuracy': float(eval_results['eval_accuracy']),
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
}

with open(f'{MODEL_SAVE_DIR}/metadata.json', 'w', encoding='utf-8') as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

# Save class weights
np.save(f'{MODEL_SAVE_DIR}/class_weights.npy', class_weights)

print("\n‚úÖ Model saved successfully!")
print(f"\nüìÅ Saved files in {MODEL_SAVE_DIR}:")
for file in os.listdir(MODEL_SAVE_DIR):
    print(f"   ‚Ä¢ {file}")

In [None]:
# DETAILED EVALUATION

print("\n" + "="*80)
print("DETAILED EVALUATION")
print("="*80)

predictions = trainer.predict(val_dataset)
pred_labels = np.argmax(predictions. predictions, axis=1)
true_labels = val_labels

# Per-class F1 scores
per_class_f1 = f1_score(true_labels, pred_labels, average=None, zero_division=0)

print(f"\nPer-Class F1 Statistics:")
print(f"   Mean: {per_class_f1.mean():.4f}")
print(f"   Min: {per_class_f1.min():.4f}")
print(f"   Max: {per_class_f1.max():.4f}")
print(f"   Std: {per_class_f1.std():.4f}")

# Best and worst performing classes
class_f1_dict = {i: f1 for i, f1 in enumerate(per_class_f1)}
sorted_classes = sorted(class_f1_dict.items(), key=lambda x: x[1], reverse=True)

print(f"\nüèÜ Top 10 Best Performing Classes:")
for class_id, f1 in sorted_classes[:10]:
    count = label_counts[class_id]
    print(f"   Class {class_id: 2d}: F1={f1:.4f}, Samples={count:4d}")

print(f"\n‚ö†Ô∏è  Top 10 Worst Performing Classes:")
for class_id, f1 in sorted_classes[-10:]:
    count = label_counts[class_id]
    print(f"   Class {class_id:2d}: F1={f1:.4f}, Samples={count:4d}")

# Save per-class F1 scores
per_class_results = {
    'class_id': list(range(NUM_LABELS)),
    'f1_score': per_class_f1.tolist(),
    'sample_count': [int(label_counts[i]) for i in range(NUM_LABELS)]
}
per_class_df = pd.DataFrame(per_class_results)
per_class_df.to_csv(f'{MODEL_SAVE_DIR}/per_class_f1_scores.csv', index=False)
print(f"\n‚úÖ Per-class F1 scores saved to {MODEL_SAVE_DIR}/per_class_f1_scores.csv")

In [None]:
# VISUALIZATION

print("\n" + "="*80)
print("CREATING PERFORMANCE VISUALIZATIONS")
print("="*80)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Per-class F1 scores
ax1 = axes[0, 0]
ax1.bar(range(NUM_LABELS), per_class_f1, color='green', alpha=0.6)
ax1.axhline(y=per_class_f1.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {per_class_f1.mean():.4f}')
ax1.set_title('Per-Class F1 Scores', fontsize=14, fontweight='bold')
ax1.set_xlabel('Class Label')
ax1.set_ylabel('F1 Score')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)
# 2. F1 Score vs Sample Count
ax2 = axes[0, 1]
sample_counts = [label_counts[i] for i in range(NUM_LABELS)]
ax2.scatter(sample_counts, per_class_f1, alpha=0.6, c=per_class_f1, cmap='RdYlGn')
ax2.set_title('F1 Score vs Sample Count', fontsize=14, fontweight='bold')
ax2.set_xlabel('Number of Training Samples')
ax2.set_ylabel('F1 Score')
ax2.grid(True, alpha=0.3)

# 3. F1 Score Distribution
ax3 = axes[1, 0]
ax3.hist(per_class_f1, bins=30, color='skyblue', alpha=0.7, edgecolor='black')
ax3.axvline(per_class_f1.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {per_class_f1.mean():.4f}')
ax3.axvline(np.median(per_class_f1), color='green', linestyle='--', linewidth=2, label=f'Median: {np.median(per_class_f1):.4f}')
ax3.set_title('F1 Score Distribution', fontsize=14, fontweight='bold')
ax3.set_xlabel('F1 Score')
ax3.set_ylabel('Frequency')
ax3.legend()
ax3.grid(axis='y', alpha=0.3)

# 4. Top/Bottom Classes
ax4 = axes[1, 1]
top_5 = sorted_classes[:5]
bottom_5 = sorted_classes[-5:]
combined = top_5 + bottom_5
class_ids = [c[0] for c in combined]
f1_scores = [c[1] for c in combined]
colors = ['green']*5 + ['red']*5
ax4.barh(range(len(combined)), f1_scores, color=colors, alpha=0.7)
ax4.set_yticks(range(len(combined)))
ax4.set_yticklabels([f'Class {c}' for c in class_ids])
ax4.set_title('Top 5 and Bottom 5 Classes', fontsize=14, fontweight='bold')
ax4.set_xlabel('F1 Score')
ax4.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig(f'{MODEL_SAVE_DIR}/performance_analysis.png', dpi=300, bbox_inches='tight')
print(f"‚úÖ Performance visualizations saved to {MODEL_SAVE_DIR}/performance_analysis.png")
plt.show()


In [None]:
# EACL 2026 Abjad NLP:  Generate Submission File
# Load trained model and create predictions

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm. auto import tqdm
import re
import json
import warnings
from collections import Counter
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

print("="*80)
print("EACL 2026 ABJAD NLP - SUBMISSION GENERATOR")
print("="*80)

In [None]:
# CONFIGURATION

# Path to your saved model
MODEL_PATH = '/kaggle/working/arabert_medical_model'

# Path to test data
TEST_DATA_PATH = '/kaggle/input/arabic/shared_task_devtest_no_label.csv'

# Output submission file
SUBMISSION_FILE = 'submission.csv'

# Batch size for inference
BATCH_SIZE = 32

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nüñ•Ô∏è  Using device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# KAGGLE:  LOAD MODEL FROM KAGGLE DATASET

print("="*80)
print("LOADING MODEL FROM KAGGLE DATASET")
print("="*80)

MODEL_PATH = '/kaggle/working/arabert_medical_model'

# Check if model exists
import os
if os.path.exists(MODEL_PATH):
    print(f"\n‚úÖ Model directory found:  {MODEL_PATH}")
    print(f"\nüìÅ Files in model directory:")
    for file in os.listdir(MODEL_PATH):
        print(f"   ‚Ä¢ {file}")
else:
    print(f"\n‚ùå ERROR: Model not found at {MODEL_PATH}")
    print("\nüîß HOW TO FIX:")
    print("1. Make sure you ran the training notebook")
    print("2. Saved the output as a dataset named 'arabert-medical-model'")
    print("3. Added it as input to THIS notebook (Add Data -> Your Datasets)")
    raise FileNotFoundError(f"Model not found at {MODEL_PATH}")

# Load metadata
print("\nLoading metadata...")
with open(f'{MODEL_PATH}/metadata.json', 'r', encoding='utf-8') as f:
    metadata = json.load(f)

print("\nüìã Model Metadata:")
for key, value in metadata.items():
    print(f"   {key}: {value}")

# Load tokenizer
print("\nLoading tokenizer...")
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
print("‚úÖ Tokenizer loaded!")

# Load model
print("Loading model...")
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification. from_pretrained(MODEL_PATH)
model.to(device)
model.eval()

print(f"‚úÖ Model loaded and moved to {device}!")
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")

print("\n" + "="*80)
print("‚úÖ MODEL LOADED SUCCESSFULLY - READY FOR INFERENCE!")
print("="*80)

In [None]:
# 2. LOAD TEST DATA

print("\n" + "="*80)
print("[2/7] LOADING TEST DATA")
print("="*80)

test_df = pd.read_csv(TEST_DATA_PATH)

print(f"\n‚úÖ Test data loaded!")
print(f"   Shape: {test_df.shape}")
print(f"   Columns: {test_df.columns. tolist()}")
print(f"\nFirst few rows:")
print(test_df.head())

# Check for missing values
if test_df.isnull().sum().sum() > 0:
    print(f"\n‚ö†Ô∏è  Warning: Missing values detected")
    print(test_df.isnull().sum())
else:
    print("\n‚úÖ No missing values")

In [None]:
# 3. PREPROCESS TEST DATA

print("\n" + "="*80)
print("[3/7] PREPROCESSING TEST DATA")
print("="*80)

def preprocess_arabic_text(text):
    """Same preprocessing as training"""
    if not isinstance(text, str):
        return ""
    
    # Remove diacritics
    text = re.sub(r'[ŸãŸåŸçŸéŸèŸêŸëŸí]', '', text)
    
    # Normalize Arabic letters
    text = re.sub(r'[ÿ•ÿ£ÿ¢ÿß]', 'ÿß', text)
    text = re.sub(r'Ÿâ', 'Ÿä', text)
    text = re.sub(r'ÿ©', 'Ÿá', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("Applying preprocessing...")
test_df['text_clean'] = test_df['text']. apply(preprocess_arabic_text)

print(f"‚úÖ Preprocessing complete!")
print(f"   Total samples: {len(test_df)}")

print("\nExample preprocessing:")
for i in range(2):
    print(f"\n{i+1}. Original: {test_df['text']. iloc[i][: 120]}...")
    print(f"   Cleaned:   {test_df['text_clean'].iloc[i][:120]}...")

In [None]:
# 4. CREATE TEST DATASET

print("\n" + "="*80)
print("[4/7] CREATING TEST DATASET")
print("="*80)

class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

MAX_LENGTH = metadata['max_length']
test_dataset = TestDataset(
    test_df['text_clean'].values,
    tokenizer,
    MAX_LENGTH
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

print(f"‚úÖ Test dataset created!")
print(f"   Samples: {len(test_dataset)}")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Batches: {len(test_loader)}")
print(f"   Max length:  {MAX_LENGTH}")

In [None]:
# 5. GENERATE PREDICTIONS

print("\n" + "="*80)
print("[5/7] GENERATING PREDICTIONS")
print("="*80)

print("\nRunning inference.. .\n")

all_predictions = []
all_probabilities = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Get probabilities
        probabilities = torch.softmax(logits, dim=-1)
        
        # Get predictions
        predictions = torch.argmax(logits, dim=-1)
        
        all_predictions.extend(predictions.cpu().numpy())
        all_probabilities.extend(probabilities.cpu().numpy())

all_predictions = np.array(all_predictions)
all_probabilities = np.array(all_probabilities)

print(f"\n‚úÖ Predictions complete!")
print(f"   Total predictions: {len(all_predictions)}")
print(f"   Prediction range: [{all_predictions.min()}, {all_predictions.max()}]")

In [None]:
# 6. ANALYZE PREDICTIONS

print("\n" + "="*80)
print("[6/7] ANALYZING PREDICTIONS")
print("="*80)

# Prediction statistics
pred_counts = Counter(all_predictions)
print(f"\nPrediction Statistics:")
print(f"   Unique classes predicted: {len(pred_counts)}/82")
print(f"   Range:  {all_predictions.min()} to {all_predictions.max()}")

# Validate predictions
invalid_preds = (all_predictions < 0) | (all_predictions > 81)
if invalid_preds.any():
    print(f"\n‚ùå ERROR: {invalid_preds.sum()} invalid predictions!")
else:
    print(f"\n‚úÖ All predictions in valid range [0, 81]")

# Top predicted classes
print(f"\nüìä Top 10 Most Predicted Classes:")
for label, count in pred_counts.most_common(10):
    print(f"   Class {label: 2d}: {count:5d} predictions ({count/len(all_predictions)*100:5.2f}%)")

# Confidence analysis
max_probs = np.max(all_probabilities, axis=1)
print(f"\nüéØ Prediction Confidence:")
print(f"   Mean:  {max_probs.mean():.4f}")
print(f"   Median: {np.median(max_probs):.4f}")
print(f"   Min: {max_probs.min():.4f}")
print(f"   Max: {max_probs.max():.4f}")

low_conf = (max_probs < 0.5).sum()
print(f"   Low confidence (<0.5): {low_conf} ({low_conf/len(max_probs)*100:.2f}%)")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Prediction distribution
ax1 = axes[0]
pred_counts_sorted = sorted(pred_counts.items())
labels, counts = zip(*pred_counts_sorted) if pred_counts_sorted else ([], [])
ax1.bar(labels, counts, color='steelblue', alpha=0.7)
ax1.set_title('Prediction Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Predicted Class')
ax1.set_ylabel('Count')
ax1.grid(axis='y', alpha=0.3)

# Confidence distribution
ax2 = axes[1]
ax2.hist(max_probs, bins=50, color='coral', alpha=0.7, edgecolor='black')
ax2.axvline(max_probs.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {max_probs. mean():.3f}')
ax2.axvline(np.median(max_probs), color='green', linestyle='--', linewidth=2, label=f'Median: {np. median(max_probs):.3f}')
ax2.set_title('Prediction Confidence Distribution', fontsize=14, fontweight='bold')
ax2.set_xlabel('Confidence Score')
ax2.set_ylabel('Frequency')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('prediction_analysis.png', dpi=300, bbox_inches='tight')
print(f"\n‚úÖ Analysis visualization saved:  prediction_analysis.png")
plt.show()

In [None]:
# 7. CREATE AND VALIDATE SUBMISSION

print("\n" + "="*80)
print("[7/7] CREATING SUBMISSION FILE")
print("="*80)

# Create submission dataframe
submission_df = pd.DataFrame({
    'Id': range(len(all_predictions)),
    'Predicted': all_predictions
})

print(f"\nüìÑ Submission Preview:")
print(submission_df. head(10))
print("...")
print(submission_df.tail(10))

# Validation
print(f"\nüîç Validating submission format...")

checks = []

# Check 1: Column names
if list(submission_df.columns) == ['Id', 'Predicted']: 
    print("   ‚úÖ Column names correct")
    checks.append(True)
else:
    print(f"   ‚ùå Column names incorrect:  {list(submission_df.columns)}")
    checks.append(False)

# Check 2: Id column
if (submission_df['Id'] == range(len(submission_df))).all():
    print("   ‚úÖ Id column sequential from 0")
    checks.append(True)
else:
    print("   ‚ùå Id column not sequential")
    checks.append(False)

# Check 3: Prediction range
if (submission_df['Predicted'] >= 0).all() and (submission_df['Predicted'] <= 81).all():
    print("   ‚úÖ All predictions in [0, 81]")
    checks.append(True)
else:
    print("   ‚ùå Predictions out of range")
    checks.append(False)

# Check 4: No missing values
if submission_df.isnull().sum().sum() == 0:
    print("   ‚úÖ No missing values")
    checks.append(True)
else:
    print(f"   ‚ùå Missing values found")
    checks.append(False)

# Check 5: Data types
if submission_df['Id'].dtype == np.int64 and submission_df['Predicted']. dtype in [np.int64, np. int32]:
    print("   ‚úÖ Data types correct")
    checks.append(True)
else:
    print(f"   ‚ùå Data types incorrect")
    checks.append(False)

# Check 6: Row count
if len(submission_df) == len(test_df):
    print(f"   ‚úÖ Row count matches ({len(submission_df)})")
    checks.append(True)
else:
    print(f"   ‚ùå Row count mismatch")
    checks.append(False)

if all(checks):
    print(f"\n‚úÖ All validation checks passed!")
else:
    print(f"\n‚ö†Ô∏è  Some validation checks failed!")

# Save submission file
print(f"\nSaving submission to: {SUBMISSION_FILE}")
submission_df.to_csv(SUBMISSION_FILE, index=False)

# Verify saved file
verify_df = pd.read_csv(SUBMISSION_FILE)
if verify_df.equals(submission_df):
    print(f"‚úÖ Submission file saved and verified!")
else:
    print(f"‚ö†Ô∏è  Warning: Verification mismatch")

file_size = len(verify_df) * 2 * 8 / 1024  # Rough estimate in KB
print(f"\nüì¶ File Details:")
print(f"   Filename: {SUBMISSION_FILE}")
print(f"   Rows: {len(submission_df)}")
print(f"   Size: ~{file_size:.2f} KB")