# OSACT7 AdabEval - Task A: Arabic Politeness Detection

Ce notebook permet de classifier des textes arabes en trois categories:
- **Polite** (Poli)
- **Impolite** (Impoli)
- **Neutral** (Neutre)

**Meilleur modele:** MARBERT avec Focal Loss (F1 = 0.84)

---

## Instructions
1. Assurez-vous d'avoir un GPU active (Runtime > Change runtime type > GPU)
2. Uploadez vos fichiers CSV (`TaskApoliteness_train.csv`, `TaskApoliteness_val.csv`) ou utilisez Google Drive
3. Executez les cellules dans l'ordre

## 1. Configuration et Installation

In [None]:
# Installation des dependances
!pip install -q transformers datasets accelerate
!pip install -q arabert
!pip install -q xgboost imbalanced-learn
!pip install -q seaborn

In [None]:
# Verification GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA disponible: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memoire GPU: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Imports
import os
import json
import re
import warnings

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, classification_report
)
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from transformers import (
    AutoTokenizer, AutoModel, AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

try:
    from arabert.preprocess import ArabertPreprocessor
except ImportError:
    ArabertPreprocessor = None

warnings.filterwarnings('ignore')

# Configuration
SEED = 42
NUM_LABELS = 3
MAX_LENGTH = 128
BATCH_SIZE = 16
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)
print(f"Device: {DEVICE}")

## 2. Chargement des Donnees

**Option A:** Upload direct des fichiers CSV

**Option B:** Montage Google Drive

In [None]:
# Option A: Upload des fichiers
from google.colab import files

USE_DRIVE = False  # Mettre True pour utiliser Google Drive

if not USE_DRIVE:
    print("Uploadez vos fichiers CSV (TaskApoliteness_train.csv et TaskApoliteness_val.csv)")
    uploaded = files.upload()
    TRAIN_CSV = "TaskApoliteness_train.csv"
    VAL_CSV = "TaskApoliteness_val.csv"
else:
    # Option B: Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_DIR = "/content/drive/MyDrive/abeval"  # Modifiez ce chemin
    TRAIN_CSV = f"{DATA_DIR}/TaskApoliteness_train.csv"
    VAL_CSV = f"{DATA_DIR}/TaskApoliteness_val.csv"

In [None]:
# Chargement des donnees
train_df = pd.read_csv(TRAIN_CSV)
val_df = pd.read_csv(VAL_CSV)

print(f"Donnees d'entrainement: {len(train_df)} exemples")
print(f"Donnees de validation: {len(val_df)} exemples")
print(f"\nDistribution des classes (train):")
print(train_df['label'].value_counts())

In [None]:
# Visualisation de la distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

train_df['label'].value_counts().plot(kind='bar', ax=axes[0], color=['#3498db', '#e74c3c', '#2ecc71'])
axes[0].set_title('Distribution Train')
axes[0].set_ylabel('Nombre')

val_df['label'].value_counts().plot(kind='bar', ax=axes[1], color=['#3498db', '#e74c3c', '#2ecc71'])
axes[1].set_title('Distribution Validation')
axes[1].set_ylabel('Nombre')

plt.tight_layout()
plt.show()

## 3. Preprocessing

In [None]:
def preprocess_text(text, arabert_prep=None):
    """Preprocesse le texte arabe."""
    if pd.isna(text):
        return ""
    text = str(text)
    if arabert_prep is not None:
        try:
            text = arabert_prep.preprocess(text)
        except Exception:
            pass
    # Nettoyage
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|#\w+', '', text)
    # Normalisation arabe
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ى', 'ي', text)
    text = ' '.join(text.split())
    return text

# Encodage des labels
le = LabelEncoder()
train_df['label_encoded'] = le.fit_transform(train_df['label'])
val_df['label_encoded'] = le.transform(val_df['label'])

label_map = {int(i): str(l) for i, l in enumerate(le.classes_)}
print(f"Mapping des labels: {label_map}")

y_train = train_df['label_encoded'].values
y_val = val_df['label_encoded'].values

# Calcul des poids de classe
def compute_class_weights(labels, num_classes):
    counts = np.bincount(labels, minlength=num_classes).astype(float)
    weights = len(labels) / (num_classes * counts)
    return torch.tensor(weights, dtype=torch.float32)

class_weights = compute_class_weights(y_train, NUM_LABELS)
print(f"Poids des classes: {class_weights.tolist()}")

## 4. Definition du Modele

In [None]:
# Focal Loss - Meilleur pour les donnees desequilibrees
class FocalLoss(nn.Module):
    """
    Focal Loss: reduit le poids des exemples bien classifies,
    se concentre sur les exemples difficiles.
    """
    def __init__(self, alpha=None, gamma=2.0):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        return focal_loss.mean()

# Dataset PyTorch
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=MAX_LENGTH):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx], padding='max_length', truncation=True,
            max_length=self.max_length, return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

## 5. Entrainement

Utilisation de **MARBERT** (entraine sur 1B de tweets arabes)

In [None]:
# Configuration du modele
MODEL_NAME = "UBC-NLP/MARBERT"  # Meilleur modele pour les tweets arabes

print(f"Chargement du tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Preprocessing des textes
print("Preprocessing des textes...")
train_texts = train_df['Sentence'].apply(preprocess_text).tolist()
val_texts = val_df['Sentence'].apply(preprocess_text).tolist()

# Creation des datasets
train_dataset = TextDataset(train_texts, y_train, tokenizer)
val_dataset = TextDataset(val_texts, y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

print(f"Batches train: {len(train_loader)}, val: {len(val_loader)}")

In [None]:
# Fonctions d'entrainement
def train_one_epoch(model, loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0.0
    all_preds, all_labels = [], []

    for batch in tqdm(loader, desc="Training", leave=False):
        ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=ids, attention_mask=mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item() * len(labels)
        all_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)
    _, _, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro', zero_division=0)
    return avg_loss, f1

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    all_preds, all_labels = [], []

    for batch in tqdm(loader, desc="Evaluating", leave=False):
        ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=ids, attention_mask=mask)
        loss = criterion(outputs.logits, labels)

        total_loss += loss.item() * len(labels)
        all_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro', zero_division=0)
    return avg_loss, acc, prec, rec, f1, np.array(all_preds), np.array(all_labels)

In [None]:
# Entrainement du modele
print(f"\n{'='*60}")
print(f"ENTRAINEMENT: {MODEL_NAME}")
print(f"{'='*60}")

# Chargement du modele
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_LABELS
).to(DEVICE)

# Loss et Optimizer
criterion = FocalLoss(alpha=class_weights.to(DEVICE), gamma=2.0)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

NUM_EPOCHS = 15
total_steps = len(train_loader) * NUM_EPOCHS
warmup_steps = int(total_steps * 0.1)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)

# Boucle d'entrainement
best_f1, best_epoch, patience = 0.0, 0, 0
history = {'train_loss': [], 'val_loss': [], 'train_f1': [], 'val_f1': []}
best_preds = None

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\nEpoch {epoch}/{NUM_EPOCHS}")

    t_loss, t_f1 = train_one_epoch(model, train_loader, optimizer, scheduler, criterion, DEVICE)
    v_loss, v_acc, v_prec, v_rec, v_f1, v_preds, v_labels = evaluate(model, val_loader, criterion, DEVICE)

    history['train_loss'].append(t_loss)
    history['val_loss'].append(v_loss)
    history['train_f1'].append(t_f1)
    history['val_f1'].append(v_f1)

    print(f"  Train - Loss: {t_loss:.4f}, F1: {t_f1:.4f}")
    print(f"  Val   - Loss: {v_loss:.4f}, F1: {v_f1:.4f}, Acc: {v_acc:.4f}")

    if v_f1 > best_f1:
        best_f1 = v_f1
        best_epoch = epoch
        best_preds = v_preds.copy()
        # Sauvegarde du modele
        model.save_pretrained('best_model')
        tokenizer.save_pretrained('best_model')
        patience = 0
        print(f"  >> Nouveau meilleur modele (F1={v_f1:.4f})")
    else:
        patience += 1
        if patience >= 4:
            print(f"  >> Early stopping a l'epoch {epoch}")
            break

print(f"\n{'='*60}")
print(f"RESULTAT FINAL: F1 = {best_f1:.4f} (epoch {best_epoch})")
print(f"{'='*60}")

## 6. Resultats et Visualisation

In [None]:
# Courbes d'entrainement
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss
axes[0].plot(history['train_loss'], label='Train', marker='o')
axes[0].plot(history['val_loss'], label='Validation', marker='s')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Evolution de la Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# F1
axes[1].plot(history['train_f1'], label='Train', marker='o')
axes[1].plot(history['val_f1'], label='Validation', marker='s')
axes[1].axvline(x=best_epoch-1, color='r', linestyle='--', label=f'Best (epoch {best_epoch})')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('F1 Score (Macro)')
axes[1].set_title('Evolution du F1 Score')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_curves.png', dpi=150)
plt.show()

In [None]:
# Matrice de confusion
fig, ax = plt.subplots(figsize=(10, 8))
cm = confusion_matrix(y_val, best_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=le.classes_, yticklabels=le.classes_)
ax.set_title(f'Matrice de Confusion (F1={best_f1:.4f})', fontsize=14, fontweight='bold')
ax.set_xlabel('Predit')
ax.set_ylabel('Reel')
plt.savefig('confusion_matrix.png', dpi=150)
plt.show()

# Rapport de classification
print("\nRapport de Classification:")
print(classification_report(y_val, best_preds, target_names=le.classes_, zero_division=0))

## 7. Sauvegarde et Export

In [None]:
# Sauvegarde des predictions
submission = pd.DataFrame({
    'id': range(1, len(val_df) + 1),
    'Sentence': val_df['Sentence'].values,
    'True_Label': le.inverse_transform(y_val),
    'Predicted_Label': le.inverse_transform(best_preds)
})
submission.to_csv('predictions.csv', index=False, encoding='utf-8-sig')
print(f"Predictions sauvegardees: predictions.csv ({len(submission)} lignes)")

# Sauvegarde de la configuration
config = {
    'model_name': MODEL_NAME,
    'best_epoch': best_epoch,
    'best_f1': float(best_f1),
    'label_mapping': label_map,
    'class_weights': class_weights.tolist()
}
with open('config.json', 'w', encoding='utf-8') as f:
    json.dump(config, f, indent=2, ensure_ascii=False)
print("Configuration sauvegardee: config.json")

In [None]:
# Telecharger les fichiers
from google.colab import files

print("Telechargement des fichiers...")
files.download('predictions.csv')
files.download('config.json')
files.download('training_curves.png')
files.download('confusion_matrix.png')

# Pour telecharger le modele complet (optionnel - gros fichier)
# !zip -r best_model.zip best_model/
# files.download('best_model.zip')

## 8. Inference sur de Nouvelles Donnees

In [None]:
def predict(texts, model, tokenizer, device):
    """Predit les labels pour une liste de textes."""
    model.eval()
    predictions = []

    for text in texts:
        processed = preprocess_text(text)
        enc = tokenizer(
            processed, padding='max_length', truncation=True,
            max_length=MAX_LENGTH, return_tensors='pt'
        )
        with torch.no_grad():
            outputs = model(
                input_ids=enc['input_ids'].to(device),
                attention_mask=enc['attention_mask'].to(device)
            )
            pred = torch.argmax(outputs.logits, dim=1).item()
            predictions.append(le.inverse_transform([pred])[0])

    return predictions

# Exemple d'utilisation
exemples = [
    "شكرا جزيلا على مساعدتك",           # Merci beaucoup pour ton aide
    "هذا الشخص غبي جدا",                # Cette personne est tres stupide
    "الطقس جميل اليوم",                  # Le temps est beau aujourd'hui
    "لو سمحت ممكن تساعدني",             # S'il te plait, peux-tu m'aider
]

# Charger le meilleur modele
best_model = AutoModelForSequenceClassification.from_pretrained('best_model').to(DEVICE)
best_tokenizer = AutoTokenizer.from_pretrained('best_model')

# Predictions
preds = predict(exemples, best_model, best_tokenizer, DEVICE)

print("\nExemples de predictions:")
print("-" * 60)
for text, pred in zip(exemples, preds):
    print(f"Texte: {text}")
    print(f"Prediction: {pred}")
    print("-" * 60)

---

## Resume

Ce notebook implemente un systeme de detection de politesse en arabe utilisant:
- **Modele:** MARBERT (entraine sur 1B de tweets arabes)
- **Loss:** Focal Loss (meilleure pour les donnees desequilibrees)
- **Performance attendue:** F1 ~ 0.84

### Fichiers generes:
- `predictions.csv` - Predictions sur les donnees de validation
- `config.json` - Configuration du modele
- `best_model/` - Modele entraine (HuggingFace format)
- `training_curves.png` - Courbes d'entrainement
- `confusion_matrix.png` - Matrice de confusion