In [None]:
import sys
!{sys.executable} -m pip install wandb

In [None]:
import json
import numpy as np
import pandas as pd
from pandas import json_normalize
import re
import os
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    AutoModel,
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback
)
import warnings
warnings.filterwarnings('ignore')

import wandb

In [None]:
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print("Using device:", device)

In [None]:
MODEL_NAME = "camembert-base"
MAX_LENGTH = 256 # Max token length for CamemBERT
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 500

LOG_DIR = "/Data/DL"

In [None]:
def load_jsonl_flat(path):
    with open(path, 'r', encoding='utf-8') as f:
        records = [json.loads(line) for line in f if line.strip()]
    return json_normalize(records)

train_data_raw = load_jsonl_flat('../data/raw/train.jsonl')
kaggle_data_raw = load_jsonl_flat('../data/raw/kaggle_test.jsonl')

In [None]:
def extract_full_text(tweet):
    text = tweet.get('text','')
    extended = tweet.get('extended_tweet.full_text')
    if extended and not pd.isna(extended):
        text = extended
    return text

train_data_raw['full_text'] = train_data_raw.apply(
    lambda row: extract_full_text(row.to_dict()), axis=1
)
kaggle_data_raw['full_text'] = kaggle_data_raw.apply(
    lambda row: extract_full_text(row.to_dict()), axis=1
)

# Extract label
y_train_raw = train_data_raw['label'].copy()
X_train_raw = train_data_raw.copy()
X_kaggle_raw = kaggle_data_raw.copy()

In [None]:
def aggregate_by_user(df_features):
    """Aggregates tweets and user description by user ID."""
    
    # Group by user ID
    grouped = df_features.groupby('user.description').agg(
        # Concatenate all full texts for a user
        full_text_agg=('full_text', lambda x: " ".join(x.astype(str))),
        # Keep the first user description
        user_description=('user.description', 'first')
    ).reset_index()
    
    # Combine aggregated tweets and user description
    grouped['combined_text'] = grouped['user_description'].fillna('') + " " + grouped['full_text_agg']
    
    # Add labels back if provided
    if 'label' in df_features.columns:
        # Get one label per unique user.description group
        user_labels = (
            df_features.drop_duplicates(subset=["user.description"])
            .set_index("user.description")['label']
        )
        # Join the labels back to the aggregated DataFrame
        grouped = grouped.set_index("user.description").join(user_labels.rename('label')).reset_index()
        
    return grouped

# Prepare data for aggregation
X_train_raw['label'] = y_train_raw.values
aggregated_train = aggregate_by_user(X_train_raw)
aggregated_test = aggregate_by_user(X_kaggle_raw)

# Extract final components
X_train_agg = aggregated_train[['user.description', 'combined_text']].copy()
y_train_agg = aggregated_train['label']
X_test_agg = aggregated_test[['user.description', 'combined_text']].copy()

print(f"Aggregated Train set size (Users): {len(X_train_agg)}")
print(f"Aggregated Test set size (Users): {len(X_test_agg)}")

user_ids = X_train_agg['user.description'].tolist()
train_user_ids, val_user_ids = train_test_split(
    user_ids, 
    test_size=0.1, 
    random_state=42
)

# 2. Filter aggregated dataframes
X_val = X_train_agg[X_train_agg['user.description'].isin(val_user_ids)]
X_train = X_train_agg[X_train_agg['user.description'].isin(train_user_ids)]

# 3. Align labels
y_train = y_train_agg[X_train.index]
y_val = y_train_agg[X_val.index]

print(f"Final Train samples: {len(X_train)}, Val samples: {len(X_val)}")

In [None]:
class FrenchAggregatedTextDataset(Dataset):
    """Custom Dataset for French text classification using aggregated text."""
    
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        # Tokenize (using increased MAX_LENGTH)
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Instantiate datasets
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_dataset = FrenchAggregatedTextDataset(
    X_train['combined_text'],
    y_train,
    tokenizer,
    MAX_LENGTH
)

eval_dataset = FrenchAggregatedTextDataset(
    X_val['combined_text'],
    y_val,
    tokenizer,
    MAX_LENGTH
)

In [None]:
def compute_metrics(pred):
    """Compute metrics for evaluation"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Ensure logits are converted to probabilities for AUC
    probs = torch.softmax(torch.tensor(pred.predictions), dim=-1)[:, 1].numpy()
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, probs)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc
    }

In [None]:
import sys
!{sys.executable} -m pip install transformers[torch]

In [None]:
# Initialize Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    problem_type="single_label_classification"
)

# Freeze layers (Example from 09_CamemBERT_user.ipynb)
# This will freeze layers 0 through 4 (as layers 5-11 and classifier are unfrozen)
for name, param in model.named_parameters():
    if not re.search("classifier|11|10|9|8|7|6|5", name):
        param.requires_grad = False

# WandB Initialization
wandb.init(
    project="DL-project",
    name="Camembert_Aggregated_Text_Only",
    dir=os.path.join(LOG_DIR, "wandb"),
    config={
        "model": MODEL_NAME,
        "max_length": MAX_LENGTH,
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "num_epochs": NUM_EPOCHS,
        "num_layers_to_train": "Last 7 layers + Classifier",
        "train_samples": len(X_train),
        "val_samples": len(X_val)
    }
)

# Setup Training Arguments
training_args = TrainingArguments(
    output_dir=os.path.join(LOG_DIR, 'results'),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_dir=os.path.join(LOG_DIR, 'logs'),
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="wandb",
)

callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]

# Setup Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=callbacks
)

# Start Training
trainer.train()

In [None]:
# Finish WandB run
wandb.finish()