In [None]:
import json
import numpy as np
import pandas as pd
from pandas import json_normalize
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from transformers import AutoTokenizer, AutoModel
from transformers import get_linear_schedule_with_warmup  # scheduler
from torch.optim import AdamW                           # optimizer

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print("Using device:", device)

In [None]:
def load_jsonl_flat(path):
    with open(path, 'r', encoding='utf-8') as f:
        records = [json.loads(line) for line in f if line.strip()]
    return json_normalize(records)

train_data = load_jsonl_flat('../data/raw/train.jsonl')
kaggle_data = load_jsonl_flat('../data/raw/kaggle_test.jsonl')

# Extract tweet text + metadata

In [None]:
# ----------------------------------------------------
# Extract full text
# ----------------------------------------------------
def extract_full_text(tweet):
    text = tweet.get('text','')
    extended = tweet.get('extended_tweet.full_text')
    if extended and not pd.isna(extended):
        text = extended
    return text

train_data['full_text'] = train_data.apply(lambda row: extract_full_text(row.to_dict()), axis=1)
kaggle_data['full_text'] = kaggle_data.apply(lambda row: extract_full_text(row.to_dict()), axis=1)

# ----------------------------------------------------
# Save target column before dropping
# ----------------------------------------------------
y_train = train_data['label'].copy()

# ----------------------------------------------------
# Keep train columns aligned with test set
# ----------------------------------------------------
train_data = train_data.dropna(how='all', axis="columns")
train_data = train_data.drop(
    train_data.columns.difference(kaggle_data.columns).to_list(), axis=1
)

# ----------------------------------------------------
# Identify metadata columns
# ----------------------------------------------------
text_column = 'full_text'
categorical_column = 'source'

num_columns = train_data.select_dtypes(include=[np.number]).columns.tolist()
bool_columns = train_data.select_dtypes(include=[np.bool_]).columns.tolist()
list_columns = [col for col in train_data.columns if train_data[col].apply(lambda x: isinstance(x,list)).any()]

unuseful_columns = [
    "lang", "text", "extended_tweet.full_text", "user.description",
    'retweet_count', 'favorite_count', 'quote_count', 'reply_count',
    'retweeted', 'favorited', 'user.default_profile_image',
    'user.protected', 'user.contributors_enabled'
]

num_columns = [col for col in num_columns if col not in unuseful_columns]
bool_columns = [col for col in bool_columns if col not in unuseful_columns]
list_columns = [col for col in list_columns if col not in unuseful_columns]

# ----------------------------------------------------
# Function to extract metadata features
# ----------------------------------------------------
def extract_features(df, num_columns, bool_columns, list_columns, unuseful_columns):
    df = df.copy()
    
    # Numerical columns: fill NAs and replace inf
    df[num_columns] = df[num_columns].fillna(0).replace([np.inf, -np.inf], 0)
    
    # Boolean columns: convert True/False -> 1/0
    for column in bool_columns:
        df[column] = df[column].map({True: 1, False: 0})
    
    # List columns: encode as length of the list
    for col in list_columns:
        df[col] = df[col].apply(lambda x: len(x) if isinstance(x, list) else 0)
    
    # Drop unuseful columns
    df = df.drop(unuseful_columns, axis=1, errors='ignore')
    
    return df

# ----------------------------------------------------
# Apply feature extraction
# ----------------------------------------------------
X_train_f = extract_features(train_data, num_columns, bool_columns, list_columns, unuseful_columns)
X_test_f = extract_features(kaggle_data, num_columns, bool_columns, list_columns, unuseful_columns)

# Keep only numeric columns shared between train/test
common_numeric_cols = X_train_f.select_dtypes(include=[np.number]).columns.intersection(
    X_test_f.select_dtypes(include=[np.number]).columns
)

metadata_train = X_train_f[common_numeric_cols].values.astype(np.float32)
metadata_test = X_test_f[common_numeric_cols].values.astype(np.float32)

# Standardize metadata
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
metadata_train = scaler.fit_transform(metadata_train)
metadata_test = scaler.transform(metadata_test)

print("Metadata shape (train):", metadata_train.shape)
print("Metadata shape (test):", metadata_test.shape)
print("y_train shape:", y_train.shape)

# Dataset class

In [None]:
MODEL_NAME = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME).to(device)

In [None]:
class TweetDataset(Dataset):
    def __init__(self, texts, metadata, labels=None, max_len=128):
        self.texts = texts
        self.metadata = torch.tensor(metadata, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else None
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        tokens = tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        input_ids = tokens['input_ids'].squeeze(0)
        attention_mask = tokens['attention_mask'].squeeze(0)
        
        item = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'metadata': self.metadata[idx]
        }
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

In [None]:
from sklearn.model_selection import train_test_split

# ----------------------------------------------------
# 1. Create Train/Validation Split
# ----------------------------------------------------
train_texts, val_texts, train_meta, val_meta, train_labels, val_labels = train_test_split(
    train_data['full_text'].tolist(),
    metadata_train,
    y_train.values,
    test_size=0.1,       # 20% for validation
    random_state=42,
    stratify=y_train.values
)

print(f"Train size: {len(train_texts)}")
print(f"Val size: {len(val_texts)}")

# ----------------------------------------------------
# 2. Instantiate Datasets
# ----------------------------------------------------
train_dataset = TweetDataset(train_texts, train_meta, train_labels)
val_dataset = TweetDataset(val_texts, val_meta, val_labels)

# Create the test dataset (Kaggle)
kaggle_dataset = TweetDataset(
    kaggle_data['full_text'].tolist(), 
    metadata_test, 
    labels=None
)

# CamemBERT 

In [None]:
class CamemBERTClassifier(nn.Module):
    def __init__(self, bert_model, metadata_dim, num_classes):
        super().__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Sequential(
            nn.Linear(bert_model.config.hidden_size + metadata_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )
        
    def forward(self, input_ids, attention_mask, metadata):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:,0,:]  # CLS token
        x = torch.cat([cls_emb, metadata], dim=1)
        x = self.dropout(x)
        logits = self.classifier(x)
        return logits

In [None]:
import re

def freeze_bert_layers(model, num_unfrozen_last_layers=3):
    total_layers = model.bert.config.num_hidden_layers
    
    for name, param in model.named_parameters():
        
        # We only care about freezing the encoder layers
        if "encoder.layer" in name:
            # Use Regex to find the pattern "layer.<number>"
            match = re.search(r"encoder\.layer\.(\d+)", name)
            
            if match:
                layer_idx = int(match.group(1))
                
                if layer_idx < total_layers - num_unfrozen_last_layers:
                    param.requires_grad = False
                else:
                    param.requires_grad = True
        
        # Always keep the classifier head and embeddings trainable
        else:
            param.requires_grad = True

    print(f"Freezing complete. Last {num_unfrozen_last_layers} layers are unfrozen.")

In [None]:
def train_model(model, train_dataset, val_dataset, device, epochs=3, batch_size=16, lr=2e-5):
    # Create DataLoaders from the Dataset objects
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
    total_steps = len(train_loader) * epochs
    
    # Warmup for 6% of total steps
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(0.06 * total_steps), 
        num_training_steps=total_steps
    )
    
    criterion = nn.CrossEntropyLoss()
    model.to(device)
    
    best_val_acc = 0.0

    for epoch in range(epochs):
        # --- Training Phase ---
        model.train()
        correct = 0
        total = 0
        running_loss = 0.0
        
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            metadata = batch['metadata'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids=input_ids, attention_mask=attention_mask, metadata=metadata)
            
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            running_loss += loss.item()
            
        train_acc = correct / total
        avg_loss = running_loss / len(train_loader)

        # --- Validation Phase ---
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                metadata = batch['metadata'].to(device)
                labels = batch['labels'].to(device)
                
                logits = model(input_ids=input_ids, attention_mask=attention_mask, metadata=metadata)
                preds = torch.argmax(logits, dim=1)
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)
        
        val_acc = val_correct / val_total
        print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "best_camembert_meta.pt")
            
    print("Training complete. Best val acc:", best_val_acc)

In [None]:
# --------------------------
# Initialize and train model
# --------------------------
meta_dim = metadata_train.shape[1]
num_classes = len(np.unique(y_train))

model = CamemBERTClassifier(bert_model, metadata_dim=meta_dim, num_classes=num_classes)

# Freeze layers
freeze_bert_layers(model, num_unfrozen_last_layers=7)

train_model(
    model, 
    train_dataset=train_dataset, 
    val_dataset=val_dataset, 
    device=device, 
    epochs=3, 
    batch_size=16
)

In [None]:
def predict(model, dataset, device, batch_size=32):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    model.to(device)
    model.eval()
    preds = []
    
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            metadata = batch['metadata'].to(device)
            
            logits = model(input_ids=input_ids, attention_mask=attention_mask, metadata=metadata)
            batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
            preds.extend(batch_preds.tolist())
            
    return np.array(preds)

model.load_state_dict(torch.load("best_camembert_meta.pt", map_location=device))
kaggle_preds = predict(model, kaggle_dataset, device)

In [None]:
# --------------------------
# Save submission
# --------------------------
submission = pd.DataFrame({
    "ID": X_test_f["challenge_id"].astype(int),
    "Prediction": kaggle_preds
})
submission.to_csv("camembert_with_meta_submission.csv", index=False)
print("Saved camembert_with_meta_submission.csv")