In [None]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize
import os
import re
from tqdm import tqdm

from sklearn.model_selection import train_test_split

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
import wandb

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [28]:
MODEL_NAME = "camembert-base"  # French BERT model
# Alternatives:
# MODEL_NAME = "flaubert/flaubert_base_cased"
# MODEL_NAME = "dbmdz/bert-base-french-europeana-cased"

MAX_LENGTH = 256  # Maximum sequence length
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 500

LOG_DIR = "/Data/iuliia.korotkova/DL"

# Data preprocessing

In [5]:
data_dir = "../data/raw"

train_data = pd.read_json(os.path.join(data_dir, 'train.jsonl'), lines=True)
train_data = json_normalize(train_data.to_dict(orient='records'))

kaggle_data = pd.read_json(os.path.join(data_dir, 'kaggle_test.jsonl'), lines=True)
kaggle_data = json_normalize(kaggle_data.to_dict(orient='records'))

X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_kaggle = kaggle_data

In [6]:
def extract_full_text(tweet):
    text = tweet['text']
    if not pd.isna(tweet['extended_tweet.full_text']):
        text = tweet['extended_tweet.full_text']
    return text

X_train['full_text'] = X_train.apply(lambda tweet: extract_full_text(tweet), axis=1)
X_kaggle['full_text'] = X_kaggle.apply(lambda tweet: extract_full_text(tweet), axis=1)

# Dataset

In [7]:
class FrenchTextDataset(Dataset):
    """Custom Dataset for French text classification"""
    
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Metrics

In [8]:
def compute_metrics(pred):
    """Compute metrics for evaluation"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = torch.softmax(torch.tensor(pred.predictions), dim=-1)[:, 1].numpy()
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary'
    )
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, probs)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc
    }

# Train model

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    problem_type="single_label_classification"
)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for name, param in model.named_parameters():
    if not re.search("classifier|11|10|9|8|7|6|5", name):
        param.requires_grad = False

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

In [11]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((139422, 193), (139422,), (15492, 193), (15492,))

In [12]:
train_dataset = FrenchTextDataset(
        X_train['full_text'],
        y_train.iloc[:, 0] if isinstance(y_train, pd.DataFrame) else y_train,
        tokenizer,
        MAX_LENGTH
    )

eval_dataset = FrenchTextDataset(
        X_val['full_text'],
        y_val.iloc[:, 0] if isinstance(y_val, pd.DataFrame) else y_val,
        tokenizer,
        MAX_LENGTH
    )

In [None]:
training_args = TrainingArguments(
    output_dir=os.path.join(LOG_DIR, 'results'),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_dir=os.path.join(LOG_DIR, 'logs'),
    logging_steps=100,
    eval_strategy="epoch" if eval_dataset else "no",
    save_strategy="epoch",
    load_best_model_at_end=True if eval_dataset else False,
    metric_for_best_model="f1" if eval_dataset else None,
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to="wandb",
)

callbacks = [EarlyStoppingCallback(early_stopping_patience=2)] if eval_dataset else []

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics if eval_dataset else None,
    callbacks=callbacks
)

In [None]:
num_layers_to_train = 7

wandb.init(
    project="DL-project",
    name="Camembert",
    dir=os.path.join(LOG_DIR, "wandb"),
    config={
        "model": MODEL_NAME,
        "max_length": MAX_LENGTH,
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "num_epochs": NUM_EPOCHS,
        "num_layers_to_train": num_layers_to_train if num_layers_to_train else "all",
        "train_samples": len(X_train),
        "val_samples": len(X_val) if X_val is not None else 0
    }
)

In [26]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mjulia_kor[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc
1,0.5993,0.567434,0.70133,0.649177,0.72095,0.590401,0.776809
2,0.5511,0.566074,0.709334,0.6929,0.685375,0.700593,0.781016
3,0.52,0.577214,0.709915,0.691304,0.688655,0.693973,0.780713


TrainOutput(global_step=26142, training_loss=0.559172921497741, metrics={'train_runtime': 1557.178, 'train_samples_per_second': 268.605, 'train_steps_per_second': 16.788, 'total_flos': 5.502520434060288e+16, 'train_loss': 0.559172921497741, 'epoch': 3.0})

In [29]:
trainer.save_model(os.path.join(LOG_DIR, 'french_bert_classifier'))

# Prediction

In [None]:
def predict(texts, tokenizer, model, batch_size=32):
    """
    Make predictions on new texts
    
    Parameters:
    -----------
    texts : list or pd.Series of text strings
    tokenizer : Trained tokenizer
    model : Trained model
    batch_size : Batch size for inference
    
    Returns:
    --------
    predictions : numpy array of predicted labels (0/1)
    probabilities : numpy array of probabilities for class 1
    """
    model.eval()
    model.to(device)
    
    all_preds = []
    all_probs = []
    
    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size), total=len(texts)//batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        encoding = tokenizer(
            batch_texts.tolist() if isinstance(batch_texts, pd.Series) else batch_texts,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # Move to device
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        
        # Predict
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=-1)
            preds = torch.argmax(logits, dim=-1)
        
        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs[:, 1].cpu().numpy())
    
    return np.array(all_preds), np.array(all_probs)

In [37]:
y_pred, y_pred_proba = predict(X_kaggle['full_text'], tokenizer, model)

In [41]:
output = pd.concat([X_kaggle['challenge_id'], pd.DataFrame(y_pred)], axis=1,ignore_index=True)
# Rename columns to match the required submission format
output.columns = ['ID', "Prediction"]
# Save the submission file as a CSV
output.to_csv('../outputs/camembert_base.csv', index=False)