In [1]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize
import os
import re
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

import wandb

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
MODEL_NAME = "camembert-base"  # French BERT model
# Alternatives:
# MODEL_NAME = "flaubert/flaubert_base_cased"
# MODEL_NAME = "dbmdz/bert-base-french-europeana-cased"

MAX_LENGTH = 256  # Maximum sequence length
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 500

LOG_DIR = "/Data/iuliia.korotkova/DL"

# Data Preprocessing

In [4]:
data_dir = "../data/raw"

train_data = pd.read_json(os.path.join(data_dir, 'train.jsonl'), lines=True)
train_data = json_normalize(train_data.to_dict(orient='records'))

kaggle_data = pd.read_json(os.path.join(data_dir, 'kaggle_test.jsonl'), lines=True)
kaggle_data = json_normalize(kaggle_data.to_dict(orient='records'))

X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_kaggle = kaggle_data

# Dataset

In [5]:
class FrenchTextDataset(Dataset):
    """Custom Dataset for French text classification"""
    
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Metrics

In [6]:
def compute_metrics(pred):
    """Compute metrics for evaluation"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = torch.softmax(torch.tensor(pred.predictions), dim=-1)[:, 1].numpy()
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary'
    )
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, probs)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc
    }

# Train Model

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    problem_type="single_label_classification"
)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
for name, param in model.named_parameters():
    if not re.search("classifier|11|10|9|8|7|6|5", name):
        param.requires_grad = False

In [9]:
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

X_train_by_users = X_train.reset_index().groupby("user.description")["index"].apply(list)
X_train_by_users, X_val_by_users = train_test_split(X_train_by_users, test_size=0.1)

X_train_idx = [
    xs[0]
    for xs in X_train_by_users
]
X_val_idx = [
    x
    for xs in X_val_by_users
    for x in xs
]

X_val = X_train.iloc[X_val_idx]
X_train = X_train.iloc[X_train_idx]

y_val = y_train[X_val_idx]
y_train = y_train[X_train_idx]

In [11]:
X_val.shape, y_val.shape, X_train.shape, y_train.shape

((13184, 192), (13184,), (37110, 192), (37110,))

In [14]:
train_dataset = FrenchTextDataset(
        X_train['user.description'],
        y_train.iloc[:, 0] if isinstance(y_train, pd.DataFrame) else y_train,
        tokenizer,
        MAX_LENGTH
    )

eval_dataset = FrenchTextDataset(
        X_val['user.description'],
        y_val.iloc[:, 0] if isinstance(y_val, pd.DataFrame) else y_val,
        tokenizer,
        MAX_LENGTH
    )

In [15]:
training_args = TrainingArguments(
    output_dir=os.path.join(LOG_DIR, 'results'),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_dir=os.path.join(LOG_DIR, 'logs'),
    logging_steps=100,
    eval_strategy="epoch" if eval_dataset else "no",
    save_strategy="epoch",
    load_best_model_at_end=True if eval_dataset else False,
    metric_for_best_model="f1" if eval_dataset else None,
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to="wandb",
)

callbacks = [EarlyStoppingCallback(early_stopping_patience=2)] if eval_dataset else []

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics if eval_dataset else None,
    callbacks=callbacks
)

In [16]:
num_layers_to_train = 7

wandb.init(
    project="DL-project",
    name="Camembert_user_only",
    dir=os.path.join(LOG_DIR, "wandb"),
    config={
        "model": MODEL_NAME,
        "max_length": MAX_LENGTH,
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "num_epochs": NUM_EPOCHS,
        "num_layers_to_train": num_layers_to_train if num_layers_to_train else "all",
        "train_samples": len(X_train),
        "val_samples": len(X_val) if X_val is not None else 0
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mjulia_kor[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc
1,0.5707,0.564259,0.711696,0.704777,0.736646,0.675551,0.784293
2,0.554,0.542665,0.726942,0.720844,0.752104,0.692079,0.800092
3,0.4951,0.558328,0.726562,0.722457,0.747967,0.69863,0.803381


TrainOutput(global_step=6960, training_loss=0.5429404598543014, metrics={'train_runtime': 443.7876, 'train_samples_per_second': 250.863, 'train_steps_per_second': 15.683, 'total_flos': 1.46460768966144e+16, 'train_loss': 0.5429404598543014, 'epoch': 3.0})