In [1]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize
import os
import re
from tqdm import tqdm

from sklearn.model_selection import train_test_split

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
import wandb

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [5]:
MODEL_NAME = "camembert-base"  # French BERT model
# Alternatives:
# MODEL_NAME = "flaubert/flaubert_base_cased"
# MODEL_NAME = "dbmdz/bert-base-french-europeana-cased"

MAX_LENGTH = 256  # Maximum sequence length
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 500

LOG_DIR = "/Data/iuliia.korotkova/DL"

# Data preprocessing

In [6]:
data_dir = "../data/raw"

train_data = pd.read_json(os.path.join(data_dir, 'train.jsonl'), lines=True)
train_data = json_normalize(train_data.to_dict(orient='records'))

kaggle_data = pd.read_json(os.path.join(data_dir, 'kaggle_test.jsonl'), lines=True)
kaggle_data = json_normalize(kaggle_data.to_dict(orient='records'))

X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_kaggle = kaggle_data

In [9]:
def extract_full_text(tweet):
    text = tweet['text']
    if not pd.isna(tweet['extended_tweet.full_text']):
        text = tweet['extended_tweet.full_text']
    user_description = tweet["user.description"]
    input_text = f"[user description] {user_description}\n\n[tweet content] {text}"
    return input_text

X_train['full_text'] = X_train.apply(lambda tweet: extract_full_text(tweet), axis=1)
X_kaggle['full_text'] = X_kaggle.apply(lambda tweet: extract_full_text(tweet), axis=1)

In [25]:
X_kaggle['full_text'].iloc[0]

'[user description] CrÃ©ateur et producteur de performance durable.\n\n[tweet content] [Actu] - ðŸ”Ž Le coronavirus fait-il avancer les #FMers ? - https://t.co/uCsCCl6zow\n"Afin de respecter les protocoles, il a fallu Ãªtre innovants et adaptables [...]" intervient AurÃ©lie Fort, Experte mÃ©tier FM.\nðŸ‘‰ https://t.co/aQAI6Gxuz6'

# Dataset

In [11]:
class FrenchTextDataset(Dataset):
    """Custom Dataset for French text classification"""
    
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Metrics

In [12]:
def compute_metrics(pred):
    """Compute metrics for evaluation"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = torch.softmax(torch.tensor(pred.predictions), dim=-1)[:, 1].numpy()
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary'
    )
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, probs)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc
    }

# Train model

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    problem_type="single_label_classification"
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
for name, param in model.named_parameters():
    if not re.search("classifier|11|10|9|8|7|6|5", name):
        param.requires_grad = False

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

In [16]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((139422, 193), (139422,), (15492, 193), (15492,))

In [17]:
train_dataset = FrenchTextDataset(
        X_train['full_text'],
        y_train.iloc[:, 0] if isinstance(y_train, pd.DataFrame) else y_train,
        tokenizer,
        MAX_LENGTH
    )

eval_dataset = FrenchTextDataset(
        X_val['full_text'],
        y_val.iloc[:, 0] if isinstance(y_val, pd.DataFrame) else y_val,
        tokenizer,
        MAX_LENGTH
    )

In [18]:
training_args = TrainingArguments(
    output_dir=os.path.join(LOG_DIR, 'results'),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_dir=os.path.join(LOG_DIR, 'logs'),
    logging_steps=100,
    eval_strategy="epoch" if eval_dataset else "no",
    save_strategy="epoch",
    load_best_model_at_end=True if eval_dataset else False,
    metric_for_best_model="f1" if eval_dataset else None,
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to="wandb",
)

callbacks = [EarlyStoppingCallback(early_stopping_patience=2)] if eval_dataset else []

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics if eval_dataset else None,
    callbacks=callbacks
)

In [19]:
num_layers_to_train = 7

wandb.init(
    project="DL-project",
    name="Camembert",
    dir=os.path.join(LOG_DIR, "wandb"),
    config={
        "model": MODEL_NAME,
        "max_length": MAX_LENGTH,
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "num_epochs": NUM_EPOCHS,
        "num_layers_to_train": num_layers_to_train if num_layers_to_train else "all",
        "train_samples": len(X_train),
        "val_samples": len(X_val) if X_val is not None else 0
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mjulia_kor[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc
1,0.4734,0.430378,0.806739,0.785929,0.815672,0.758278,0.879506
2,0.3077,0.322176,0.871353,0.861607,0.867328,0.85596,0.939323
3,0.2613,0.285001,0.893042,0.885368,0.887887,0.882864,0.955785


TrainOutput(global_step=26142, training_loss=0.378094870589429, metrics={'train_runtime': 1577.1341, 'train_samples_per_second': 265.206, 'train_steps_per_second': 16.576, 'total_flos': 5.502520434060288e+16, 'train_loss': 0.378094870589429, 'epoch': 3.0})

In [21]:
trainer.save_model(os.path.join(LOG_DIR, 'french_bert_user_classifier'))

In [28]:
trainer.model

CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias

In [29]:
model

CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias

# Prediction

In [22]:
def predict(texts, tokenizer, model, batch_size=32):
    """
    Make predictions on new texts
    
    Parameters:
    -----------
    texts : list or pd.Series of text strings
    tokenizer : Trained tokenizer
    model : Trained model
    batch_size : Batch size for inference
    
    Returns:
    --------
    predictions : numpy array of predicted labels (0/1)
    probabilities : numpy array of probabilities for class 1
    """
    model.eval()
    model.to(device)
    
    all_preds = []
    all_probs = []
    
    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size), total=len(texts)//batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        encoding = tokenizer(
            batch_texts.tolist() if isinstance(batch_texts, pd.Series) else batch_texts,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # Move to device
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        
        # Predict
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=-1)
            preds = torch.argmax(logits, dim=-1)
        
        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs[:, 1].cpu().numpy())
    
    return np.array(all_preds), np.array(all_probs)

In [31]:
y_pred, y_pred_proba = predict(X_kaggle['full_text'], tokenizer, trainer.model)

3231it [02:19, 23.10it/s]                          


In [30]:
y_pred

array([1, 1, 0, ..., 1, 0, 0])

In [32]:
y_pred

array([1, 1, 0, ..., 1, 0, 0])

In [27]:
X_kaggle['full_text']

0         [user description] CrÃ©ateur et producteur de p...
1         [user description] MÃ©dia de #solutions devenu ...
2         [user description] AustralopithÃ¨que, lanceuse....
3         [user description] Un fan de Dragon ball comme...
4         [user description] None\n\n[tweet content] #Pl...
                                ...                        
103375    [user description] Jâ€™fais des tweets parfois d...
103376    [user description] Welcome to the official twi...
103377    [user description] #BBPTeamMusic\nSon dispo ic...
103378    [user description] Ne vit pas dans un ananas s...
103379    [user description] None\n\n[tweet content] @So...
Name: full_text, Length: 103380, dtype: object

In [24]:
output = pd.concat([X_kaggle['challenge_id'], pd.DataFrame(y_pred)], axis=1,ignore_index=True)
# Rename columns to match the required submission format
output.columns = ['ID', "Prediction"]
# Save the submission file as a CSV
output.to_csv('../outputs/camembert_w_user.csv', index=False)

# Example of fine-tuning BERT and metadata

In [None]:
class BERTWithMetadata(nn.Module):
    def __init__(self, bert_model, metadata_dim, hidden_dim=768):
        super().__init__()
        self.bert = bert_model
        # Combine BERT's [CLS] token output with metadata
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim + metadata_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, 2)  # binary classification
        )
    
    def forward(self, input_ids, attention_mask, metadata):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        # Concatenate text representation with metadata
        combined = torch.cat([cls_output, metadata], dim=1)
        return self.classifier(combined)

In [36]:
class FrenchTextDatasetTest(Dataset):
    """Custom Dataset for French text classification"""
    
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

In [37]:
test_dataset = FrenchTextDatasetTest(
        X_kaggle['full_text'],
        tokenizer,
        MAX_LENGTH
    )

In [38]:
predictions = trainer.predict(test_dataset)

In [None]:
torch.argmax(predictions.predictions, dim=-1)

socket.send() raised exception.
socket.send() raised exception.


Error in callback <bound method _WandbInit._pre_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x7f60f8657b50>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f6099171d30, raw_cell="predictions" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2Bpolytechnique/users/eleves-a/2025/iuliia.korotkova/CSC_51054_EP-Deep-Learning_Final-Project/notebooks/06_bert.ipynb#X56sdnNjb2RlLXJlbW90ZQ%3D%3D>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

PredictionOutput(predictions=array([[-3.2304688 ,  2.7871094 ],
       [-3.6289062 ,  3.1015625 ],
       [ 0.6875    , -0.61279297],
       ...,
       [-0.5551758 ,  0.47436523],
       [ 3.1894531 , -2.6875    ],
       [ 0.82128906, -0.7368164 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 144.8555, 'test_samples_per_second': 713.677, 'test_steps_per_second': 44.61})

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x7f60f8657b50>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f60991716a0, execution_count=40 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f6099171d30, raw_cell="predictions" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2Bpolytechnique/users/eleves-a/2025/iuliia.korotkova/CSC_51054_EP-Deep-Learning_Final-Project/notebooks/06_bert.ipynb#X56sdnNjb2RlLXJlbW90ZQ%3D%3D> result=PredictionOutput(predictions=array([[-3.2304688 ,  2.7871094 ],
       [-3.6289062 ,  3.1015625 ],
       [ 0.6875    , -0.61279297],
       ...,
       [-0.5551758 ,  0.47436523],
       [ 3.1894531 , -2.6875    ],
       [ 0.82128906, -0.7368164 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 144.8555, 'test_samples_per_second': 713.677, 'test_steps_per_second': 44.61})>,),kwargs {}:


KeyboardInterrupt: 

In [None]:
1+1