In [1]:
from pprint import pprint
import functools
import numpy as np


import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import AutoModelForSequenceClassification, CamembertForMaskedLM, AutoTokenizer, AutoConfig
from sklearn.metrics import confusion_matrix, f1_score

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm.notebook import tqdm
from datasets import load_dataset


In [2]:
def xmlToDf(xmlFile):
    # Read XML file
    df = pd.read_xml(xmlFile)
    #replace None to empty string in commentaire column
    df["commentaire"] = df["commentaire"].apply(checkIfWordInComment)
    #check if not column exist
    if "note" in df.columns:
        # replace comma to point in note column
        df["note"] = df["note"].apply(lambda x: x.replace(",", "."))
        # string to double conversion column
        df['note'] = df['note'].astype(float)
        df['note'] = df['note'].apply(lambda x: x * 2 -1)
        df['note'] = df['note'].astype(int)
    return df


def checkIfWordInComment(comment):
    if comment is None:
        return ""
    return comment

df_train = xmlToDf("data/train.xml")
df_dev = xmlToDf("data/dev.xml")
df_test = xmlToDf("data/test.xml")


In [3]:
# convert df to dict
dict_train = df_train.to_dict('records')
dict_dev = df_dev.to_dict('records')
dict_test = df_test.to_dict('records')

In [4]:
def tokenize_batch(samples, tokenizer):
    text = [sample["commentaire"] for sample in samples]
    
    rates = [sample["note"] for sample in samples]
    labels = torch.tensor(rates).cuda()
    # The tokenizer handles
    # - Tokenization (amazing right?)
    # - Padding (adding empty tokens so that each example has the same length)
    # - Truncation (cutting samples that are too long)
    # - Special tokens (in CamemBERT, each sentence ends with a special token </s>)
    # - Attention mask (a binary vector which tells the model which tokens to look at. For instance it will not compute anything if the token is a padding token)
    tokens = tokenizer.batch_encode_plus( text,
                                        add_special_tokens=True,
                                        padding=True,
                                        truncation=True,
                                        max_length=512,
                                        return_attention_mask = True,
                                        return_tensors = 'pt')

    return {"input_ids": tokens.input_ids, "attention_mask": tokens.attention_mask, "labels": labels, "sentences": text}

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    'camembert/camembert-base',
    do_lower_case=True)


val_dataloader = DataLoader(
    dict_dev, 
    batch_size=8, 
    shuffle=False,
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer),
)

train_dataloader = DataLoader(
    dict_train, 
    batch_size=8, 
    shuffle=True, 
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer),
)



In [1]:
class LightningModel(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr, weight_decay, from_scratch=False):
        super().__init__()
        self.save_hyperparameters()
        if from_scratch:
            # Si `from_scratch` est vrai, on charge uniquement la config (nombre de couches, hidden size, etc.) et pas les poids du modèle 
            config = AutoConfig.from_pretrained(
                model_name, num_labels=num_labels
            )
            self.model = AutoModelForSequenceClassification.from_config(config)
        else:
            # Cette méthode permet de télécharger le bon modèle pré-entraîné directement depuis le Hub de HuggingFace sur lequel sont stockés de nombreux modèles
            self.model = AutoModelForSequenceClassification.from_pretrained(
                model_name, num_labels=num_labels
            )
        self.lr = lr
        self.weight_decay = weight_decay
        self.num_labels = self.model.num_labels

    def forward(self, batch):
        return self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

    def training_step(self, batch):
        out = self.forward(batch)

        logits = out.logits
        # -------- MASKED --------
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits.view(-1, self.num_labels), batch["labels"].view(-1))

        # ------ END MASKED ------

        self.log("train/loss", loss)

        return loss

    def validation_step(self, batch, batch_index):
        labels = batch["labels"]
        out = self.forward(batch)

        preds = torch.max(out.logits, -1).indices
        # -------- MASKED --------
        acc = (batch["labels"] == preds).float().mean()
        # ------ END MASKED ------
        self.log("valid/acc", acc)

        f1 = f1_score(batch["labels"].cpu().tolist(), preds.cpu().tolist(), average="macro")
        self.log("valid/f1", f1)

    def predict_step(self, batch, batch_idx):
        """La fonction predict step facilite la prédiction de données. Elle est 
        similaire à `validation_step`, sans le calcul des métriques.
        """
        out = self.forward(batch)

        return torch.max(out.logits, -1).indices

    def configure_optimizers(self):
        return torch.optim.AdamW(
            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )

NameError: name 'pl' is not defined

In [None]:
lightning_model = LightningModel("camembert-base", 10, lr=3e-5, weight_decay=0.)
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="valid/acc", mode="max")
camembert_trainer = pl.Trainer(
    max_epochs=20,
    accelerator='gpu',
    devices=1,
    callbacks=[
        pl.callbacks.EarlyStopping(monitor="valid/acc", patience=4, mode="max"),
        model_checkpoint,
    ]
)
camembert_trainer.fit(lightning_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
lightning_model = LightningModel.load_from_checkpoint(checkpoint_path=model_checkpoint.best_model_path)

# save the model
torch.save(lightning_model.model.state_dict(), "model_lightning.pt")

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weig

Sanity Checking: 0it [00:00, ?it/s]