### Import des modules

In [None]:
from pprint import pprint
import functools

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import AutoModelForSequenceClassification, CamembertForMaskedLM, AutoTokenizer, AutoConfig
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, f1_score

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm.notebook import tqdm
import pandas as pd
import torchmetrics

import re

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from tqdm import tqdm

## Importation des données et du tokenizer

In [None]:
X = pd.read_csv('DATA/X_train.csv').set_index('ID').join(
    pd.read_csv('DATA/y_train.csv').set_index('ID'))
# X = pd.read_csv('xxx.csv')
# X.question = [re.sub('[^a-zA-Z ]','',x) for x in X.question]
# X.question=X.question.apply(str.lower)
# X.question= [x.replace('  ',' ') for x in X.question]

test =  pd.read_csv('DATA/test.csv').set_index('ID')

num_labels = X.intention.unique().shape[0]

tokenizer = AutoTokenizer.from_pretrained('camembert-base',max_lenght=512)

## Préparation des données

In [None]:
def tokenize_batch(samples, tokenizer):
    text = [sample["question"] for sample in samples]
    labels = torch.tensor([sample["intention"] for sample in samples])
    tokens = tokenizer(text, padding="longest", return_tensors="pt")

    return {"input_ids": tokens.input_ids, "attention_mask": tokens.attention_mask, "labels": labels,"sentences": text}

X['len']=list(map(len,X.question.str.split()))


X=X.loc[X.len<250]
X=X.sample(frac=1)

#### Loaders

In [None]:
aug=back_translation_aug = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-fr-en', 
    to_model_name='Helsinki-NLP/opus-mt-en-fr'
)
aug2 = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-fr-ru', 
    to_model_name='Helsinki-NLP/opus-mt-ru-fr'
)

In [None]:
train_dataloader = DataLoader(
    X[:6000].to_dict(orient='record'), 
    batch_size=8,
    shuffle=True, 
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer),
    Transforms=None,
    num_workers=4
)
val_dataloader = DataLoader(
    X[6000:].to_dict(orient='record'), 
    batch_size=8, 
    shuffle=False, 
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer),
    num_workers=4
)

In [None]:
class LightningModel(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr, weight_decay, from_scratch=False):
        super().__init__()
        self.save_hyperparameters()
        if from_scratch:
            # Si `from_scratch` est vrai, on charge uniquement la config (nombre de couches, hidden size, etc.) et pas les poids du modèle 
            config = AutoConfig.from_pretrained(
                model_name, num_labels=num_labels
            )
            self.model = AutoModelForSequenceClassification.from_config(config)
        else:
            # Cette méthode permet de télécharger le bon modèle pré-entraîné directement depuis le Hub de HuggingFace sur lequel sont stockés de nombreux modèles
            self.model = AutoModelForSequenceClassification.from_pretrained(
                model_name, num_labels=num_labels
            )
        self.lr = lr
        self.weight_decay = weight_decay
        self.num_labels = self.model.num_labels

    def forward(self, batch):
        return self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

    def training_step(self, batch):
        out = self.forward(batch)

        logits = out.logits
        # -------- MASKED --------
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits.view(-1, self.num_labels), batch["labels"].view(-1))

        # ------ END MASKED ------
        
        self.log("train/loss", loss)
        return loss

    def validation_step(self, batch, batch_index):
        labels = batch["labels"]
        out = self.forward(batch)

        preds = torch.max(out.logits, -1).indices
        # -------- MASKED --------
        acc = (batch["labels"] == preds).float().mean()
        # ------ END MASKED ------
        self.log("valid/acc", acc)

        f1 = f1_score(batch["labels"].cpu().tolist(), preds.cpu().tolist(), average="macro")
        self.log("valid/f1", f1)

    def predict_step(self, batch, batch_idx):
        """La fonction predict step facilite la prédiction de données. Elle est 
        similaire à `validation_step`, sans le calcul des métriques.
        """
        out = self.forward(batch)

        return torch.max(out.logits, -1).indices

    def configure_optimizers(self):
        return torch.optim.AdamW(
            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )

In [None]:
lightning_model = LightningModel("camembert-base", num_labels, lr=3e-5, weight_decay=0.,from_scratch=False)

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir lightning_logs --host localhost --port 6006

In [None]:
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="valid/acc", mode="max")

camembert_trainer = pl.Trainer(
    max_epochs=40,
    accelerator='gpu',
    devices=1,
    callbacks=[
        pl.callbacks.EarlyStopping(monitor="valid/acc", patience=4, mode="max"),
        model_checkpoint,
    ]
)

In [None]:
camembert_trainer.fit(lightning_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

In [None]:
def get_preds(model, tokenizer, sentence):
    tokenized_sentence = tokenizer(sentence, return_tensors="pt")
    input_ids, attention_mask = tokenized_sentence.input_ids, tokenized_sentence.attention_mask

    out = model(tokenized_sentence
    )

    logits = out.logits

    probas = torch.softmax(logits, -1).squeeze()

    pred = torch.argmax(probas)

    return pred

In [None]:
test['intention'] = [get_preds(lightning_model,tokenizer,i).to().numpy() for i in test.question]
test['intention'].to_csv('resultat.csv')