In [None]:
# installations (à exécuter uniquement dans Google Colab)
#!pip install torchmetrics
#!pip install pytorch_lightning
#!pip install transformers
#!pip install datasets

In [1]:
import os
import torch
import torchmetrics
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import EarlyStopping
from torch.nn import functional as F
from torch.utils.data import random_split, DataLoader, Dataset
from pprint import pprint
from transformers import AutoModel, AutoTokenizer, AutoConfig
from datasets import load_dataset
import pytorch_lightning as pl
from sklearn.preprocessing import LabelBinarizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set the cache dir for HuggingFace transformers library
def get_cache_dir():
    import sys
    if "linux" in sys.platform:
        # return "/gfs/team/nlp/users/ait/.cache/"
        return "/gfs-ssd/user/ait/.cache/"
    else:
        return "c:/Users/ait/.cache/"

HF_MODEL_CACHE_DIR = f"{get_cache_dir()}/huggingface/transformers"
os.environ['TRANSFORMERS_CACHE'] = HF_MODEL_CACHE_DIR


In [3]:


class ReviewDataset(Dataset):
    def __init__(self, hfdataset_split, n: int, lmtokenizer, lb: LabelBinarizer):
        # n is the number of (random) samples we want to use from this hf dataset
        # shuffle and take the first n examples
        data = hfdataset_split.shuffle(seed=123).select(range(n))
        texts = [d['review_body'] for d in data]
        encoded_texts = lmtokenizer(texts,
                                    padding=False,
                                    add_special_tokens=True,
                                    return_tensors=None,
                                    return_offsets_mapping=False,
                                    )
        self.input_ids = [torch.tensor(id_list) for id_list in encoded_texts['input_ids']]
        self.attention_mask = [torch.tensor(mask_list) for mask_list in encoded_texts['attention_mask']]
        # labels
        labels = ['positive' if d['stars'] > 3 else 'negative' if d['stars'] < 3 else 'neutral' for d in data]
        self.label_vects = torch.from_numpy(lb.transform(labels)).long()

    def __getitem__(self, index):
        return (self.input_ids[index], self.attention_mask[index], self.label_vects[index])

    def __len__(self):
        return len(self.input_ids)

    def collate_fn(self, batch_list):
        # batch_list is a list of tuples, each returned by the __get_item__() function above
        # create 3 separate lists for each element type in the tuples
        input_ids, attention_masks, label_ids = tuple(zip(*batch_list))
        # the batch will be a dictionary of tensors: a tensor for the input_ids, another for the attention_masks and another for the label_ids if any
        batch = dict({})
        batch['input_ids'] = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=1)
        batch['attention_mask'] = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
        batch['label_vects'] = torch.nn.utils.rnn.pad_sequence(label_ids, batch_first=True, padding_value=0)
        # return the batch as a dictionary of tensors
        return batch



In [4]:

class TransformerClassifier(pl.LightningModule):

    def __init__(self, hf_plm_name: str, output_size: int, dropout: float = 0.3):
        # hf_plm_name = HuggingFace Pretrained Language Model name
        super().__init__()
        # text encoder/vectorizer: a pretrained language model
        self.config = AutoConfig.from_pretrained(hf_plm_name)
        lm_hidden_size = self.config.hidden_size
        self.lm = AutoModel.from_pretrained(hf_plm_name, output_attentions=False)
        # Linear layer(s) for the classifier component
        self.fcn = torch.nn.Sequential(
            torch.nn.Dropout(dropout),
            torch.nn.Linear(lm_hidden_size, output_size),
        )
        # Loss function
        self.loss_fn = torch.nn.CrossEntropyLoss()
        # Learning rate
        self.lr = 1e-4
        #plus le model est grand plus le lR est petit 

    def forward(self, batch):
        out = self.lm(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])[0]
        out = self.fcn(out.mean(dim=1))
        return out

    def training_step(self, batch, batch_idx):
        # training_step is called in PyTorch Lightning train loop
        y_hat = self.forward(batch)
        loss = self.loss_fn(y_hat, batch['label_vects'])
        self.log("loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

    def validation_step(self, batch, batch_ix):
        # validation_step is called in PyTorch Lightning train loop
        y_hat = self.forward(batch)
        loss = self.loss_fn(y_hat, batch['label_vects'])
        acc = torchmetrics.functional.accuracy(y_hat, batch['label_vects'])
        self.log_dict({'val_loss': loss.item(), 'val_acc': acc.item()}, on_step=False, on_epoch=True, reduce_fx='mean', prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        # this is the test loop
        y_hat = self.forward(batch)
        y_hat = F.softmax(y_hat)
        test_acc = torchmetrics.functional.accuracy(y_hat, batch['label_vects'])
        # test_loss = F.mse_loss(x_hat, x)
        self.log("test_acc", test_acc)

    def predict_step(self, batch, batch_idx, dataloader_idx: int = 0):
        y_hat = self.forward(batch)
        y_hat = F.softmax(y_hat)
        return torch.round(y_hat).item()


In [5]:
seed_everything(42)
hf_plm_name = "camembert-base"
# Define the tokenizer (for the pretrained language model)
lmtokenizer = AutoTokenizer.from_pretrained(hf_plm_name)
# Label binarizer in order to vectorize and devectorize labels
lb = LabelBinarizer()
lb.fit(['positive', 'negative', 'neutral'])
# Load the dataset
dataset = load_dataset("amazon_reviews_multi", "fr")
train_dataset = ReviewDataset(dataset['train'], 50, lmtokenizer, lb)
train_dataloader = DataLoader(train_dataset, batch_size=15, collate_fn=train_dataset.collate_fn, shuffle=False)
val_dataset = ReviewDataset(dataset['train'], 100, lmtokenizer, lb)
val_dataloader = DataLoader(train_dataset, batch_size=15, collate_fn=train_dataset.collate_fn, shuffle=False)
# Create the model
model = TransformerClassifier(hf_plm_name, output_size=len(lb.classes_))
# Training the model
device = 'cpu'
early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=5, verbose=True, mode='min')
trainer = pl.Trainer(max_epochs=2,  callbacks=[early_stop_callback], log_every_n_steps=10, accelerator=device)
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

Global seed set to 42
Downloading builder script: 100%|██████████| 7.16k/7.16k [00:00<00:00, 2.39MB/s]
Downloading metadata: 100%|██████████| 37.4k/37.4k [00:00<00:00, 360kB/s]
Downloading readme: 100%|██████████| 13.4k/13.4k [00:00<00:00, 1.34MB/s]


Downloading and preparing dataset amazon_reviews_multi/fr to C:/Users/mosta/.cache/huggingface/datasets/amazon_reviews_multi/fr/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609...


Downloading data: 100%|██████████| 81.9M/81.9M [01:00<00:00, 1.36MB/s]
Downloading data files: 100%|██████████| 1/1 [01:02<00:00, 62.06s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 501.05it/s]
Downloading data: 100%|██████████| 2.02M/2.02M [00:02<00:00, 904kB/s] 
Downloading data files: 100%|██████████| 1/1 [00:03<00:00,  3.88s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 502.25it/s]
Downloading data: 100%|██████████| 2.04M/2.04M [00:03<00:00, 671kB/s]
Downloading data files: 100%|██████████| 1/1 [00:04<00:00,  4.98s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 501.65it/s]
                                                                                         

Dataset amazon_reviews_multi downloaded and prepared to C:/Users/mosta/.cache/huggingface/datasets/amazon_reviews_multi/fr/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 16.09it/s]
Loading cached shuffled indices for dataset at C:/Users/mosta/.cache/huggingface/datasets/amazon_reviews_multi/fr/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609\cache-1b41743e50f8d888.arrow
Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), u

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 0:  12%|█▎        | 1/8 [01:12<08:26, 72.41s/it, loss=5, v_num=0]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
