In [1]:
import numpy as np
import torch
import seaborn
import pickle
import pandas as pd
from sklearn import metrics
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import CamembertForSequenceClassification, CamembertTokenizer, AdamW

In [2]:
def xmlToDf(xmlFile):
    # Read XML file
    df = pd.read_xml(xmlFile)
    # replace comma to point in note column
    df["note"] = df["note"].apply(lambda x: x.replace(",", "."))
    #replace None to empty string in commentaire column
    df["commentaire"] = df["commentaire"].apply(checkIfWordInComment)
    # string to double conversion column
    df['note'] = df['note'].astype(float)
    return df


def checkIfWordInComment(comment):
    if comment is None:
        return ""
    return comment

df_dev = xmlToDf("data/train.xml")

In [2]:
def deserializeDf(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
# df_dev = deserializeDf('data/df_train_new.pkl')

In [3]:
comments = df_dev["commentaire"].values.tolist()
rates = df_dev["note"].values.tolist()

TOKENIZER = CamembertTokenizer.from_pretrained(
    'camembert/camembert-base',
    do_lower_case=True)



In [4]:

# df_dev["note"] = df_dev["note"].apply(lambda x: x.replace(",", "."))
# df_dev['note'] = df_dev['note'].astype(float)

rates = df_dev["note"].values.tolist()

# rates to list of len(rates) arrays with 10 elements with valye 0 or 1 and 1 if rates*2 is equal to index
list_rates = []
for rate in rates:
    # array = [0]*10
    # array[int(rate*2)-1] = 1
    # list_rates.append(array)
    list_rates.append(np.int64(rate*2-1)) # ! perso j ai pas eu besoin d y transformer en one hot (cad array avec que des 0 et un 1 a la bonne classe)

rates = torch.tensor(list_rates).cuda()

In [5]:
df_dev = None

# La fonction batch_encode_plus encode un batch de donnees
encoded_batch = TOKENIZER.batch_encode_plus(comments,
                                            add_special_tokens=True,
                                            padding=True,
                                            truncation=True,
                                            max_length=512,     # ! au lieu de 10 -> 512 car c'est le nb max de token que peut prendre le camembert
                                            return_attention_mask = True,
                                            return_tensors = 'pt')

In [12]:
# save encoded_batch
with open('data/encoded_batch_train_untouch_comment.pkl', 'wb') as f:
    pickle.dump(encoded_batch, f)
    

In [6]:
# On calcule l'indice qui va delimiter nos datasets d'entrainement et de validation
# On utilise 80% du jeu de donnée pour l'entrainement et les 20% restant pour la validation
# split_border = int(len(rates)*0.8)

# ! Ton dataset de dev est deja le validation dataset donc dans notre cas on a pas besoin de split
 
 
# train_dataset = TensorDataset(
#     encoded_batch['input_ids'][:split_border],
#     encoded_batch['attention_mask'][:split_border],
#     rates[:split_border])
# validation_dataset = TensorDataset(
#     encoded_batch['input_ids'][split_border:],
#     encoded_batch['attention_mask'][split_border:],
#     rates[split_border:])

train_dataset = TensorDataset(  # ! sera ton train.xml
    encoded_batch['input_ids'],
    encoded_batch['attention_mask'],
    rates)
# validation_dataset = TensorDataset(   # ! sera ton dev.xml
#     encoded_batch['input_ids'],
#     encoded_batch['attention_mask'],
#     rates)
 
# On definit la taille des batchs
batch_size = 8
 
# On cree les DataLoaders d'entrainement et de validation
# Le dataloader est juste un objet iterable
# On le configure pour iterer le jeu d'entrainement de façon aleatoire et creer les batchs.
train_dataloader = DataLoader(
            train_dataset,
            # sampler = RandomSampler(train_dataset),
            shuffle=True, # ! ca fait pareil que le truc d au dessus mais c'est plus explicite je trouve
            batch_size = batch_size)
 
# validation_dataloader = DataLoader(
#             validation_dataset,
#             sampler = SequentialSampler(validation_dataset),
#             batch_size = batch_size)

In [7]:
# On la version pre-entrainee de camemBERT 'base'
model = CamembertForSequenceClassification.from_pretrained('camembert/camembert-base', num_labels = 10).cuda()

df_dev = None

#add layer to model to predict 10 classes instead of 1
# model.classifier = torch.nn.Linear(768, 10).cuda()    # ! perso j ai pas ca

Some weights of the model checkpoint at camembert/camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert/camembert-base and are n

In [8]:
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, # Learning Rate
                  eps = 1e-8) # Epsilon
epochs = 1

In [9]:
# On met le modele sur le GPU
device = torch.device('cuda',0)
 
# Pour enregistrer les stats a chaque epoque
training_stats = []
 
# Boucle d'entrainement
for epoch in range(0, epochs):
     
    print("")
    print(f'########## Epoch {epoch+1} / {epochs} ##########')
    print('Training...')
 
 
    # On initialise la loss pour cette epoque
    total_train_loss = 0
    total_elem = 0
    # On met le modele en mode 'training'
    # Dans ce mode certaines couches du modele agissent differement
    model.train()    
 
    # Pour chaque batch
    for step, (input_id, mask, rate) in enumerate(tqdm(train_dataloader)):
 
        # On fait un print chaque 10 batchs
        # if step % 10 == 0 and not step == 0:
        #     print(f'  Batch {step} of {len(train_dataloader)}.')
         
        # On recupere les donnees du batch
        input_id,attention_mask, rate  = input_id.to(device), mask.to(device), rate.to(device)
 
        # On met le gradient a 0
        model.zero_grad()     
        # On passe la donnee au model et on recupere la loss et le logits (sortie avant fonction d'activation)
        loss, logits = model(input_id, 
                            token_type_ids=None, 
                            attention_mask=attention_mask, 
                            labels=rate,
                            return_dict=False)
        
        # On incremente la loss totale
        # .item() donne la valeur numerique de la loss
        total_train_loss += loss.item()
        total_elem += len(rate)
        # Backpropagtion
        loss.backward()
        # loss.backward(retain_graph=True)
        # On actualise les parametrer grace a l'optimizer
        optimizer.step()
        if step>=100:
            break
 
    # On calcule la  loss moyenne sur toute l'epoque
    # avg_train_loss = total_train_loss / len(train_dataloader)  # ! len train loader ca donne le nb de batch normalement
    # avg_train_loss = total_train_loss / len(train_dataloader.dataset)   # ! .dataset ca donne le nb de donnees au total
    avg_train_loss = total_train_loss / total_elem   # ! juste pour debug vu que je fais pas passer tt le dataset
 
    print("")
    print("  Average training loss: {0:.3f}".format(avg_train_loss))
     
    # Enregistrement des stats de l'epoque
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
        }
    )
 
print("Model saved!")
# perso je save le model entier
# torch.save(model.state_dict(), "./rates.pt")
# =>
torch.save(model, "model_train_untouched_comments.pt")


########## Epoch 1 / 1 ##########
Training...


  0%|          | 100/83246 [00:27<6:22:49,  3.62it/s]



  Average training loss: 0.281
Model saved!


In [6]:
# model = CamembertForSequenceClassification.from_pretrained('camembert/camembert-base-oscar-4gb', num_labels = 10).cuda()
model = torch.load("model.pt").cuda()   # ! perso je load les modeles comme ca
# model.load_state_dict(torch.load("./rates.pt"))
model.eval()
def preprocess(raw_reviews, rates=None):
    encoded_batch = TOKENIZER.batch_encode_plus(raw_reviews,
                                                add_special_tokens=True,
                                                padding=True,
                                                truncation=True,
                                                max_length=512, # ! meme raison qu en haut
                                                return_attention_mask = True,
                                                return_tensors = 'pt')
    if rates:
        rates = torch.tensor(rates)
        return encoded_batch['input_ids'], encoded_batch['attention_mask'], rates
    return encoded_batch['input_ids'], encoded_batch['attention_mask']
 
def predict(reviews, model=model):
    with torch.no_grad():
        input_ids, attention_mask = preprocess(reviews)
        input_ids,attention_mask = input_ids.cuda(), attention_mask.cuda()  # ! pour tt mettre sur cuda pour eviter les pb de devices different
        retour = model(input_ids, attention_mask=attention_mask)
        return torch.argmax(retour[0], dim=1).cuda() # ! dim 1 plutot non?
 
 
def evaluate(model, reviews, rates):
    predictions = predict(reviews, model).cpu()
    print(predictions)
    print(metrics.f1_score(rates.cpu(), predictions.cpu(), average='weighted', zero_division=0))
    seaborn.heatmap(metrics.confusion_matrix(rates.cpu(), predictions.cpu()))

# predictions = predict(comments, model)
# print(predictions)
evaluate(model, comments[0:20], rates[0:20])

OutOfMemoryError: CUDA out of memory. Tried to allocate 11.72 GiB (GPU 0; 10.00 GiB total capacity; 9.21 GiB already allocated; 0 bytes free; 9.25 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF