In [80]:
"""
https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb
"""

import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
from transformers import CamembertModel, CamembertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
#from transformers import BertTokenizer, BertModel, BertConfig

import os

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
camembert = CamembertModel.from_pretrained("camembert-base")


In [81]:
os.listdir('./train')

['Train.csv']

In [82]:
df=pd.read_csv('./translated.csv')
print(len(df))
display(df.head())
df=df.drop(['Unnamed: 0'],axis=1)
display(df.head())

2042


Unnamed: 0.1,Unnamed: 0,text,list
0,0,Je m'en fiche si certaines personnes ont voté ...,1
1,1,J'ai vu ce film par hasard sur la petite boîte...,1
2,2,Le pire film que j'ai vu depuis un moment. Oua...,0
3,3,Je ne sais pas ce que c'est à propos de ce fil...,1
4,4,J'adore ce film! Je pense que je l'ai déjà vu ...,1


Unnamed: 0,text,list
0,Je m'en fiche si certaines personnes ont voté ...,1
1,J'ai vu ce film par hasard sur la petite boîte...,1
2,Le pire film que j'ai vu depuis un moment. Oua...,0
3,Je ne sais pas ce que c'est à propos de ce fil...,1
4,J'adore ce film! Je pense que je l'ai déjà vu ...,1


In [83]:


new_df=df

df_target=pd.DataFrame(new_df['list'])
df_target['toto']=0
for i in range(len(df_target)):
    if df_target['list'].loc[i]==0:
        df_target['toto'].loc[i]=1
new_df1=new_df.drop(['list'],axis=1)
new_df2=pd.DataFrame(df_target)

new_df3=pd.concat([new_df1,new_df2],axis = 1)
df=new_df3

df['list'] = df[df.columns[1:]].values.tolist()
new_df = df[['text', 'list']].copy()
display(new_df.head())

Unnamed: 0,text,list
0,Je m'en fiche si certaines personnes ont voté ...,"[1, 0]"
1,J'ai vu ce film par hasard sur la petite boîte...,"[1, 0]"
2,Le pire film que j'ai vu depuis un moment. Oua...,"[0, 1]"
3,Je ne sais pas ce que c'est à propos de ce fil...,"[1, 0]"
4,J'adore ce film! Je pense que je l'ai déjà vu ...,"[1, 0]"


In [84]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 350
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1
EPOCHS = 2
LEARNING_RATE = 1e-05


Now, let’s turn our labels and encodings into a Dataset object. In PyTorch, this is done by subclassing a torch.utils.data.Dataset object and implementing __len__ and __getitem__. In TensorFlow, we pass our input encodings and labels to the from_tensor_slices constructor method. We put the data in this format so that the data can be easily batched such that each key in the batch encoding corresponds to a named parameter of the forward() method of the model we will train.


In [85]:
"""
SOURCE : https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#Dataset
On subclass la classe Dataset de Pytorch.

SOURCE :https://huggingface.co/transformers/_modules/transformers/tokenization_utils_base.html#PreTrainedTokenizerBase.batch_encode_plus
encoded_inputs = self.encode_plus(
            text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            return_tensors=return_tensors,
            **kwargs,
               )

Retourne un dictionnaire de tensors attendu par le modele
        
"""
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [86]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (2042, 2)
TRAIN Dataset: (1634, 2)
TEST Dataset: (408, 2)


In [87]:
"""
On fait des batchs, et on shuffle : https://pytorch.org/docs/stable/data.html

DataLoader(dataset, batch_size=1, shuffle=False, sampler=None,
           batch_sampler=None, num_workers=0, collate_fn=None,
           pin_memory=False, drop_last=False, timeout=0,
           worker_init_fn=None, *, prefetch_factor=2,
           persistent_workers=False)
Dataloader

    Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.
    This control is achieved using the parameters such as batch_size and max_len.
    Training and Validation dataloaders are used in the training and validation part of the flow respectively


"""

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [88]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 
"""
We will be creating a neural network with the BERTClass.
This network will have the Bert model. Follwed by a Droput and Linear Layer. They are added for the purpose of Regulariaztion and Classification respectively.
In the forward loop, there are 2 output from the BertModel layer.
The second output output_1 or called the pooled output is passed to the Drop Out layer and the subsequent output is given to the Linear layer.
Keep note the number of dimensions for Linear Layer is 6 because that is the total number of categories in which we are looking to classify our model.
The data will be fed to the BertClass as defined in the dataset.
Final layer outputs is what will be used to calcuate the loss and to determine the accuracy of models prediction.
We will initiate an instance of the network called model. This instance will be used for training and then to save the final trained model for future inference.
the batch encoding corresponds to a named parameter of the forward() method of the model we will train.

"""


class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = CamembertModel.from_pretrained("camembert-base")
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 2) #2 = binary classification
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()




In [89]:
"""
On définit la loss function et l'optimizer

It is to be noted that the overall mechanisms for a multiclass and multilabel problems are similar, except for few differences namely:
Loss function is designed to evaluate all the probability of categories individually rather than as compared to other categories. 
Hence the use of BCE rather than Cross Entropy when defining loss.
Sigmoid of the outputs calcuated to rather than Softmax. Again for the reasons defined in the previous point
The accuracy metrics and F1 scores used from sklearn package as compared to direct comparison of expected vs predicted


"""

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)


In [90]:
"""
The dataloader passes data to the model based on the batch size.
Subsequent output from the model and the actual category are compared to calculate the loss.
Loss value is used to optimize the weights of the neurons in the network.
After every 100 steps the loss value is printed in the console.

"""
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids']
        mask = data['mask']
        token_type_ids = data['token_type_ids']
        targets = data['targets']
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [91]:
"""
On lance l'entrainement !

"""

for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.7279871702194214
Epoch: 0, Loss:  0.60594642162323
Epoch: 0, Loss:  0.6041982173919678
Epoch: 0, Loss:  0.3362402319908142
Epoch: 0, Loss:  0.310707151889801
Epoch: 0, Loss:  0.2229984700679779
Epoch: 0, Loss:  1.53865385055542
Epoch: 0, Loss:  0.17237332463264465
Epoch: 0, Loss:  0.13859540224075317
Epoch: 0, Loss:  0.7772263288497925
Epoch: 0, Loss:  0.4765016734600067
Epoch: 0, Loss:  0.14793190360069275
Epoch: 0, Loss:  0.3320745825767517
Epoch: 0, Loss:  0.08441752940416336
Epoch: 0, Loss:  0.08091908693313599
Epoch: 0, Loss:  0.07366960495710373
Epoch: 0, Loss:  0.0666135847568512
Epoch: 1, Loss:  0.11256740987300873
Epoch: 1, Loss:  0.45494163036346436
Epoch: 1, Loss:  0.06847839057445526
Epoch: 1, Loss:  0.05006334185600281
Epoch: 1, Loss:  0.05995655804872513
Epoch: 1, Loss:  0.05453237146139145
Epoch: 1, Loss:  0.044343702495098114
Epoch: 1, Loss:  0.06448893994092941
Epoch: 1, Loss:  0.040607936680316925
Epoch: 1, Loss:  0.033368296921253204
Epoch: 1, Loss

In [92]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids']
            mask = data['mask']
            token_type_ids = data['token_type_ids']
            targets = data['targets']
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [93]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.8970588235294118
F1 Score (Micro) = 0.8970588235294118
F1 Score (Macro) = 0.8956140350877193
Accuracy Score = 0.8970588235294118
F1 Score (Micro) = 0.8970588235294118
F1 Score (Macro) = 0.8956140350877193


In [102]:
torch.save(model.state_dict())



TypeError: save() missing 1 required positional argument: 'f'

In [105]:
"""
torch.save(model.state_dict(), f'BERT_ft_epoch{epoch}.model')

model.load_state_dict(torch.load(f'BERT_ft_epoch1.model'))
model.eval()
output=model(...)

"""


"\ntorch.save(model.state_dict(), f'BERT_ft_epoch{epoch}.model')\n\nmodel.load_state_dict(torch.load(f'BERT_ft_epoch1.model'))\nmodel.eval()\noutput=model(...)\n\n"