# 1. Setup

## 1.1 Imports

In [1]:
#imports
import torch
from torch.utils.data import Dataset,DataLoader,Subset
import pandas as pd
from collections import namedtuple
from transformers import BertTokenizer
import numpy as np
from transformers import BertForSequenceClassification,BertConfig
from torch.optim import Adam
from torch.nn import CrossEntropyLoss    
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix

import matplotlib.pyplot as plt



## 1.2 Cuda

use cuda if available

In [2]:
#define device
device= torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

## 1.3 Parameters

This section holds all tuneable parameters of this notebook seperated into different categories according to target area

### 1.3.1 Model Parameters

### 1.3.2 Training Parameters

In [3]:
#learning rate and number of epochs
lr=2e-5
number_of_epochs= 2
batchsize=6

# 2. Datasets and Transforms

## 2.1 Original Dataset

Create a class that will hold the initial dataset, directly derived from the translated original data

In [4]:
class Blogpost_dataset(Dataset):
    """
    this class serves as a custom dataset for the given blog posts
    """

    def __init__(self,src_path,transform=None):
        """
        task: inits the dataset and sets optional transforms \n
        parameters: src_path(str(path to the underlying source data)), transform(optional transformation that may be applied to each sample ) \n
        return value:
        """

        self.src_df = pd.read_csv(src_path,sep="\t")
        self.transform= transform
        self.Dataset_item= namedtuple("Dataset_item",["text","label"]) 

    def __len__(self):
        """
        task: return the length of the underlying source DataFrame \n
        parameters:\n
        return value:
        """

        return len(self.src_df)

    def __getitem__(self, index):
        """
        task: return the item at the given index \n
        parameters:\n
        return value:
        """

        #transform the index to a list in case it is a tensor
        if torch.is_tensor(index):
            index= index.tolist()

        #fetch item from source df
        item=self.Dataset_item(self.src_df.iloc[index]["text"],self.src_df.iloc[index]["label"])

        #apply transform if available
        if self.transform:
            item= self.transform(item)

        return item


## 2.2 BERT compatible Dataset

Create a class that will serve as a dataset for the data in BERT compatible, already tokenized form

In [5]:
class Bert_compatible_dataset(Dataset):
    """
    this class holds a dataset that was transformed using the BertTransform
    """

    def __init__(self,input_ids_list,attention_mask_list,token_type_ids_list,label_list,text_list=None,transform=None):
        """
        task: create a dataset from the given lists of tokens. if text_list is given that column will hold the original text \n
        parameters:input_ids_list(list(token id)), attention_mask_list(list(attention mask)),token_type_ids_list(list(token_type_id)) ,label_list(list(label)), text_list(list(optional: orignal text))\n
        return value:
        """

        #create a class that will hold one element of data
        self.Dataset_item= namedtuple("Bert_dataset_item",["input_ids","attention_mask","token_type_ids","label","text"])

        #this list will store all data 
        self.data= []
        
        if text_list:
            #assert that all of those lists are of same length
            assert len(input_ids_list)==len(attention_mask_list)==len(token_type_ids_list)==len(label_list)==len(text_list),"length of lists has to match"
        
            #zip the lists together
            for input_ids,attention_mask,token_type_ids,label,text in zip(input_ids_list,attention_mask_list,token_type_ids_list,label_list,text_list):

                #create a namedtuple storing that data and append it to self.data
                item= self.Dataset_item(input_ids,attention_mask,token_type_ids,label,text)
                self.data.append(item)

        else:
            assert len(input_ids_list)==len(attention_mask_list)==len(token_type_ids_list)==len(label_list),"length of lists has to match"

            #zip the lists together
            for input_ids,attention_mask,token_type_ids,label in zip(input_ids_list,attention_mask_list,token_type_ids_list,label_list):

                #create a namedtuple storing that data and append it to self.data
                item= self.Dataset_item(input_ids,attention_mask,token_type_ids,label,0)
                self.data.append(item)

        self.transform = transform


    def __len__(self):
        """
        task: return the length of self.data field \n
        parameters:\n
        return value:
        """

        return len(self.data)

    def __getitem__(self, index):
        """
        task: return the item at index \n
        parameters:\n
        return value:
        """

        #transform the index to a list in case it is a tensor
        if torch.is_tensor(index):
            index= index.tolist()

        data=self.data[index]

        if self.transform:
            data= self.transform(data)

        return data


## 2.3 Transforms

Create a Transform that will transform bert compatible data into tensors on the given device

In [6]:
class BertToTensor(object):
    """
    This class serves as a transform to transfer the elements of namedtuple into tensors on the given device
    """

    def __init__(self,device="cpu"):
        """
        task:  \n
        parameters:\n
        return value:
        """

        self.device=device

    def __call__(self,named_tuple):
        """
        task: transform the elements of the named tuple into tensors and ship them over to self.device \n
        parameters: named_tuple("Bert_dataset_item",["input_ids","attention_mask","token_type_ids","label","text"]) \n
        return value: transformed elements
        """

        #unpack the named tuple
        input_ids,attention_mask,token_type_ids,label,text = named_tuple

        #transform to tensor
        input_ids= torch.IntTensor(input_ids).to(device=self.device)
        attention_mask= torch.IntTensor(attention_mask).to(device=self.device)
        token_type_ids= torch.IntTensor(token_type_ids).to(device=self.device)
        label= torch.IntTensor(label).to(device=self.device)
        text= torch.IntTensor(text).to(device=self.device)

        return input_ids,attention_mask,token_type_ids,label,text

create a transform that takes the original data as imput and tokenizes it using the BERTTokenizer

In [7]:
class BertTransform(object):
    """
    this class will serve as a transform that tokenizes a given text using the bert tokenizer
    """

    def __init__(self,max_length):
        """
        task: init the transform and creates a bert tokenizer\n
        parameters:\n
        return value:
        """

        self.tokenizer= BertTokenizer.from_pretrained('bert-base-chinese',do_lower_case=True)
        self.max_length= max_length

    def __call__(self,item):
        """
        task: when called transform the given items' text field by applying bertTokenization  \n
        parameters:\n
        return value:
        """

        #transform the 
        transformed_text= self.tokenizer.encode_plus(
            item.text,
            add_special_tokens=True, #adds beginning(CLS)) and end(SEP) tokens of sequence)
            max_length= self.max_length,
            pad_to_max_length=True, # makes the tokenizer fill the token vectors with padding tokens if the sequence is smaller than max_length
            return_attention_mask = True
        )

        Dataset_item=namedtuple("Dataset_item",["text","label"]) 
        return Dataset_item(transformed_text,item.label)

a method that creates an original dataset and transforms it into a bert compatibel one

In [8]:
def transform_original_dataset_2_bert_compatible(src_path,max_length=512,limit=-1,device="cpu"):
    """
    task: use the BertTokenize transform to tokenize the given original Dataset and thus create a dataset that is bert compatible \n
    parameters: src_path(path to original data), max_length(int(max length allowed for transformer, 512 for bert)),limit(number of entries to use) \n
    return value: torch.utils.data.Dataset
    """

    #create BertTransform
    transform=BertTransform(max_length)

    #create the original Blogpost_dataset
    blogpost_ds=Blogpost_dataset(src_path,transform=transform)

    #pull a subset of the dataset if a limit was given
    if limit>0:
        indices= np.random.choice(range(len(blogpost_ds)),size=limit,replace=False) #choose random indices
        blogpost_ds= Subset(blogpost_ds,indices)

    #lists that will stored the transformed/tokenized text
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []

    #iterate over the dataset with transform and catch the thus transformed texts by putting them into a new dataset
    for transformed_text,label in blogpost_ds:

        #apppend the contents to the corresponding list
        input_ids_list.append(transformed_text['input_ids'])
        token_type_ids_list.append(transformed_text['token_type_ids'])
        attention_mask_list.append(transformed_text['attention_mask'])
        label_list.append([label])

    #return the Bert_compatible_dataset derived from those lists
    return Bert_compatible_dataset(input_ids_list,attention_mask_list,token_type_ids_list,label_list,transform=BertToTensor(device))


## 2.4 Dataset and Dataloader Creation

train dataset/dataloader

In [9]:
#create a bert compatible dataset
bert_train_ds= transform_original_dataset_2_bert_compatible(r"C:\Users\nick\Code\MachineLearning_Projects\Bewerbung_NLP\data\v1\v1\train.tsv",device=device)

#create a corresponding dataloader
bert_train_dl= DataLoader(bert_train_ds,batch_size=batchsize,shuffle=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
for sample in bert_train_ds:
    print(sample)
    break

(tensor([ 101, 2945, 6432, 1305, 1849, 2209, 6206, 6842, 1139, 4767, 3779, 6783,
        1139, 1744, 5299, 5302, 8024, 6821, 3221, 2335, 3249, 6842, 5408, 4060,
        4638, 6825, 7219, 1353, 2418, 8024,  738, 3221,  686, 4518, 4914, 2415,
        1469, 3419, 2229, 7028, 3354, 4638, 3322, 5357,  511, 2190, 4767, 3779,
        3867, 6589, 1744, 2418, 6421, 3221, 1962,  752,  102,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,  

validation dataset/dataloader

In [11]:
#create a bert compatible dataset
bert_valid_ds= transform_original_dataset_2_bert_compatible(r"C:\Users\nick\Code\MachineLearning_Projects\Bewerbung_NLP\data\v1\v1\dev.tsv",device=device)

#create a corresponding dataloader
bert_valid_dl= DataLoader(bert_valid_ds,batch_size=batchsize,shuffle=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# 3. The Model

The model used will be BERT of google

# 3.1 Training

### 3.1.1 Model and optimizer init

use the Bert Config class to configure the Bert model used

In [12]:
bert_config= BertConfig(num_hidden_layers=24)

initialize optimizer and model

In [13]:
#init the model
model= BertForSequenceClassification(bert_config).from_pretrained('bert-base-chinese').to(device)

#use adam optimizer
optimizer= Adam(model.parameters(),lr=lr)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### 3.1.2 Loss Function

Given the binary classification problem, the cross entropy loss function comes in quite handy

In [14]:
#use Crossentropy loss
criterion = CrossEntropyLoss()


### 3.1.3 Training loop

In [20]:
def train_model(model,dataloader,loss_func,optimizer,number_of_epochs,calc_ewma=False,**ewma_kwargs):
    """
    task: train the given model using the given dataloader for number_of_epochs \n
    parameters: model(torch.nn.Module subclass),loss_func(torch.nn loss function), optimizer(torch.optim optimizer), number_of_epochs(int(number of epochs to train)),calc_ewma(bool(true if ewma shall be calculated)),ewma_kwargs(kwargs that will be passed to ewma calculation) \n
    return value: None, but it is a generator yielding the loss of each batch iteration together with the number of epochs and batch id
    """

    if calc_ewma:
        loss_history=pd.DataFrame(columns=["batch_number","batch_loss"])
    current_ewma=None
    
    for epoch_id in range(number_of_epochs):

        for batch_id,batch in enumerate(dataloader,start=1):

            #unpack the batch
            input_ids_list,attention_mask_list,token_type_ids_list,label_list,*_=batch

            #zero gradients
            optimizer.zero_grad()

            #forward_pass
            output=model(input_ids_list,attention_mask_list,token_type_ids_list)
            #something is wrong with the model inputs here, prob cuz they are no tensors

            #compute loss
            batch_loss=criterion(output.logits,label_list.flatten().to(dtype=torch.long))
            batch_loss.backward()

            #optimize
            optimizer.step()
            
            #calc ewma if desired
            if calc_ewma:
                loss_history=loss_history.append({"batch_number": epoch_id*len(dataloader)+batch_id,
                                                  "batch_loss": batch_loss}
                                                  ,ignore_index=True)
                current_ewma=loss_history.ewm(**ewma_kwargs).mean().iloc[-1]["batch_loss"]

            #yield training info about current batch
            yield epoch_id,batch_id, batch_loss,current_ewma



In [40]:
%matplotlib notebook

#create a plot for training info
x=[0]
y=[0]
ewma_list=[None]
plt.ion()
info_fig,axes= plt.subplots(figsize=(10,8))
batch_plot,= axes.plot(x,y)
ewma_plot, =axes.plot(x,ewma_list, "red")
plt.title("training performance")
plt.xlabel("batch iteration")
plt.ylabel("cross entropy batch loss")
plt.legend(["batch_loss","EWMA of batch_loss"])
plt.gcf().text(0.5,0,"0.00% of training done",fontsize=14)


#execute the training loop and print the training info
for epoch_id,batch_id,batch_loss,ewma in train_model(model,bert_train_dl,criterion,optimizer,number_of_epochs,calc_ewma=True,halflife=10):
    
    #print training info
    print(f"[epoch {epoch_id} _ batch {batch_id}]: loss= {batch_loss}")
    
    #calculate the percentage of training done and add it as text
    batch_number=epoch_id*len(bert_train_dl)+batch_id
    percentage_done= round(batch_number/(number_of_epochs*len(bert_train_dl))*100,2)
    info_fig.texts[0].set_text(f"{percentage_done:.2f}% of training done")
    
    #update plot with current training info
    x.append(batch_number)
    y.append(batch_loss)
    ewma_list.append(ewma)
    
    batch_plot.set_data(x,y)
    ewma_plot.set_data(x,ewma_list)
    axes.relim()
    axes.autoscale_view(True,True,True)
    
    info_fig.canvas.draw()
    info_fig.canvas.flush_events()
    

<IPython.core.display.Javascript object>

[epoch 0 _ batch 1]: loss= 0.005583286751061678
[epoch 0 _ batch 2]: loss= 0.04118039831519127
[epoch 0 _ batch 3]: loss= 0.10945171862840652
[epoch 0 _ batch 4]: loss= 1.3324095010757446
[epoch 0 _ batch 5]: loss= 0.0021021098364144564
[epoch 0 _ batch 6]: loss= 0.008531599305570126
[epoch 0 _ batch 7]: loss= 0.007853418588638306
[epoch 0 _ batch 8]: loss= 0.006751642096787691
[epoch 0 _ batch 9]: loss= 0.022362271323800087
[epoch 0 _ batch 10]: loss= 0.3725571632385254
[epoch 0 _ batch 11]: loss= 0.0017289860406890512
[epoch 0 _ batch 12]: loss= 0.0034068378154188395
[epoch 0 _ batch 13]: loss= 0.0162308681756258
[epoch 0 _ batch 14]: loss= 0.0025815845001488924
[epoch 0 _ batch 15]: loss= 0.005829046946018934
[epoch 0 _ batch 16]: loss= 0.01995115913450718
[epoch 0 _ batch 17]: loss= 0.006783560384064913
[epoch 0 _ batch 18]: loss= 0.003411830635741353
[epoch 0 _ batch 19]: loss= 0.026374712586402893
[epoch 0 _ batch 20]: loss= 0.003608535975217819
[epoch 0 _ batch 21]: loss= 0.0082

[epoch 0 _ batch 168]: loss= 0.10800296068191528
[epoch 0 _ batch 169]: loss= 0.0045559704303741455
[epoch 0 _ batch 170]: loss= 0.0652562603354454
[epoch 0 _ batch 171]: loss= 0.004849420860409737
[epoch 0 _ batch 172]: loss= 0.02703392691910267
[epoch 0 _ batch 173]: loss= 0.003003455698490143
[epoch 0 _ batch 174]: loss= 0.19636698067188263
[epoch 0 _ batch 175]: loss= 0.025662237778306007
[epoch 0 _ batch 176]: loss= 0.011823594570159912
[epoch 0 _ batch 177]: loss= 0.004989064298570156
[epoch 0 _ batch 178]: loss= 0.004908756818622351
[epoch 0 _ batch 179]: loss= 0.04018441215157509
[epoch 0 _ batch 180]: loss= 0.0020175063982605934
[epoch 0 _ batch 181]: loss= 0.02964060567319393
[epoch 0 _ batch 182]: loss= 0.0037343036383390427
[epoch 0 _ batch 183]: loss= 0.0023673090618103743
[epoch 0 _ batch 184]: loss= 0.007401364389806986
[epoch 0 _ batch 185]: loss= 0.018075866624712944
[epoch 0 _ batch 186]: loss= 0.1969555765390396
[epoch 0 _ batch 187]: loss= 0.3283331096172333
[epoch 

[epoch 1 _ batch 82]: loss= 0.003186748595908284
[epoch 1 _ batch 83]: loss= 0.0007781866588629782
[epoch 1 _ batch 84]: loss= 0.001460746512748301
[epoch 1 _ batch 85]: loss= 0.0008868161239661276
[epoch 1 _ batch 86]: loss= 0.0016824029153212905
[epoch 1 _ batch 87]: loss= 0.0018507641507312655
[epoch 1 _ batch 88]: loss= 0.0008363753440789878
[epoch 1 _ batch 89]: loss= 0.002048071939498186
[epoch 1 _ batch 90]: loss= 0.0009308832813985646
[epoch 1 _ batch 91]: loss= 0.0015348136657848954
[epoch 1 _ batch 92]: loss= 0.0005531318602152169
[epoch 1 _ batch 93]: loss= 0.0020506465807557106
[epoch 1 _ batch 94]: loss= 0.003216954180970788
[epoch 1 _ batch 95]: loss= 0.0007370900711975992
[epoch 1 _ batch 96]: loss= 0.0022358763962984085
[epoch 1 _ batch 97]: loss= 0.0009804737055674195
[epoch 1 _ batch 98]: loss= 0.000901926017832011
[epoch 1 _ batch 99]: loss= 0.0008759435149841011
[epoch 1 _ batch 100]: loss= 0.0014943735441192985
[epoch 1 _ batch 101]: loss= 0.0012621487258002162
[ep

[epoch 1 _ batch 246]: loss= 0.015332572162151337
[epoch 1 _ batch 247]: loss= 0.011562380008399487
[epoch 1 _ batch 248]: loss= 0.000997320399619639
[epoch 1 _ batch 249]: loss= 0.014135405421257019
[epoch 1 _ batch 250]: loss= 0.019712308421730995
[epoch 1 _ batch 251]: loss= 0.0006394482334144413
[epoch 1 _ batch 252]: loss= 0.8873839974403381


## 3.2 Model evaluation

In [42]:
class Metric_composer():
    """
    this class is used to evaluate model predictions on multiple metrics simultaneously
    """
    
    def __init__(self,metrics_dict):
        """
        task: init composer and set its metrics field \n
        parameters:metrics_dict(sting(metric name): callable(metric func))\n
        return value:
        """
        
        self.metrics= metrics_dict
        
    def evaluate(self,y_truth,y_pred):
        """
        task: use the metrics in the self.metrics field to evaluate the given y values \n
        parameters: y_truth(ground truth labels), y_pred(predicted labels) \n
        return value: dict(string(name):evaluation of corresponding function)
        """
    
        evaluation={}
        for name,metric_func in self.metrics.items():
            
            evaluation[name]= metric_func(y_truth,y_pred)
            
        return evaluation
    

In [43]:
def eval_model(model,dataloader,metric_composer_obj):
    """
    task: evaluate the model on the given dataloader using the loss_func \n
    parameters: model(torch.nn.Module subclass), dataloader(torch.utils.data.DataLoader instance), metric_composer_obj(instance of Metric_composer) \n
    return value:
    """
    
    y_pred=[]
    y_truth=[]
    with torch.no_grad():
        
        for batch in dataloader:
            
            #unpack the batch
            input_ids_list,attention_mask_list,token_type_ids_list,label_list,*_=batch
            
            #evalaute batch with model
            output=model(input_ids_list,attention_mask_list,token_type_ids_list)
            
            #model outputs a prob distribution over the 2 classes, so get argmax to retreive predicted label
            predicted_labels=torch.argmax(output.logits, axis=1).flatten()
    
            #append both the predicted labels and the true labels 
            y_pred+=predicted_labels.tolist()
            y_truth+=label_list.flatten().tolist()
            
            
    return metric_composer_obj.evaluate(y_truth,y_pred)

compose metrics

In [44]:
#compose different performance metrics
metric_composer_obj= Metric_composer({
    "accuracy": accuracy_score,
    "precision": precision_score,
    "confusion_matrix": confusion_matrix
})

----accuracy----
0.656084656084656
---------------------
----precision----
0.7058823529411765
---------------------
----confusion_matrix----
[[76 20]
 [45 48]]
---------------------


evaluate performance on train data

In [45]:
#iterate over all evaluations and print them
for metric_name, metric_evaluation in eval_model(model,bert_train_dl,metric_composer_obj).items():
    print(f"----{metric_name}----")
    print(metric_evaluation)
    print("---------------------")

----accuracy----
0.9953703703703703
---------------------
----precision----
0.9934640522875817
---------------------
----confusion_matrix----
[[745   5]
 [  2 760]]
---------------------


evaluate performance on validation data

In [46]:
#iterate over all evaluations and print them
for metric_name, metric_evaluation in eval_model(model,bert_valid_dl,metric_composer_obj).items():
    print(f"----{metric_name}----")
    print(metric_evaluation)
    print("---------------------")

----accuracy----
0.656084656084656
---------------------
----precision----
0.7058823529411765
---------------------
----confusion_matrix----
[[76 20]
 [45 48]]
---------------------
