# 1. Setup

## 1.1 Imports

In [1]:
#imports
import torch
from torch.utils.data import Dataset,DataLoader,Subset
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR
from torch.nn import CrossEntropyLoss

from transformers import DistilBertForSequenceClassification,DistilBertConfig,DistilBertTokenizer

from sklearn.metrics import accuracy_score,precision_score,confusion_matrix

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import namedtuple

import os
import json
import simplejson
import warnings
from pathlib import Path
import pathlib



## 1.2 Cuda

use cuda if available

In [2]:
#define device
device= torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

## 1.3 Parameters

This section holds all tuneable parameters of this notebook seperated into different categories according to target area

In [3]:
notebook_parameters={"model_parameters":None,
                     "training_parameters":None,
                     "data_parameters":None,
                     "storage_parameters":None}

### 1.3.1 Model Parameters

dict that will be passed to BertConfig to configure the model used

In [4]:
notebook_parameters["model_parameters"]={"n_layers":16,
                                         "n_heads":12,
                                         "seq_classif_dropout": 0.1,
                                         "dropout":0.2,
                                         "attention_dropout":0.2
                                        }

### 1.3.2 Training Parameters

In [5]:
notebook_parameters["training_parameters"]={"optimizer": Adam,
                                            "optimizer_kwargs": {"lr": 1e-4, "betas":(0.9,0.999)},
                                            "learningrate_scheduler": ExponentialLR,
                                            "update_lr_every_n_batches": 100,
                                            "learningrate_scheduler_kwargs": {"gamma": 0.9},
                                            "number_of_epochs": 6,
                                            "loss_function": CrossEntropyLoss(),
                                            "validate_every_n_batches": 252,
                                            "calc_ewma":True
                                            }

### 1.3.3 Data parameters

In [6]:
notebook_parameters["data_parameters"]={"batchsize":8,
                                        "path_to_train_data":r"C:\path\to\train.tsv",
                                        "train_limit": -1,
                                        "path_to_valid_data":r"C:\path\to\dev.tsv",
                                        "valid_limit": -1,
                                        }

### 1.3.4 Storage parameters

In [7]:
notebook_parameters["storage_parameters"]= {"path_to_model_storage": Path(r"C:\path\to\trainings\trained_models")
                                           }

# 2. Datasets and Transforms

## 2.1 Original Dataset

Create a class that will hold the initial dataset

In [8]:
class Blogpost_dataset(Dataset):
    """
    this class serves as a custom dataset for the given blog posts
    """

    def __init__(self,src_path,transform=None):
        """
        task: inits the dataset and sets optional transforms \n
        parameters: src_path(str(path to the underlying source data)), transform(optional transformation that may be applied to each sample ) \n
        return value:
        """

        self.src_df = pd.read_csv(src_path,sep="\t")
        self.transform= transform
        self.Dataset_item= namedtuple("Dataset_item",["text","label"]) 

    def __len__(self):
        """
        task: return the length of the underlying source DataFrame \n
        parameters:\n
        return value:
        """

        return len(self.src_df)

    def __getitem__(self, index):
        """
        task: return the item at the given index \n
        parameters:\n
        return value:
        """

        #transform the index to a list in case it is a tensor
        if torch.is_tensor(index):
            index= index.tolist()

        #fetch item from source df
        item=self.Dataset_item(self.src_df.iloc[index]["text"],self.src_df.iloc[index]["label"])

        #apply transform if available
        if self.transform:
            item= self.transform(item)

        return item


## 2.2 BERT compatible Dataset

Create a class that will serve as a dataset for the data in BERT compatible, already tokenized form

In [9]:
class Bert_compatible_dataset(Dataset):
    """
    this class holds a dataset that was transformed using the BertTransform
    """

    def __init__(self,input_ids_list,attention_mask_list,label_list,text_list=None,transform=None):
        """
        task: create a dataset from the given lists of tokens. if text_list is given that column will hold the original text \n
        parameters:input_ids_list(list(token id)), attention_mask_list(list(attention mask)) ,label_list(list(label)), text_list(list(optional: orignal text))\n
        return value:
        """

        #create a class that will hold one element of data
        self.Dataset_item= namedtuple("Bert_dataset_item",["input_ids","attention_mask","label","text"])

        #this list will store all data 
        self.data= []
        
        if text_list:
            #assert that all of those lists are of same length
            assert len(input_ids_list)==len(attention_mask_list)==len(label_list)==len(text_list),"length of lists has to match"
        
            #zip the lists together
            for input_ids,attention_mask,label,text in zip(input_ids_list,attention_mask_list,label_list,text_list):

                #create a namedtuple storing that data and append it to self.data
                item= self.Dataset_item(input_ids,attention_mask,label,text)
                self.data.append(item)

        else:
            assert len(input_ids_list)==len(attention_mask_list)==len(label_list),"length of lists has to match"

            #zip the lists together
            for input_ids,attention_mask,label in zip(input_ids_list,attention_mask_list,label_list):

                #create a namedtuple storing that data and append it to self.data
                item= self.Dataset_item(input_ids,attention_mask,label,0)
                self.data.append(item)

        self.transform = transform


    def __len__(self):
        """
        task: return the length of self.data field \n
        parameters:\n
        return value:
        """

        return len(self.data)

    def __getitem__(self, index):
        """
        task: return the item at index \n
        parameters:\n
        return value:
        """

        #transform the index to a list in case it is a tensor
        if torch.is_tensor(index):
            index= index.tolist()

        data=self.data[index]

        if self.transform:
            data= self.transform(data)

        return data


## 2.3 Transforms

Create a Transform that will transform bert compatible data into tensors on the given device

In [10]:
class BertToTensor(object):
    """
    This class serves as a transform to transfer the elements of namedtuple into tensors on the given device
    """

    def __init__(self,device="cpu"):
        """
        task:  \n
        parameters:\n
        return value:
        """

        self.device=device

    def __call__(self,named_tuple):
        """
        task: transform the elements of the named tuple into tensors and ship them over to self.device \n
        parameters: named_tuple("Bert_dataset_item",["input_ids","attention_mask","label","text"]) \n
        return value: transformed elements
        """

        #unpack the named tuple
        input_ids,attention_mask,label,text = named_tuple

        #transform to tensor
        input_ids= torch.IntTensor(input_ids).to(device=self.device)
        attention_mask= torch.IntTensor(attention_mask).to(device=self.device)
        label= torch.IntTensor(label).to(device=self.device)
        text= torch.IntTensor(text).to(device=self.device)

        return input_ids,attention_mask,label,text

create a transform that takes the original data as imput and tokenizes it using the BERTTokenizer

In [11]:
class BertTransform(object):
    """
    this class will serve as a transform that tokenizes a given text using the bert tokenizer
    """

    def __init__(self,max_length):
        """
        task: init the transform and creates a bert tokenizer\n
        parameters:\n
        return value:
        """

        self.tokenizer= DistilBertTokenizer.from_pretrained('distilbert-base-uncased',do_lower_case=True)
        self.max_length= max_length

    def __call__(self,item):
        """
        task: when called transform the given items' text field by applying bertTokenization  \n
        parameters:\n
        return value:
        """

        #transform the items text 
        transformed_text= self.tokenizer.encode_plus(
            item.text,
            add_special_tokens=True, #adds beginning(CLS)) and end(SEP) tokens of sequence)
            max_length= self.max_length,
            pad_to_max_length=True, # makes the tokenizer fill the token vectors with padding tokens if the sequence is smaller than max_length
            return_attention_mask = True
        )

        Dataset_item=namedtuple("Dataset_item",["text","label"]) 
        return Dataset_item(transformed_text,item.label)

a method that creates an original dataset and transforms it into a bert compatibel one

In [12]:
def transform_original_dataset_2_bert_compatible(src_path,max_length=512,limit=-1,device="cpu"):
    """
    task: use the BertTokenize transform to tokenize the given original Dataset and thus create a dataset that is bert compatible \n
    parameters: src_path(path to original data), max_length(int(max length allowed for transformer, 512 for bert)),limit(number of entries to use) \n
    return value: torch.utils.data.Dataset
    """

    #create BertTransform
    transform=BertTransform(max_length)

    #create the original Blogpost_dataset
    blogpost_ds=Blogpost_dataset(src_path,transform=transform)

    #pull a subset of the dataset if a limit was given
    if limit>0:
        indices= np.random.choice(range(len(blogpost_ds)),size=limit,replace=False) #choose random indices
        blogpost_ds= Subset(blogpost_ds,indices)

    #lists that will stored the transformed/tokenized text
    input_ids_list = []
    attention_mask_list = []
    label_list = []

    #iterate over the dataset with transform and catch the thus transformed texts by putting them into a new dataset
    for transformed_text,label in blogpost_ds:

        #apppend the contents to the corresponding list
        input_ids_list.append(transformed_text['input_ids'])
        attention_mask_list.append(transformed_text['attention_mask'])
        label_list.append([label])

    #return the Bert_compatible_dataset derived from those lists
    return Bert_compatible_dataset(input_ids_list,attention_mask_list,label_list,transform=BertToTensor(device))


## 2.4 Dataset and Dataloader Creation

train dataset/dataloader

In [13]:
#create a bert compatible dataset
bert_train_ds= transform_original_dataset_2_bert_compatible(notebook_parameters["data_parameters"]["path_to_train_data"],limit=notebook_parameters["data_parameters"]["train_limit"],device=device)

#create a corresponding dataloader
bert_train_dl= DataLoader(bert_train_ds,batch_size=notebook_parameters["data_parameters"]["batchsize"],shuffle=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


validation dataset/dataloader

In [14]:
#create a bert compatible dataset
bert_valid_ds= transform_original_dataset_2_bert_compatible(notebook_parameters["data_parameters"]["path_to_valid_data"],limit=notebook_parameters["data_parameters"]["valid_limit"],device=device)

#create a corresponding dataloader
bert_valid_dl= DataLoader(bert_valid_ds,batch_size=notebook_parameters["data_parameters"]["batchsize"],shuffle=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# 3. The Model

The model used will be DistilBERT by google. further documentation can be found here: https://huggingface.co/transformers/model_doc/distilbert.html

# 3.1 Training

### 3.1.1 Model and optimizer init

use the DistilBERT Config class to configure the DistilBert model used

In [15]:
bert_config= DistilBertConfig(**notebook_parameters["model_parameters"]) 

initialize optimizer,learningrate scheduler and model

In [16]:
#init the model
model= DistilBertForSequenceClassification(bert_config).from_pretrained('distilbert-base-uncased').to(device)

#init optimizer and learning rate scheduler
optimizer= notebook_parameters["training_parameters"]["optimizer"](model.parameters(),**notebook_parameters["training_parameters"]["optimizer_kwargs"]) 
lr_scheduler= notebook_parameters["training_parameters"]["learningrate_scheduler"](optimizer,**notebook_parameters["training_parameters"]["learningrate_scheduler_kwargs"])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

### 3.1.2 Loss Function

Given the binary classification problem, the cross entropy loss function comes in quite handy and will be used for this notebook

In [17]:
criterion = notebook_parameters["training_parameters"]["loss_function"]


### 3.1.3 Training loop

In [18]:
def train_model(model,train_dataloader,valid_dataloader,loss_function,optimizer,number_of_epochs,learningrate_scheduler=None,update_lr_every_n_batches=-1, validate_every_n_batches=None,calc_ewma=False,**ewma_kwargs):
    """
    task: train the given model and validate it every n epochs on the validation dataset \n
    parameters:model(torch.nn.Module subclass),loss_func(torch.nn loss function), optimizer(torch.optim optimizer), number_of_epochs(int(number of epochs to train)),learningrate_scheduler(instance of torch.optim.lr_scheduler subclass),validate_every_n_batches(int),calc_ewma(bool(true if ewma shall be calculated)),ewma_kwargs(kwargs that will be passed to ewma calculation) \n
    return value:
    """
    
    #put model into training mode
    model.train()
    
    batch_number=1
    
    if calc_ewma:
        loss_history=pd.DataFrame(columns=["batch_number","train_batch_loss"])
    current_ewma=None
    
    for epoch_id in range(number_of_epochs):
        
        for train_batch_id,train_batch in enumerate(train_dataloader,start=1):
        
            #unpack the batch
            input_ids_list,attention_mask_list,label_list,*_=train_batch

            #zero gradients
            optimizer.zero_grad()

            #forward_pass
            output=model(input_ids_list,attention_mask_list)

            #compute loss
            train_batch_loss=criterion(output.logits,label_list.flatten().to(dtype=torch.long))
            train_batch_loss.backward()

            #optimize
            optimizer.step()

            #check if validation necessary, if so do it
            average_validation_loss=None
            if validate_every_n_batches and batch_number%validate_every_n_batches==0:
                
                #put model into evaluation mode
                model.eval()
                
                validation_losses=[]
                with torch.no_grad():
                    for valid_batch in valid_dataloader:
                        
                        #unpack batch
                        input_ids_list,attention_mask_list,label_list,*_=valid_batch
                        
                        #evaluate the inputs with model
                        output=model(input_ids_list,attention_mask_list)
                        
                        #calc validation batch loss
                        valid_batch_loss=criterion(output.logits,label_list.flatten().to(dtype=torch.long))
                        
                        #append the loss to all validation_losses
                        validation_losses.append(valid_batch_loss)
            
                #calculate the average loss
                average_validation_loss=sum(validation_losses)/len(validation_losses)
            
                #reset model to train mode
                model.train()
            
            #calc ewma if desired
            if calc_ewma:
                loss_history=loss_history.append({"batch_number": batch_number,
                                                  "train_batch_loss": train_batch_loss}
                                                  ,ignore_index=True)
                current_ewma=loss_history.ewm(**ewma_kwargs).mean().iloc[-1]["train_batch_loss"]

            #get last lr if available
            last_lr= None if not learningrate_scheduler else learningrate_scheduler.get_last_lr()[-1]
            
            #execute step of lr scheduler if available
            if learningrate_scheduler and update_lr_every_n_batches>0 and batch_number%update_lr_every_n_batches==0:
                learningrate_scheduler.step()
            
            #increase batch number
            batch_number+=1
            
            #yield training info about current batch
            yield epoch_id,train_batch_id, train_batch_loss,average_validation_loss,current_ewma,last_lr
            

use that train_model method and create a plot about the generated training info

In [19]:
%matplotlib notebook

#lists for batch_loss
batch_number_list=[None]
batch_train_loss_list=[None]

#lists for average validation loss
batch_number_avg_valid_loss_list=[None]
avg_valid_loss_list=[None]

#list for ewma
ewma_list=[None]

#list for lr
lr_list=[None]

#create figure and seperate plots for batch_loss,average validation_loss, ewma and lr 
plt.ion()
info_fig,axes= plt.subplots(figsize=(10,8))
batch_train_plot, = axes.plot(batch_number_list,batch_train_loss_list,"blue",label="train batch loss")
average_valid_plot, =axes.plot(batch_number_list,avg_valid_loss_list, "red",label="average validation loss")
ewma_plot, =axes.plot(batch_number_list,ewma_list,"orange",label="ewma of train batch loss")

#create twin axes to allow second y axis
twin_axes= axes.twinx()
lr_plot, = twin_axes.plot(batch_number_list,lr_list,"green",label="learning rate")

#set figure attributes like title,labels and legend
plt.title("training performance")
axes.set_xlabel("batch iteration")
axes.set_ylabel("cross entropy batch loss")
twin_axes.set_ylabel("learning rate")

#create legend
plots= [batch_train_plot,average_valid_plot,ewma_plot,lr_plot]
plt.legend(plots,[plot.get_label() for plot in plots])

#position text to display training progress
plt.gcf().text(0.4,0.02,"0.00% of training done",fontsize=14)

batch_number=1

#execute the training loop and print the training info
for epoch_id,train_batch_id,train_batch_loss,average_valid_loss,ewma,last_lr in train_model(model,bert_train_dl,bert_valid_dl,criterion,optimizer,notebook_parameters["training_parameters"]["number_of_epochs"],lr_scheduler,update_lr_every_n_batches=notebook_parameters["training_parameters"]["update_lr_every_n_batches"] ,validate_every_n_batches=notebook_parameters["training_parameters"]["validate_every_n_batches"],calc_ewma=notebook_parameters["training_parameters"]["calc_ewma"],halflife=10):
    
    #print training info
    print(f"[epoch {epoch_id} _ batch {train_batch_id}]: loss= {train_batch_loss}")

    #calculate the percentage of training done and add it as text
    percentage_done= round(batch_number/(notebook_parameters["training_parameters"]["number_of_epochs"]*len(bert_train_dl))*100,2)
    info_fig.texts[0].set_text(f"{percentage_done:.2f}% of training done")
    
    #add the new data to corresponding lists
    batch_number_list.append(batch_number)
    batch_train_loss_list.append(train_batch_loss)
    ewma_list.append(ewma)
    lr_list.append(last_lr)
    if average_valid_loss:
        batch_number_avg_valid_loss_list.append(batch_number)
        avg_valid_loss_list.append(average_valid_loss)
    
    #update plot with current training info
    batch_train_plot.set_data(batch_number_list,batch_train_loss_list)
    average_valid_plot.set_data(batch_number_avg_valid_loss_list,avg_valid_loss_list)
    ewma_plot.set_data(batch_number_list,ewma_list)
    lr_plot.set_data(batch_number_list,lr_list)
    
    #relimit axes to show newly created data and autoscale
    axes.relim()
    axes.autoscale_view(True,True,True)
    twin_axes.relim()
    twin_axes.autoscale_view(True,True,True)
    
    #show updates
    info_fig.canvas.draw()
    info_fig.canvas.flush_events()
    
    #increase batch number
    batch_number+=1
    

<IPython.core.display.Javascript object>

[epoch 0 _ batch 1]: loss= 0.7704576849937439
[epoch 0 _ batch 2]: loss= 0.5535341501235962
[epoch 0 _ batch 3]: loss= 1.0476596355438232
[epoch 0 _ batch 4]: loss= 0.6701900362968445
[epoch 0 _ batch 5]: loss= 0.7204310894012451
[epoch 0 _ batch 6]: loss= 0.6597241163253784
[epoch 0 _ batch 7]: loss= 0.679343581199646
[epoch 0 _ batch 8]: loss= 0.6320213079452515
[epoch 0 _ batch 9]: loss= 0.6803225874900818
[epoch 0 _ batch 10]: loss= 0.6616032123565674
[epoch 0 _ batch 11]: loss= 0.6279165744781494
[epoch 0 _ batch 12]: loss= 0.6991792321205139
[epoch 0 _ batch 13]: loss= 0.6728343963623047
[epoch 0 _ batch 14]: loss= 0.731966495513916
[epoch 0 _ batch 15]: loss= 0.7950336933135986
[epoch 0 _ batch 16]: loss= 0.7688605785369873
[epoch 0 _ batch 17]: loss= 0.616518497467041
[epoch 0 _ batch 18]: loss= 0.7200767993927002
[epoch 0 _ batch 19]: loss= 0.7475401759147644
[epoch 0 _ batch 20]: loss= 0.8148616552352905
[epoch 0 _ batch 21]: loss= 0.6886494755744934
[epoch 0 _ batch 22]: los

[epoch 0 _ batch 175]: loss= 0.6875945329666138
[epoch 0 _ batch 176]: loss= 0.6818335056304932
[epoch 0 _ batch 177]: loss= 0.707780122756958
[epoch 0 _ batch 178]: loss= 0.7448828220367432
[epoch 0 _ batch 179]: loss= 0.6851094961166382
[epoch 0 _ batch 180]: loss= 0.6862260699272156
[epoch 0 _ batch 181]: loss= 0.6704264879226685
[epoch 0 _ batch 182]: loss= 0.7022249698638916
[epoch 0 _ batch 183]: loss= 0.6733042597770691
[epoch 0 _ batch 184]: loss= 0.6596448421478271
[epoch 0 _ batch 185]: loss= 0.739134669303894
[epoch 0 _ batch 186]: loss= 0.6903379559516907
[epoch 0 _ batch 187]: loss= 0.6790461540222168
[epoch 0 _ batch 188]: loss= 0.682850182056427
[epoch 0 _ batch 189]: loss= 0.6853080987930298
[epoch 1 _ batch 1]: loss= 0.7115072011947632
[epoch 1 _ batch 2]: loss= 0.6850336790084839
[epoch 1 _ batch 3]: loss= 0.676085889339447
[epoch 1 _ batch 4]: loss= 0.7139964699745178
[epoch 1 _ batch 5]: loss= 0.7276415228843689
[epoch 1 _ batch 6]: loss= 0.6857527494430542
[epoch 1

[epoch 1 _ batch 160]: loss= 0.6905805468559265
[epoch 1 _ batch 161]: loss= 0.6974032521247864
[epoch 1 _ batch 162]: loss= 0.7322090268135071
[epoch 1 _ batch 163]: loss= 0.7239029407501221
[epoch 1 _ batch 164]: loss= 0.7105013132095337
[epoch 1 _ batch 165]: loss= 0.6883683204650879
[epoch 1 _ batch 166]: loss= 0.6830912828445435
[epoch 1 _ batch 167]: loss= 0.740585207939148
[epoch 1 _ batch 168]: loss= 0.708153486251831
[epoch 1 _ batch 169]: loss= 0.6885608434677124
[epoch 1 _ batch 170]: loss= 0.7215657234191895
[epoch 1 _ batch 171]: loss= 0.6976748704910278
[epoch 1 _ batch 172]: loss= 0.7050127983093262
[epoch 1 _ batch 173]: loss= 0.6956432461738586
[epoch 1 _ batch 174]: loss= 0.6906205415725708
[epoch 1 _ batch 175]: loss= 0.6888955235481262
[epoch 1 _ batch 176]: loss= 0.6665457487106323
[epoch 1 _ batch 177]: loss= 0.7000901699066162
[epoch 1 _ batch 178]: loss= 0.7085171341896057
[epoch 1 _ batch 179]: loss= 0.684313178062439
[epoch 1 _ batch 180]: loss= 0.709350824356

[epoch 2 _ batch 145]: loss= 0.6899195909500122
[epoch 2 _ batch 146]: loss= 0.7089897990226746
[epoch 2 _ batch 147]: loss= 0.6830776333808899
[epoch 2 _ batch 148]: loss= 0.6908414959907532
[epoch 2 _ batch 149]: loss= 0.6903756260871887
[epoch 2 _ batch 150]: loss= 0.6898638010025024
[epoch 2 _ batch 151]: loss= 0.6804551482200623
[epoch 2 _ batch 152]: loss= 0.7174654603004456
[epoch 2 _ batch 153]: loss= 0.6799403429031372
[epoch 2 _ batch 154]: loss= 0.7047484517097473
[epoch 2 _ batch 155]: loss= 0.7029819488525391
[epoch 2 _ batch 156]: loss= 0.6887094378471375
[epoch 2 _ batch 157]: loss= 0.6530935168266296
[epoch 2 _ batch 158]: loss= 0.706717312335968
[epoch 2 _ batch 159]: loss= 0.7516817450523376
[epoch 2 _ batch 160]: loss= 0.6827332377433777
[epoch 2 _ batch 161]: loss= 0.6884306073188782
[epoch 2 _ batch 162]: loss= 0.704444944858551
[epoch 2 _ batch 163]: loss= 0.6905173063278198
[epoch 2 _ batch 164]: loss= 0.6850360035896301
[epoch 2 _ batch 165]: loss= 0.77328550815

[epoch 3 _ batch 130]: loss= 0.6669239401817322
[epoch 3 _ batch 131]: loss= 0.6653944253921509
[epoch 3 _ batch 132]: loss= 0.6151757836341858
[epoch 3 _ batch 133]: loss= 0.6963585615158081
[epoch 3 _ batch 134]: loss= 0.6692968606948853
[epoch 3 _ batch 135]: loss= 0.5376583337783813
[epoch 3 _ batch 136]: loss= 0.7535790801048279
[epoch 3 _ batch 137]: loss= 0.5768742561340332
[epoch 3 _ batch 138]: loss= 0.7092769742012024
[epoch 3 _ batch 139]: loss= 0.6740527153015137
[epoch 3 _ batch 140]: loss= 0.48090869188308716
[epoch 3 _ batch 141]: loss= 0.6509228944778442
[epoch 3 _ batch 142]: loss= 0.6144420504570007
[epoch 3 _ batch 143]: loss= 0.6973087191581726
[epoch 3 _ batch 144]: loss= 0.7568680644035339
[epoch 3 _ batch 145]: loss= 0.7429965138435364
[epoch 3 _ batch 146]: loss= 0.642296314239502
[epoch 3 _ batch 147]: loss= 0.6584265232086182
[epoch 3 _ batch 148]: loss= 0.6889978051185608
[epoch 3 _ batch 149]: loss= 0.740896999835968
[epoch 3 _ batch 150]: loss= 0.7048884630

[epoch 4 _ batch 115]: loss= 0.611167311668396
[epoch 4 _ batch 116]: loss= 0.9289395809173584
[epoch 4 _ batch 117]: loss= 0.6235329508781433
[epoch 4 _ batch 118]: loss= 0.6663756966590881
[epoch 4 _ batch 119]: loss= 0.6617355942726135
[epoch 4 _ batch 120]: loss= 0.6689006686210632
[epoch 4 _ batch 121]: loss= 0.4644392132759094
[epoch 4 _ batch 122]: loss= 0.7080901861190796
[epoch 4 _ batch 123]: loss= 0.6614152789115906
[epoch 4 _ batch 124]: loss= 0.5165808796882629
[epoch 4 _ batch 125]: loss= 0.6538732647895813
[epoch 4 _ batch 126]: loss= 0.6003764867782593
[epoch 4 _ batch 127]: loss= 0.6704471707344055
[epoch 4 _ batch 128]: loss= 0.7607908844947815
[epoch 4 _ batch 129]: loss= 0.5383098125457764
[epoch 4 _ batch 130]: loss= 0.7494255900382996
[epoch 4 _ batch 131]: loss= 0.5992199182510376
[epoch 4 _ batch 132]: loss= 0.7094290852546692
[epoch 4 _ batch 133]: loss= 0.7453319430351257
[epoch 4 _ batch 134]: loss= 0.531620442867279
[epoch 4 _ batch 135]: loss= 0.58747625350

[epoch 5 _ batch 100]: loss= 0.6907097101211548
[epoch 5 _ batch 101]: loss= 0.7118611335754395
[epoch 5 _ batch 102]: loss= 0.7061024904251099
[epoch 5 _ batch 103]: loss= 0.6629911661148071
[epoch 5 _ batch 104]: loss= 0.6692782640457153
[epoch 5 _ batch 105]: loss= 0.6663192510604858
[epoch 5 _ batch 106]: loss= 0.6933130025863647
[epoch 5 _ batch 107]: loss= 0.677866518497467
[epoch 5 _ batch 108]: loss= 0.6661024689674377
[epoch 5 _ batch 109]: loss= 0.7268788814544678
[epoch 5 _ batch 110]: loss= 0.6953145265579224
[epoch 5 _ batch 111]: loss= 0.6878113150596619
[epoch 5 _ batch 112]: loss= 0.7085310816764832
[epoch 5 _ batch 113]: loss= 0.6830755472183228
[epoch 5 _ batch 114]: loss= 0.7584603428840637
[epoch 5 _ batch 115]: loss= 0.6790626645088196
[epoch 5 _ batch 116]: loss= 0.6678658127784729
[epoch 5 _ batch 117]: loss= 0.6711701154708862
[epoch 5 _ batch 118]: loss= 0.6526941657066345
[epoch 5 _ batch 119]: loss= 0.6870679259300232
[epoch 5 _ batch 120]: loss= 0.7242599725

## 3.2 Model evaluation

write a class that composes several metrics to allow composed metric evalution

In [20]:
class Metric_composer():
    """
    this class is used to evaluate model predictions on multiple metrics simultaneously
    """
    
    def __init__(self,metrics_dict):
        """
        task: init composer and set its metrics field \n
        parameters:metrics_dict(sting(metric name): callable(metric func))\n
        return value:
        """
        
        self.metrics= metrics_dict
        
    def evaluate(self,y_truth,y_pred):
        """
        task: use the metrics in the self.metrics field to evaluate the given y values \n
        parameters: y_truth(ground truth labels), y_pred(predicted labels) \n
        return value: dict(string(name):evaluation of corresponding function)
        """
    
        evaluation={}
        for name,metric_func in self.metrics.items():
            
            evaluation[name]= metric_func(y_truth,y_pred)
            
        return evaluation
    

a method to evaluate a given model using the data from given dataloader and the metrics composed within the metric_composer_obj 

In [21]:
def eval_model(model,dataloader,metric_composer_obj):
    """
    task: evaluate the model on the given dataloader using the loss_func \n
    parameters: model(torch.nn.Module subclass), dataloader(torch.utils.data.DataLoader instance), metric_composer_obj(instance of Metric_composer) \n
    return value:
    """
    
    #put model into evaluation mode
    model.eval()
    
    y_pred=[]
    y_truth=[]
    with torch.no_grad():
        
        for batch in dataloader:
            
            #unpack the batch
            input_ids_list,attention_mask_list,label_list,*_=batch
            
            #evalaute batch with model
            output=model(input_ids_list,attention_mask_list)
            
            #model outputs a prob distribution over the 2 classes, so get argmax to retreive predicted label
            predicted_labels=torch.argmax(output.logits, axis=1).flatten()
    
            #append both the predicted labels and the true labels 
            y_pred+=predicted_labels.tolist()
            y_truth+=label_list.flatten().tolist()
            
            
    return metric_composer_obj.evaluate(y_truth,y_pred)

compose metrics

In [22]:
#compose different performance metrics
metric_composer_obj= Metric_composer({
    "accuracy": accuracy_score,
    "precision": precision_score,
    "confusion_matrix": confusion_matrix
})

evaluate performance on train data

In [23]:
#iterate over all evaluations and print them
train_evaluation=eval_model(model,bert_train_dl,metric_composer_obj)
for metric_name, metric_evaluation in train_evaluation.items():
    print(f"----{metric_name}----")
    print(metric_evaluation)
    print("---------------------")

----accuracy----
0.503968253968254
---------------------
----precision----
0.503968253968254
---------------------
----confusion_matrix----
[[  0 750]
 [  0 762]]
---------------------


evaluate performance on validation data

In [24]:
#iterate over all evaluations and print them
valid_evaluation=eval_model(model,bert_valid_dl,metric_composer_obj)
for metric_name, metric_evaluation in valid_evaluation.items():
    print(f"----{metric_name}----")
    print(metric_evaluation)
    print("---------------------")

----accuracy----
0.49206349206349204
---------------------
----precision----
0.49206349206349204
---------------------
----confusion_matrix----
[[ 0 96]
 [ 0 93]]
---------------------


## 3.3 Store/load the model 

### 3.3.1 store model

In [25]:
def store_model(model,description,dest_path):
    """
    task: store the given model together with a txt file that stores a short description \n
    parameters:model(torch.nn.Module), description(dict), dest_path(each stored model will be stored in a folder containing state_dict and description.txt, this defines the name of the folder)\n
    return value:
    """
    
    #create the folder at dest_path if not available
    os.mkdir(str(dest_path))
    
    #store the description in a description.json file
    with open(dest_path.joinpath("description.json"),"w") as f:
        f.write(simplejson.dumps(description, indent=4, sort_keys=True))
    
    #store the models state dict
    torch.save(model.state_dict(), str(dest_path.joinpath("state_dict.pth")))

method to force serializability of any dictionary. Used to serialize the notebook_paramters dict when storing a model

In [26]:
def recursive_force_serializability(dict_to_transform,max_depth=100,caller_depth=0,replace_unstringable=False):
    """
    task: convert any non-serializable object within that dict to string and thus force serialiazability  \n
    parameters:dict_to_transform(dict), max_depth(int(max depth for recursive calls)),caller_depth(int(depth of the caller)), replace_unstringable=\n
    return value:
    """
    
    #create a copy of the original dict in case the caller_depth==0
    if caller_depth==0:
        dict_to_transform= dict_to_transform.copy()
    
    current_depth=caller_depth+1
    
    #recursion anker
    if current_depth==max_depth:
        warnings.warn("exceeded given recursion depth. Unable to fully transform given dictionary")
        return dict_to_transform
    
    #call ,method recursively on the items of the dict if they are dicts as well, in case of unserializable items convert them to strings if possible
    for key,item in dict_to_transform.items():
        if type(item)==dict:
            dict_to_transform[key]= recursive_force_serializability(item,max_depth=max_depth,caller_depth=current_depth,replace_unstringable=replace_unstringable)
        else:
            #try to dump the item to check ifg its serializable
            try:
                json.dumps(item)
            except TypeError:
                #if not serializable try to transform it to string
                try:
                    dict_to_transform[key]=str(item)
                except TypeError:
                    #if not stringable replace by "unserializable and unstringable object" string if replace_unstringable==True, else raise error
                    if replace_unstringable:
                        dict_to_transform[key]="unserializable and unstringable object"
                        warnings.warn(f"replaced object at key {key} with replacement string")
                    else:
                        raise AssertionError("unserializable and unstringable object was found and replace_unstringable is set to False, set to True if replacement is desired")
                    
    return dict_to_transform

derive a desription dictionary from notebook_parameters and optional train/validation evaluations

In [27]:
def create_model_description(notebook_parameters=None,train_evaluation=None,valid_evaluation=None):
    """
    task: create a description of the model by storing the notebook_parameters used and evaluation scores   \n
    parameters: notebook_parameters(dict at beginning of notebook), train_evaluation(output of Model_composer.evaluate on train data),valid_evaluation(output of Model_composer.evaluate on validation data) \n
    return value: dict
    """
    
    description={
        "notebook_parameters": notebook_parameters,
        "train_evaluation": train_evaluation if train_evaluation else "no train evaluation available",
        "valid_evaluation": valid_evaluation if valid_evaluation else "no validation evaluation available"
    }

    return description
    

In [28]:
#store the current run
folder_name="chinese_v2_1er-precision"
if False: #change to true if desired
    desc= create_model_description(notebook_parameters,train_evaluation,valid_evaluation)
    dest_folder=notebook_parameters["storage_parameters"]["path_to_model_storage"].joinpath(folder_name)
    store_model(model,recursive_force_serializability(desc,max_depth=100),dest_folder)

### 3.3.2 load model

In [29]:
#load the model in given folder name
folder_name="chinese_v1"
if False: #change to true if desired
    model.load_state_dict(torch.load(notebook_parameters["storage_parameters"]["path_to_model_storage"].joinpath(folder_name,"state_dict.pth")))

# 4. Result 

Apparently this model struggles with a phenomenon similar to the mode collapse of GAN's: Most of my training runs(~80%) made the model default into predicting either all samples as "being censored" or all as "not censored". I was unable to identify the reason for that behavior but I think it might be caused by the uncased tokenizer producing very similar, nearly identical tokens for all sequences. Unfortunately there is neither a chinese tokenizer nor pretrained model for the DestilBERT available.
In the remaining cases the model managed to outperform random guessing. Nonetheless the results are defenitly not state-of-the-art having an average accuracy of ~65% and a precision of ~70% on the validation set. One model had an astonishing precision of 100% on the validation set, unfortunately with a relatively low amount of predictions in the positive category thus making this less remarkable. However that models' description and state_dict can be found under ... 