# 1. Setup

## 1.1 Imports

In [1]:
#imports
import torch
from torch.utils.data import Dataset,DataLoader,Subset
import pandas as pd
from collections import namedtuple
import numpy as np
from transformers import BertForSequenceClassification,BertConfig,BertTokenizer
from torch.optim import Adam
from torch.nn import CrossEntropyLoss    
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix
from pathlib import Path
import pathlib
import matplotlib.pyplot as plt
import os
import json
import simplejson
import warnings



## 1.2 Cuda

use cuda if available

In [2]:
#define device
device= torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

## 1.3 Parameters

This section holds all tuneable parameters of this notebook seperated into different categories according to target area

In [3]:
notebook_parameters={"model_parameters":None,
                     "training_parameters":None,
                     "data_parameters":None,
                     "storage_parameters":None}

### 1.3.1 Model Parameters

dict that will be passed to BertConfig to configure the model used

In [4]:
notebook_parameters["model_parameters"]={"num_hidden_layers":12,
                                         "hidden_dropout_prob":0.1,
                                         "attention_probs_dropout_prob":0.1
                                        }

### 1.3.2 Training Parameters

In [5]:
notebook_parameters["training_parameters"]={"learningrate": 2e-5,
                                            "number_of_epochs": 2,
                                            "loss_function": CrossEntropyLoss(),
                                            "validate_every_n_batches": 100,
                                            "calc_ewma":True
                                            }

### 1.3.3 Data parameters

In [6]:
notebook_parameters["data_parameters"]={"batchsize":6,
                                        "path_to_train_data":r"C:\Users\nick\Code\MachineLearning_Projects\Bewerbung_NLP\data\english_datasets\en_train.csv",
                                        "train_limit": -1,
                                        "path_to_valid_data":r"C:\Users\nick\Code\MachineLearning_Projects\Bewerbung_NLP\data\english_datasets\en_dev.csv",
                                        "valid_limit": -1,
                                        }

### 1.3.4 Storage parameters

In [7]:
notebook_parameters["storage_parameters"]= {"path_to_model_storage": Path(r"C:\Users\nick\Code\MachineLearning_Projects\Bewerbung_NLP\trainings\trained_models")
                                           }

# 2. Datasets and Transforms

## 2.1 Original Dataset

Create a class that will hold the initial dataset directly derived from the translated data

In [8]:
class Blogpost_dataset(Dataset):
    """
    this class serves as a custom dataset for the given blog posts
    """

    def __init__(self,src_path,transform=None):
        """
        task: inits the dataset and sets optional transforms \n
        parameters: src_path(str(path to the underlying source data)), transform(optional transformation that may be applied to each sample ) \n
        return value:
        """

        self.src_df = pd.read_csv(src_path)
        self.transform= transform
        self.Dataset_item= namedtuple("Dataset_item",["text","label"]) 

    def __len__(self):
        """
        task: return the length of the underlying source DataFrame \n
        parameters:\n
        return value:
        """

        return len(self.src_df)

    def __getitem__(self, index):
        """
        task: return the item at the given index \n
        parameters:\n
        return value:
        """

        #transform the index to a list in case it is a tensor
        if torch.is_tensor(index):
            index= index.tolist()

        #fetch item from source df
        item=self.Dataset_item(self.src_df.iloc[index]["text"],self.src_df.iloc[index]["label"])

        #apply transform if available
        if self.transform:
            item= self.transform(item)

        return item


## 2.2 BERT compatible Dataset

Create a class that will serve as a dataset for the data in BERT compatible, already tokenized form

In [9]:
class Bert_compatible_dataset(Dataset):
    """
    this class holds a dataset that was transformed using the BertTransform
    """

    def __init__(self,input_ids_list,attention_mask_list,token_type_ids_list,label_list,text_list=None,transform=None):
        """
        task: create a dataset from the given lists of tokens. if text_list is given that column will hold the original text \n
        parameters:input_ids_list(list(token id)), attention_mask_list(list(attention mask)),token_type_ids_list(list(token_type_id)) ,label_list(list(label)), text_list(list(optional: orignal text))\n
        return value:
        """

        #create a class that will hold one element of data
        self.Dataset_item= namedtuple("Bert_dataset_item",["input_ids","attention_mask","token_type_ids","label","text"])

        #this list will store all data 
        self.data= []
        
        if text_list:
            #assert that all of those lists are of same length
            assert len(input_ids_list)==len(attention_mask_list)==len(token_type_ids_list)==len(label_list)==len(text_list),"length of lists has to match"
        
            #zip the lists together
            for input_ids,attention_mask,token_type_ids,label,text in zip(input_ids_list,attention_mask_list,token_type_ids_list,label_list,text_list):

                #create a namedtuple storing that data and append it to self.data
                item= self.Dataset_item(input_ids,attention_mask,token_type_ids,label,text)
                self.data.append(item)

        else:
            assert len(input_ids_list)==len(attention_mask_list)==len(token_type_ids_list)==len(label_list),"length of lists has to match"

            #zip the lists together
            for input_ids,attention_mask,token_type_ids,label in zip(input_ids_list,attention_mask_list,token_type_ids_list,label_list):

                #create a namedtuple storing that data and append it to self.data
                item= self.Dataset_item(input_ids,attention_mask,token_type_ids,label,0)
                self.data.append(item)

        self.transform = transform


    def __len__(self):
        """
        task: return the length of self.data field \n
        parameters:\n
        return value:
        """

        return len(self.data)

    def __getitem__(self, index):
        """
        task: return the item at index \n
        parameters:\n
        return value:
        """

        #transform the index to a list in case it is a tensor
        if torch.is_tensor(index):
            index= index.tolist()

        data=self.data[index]

        if self.transform:
            data= self.transform(data)

        return data


## 2.3 Transforms

Create a Transform that will transform bert compatible data into tensors on the given device

In [10]:
class BertToTensor(object):
    """
    This class serves as a transform to transfer the elements of namedtuple into tensors on the given device
    """

    def __init__(self,device="cpu"):
        """
        task:  \n
        parameters:\n
        return value:
        """

        self.device=device

    def __call__(self,named_tuple):
        """
        task: transform the elements of the named tuple into tensors and ship them over to self.device \n
        parameters: named_tuple("Bert_dataset_item",["input_ids","attention_mask","token_type_ids","label","text"]) \n
        return value: transformed elements
        """

        #unpack the named tuple
        input_ids,attention_mask,token_type_ids,label,text = named_tuple

        #transform to tensor
        input_ids= torch.IntTensor(input_ids).to(device=self.device)
        attention_mask= torch.IntTensor(attention_mask).to(device=self.device)
        token_type_ids= torch.IntTensor(token_type_ids).to(device=self.device)
        label= torch.IntTensor(label).to(device=self.device)
        text= torch.IntTensor(text).to(device=self.device)

        return input_ids,attention_mask,token_type_ids,label,text

create a transform that takes the original data as imput and tokenizes it using the BERTTokenizer

In [11]:
class BertTransform(object):
    """
    this class will serve as a transform that tokenizes a given text using the bert tokenizer
    """

    def __init__(self,max_length):
        """
        task: init the transform and creates a bert tokenizer\n
        parameters:\n
        return value:
        """

        self.tokenizer= BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
        self.max_length= max_length

    def __call__(self,item):
        """
        task: when called transform the given items' text field by applying bertTokenization  \n
        parameters:\n
        return value:
        """

        #transform the 
        transformed_text= self.tokenizer.encode_plus(
            item.text,
            add_special_tokens=True, #adds beginning(CLS)) and end(SEP) tokens of sequence)
            max_length= self.max_length,
            pad_to_max_length=True, # makes the tokenizer fill the token vectors with padding tokens if the sequence is smaller than max_length
            return_attention_mask = True
        )

        Dataset_item=namedtuple("Dataset_item",["text","label"]) 
        return Dataset_item(transformed_text,item.label)

a method that creates an original dataset and transforms it into a bert compatibel one

In [12]:
def transform_original_dataset_2_bert_compatible(src_path,max_length=512,limit=-1,device="cpu"):
    """
    task: use the BertTokenize transform to tokenize the given original Dataset and thus create a dataset that is bert compatible \n
    parameters: src_path(path to original data), max_length(int(max length allowed for transformer, 512 for bert)),limit(number of entries to use) \n
    return value: torch.utils.data.Dataset
    """

    #create BertTransform
    transform=BertTransform(max_length)

    #create the original Blogpost_dataset
    blogpost_ds=Blogpost_dataset(src_path,transform=transform)

    #pull a subset of the dataset if a limit was given
    if limit>0:
        indices= np.random.choice(range(len(blogpost_ds)),size=limit,replace=False) #choose random indices
        blogpost_ds= Subset(blogpost_ds,indices)

    #lists that will stored the transformed/tokenized text
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []

    #iterate over the dataset with transform and catch the thus transformed texts by putting them into a new dataset
    for transformed_text,label in blogpost_ds:

        #apppend the contents to the corresponding list
        input_ids_list.append(transformed_text['input_ids'])
        token_type_ids_list.append(transformed_text['token_type_ids'])
        attention_mask_list.append(transformed_text['attention_mask'])
        label_list.append([label])

    #return the Bert_compatible_dataset derived from those lists
    return Bert_compatible_dataset(input_ids_list,attention_mask_list,token_type_ids_list,label_list,transform=BertToTensor(device))


## 2.4 Dataset and Dataloader Creation

train dataset/dataloader

In [13]:
#create a bert compatible dataset
bert_train_ds= transform_original_dataset_2_bert_compatible(notebook_parameters["data_parameters"]["path_to_train_data"],limit=notebook_parameters["data_parameters"]["train_limit"],device=device)

#create a corresponding dataloader
bert_train_dl= DataLoader(bert_train_ds,batch_size=notebook_parameters["data_parameters"]["batchsize"],shuffle=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


validation dataset/dataloader

In [14]:
#create a bert compatible dataset
bert_valid_ds= transform_original_dataset_2_bert_compatible(notebook_parameters["data_parameters"]["path_to_valid_data"],limit=notebook_parameters["data_parameters"]["valid_limit"],device=device)

#create a corresponding dataloader
bert_valid_dl= DataLoader(bert_valid_ds,batch_size=notebook_parameters["data_parameters"]["batchsize"],shuffle=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# 3. The Model

The model used will be BERT by google

# 3.1 Training

### 3.1.1 Model and optimizer init

use the Bert Config class to configure the Bert model used

In [15]:
bert_config= BertConfig(**notebook_parameters["model_parameters"]) #num_hidden_layers=notebook_parameters["model_parameters"]["num_hidden_layers"],hidden_dropout_prob=notebook_parameters["model_parameters"]["hidden_dropout_prob"],attention_probs_dropout_prob=notebook_parameters["model_parameters"]["attention_probs_dropout_prob"])

initialize optimizer and model

In [16]:
#init the model
model= BertForSequenceClassification(bert_config).from_pretrained('bert-base-uncased').to(device)

#use adam optimizer
optimizer= Adam(model.parameters(),lr=notebook_parameters["training_parameters"]["learningrate"])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### 3.1.2 Loss Function

Given the binary classification problem, the cross entropy loss function comes in quite handy and will be used for this notebook

In [17]:
criterion = notebook_parameters["training_parameters"]["loss_function"]


### 3.1.3 Training loop

In [18]:
def train_model(model,train_dataloader,valid_dataloader,loss_function,optimizer,number_of_epochs, validate_every_n_batches=None,calc_ewma=False,**ewma_kwargs):
    """
    task: train the given model and validate it every n epochs on the validation dataset \n
    parameters:model(torch.nn.Module subclass),loss_func(torch.nn loss function), optimizer(torch.optim optimizer), number_of_epochs(int(number of epochs to train)),validate_every_n_batches(int),calc_ewma(bool(true if ewma shall be calculated)),ewma_kwargs(kwargs that will be passed to ewma calculation) \n
    return value:
    """
    
    #put model into training mode
    model.train()
    
    batch_number=1
    
    if calc_ewma:
        loss_history=pd.DataFrame(columns=["batch_number","train_batch_loss"])
    current_ewma=None
    
    for epoch_id in range(number_of_epochs):
        
        for train_batch_id,train_batch in enumerate(train_dataloader,start=1):
        
            #unpack the batch
            input_ids_list,attention_mask_list,token_type_ids_list,label_list,*_=train_batch

            #zero gradients
            optimizer.zero_grad()

            #forward_pass
            output=model(input_ids_list,attention_mask_list,token_type_ids_list)

            #compute loss
            train_batch_loss=criterion(output.logits,label_list.flatten().to(dtype=torch.long))
            train_batch_loss.backward()

            #optimize
            optimizer.step()

            #check if validation necessary, if so do it
            average_validation_loss=None
            if validate_every_n_batches and batch_number%validate_every_n_batches==0:
                
                #put model into evaluation mode
                model.eval()
                
                validation_losses=[]
                with torch.no_grad():
                    for valid_batch in valid_dataloader:
                        
                        #unpack batch
                        input_ids_list,attention_mask_list,token_type_ids_list,label_list,*_=valid_batch
                        
                        #evaluate the inputs with model
                        output=model(input_ids_list,attention_mask_list,token_type_ids_list)
                        
                        #calc validation batch loss
                        valid_batch_loss=criterion(output.logits,label_list.flatten().to(dtype=torch.long))
                        
                        #append the loss to all validation_losses
                        validation_losses.append(valid_batch_loss)
            
                #calculate the average loss
                average_validation_loss=sum(validation_losses)/len(validation_losses)
            
                #reset model to train mode
                model.train()
            
            #calc ewma if desired
            if calc_ewma:
                loss_history=loss_history.append({"batch_number": batch_number,
                                                  "train_batch_loss": train_batch_loss}
                                                  ,ignore_index=True)
                current_ewma=loss_history.ewm(**ewma_kwargs).mean().iloc[-1]["train_batch_loss"]

            
            #increase batch number
            batch_number+=1
            
            #yield training info about current batch
            yield epoch_id,train_batch_id, train_batch_loss,average_validation_loss,current_ewma

use that train_model method and create a plot about the generated training info

In [19]:
%matplotlib notebook

#lists for batch_loss
batch_number_list=[None]
batch_train_loss_list=[None]

#lists for average validation loss
batch_number_avg_valid_loss_list=[None]
avg_valid_loss_list=[None]

#list for ewma
ewma_list=[None]

#create figure and seperate plots for batch_loss,average validation_loss and ewma
plt.ion()
info_fig,axes= plt.subplots(figsize=(10,8))
batch_train_plot, = axes.plot(batch_number_list,batch_train_loss_list,"blue")
average_valid_plot, =axes.plot(batch_number_list,avg_valid_loss_list, "red")
ewma_plot, =axes.plot(batch_number_list,ewma_list,"orange")

#set figure attributes like title,labels and legend
plt.title("training performance")
plt.xlabel("batch iteration")
plt.ylabel("cross entropy batch loss")
plt.legend(["train_batch_loss","average_valid_loss","ewma of train_batch_loss"])

#position text to display training progress
plt.gcf().text(0.4,0.02,"0.00% of training done",fontsize=14)

batch_number=1

#execute the training loop and print the training info
for epoch_id,train_batch_id,train_batch_loss,average_valid_loss,ewma in train_model(model,bert_train_dl,bert_valid_dl,criterion,optimizer,notebook_parameters["training_parameters"]["number_of_epochs"],validate_every_n_batches=notebook_parameters["training_parameters"]["validate_every_n_batches"],calc_ewma=notebook_parameters["training_parameters"]["calc_ewma"],halflife=10):
    
    #print training info
    print(f"[epoch {epoch_id} _ batch {train_batch_id}]: loss= {train_batch_loss}")
    
    #calculate the percentage of training done and add it as text
    percentage_done= round(batch_number/(notebook_parameters["training_parameters"]["number_of_epochs"]*len(bert_train_dl))*100,2)
    info_fig.texts[0].set_text(f"{percentage_done:.2f}% of training done")
    
    #add the new data to corresponding lists
    batch_number_list.append(batch_number)
    batch_train_loss_list.append(train_batch_loss)
    ewma_list.append(ewma)
    if average_valid_loss:
        batch_number_avg_valid_loss_list.append(batch_number)
        avg_valid_loss_list.append(average_valid_loss)
    
    #update plot with current training info
    batch_train_plot.set_data(batch_number_list,batch_train_loss_list)
    average_valid_plot.set_data(batch_number_avg_valid_loss_list,avg_valid_loss_list)
    ewma_plot.set_data(batch_number_list,ewma_list)
    
    #relimit axes to show newly created data and autoscale
    axes.relim()
    axes.autoscale_view(True,True,True)
    
    #show updates
    info_fig.canvas.draw()
    info_fig.canvas.flush_events()
    
    #increase batch number
    batch_number+=1
    

<IPython.core.display.Javascript object>

[epoch 0 _ batch 1]: loss= 0.598677933216095
[epoch 0 _ batch 2]: loss= 0.5832054018974304
[epoch 0 _ batch 3]: loss= 0.6248192191123962
[epoch 0 _ batch 4]: loss= 0.8396489024162292
[epoch 0 _ batch 5]: loss= 0.6435042023658752
[epoch 0 _ batch 6]: loss= 0.5029110312461853
[epoch 0 _ batch 7]: loss= 0.9222753643989563
[epoch 0 _ batch 8]: loss= 0.7684416174888611
[epoch 0 _ batch 9]: loss= 0.6169042587280273
[epoch 0 _ batch 10]: loss= 0.824752151966095
[epoch 0 _ batch 11]: loss= 0.6399288773536682
[epoch 0 _ batch 12]: loss= 0.7457090020179749
[epoch 0 _ batch 13]: loss= 0.8022792935371399
[epoch 0 _ batch 14]: loss= 0.8840999007225037
[epoch 0 _ batch 15]: loss= 0.659615695476532
[epoch 0 _ batch 16]: loss= 0.6352439522743225
[epoch 0 _ batch 17]: loss= 0.6766827702522278
[epoch 0 _ batch 18]: loss= 0.6355236768722534
[epoch 0 _ batch 19]: loss= 0.6721792221069336
[epoch 0 _ batch 20]: loss= 0.714000403881073
[epoch 0 _ batch 21]: loss= 0.7013679146766663
[epoch 0 _ batch 22]: loss

[epoch 0 _ batch 175]: loss= 0.6220345497131348
[epoch 0 _ batch 176]: loss= 0.7433138489723206
[epoch 0 _ batch 177]: loss= 0.6987435221672058
[epoch 0 _ batch 178]: loss= 0.7428273558616638
[epoch 0 _ batch 179]: loss= 0.6762046813964844
[epoch 0 _ batch 180]: loss= 0.7061715126037598
[epoch 0 _ batch 181]: loss= 0.724075973033905
[epoch 0 _ batch 182]: loss= 0.7026107907295227
[epoch 0 _ batch 183]: loss= 0.6845738291740417
[epoch 0 _ batch 184]: loss= 0.6324706077575684
[epoch 0 _ batch 185]: loss= 0.6938605904579163
[epoch 0 _ batch 186]: loss= 0.7400298714637756
[epoch 0 _ batch 187]: loss= 0.6307364106178284
[epoch 0 _ batch 188]: loss= 0.764350414276123
[epoch 0 _ batch 189]: loss= 0.6801712512969971
[epoch 0 _ batch 190]: loss= 0.6392302513122559
[epoch 0 _ batch 191]: loss= 0.7208893299102783
[epoch 0 _ batch 192]: loss= 0.651369571685791
[epoch 0 _ batch 193]: loss= 0.6519051194190979
[epoch 0 _ batch 194]: loss= 0.6786903738975525
[epoch 0 _ batch 195]: loss= 0.625157833099

[epoch 1 _ batch 97]: loss= 0.6145582795143127
[epoch 1 _ batch 98]: loss= 0.6064264178276062
[epoch 1 _ batch 99]: loss= 0.2746359705924988
[epoch 1 _ batch 100]: loss= 0.4574730098247528
[epoch 1 _ batch 101]: loss= 0.5712409019470215
[epoch 1 _ batch 102]: loss= 0.7307722568511963
[epoch 1 _ batch 103]: loss= 0.4072357416152954
[epoch 1 _ batch 104]: loss= 0.9008608460426331
[epoch 1 _ batch 105]: loss= 0.41527390480041504
[epoch 1 _ batch 106]: loss= 0.8629131317138672
[epoch 1 _ batch 107]: loss= 0.6141741275787354
[epoch 1 _ batch 108]: loss= 0.6866934895515442
[epoch 1 _ batch 109]: loss= 0.649021565914154
[epoch 1 _ batch 110]: loss= 0.48120298981666565
[epoch 1 _ batch 111]: loss= 0.40897008776664734
[epoch 1 _ batch 112]: loss= 0.4218437969684601
[epoch 1 _ batch 113]: loss= 0.52497798204422
[epoch 1 _ batch 114]: loss= 0.72139972448349
[epoch 1 _ batch 115]: loss= 0.630675733089447
[epoch 1 _ batch 116]: loss= 1.2182142734527588
[epoch 1 _ batch 117]: loss= 0.413807243108749

## 3.2 Model evaluation

write a class that composes several metrics to allow composed metric evalution

In [20]:
class Metric_composer():
    """
    this class is used to evaluate model predictions on multiple metrics simultaneously
    """
    
    def __init__(self,metrics_dict):
        """
        task: init composer and set its metrics field \n
        parameters:metrics_dict(sting(metric name): callable(metric func))\n
        return value:
        """
        
        self.metrics= metrics_dict
        
    def evaluate(self,y_truth,y_pred):
        """
        task: use the metrics in the self.metrics field to evaluate the given y values \n
        parameters: y_truth(ground truth labels), y_pred(predicted labels) \n
        return value: dict(string(name):evaluation of corresponding function)
        """
    
        evaluation={}
        for name,metric_func in self.metrics.items():
            
            evaluation[name]= metric_func(y_truth,y_pred)
            
        return evaluation
    

In [21]:
def eval_model(model,dataloader,metric_composer_obj):
    """
    task: evaluate the model on the given dataloader using the loss_func \n
    parameters: model(torch.nn.Module subclass), dataloader(torch.utils.data.DataLoader instance), metric_composer_obj(instance of Metric_composer) \n
    return value:
    """
    
    #put model into evaluation mode
    model.eval()
    
    y_pred=[]
    y_truth=[]
    with torch.no_grad():
        
        for batch in dataloader:
            
            #unpack the batch
            input_ids_list,attention_mask_list,token_type_ids_list,label_list,*_=batch
            
            #evalaute batch with model
            output=model(input_ids_list,attention_mask_list,token_type_ids_list)
            
            #model outputs a prob distribution over the 2 classes, so get argmax to retreive predicted label
            predicted_labels=torch.argmax(output.logits, axis=1).flatten()
    
            #append both the predicted labels and the true labels 
            y_pred+=predicted_labels.tolist()
            y_truth+=label_list.flatten().tolist()
            
            
    return metric_composer_obj.evaluate(y_truth,y_pred)

compose metrics

In [22]:
#compose different performance metrics
metric_composer_obj= Metric_composer({
    "accuracy": accuracy_score,
    "precision": precision_score,
    "confusion_matrix": confusion_matrix
})

evaluate performance on train data

In [23]:
#iterate over all evaluations and print them
train_evaluation=eval_model(model,bert_train_dl,metric_composer_obj)
for metric_name, metric_evaluation in train_evaluation.items():
    print(f"----{metric_name}----")
    print(metric_evaluation)
    print("---------------------")

----accuracy----
0.7777777777777778
---------------------
----precision----
0.9819004524886877
---------------------
----confusion_matrix----
[[742   8]
 [328 434]]
---------------------


evaluate performance on validation data

In [24]:
#iterate over all evaluations and print them
valid_evaluation=eval_model(model,bert_valid_dl,metric_composer_obj)
for metric_name, metric_evaluation in valid_evaluation.items():
    print(f"----{metric_name}----")
    print(metric_evaluation)
    print("---------------------")

----accuracy----
0.6296296296296297
---------------------
----precision----
0.8484848484848485
---------------------
----confusion_matrix----
[[91  5]
 [65 28]]
---------------------


## 3.3 Store/load the model 

### 3.3.1 store model

In [25]:
def store_model(model,description,dest_path):
    """
    task: store the given model together with a txt file that stores a short description \n
    parameters:model(torch.nn.Module), description(dict), dest_path(each stored model will be stored in a folder containing state_dict and description.txt, this defines the name of the folder)\n
    return value:
    """
    
    #create the folder at dest_path if not available
    os.mkdir(str(dest_path))
    
    #store the description in a description.json file
    with open(dest_path.joinpath("description.json"),"w") as f:
        f.write(simplejson.dumps(description, indent=4, sort_keys=True))
    
    #store the models state dict
    torch.save(model.state_dict(), str(dest_path.joinpath("state_dict.pth")))

method to force serializability of any dictionary. Used to serialize the notebook_paramters dict when storing a model

In [26]:
def recursive_force_serializability(dict_to_transform,max_depth=100,caller_depth=0,replace_unstringable=False):
    """
    task: convert any non-serializable object within that dict to string and thus force serialiazability  \n
    parameters:dict_to_transform(dict), max_depth(int(max depth for recursive calls)),caller_depth(int(depth of the caller)), replace_unstringable=\n
    return value:
    """
    
    #create a copy of the original dict in case the caller_depth==0
    if caller_depth==0:
        dict_to_transform= dict_to_transform.copy()
    
    current_depth=caller_depth+1
    
    #recursion anker
    if current_depth==max_depth:
        warnings.warn("exceeded given recursion depth. Unable to fully transform given dictionary")
        return dict_to_transform
    
    #call ,method recursively on the items of the dict if they are dicts as well, in case of unserializable items convert them to strings if possible
    for key,item in dict_to_transform.items():
        if type(item)==dict:
            dict_to_transform[key]= recursive_force_serializability(item,max_depth=max_depth,caller_depth=current_depth,replace_unstringable=replace_unstringable)
        else:
            #try to dump the item to check ifg its serializable
            try:
                json.dumps(item)
            except TypeError:
                #if not serializable try to transform it to string
                try:
                    dict_to_transform[key]=str(item)
                except TypeError:
                    #if not stringable replace by "unserializable and unstringable object" string if replace_unstringable==True, else raise error
                    if replace_unstringable:
                        dict_to_transform[key]="unserializable and unstringable object"
                        warnings.warn(f"replaced object at key {key} with replacement string")
                    else:
                        raise AssertionError("unserializable and unstringable object was found and replace_unstringable is set to False, set to True if replacement is desired")
                    
    return dict_to_transform

In [27]:
def create_model_description(notebook_parameters=None,train_evaluation=None,valid_evaluation=None):
    """
    task: create a description of the model by storing the notebook_parameters used and evaluation scores   \n
    parameters: notebook_parameters(dict at beginning of notebook), train_evaluation(output of Model_composer.evaluate on train data),valid_evaluation(output of Model_composer.evaluate on validation data) \n
    return value: dict
    """
    
    description={
        "notebook_parameters": notebook_parameters,
        "train_evaluation": train_evaluation if train_evaluation else "no train evaluation available",
        "valid_evaluation": valid_evaluation if valid_evaluation else "no validation evaluation available"
    }

    return description
    

In [28]:
#store the current run
folder_name="english_v1"
if False: #change to true if desired
    desc= create_model_description(notebook_parameters,train_evaluation,valid_evaluation)
    dest_folder=notebook_parameters["storage_parameters"]["path_to_model_storage"].joinpath(folder_name)
    store_model(model,recursive_force_serializability(desc,max_depth=100),dest_folder)

### 3.3.2 load model

In [None]:
#load the model in given folder name
folder_name="chinese_v1"
if False:
    model.load_state_dict(torch.load(notebook_parameters["storage_parameters"]["path_to_model_storage"].joinpath(folder_name,"state_dict.pth")))