# Prepare
Install required libraries and import

In [None]:
!pip install pytorch-pretrained-bert

In [None]:
import numpy as np
import pandas as pd
from typing import *
import torch
import torch.nn as nn
import random

In [None]:
from fastai import *
from fastai.text import *

In [None]:
from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification
from pytorch_pretrained_bert import BertTokenizer, BertAdam
from sklearn.model_selection import train_test_split

In [None]:
from datetime import datetime

Check, if and what kind of GPU is used

In [None]:
cuda_available = torch.cuda.is_available()
print(cuda_available)
if cuda_available:
    curr_device = torch.cuda.current_device()
    print(torch.cuda.get_device_name(curr_device))

Create a config

In [None]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    bert_model_name="bert-base-uncased", # default: "bert-base-uncased", alt: "bert-large-uncased"
    max_lr=2e-5, # default: 2e-5
    moms=(0.8, 0.7), # default: (0.95, 0.85) or (0.8, 0.7)
    epochs=10, # default: 5, 6, 10 or 20
    use_fp16=False, # default: False
    bs=2, # default: 2, 4 or 8
    max_seq_len=512, # default: 128
    train_size=0.9, #default: 0.9
    use_bertAdam=True, #default: True
    loss_func=nn.CrossEntropyLoss(), #default: None or nn.CrossEntropyLoss()
    threshold=0.9, #default: 0.9
    seed=904727489, #default: 31337, 424242 (reproducibility) or None
)

config_eval = Config(
    num_repeats=1,
    num_folds=10,
    log_to_file = True,
    log_file='./log/log_shrunk.txt',
)
shrunk = False
load_from_gdrive = False


In [None]:
seeds = []
if config.seed is None:
    for i in range(config_eval.num_repeats):
        seeds.append(random.randint(0, 2**31))
else:
    for i in range(config_eval.num_repeats):
        seeds.append(config.seed)

In [None]:
# easily control what data is loaded and where to log to
if not shrunk:
    config_eval.log_file = './log/log_beg.txt'

Set up where the data comes from

In [None]:
data_folder = './data/'

if shrunk:
    data_filenames = ['1_classCorpus_Shrunk.tsv']
else:
    data_filenames = ['1_classCorpus_BegOnly_512.tsv']

# One file shortened: ['1_classCorpus_Shrunk.tsv']
# One file unshortened: ['1_classCorpus_BegOnly_512.tsv']
# Second (smaller) dataset:
# One file shortened: ['2_classCorpus_Shrunk.tsv']
# One file unshortened: ['2_classCorpus_BegOnly_512.tsv']


In [None]:
if load_from_gdrive:
    from google.colab import drive
    # Connect to drive to load the corpus from there
    data_folder = data_folder.replace('.', '/content/drive/My Drive')
    config_eval.log_file = config_eval.log_file.replace('.', '/content/drive/My Drive/data', 1)
    drive.mount('/content/drive', force_remount=True)

# Data


To import the dataset, first we have to connect to our Google drive. For this, we have to authenticating the access and mount the drive

In [None]:
# Logging
def get_info():
    model_config = 'model: {}, max_lr: {}, epochs: {}, bs: {}, msl: {}, train_size: {}, BERT-Adam: {}, FP16: {}, Loss: {}, Threshold: {}, Data: {}'.format(config.bert_model_name, config.max_lr, config.epochs, config.bs, config.max_seq_len, config.train_size, config.use_bertAdam, config.use_fp16, config.loss_func, config.threshold, data_filenames)
    return model_config
    
def logResult(precisions, recalls, accuracy, confusionMatrix):
    if config_eval.log_to_file:
        with open(config_eval.log_file, 'a') as log:
            datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            avg_precision = np.average(precisions)
            avg_recall = np.average(recalls)
            log_txt = 'Precision: Average {0:.2%} -> {1}\n Recall: Average {2:.2%} -> {3}\n'.format(avg_precision, precisions, avg_recall, recalls)
            log_txt += 'Accuracy: {}\n'.format(accuracy)
            log_txt += '{}'.format(confusionMatrix)
            log.write("{}\n".format(log_txt))

def logLine(line):
    if config_eval.log_to_file:
        with open(config_eval.log_file, 'a') as log:
            log.write(line + '\n')
    
logLine(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
logLine(get_info())

Create proper tokenizer for our data

In [None]:

class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=512, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length. Prepend with [CLS] and append [SEP]"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]



Now, we can create our own databunch using the tokenizer above. Notice we're passing the include_bos=False and include_eos=False options. This is to prevent fastai from adding its own SOS/EOS tokens that will interfere with BERT's SOS/EOS tokens.

We can pass our own list of Preprocessors to the databunch.

In [None]:
class BertTokenizeProcessor(TokenizeProcessor):
    """Special Tokenizer, where we remove sos/eos tokens since we add that ourselves in the tokenizer."""
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class BertNumericalizeProcessor(NumericalizeProcessor):
    """Use a custom vocabulary to match the original BERT model."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]

class BertDataBunch(TextDataBunch):
    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
              tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
              label_cols:IntsOrStrs=0, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                      TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)

Load/get the different data sets for training and evaluation

In [None]:
def load_data(filename):
    fpath = data_folder + filename
    df = pd.read_csv(fpath, sep='\t', usecols=['label', 'text'])
    df = df.dropna()
    return df

def load_all_data(filenames):
    df = load_data(filenames[0])
    for i in range(1, len(filenames)):
        df = df.append(load_data(filenames[i]))
    return df

# load the datasets from files
df = load_all_data(data_filenames)

print(df.shape)
print(df['label'].value_counts())

In [None]:
def create_label_indices(df):
    #prepare labels
    labels = df['label'].unique()
    labels = np.delete(labels, np.where(labels == 'unrelated'))
    labels.sort() 
  
    #create dict
    labelDict = dict()
    for i in range (0, len(labels)):
        labelDict[labels[i]] = i
    labelDict['unrelated'] = len(labels)
    return labelDict

label_indices = create_label_indices(df)
print(label_indices)

# Create Predictor and Evaluator


Create a predictor class. Just uses the prediction of the classifier/learner, but labels with confidentiality below a threshold get labeled as 'unrelated'

In [None]:
class Predictor:
    def __init__(self, classifier, threshold=0.85, default_value =  'unrelated'):
        self.classifier = classifier
        self.threshold = threshold
        self.classes = self.classifier.data.classes
        self.default_value = default_value

    def predict(self, text):
        prediction = self.classifier.predict(text)
        prediction_class = prediction[1]
        prob = prediction[2][prediction_class].item()
        if prob > self.threshold:
            return self.classes[prediction_class]
        else: return self.default_value   

Create a evaluator class along with some useful functions

In [None]:
# Eval builds and uses a confusion matrix M like this one:
#        | Declare L1  |  Declare L2 | Declare L3 | 
# |Is L1 |             |             |            | 
# |Is L2 |             |             |            | 
# |Is L3 |             |             |            | 
# Precision for, e.g., L1 is then M_ii/Sum_j(M_ji) with i=0, so M_ii divided by the column for L1
# Recall for, e.g., L1 is then M_ii/Sum_j(M_ij) with i=0, so M_ii divided by the row for L1

def calculate_precisions(confusion_matrix):
    column_val = np.sum(confusion_matrix, axis=0)
    for i in range(len(confusion_matrix)):
        if not column_val[i] == 0:
            column_val[i] = confusion_matrix[i,i] / column_val[i]
    return column_val

def calculate_recalls(confusion_matrix):
    row_val = np.sum(confusion_matrix, axis=1)
    for i in range(len(confusion_matrix)):
        if not row_val[i] == 0:
            row_val[i] = confusion_matrix[i,i] / row_val[i]
    return row_val

def calculate_accuracy(confusion_matrix):
    matrix_sum = confusion_matrix.sum()
    true_sum = confusion_matrix.diagonal().sum()
    accuracy = true_sum / matrix_sum
    return accuracy

class Evaluator:
    def __init__(self, predictor):
        self.predictor = predictor
  
    def evaluate(self, df_eval, num_labels=None):
        num_labels = len(label_indices)
        confusion_matrix = np.zeros(shape=(num_labels, num_labels))
        for tuple in df_eval.itertuples():
            gold_label = tuple.label
            idx_gold_label = label_indices[gold_label]
            pred_label = self.predictor.predict(tuple.text)
            idx_pred_label = label_indices[pred_label]
            confusion_matrix[idx_gold_label][idx_pred_label] += 1
        return confusion_matrix
   

# Cross-Validation

Combine all the stuff above and do a cross-validation, where everything is executed and evaluated multiple times with different shuffling of the data. This way, we know better how stable our approach is regarding different training and test data. 

Create the class CrossValidator that will do the training and validation.
Properly mix the train and eval set!



In [None]:
def split_dataframe(df, train_size = 0.9, random_state = None):
    # split data into training and validation set
    df_trn, df_valid = train_test_split(df, stratify = df['label'], train_size = train_size, random_state = random_state)
    return df_trn, df_valid
  
class CrossValidator:
    '''Cross Validation done as k-fold cross validation.'''

    def __init__(self, config, df):
        self.config = config
        self.df = df.sample(frac=1, axis=0, random_state = config.seed) # shuffle data
        self.num_labels = df['label'].nunique()


    def __create_databunch(self, df_trn, df_valid):
        bert_tok = BertTokenizer.from_pretrained(self.config.bert_model_name,)
        fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=self.config.max_seq_len), pre_rules=[], post_rules=[])
        fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))
        return BertDataBunch.from_df(".", 
                         train_df=df_trn,
                         valid_df=df_valid,
                         tokenizer=fastai_tokenizer,
                         vocab=fastai_bert_vocab,
                         bs=self.config.bs,
                         text_cols='text',
                         label_cols='label',
                         collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
                    )


    def __create_learner(self, databunch):
        bert_model = BertForSequenceClassification.from_pretrained(self.config.bert_model_name, num_labels=self.num_labels)

        optimizer = AdamW # AdamW is the default optimizer of fastai.Learner
        if self.config.use_bertAdam:
          # BertAdam optimizer
          optimizer = partial(BertAdam)

        learner = Learner(
            databunch, bert_model,
            optimizer,
            metrics=accuracy,
            loss_func=self.config.loss_func
        )
        if self.config.use_fp16:
            learner.to_fp16()
        return learner

    def split_for_fold(self, fold, num_folds = 10):
        ''' Splits the data into two parts. the first part is the i'th fold of a k-fold and the second part is the rest of the data '''
        n, i, k = len(self.df), fold, num_folds
        df_eval = self.df[n*(i-1)//k:n*i//k]
        df_train = self.df[:n*(i-1)//k].append(self.df[n*i//k:])
        return df_train, df_eval

    def validate_one_fold(self, fold, num_folds, info = None):
        print('Shuffle data and create fold.')
        df_trn, df_eval = self.split_for_fold(fold, num_folds)
        df_trn, df_valid = split_dataframe(df_trn, train_size = self.config.train_size, random_state = self.config.seed)
        databunch = self.__create_databunch(df_trn, df_valid)

        if info: print(info)
        print('Create classifier and start training it. Currently in fold {}.'.format(fold))
        learner = self.__create_learner(databunch)
        learner.fit_one_cycle(self.config.epochs, max_lr=self.config.max_lr, moms=self.config.moms)

        print('Start evaluating the trained classifier on the evaluation data.')
        evaluator = Evaluator(Predictor(learner, threshold=self.config.threshold))
        confusion_matrix = evaluator.evaluate(df_eval)
        precisions = calculate_precisions(confusion_matrix)
        recalls = calculate_recalls(confusion_matrix)
        accuracy = calculate_accuracy(confusion_matrix)
        del learner
        del databunch
        return precisions, recalls, accuracy, confusion_matrix

    def validate(self, num_folds):
        current_fold = 0
        accuracy_sum = 0
        recall_sum = np.zeros(self.num_labels)
        precision_sum = np.zeros(self.num_labels)
        confusion_matrix_sum = np.zeros((self.num_labels, self.num_labels))
        info = None

        for i in range(num_folds):
            current_fold = i+1
            precisions, recalls, accuracy, confusion_matrix = self.validate_one_fold(current_fold, num_folds, info)

            precision_sum += precisions  
            recall_sum += recalls
            accuracy_sum += accuracy
            confusion_matrix_sum += confusion_matrix

            recall = np.average(recall_sum / current_fold)
            precision = np.average(precision_sum / current_fold)
            avg_acc = accuracy_sum / current_fold
            info = 'Finished fold {0} with average recall of {1:.2%}, average precision of {2:.2%}, and avg. accuracy of {3:.2%}'.format(current_fold, recall, precision, avg_acc)

        recall = recall_sum / num_folds
        precision = precision_sum / num_folds
        avg_acc = accuracy_sum / num_folds
        confusion_matrix = confusion_matrix_sum # CARE! Not divided by num_folds, just 'raw' confusion matrix (for analysing/debugging etc)
        info = 'Finished validation with recalls of {0} and precisions of {1} and accuracy of {2}'.format(recall, precision, avg_acc)
        print(info)
        return precision, recall, avg_acc, confusion_matrix

Method to set seeds to allow reproducibility

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

Start the cross-validation

In [None]:
for round in range(config_eval.num_repeats):
    curr_seed = seeds[round]
    set_seed(curr_seed)
    logLine('Seed: {}'.format(curr_seed))

    cross_validator = CrossValidator(config, df)
    result = cross_validator.validate(config_eval.num_folds)  
    logResult(result[0], result[1], result[2], result[3])

Dict: {'audit': 0, 'authenticate': 1, 'heartbeat': 2, 'pooling': 3, 'scheduler': 4, 'unrelated': 5}