# Prepare
Install required libraries and import

In [0]:
!pip install pytorch-transformers fastprogress

In [0]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [0]:
from fastai import *
from fastai.text import *
from sklearn.model_selection import train_test_split

In [0]:
from pytorch_transformers import BertTokenizer, BertPreTrainedModel, BertModel, BertConfig
from pytorch_transformers import AdamW

In [0]:
from fastprogress import master_bar, progress_bar
from datetime import datetime

Check, if and what kind of GPU is used

In [0]:
cuda_available = torch.cuda.is_available()
if cuda_available:
    curr_device = torch.cuda.current_device()
    print(torch.cuda.get_device_name(curr_device))
device = torch.device("cuda" if cuda_available else "cpu")
device

In [0]:
def get_memory_usage():
    return torch.cuda.memory_allocated(device)/1000000

def get_memory_usage_str():
    return 'Memory usage: {:.2f} MB'.format(get_memory_usage())

Create a config

In [0]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    num_labels = 6, # will be set automatically
    model_name="bert-base-uncased", 
    max_lr=2e-5, # default: 2e-5
    moms=(0.8, 0.7), # default: (0.8, 0.7); alt.(0.95, 0.85)
    epochs=10,
    bs=2, # default: 2 or 4
    weight_decay = 0.01,
    max_seq_len=512,
    train_size=0.9375,
    loss_func=nn.CrossEntropyLoss(), #default: None or nn.CrossEntropyLoss()
    seed=904727489, #default: 904727489, 424242 (reproducibility) or None
    threshold = 0.75,
    undersample = False,
    oversample = False,
)

config_data = Config(
    root_folder = '.',
    data_folder = '/data/',
    train_data = ['1_classCorpus_Shrunk.tsv'], # 1_classCorpus_BegOnly_512, 1_classCorpus_Shrunk
    eval_data = ['Hadoop_BegOnly_512.tsv'],
    log_file = '/log/classifierPredictions_' + datetime.now().strftime('%Y%m%d-%H%M') + '.txt',
    answer_set = '/AnswerSetHadoop.md',
    eval_script = '/scripts/eval.py',
    result_file = '/log/classifierResults_' + datetime.now().strftime('%Y%m%d-%H%M') + '.txt',
    model_path = '/model/',
    model_name = 'BERT4DAT.pkl',
)

load_from_gdrive = True
save_model = False


To import the dataset, first we have to connect to our Google drive (if data should be loaded from gdrive). For this, we have to authenticating the access and mount the drive

In [0]:
if load_from_gdrive:
    from google.colab import drive
    # Connect to drive to load the corpus from there
    drive.mount('/content/drive', force_remount=True)
    config_data.root_folder = '/content/drive/My Drive/BERT4DAT'

In [0]:
def initLog():
    logfile = config_data.root_folder + config_data.log_file
    log_txt = datetime.now().strftime('%Y-%m-%d %H:%M') + ' ' + get_info()
    with open(logfile, 'w') as log:
        log.write(log_txt + '\n')

def logLine(line):
    logfile = config_data.root_folder + config_data.log_file
    with open(logfile, 'a') as log:
        log.write(line + '\n')

def logResult(result):
    logfile = config_data.root_folder + config_data.result_file
    with open(logfile, 'a') as log:
        log.write(get_info() + '\n')
        for line in result:
            log.write(line + '\n')

In [0]:
def get_info():
    model_config = 'model: {}, max_lr: {}, epochs: {}, bs: {}, train_size: {}, weight decay: {}, Threshold: {}, Seed: {}, Data: {}, Undersampling: {}'.format(config.model_name, config.max_lr, config.epochs, config.bs, config.train_size, config.weight_decay, config.threshold, config.seed, config_data.train_data, config.undersample)
    return model_config

In [0]:
def set_seed(seed):
    if seed is None:
        seed = random.randint(0, 2**31)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    return seed

set_seed(config.seed)

# Data


Create proper tokenizer for our data

In [0]:
class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=512, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str):
        """Limits the maximum sequence length. Prepend with [CLS] and append [SEP]"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]



Now, we can create our own databunch using the tokenizer above. Notice we're passing the include_bos=False and include_eos=False options. This is to prevent fastai from adding its own SOS/EOS tokens that will interfere with BERT's SOS/EOS tokens.

We can pass our own list of Preprocessors to the databunch.

In [0]:
class BertTokenizeProcessor(TokenizeProcessor):
    """Special Tokenizer, where we remove sos/eos tokens since we add that ourselves in the tokenizer."""
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class BertNumericalizeProcessor(NumericalizeProcessor):
    """Use a custom vocabulary to match the original BERT model."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]

class BertDataBunch(TextDataBunch):
    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
              tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
              label_cols:IntsOrStrs=0, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                      TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)

Create the BertTextClassifier-Class

In [0]:
class BertTextClassifier(BertPreTrainedModel):
    def __init__(self, model_name, num_labels):
        config = BertConfig.from_pretrained(model_name)
        super(BertTextClassifier, self).__init__(config)
        self.num_labels = num_labels
        
        self.bert = BertModel.from_pretrained(model_name, config=config)
        
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.config.hidden_size, num_labels)

        #self.apply(self.init_weights)
    
    def forward(self, tokens, labels=None, position_ids=None, token_type_ids=None, attention_mask=None, head_mask=None):
        outputs = self.bert(tokens, position_ids=position_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, head_mask=head_mask)
        
        pooled_output = outputs[1]
        # According to documentation of pytorch-transformers, pooled output might not be the best 
        # and youâ€™re often better with averaging or pooling the sequence of hidden-states for the whole input sequence 
        #hidden_states = outputs[0]
        #pooled_output = torch.mean(hidden_states, 1)

        dropout_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_output)

        softmax = nn.Softmax(dim=1)
        probs = softmax(logits)

        return logits

Load the different data sets

In [0]:
def load_data(filename):
    fpath = config_data.root_folder + config_data.data_folder + filename
    df = pd.read_csv(fpath, sep='\t', usecols=['file', 'label', 'text'])
    df = df.dropna()
    return df

def load_all_data(filenames, limit=-1):
    df = load_data(filenames[0])
    for i in range(1, len(filenames)):
        df = df.append(load_data(filenames[i]))
    df = df.sample(frac=1, axis=0, random_state = config.seed)
    return df



Add function for undersampling, which takes randomly chosen elements out of the bigger (label) groups to equalize the size of the groups

In [0]:
def undersample(df):
  sample_size = len(df)
  df_undersampled = None

  # find smallest group size
  for label in df['label'].unique():
      label_size = len(df[df['label'] == label])
      if label_size < sample_size:
          sample_size = label_size

  # pick elements out of groups
  for label in df['label'].unique():
      indices = df[df['label'] == label].index
      len(indices)
      random_indices = np.random.choice(indices, sample_size, replace=False)
      sample = df.loc[random_indices]
      if df_undersampled is None:
          df_undersampled = sample
      else:
          df_undersampled = df_undersampled.append(sample)
  df_undersampled = df_undersampled.sample(frac=1, axis=0, random_state = config.seed)
  return df_undersampled

def oversample(df):
  sample_size = 0
  df_sampled = df

  # find smallest group size
  for label in df['label'].unique():
      label_size = len(df[df['label'] == label])
      if label_size > sample_size:
          sample_size = label_size
  
  for label in df['label'].unique():
      label_size = len(df[df['label'] == label])
      multiplier = sample_size//label_size
      indices = df[df['label'] == label].index
      diff = sample_size - (multiplier * label_size)
      random_indices = np.random.choice(indices, diff, replace=False)
      sample = df.loc[random_indices]
      for i in range(multiplier - 1):
        sample = pd.concat([sample, df[df['label'] == label]], ignore_index=True)
      df_sampled = df_sampled.append(sample)
  df_sampled = df_sampled.sample(frac=1, axis=0, random_state = config.seed)
  return df_sampled


In [0]:
# load the train datasets
df = load_all_data(config_data.train_data, 50)

# shuffle the dataset a bit and get the amount of labels
df = df.sample(frac=1, axis=0, random_state = config.seed)
config.num_labels = df['label'].nunique()

print(df.shape)
print(df['label'].value_counts())

# "undersampling"
if config.undersample:
    df = undersample(df)

    print('\nUsing Undersampling. Updated data:')
    print(df.shape)
    print(df['label'].value_counts())

if config.oversample:
    df = oversample(df)

    print('\nUsing Oversampling. Updated data:')
    print(df.shape)
    print(df['label'].value_counts())



In [0]:
# load the eval dataset(s)
df_eval = load_all_data(config_data.eval_data)

print(df_eval.shape)
print(df_eval['label'].value_counts())

Create the dictionary that contains the labels along with their indices. This is useful for evaluation and similar.

Usual dict: {'audit': 0, 'authenticate': 1, 'heartbeat': 2, 'pooling': 3, 'scheduler': 4, 'unrelated': 5}

In [0]:
def create_label_indices(df):
    #prepare labels
    labels = df['label'].unique()
    labels = np.delete(labels, np.where(labels == 'unrelated'))
    labels.sort() 
  
    #create dict
    labelDict = dict()
    for i in range (0, len(labels)):
        labelDict[labels[i]] = i
    labelDict['unrelated'] = len(labels)
    return labelDict

label_indices = create_label_indices(df)
print(label_indices)

# Create and train the learner/classifier


Create the needed functions to create and train a classifier



In [0]:
def split_dataframe(df, train_size = 0.9, random_state = None):
    # split data into training and validation set
    df_trn, df_valid = train_test_split(df, stratify = df['label'], train_size = train_size, random_state = random_state)
    return df_trn, df_valid
  
def create_databunch(config, df_trn, df_valid):
    bert_tok = BertTokenizer.from_pretrained(config.model_name,)
    fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])
    fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))
    return BertDataBunch.from_df(".", 
                   train_df=df_trn,
                   valid_df=df_valid,
                   tokenizer=fastai_tokenizer,
                   vocab=fastai_bert_vocab,
                   bs=config.bs,
                   text_cols='text',
                   label_cols='label',
                   collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
              )


def create_learner(config, databunch):
    model = BertTextClassifier(config.model_name, config.num_labels)

    optimizer = partial(AdamW)

    learner = Learner(
        databunch, model,
        optimizer,
        wd = config.weight_decay,
        metrics=FBeta(beta=1, average='macro'),
        loss_func=config.loss_func
    )
    return learner

In [0]:
# To find LR:
do_find_lr = False
if do_find_lr:
    df_trn, df_valid = split_dataframe(df, train_size = 0.7, random_state = config.seed)
    databunch = create_databunch(config, df_trn, df_valid)
    learner = create_learner(config, databunch)

    learner.lr_find()
    learner.recorder.plot()

Actually create the trained classifier

In [0]:
# Create the classifier
df_trn, df_valid = split_dataframe(df, train_size = config.train_size, random_state = config.seed)
databunch = create_databunch(config, df_trn, df_valid)

classifier = create_learner(config, databunch)
get_memory_usage_str()

In [0]:
# Train the classifier
classifier.fit_one_cycle(config.epochs, max_lr=config.max_lr, moms=config.moms, wd=config.weight_decay)

In [0]:
get_memory_usage_str()

Save the model along with its config

In [0]:
def create_model_name():
    name = 'BERT4DAT_e{epochs}_{data_filename}'.format(epochs=str(config.epochs),data_filename=data_filenames[0][:-4])
    return name

def save_config(model_save_path, model_name):
    settings = ''
    for item in config.__dict__:
        value = config[item]
        setting = '{item}={value},\n'.format(item=item, value=value)
        settings += setting
    save_path = model_save_path + model_name + '.config'
    with open(save_path, 'w', encoding='utf-8') as out:
        out.write(settings)



In [0]:
if save_model:
    model_name = create_model_name()
    model_save_path = config_data.root_folder + config_data.model_path
    save_config(model_save_path, model_name)
    model_save_file = model_save_path + model_name + '.pkl'
    classifier.export(file = model_save_file)

# Predictor


Create a predictor class. Just uses the prediction of the classifier/learner, but labels with confidentiality below a threshold get labeled as 'unrelated'

In [0]:
class Predictor:
    def __init__(self, classifier, threshold=0.75, default_value =  'unrelated'):
        self.classifier = classifier
        self.threshold = threshold
        self.classes = self.classifier.data.classes
        self.default_value = default_value

    def predict(self, text):
        prediction = self.classifier.predict(text)
        prediction_class = prediction[1]
        prob = prediction[2][prediction_class].item()
        if prob > self.threshold:
            return self.classes[prediction_class]
        else: return self.default_value   

# Use trained classifier

Load the saved model and create the predictor

In [0]:
#classifier = load_learner(config_data.root_folder + config_data.model_path, config_data.model_name)
predictor = Predictor(classifier, threshold=config.threshold)

Predict/classify

In [0]:
initLog()
for row in progress_bar(df_eval.itertuples(), total=len(df_eval)):
    filename = row.file
    class_text = row.text
    prediction = predictor.predict(class_text)
    log_text = '{} -> {}'.format(filename, prediction)
    logLine(log_text)

Analyse outputs

In [0]:
EVAL_SCRIPT = config_data.root_folder + config_data.eval_script
ANSWERS = config_data.root_folder + config_data.answer_set
LOG_FILE = config_data.root_folder + config_data.log_file

eval_command = 'python "{}" "{}" "{}"'.format(EVAL_SCRIPT, ANSWERS, LOG_FILE)

In [0]:
result = !{eval_command}
logResult(result)

for line in result:
    print(line)