# Prepare
Install required libraries and import

In [0]:
!pip install pytorch-pretrained-bert

In [0]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [0]:
from fastai import *
from fastai.text import *

In [0]:
from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification
from pytorch_pretrained_bert import BertTokenizer, BertAdam
from sklearn.model_selection import train_test_split

In [0]:
from datetime import datetime

Check, if and what kind of GPU is used

In [0]:
cuda_available = torch.cuda.is_available()
print(cuda_available)
if cuda_available:
    curr_device = torch.cuda.current_device()
    print(torch.cuda.get_device_name(curr_device))

Create a config

In [0]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    bert_model_name="bert-base-uncased", # default: "bert-base-uncased", alt: "bert-large-uncased"
    max_lr=2e-5, # default: 3e-5
    moms=(0.8, 0.7), # default: (0.8, 0.7); alt.(0.95, 0.85)
    epochs=10,
    use_fp16=False, # default: False
    bs=8, # default: 2 or 4
    max_seq_len=512,
    train_size=0.9,
    use_bertAdam=True,
    loss_func=nn.CrossEntropyLoss(), #default: None or nn.CrossEntropyLoss()
    seed=904727489, #default: 904727489, 424242 (reproducibility) or None
)

config_use = Config(
    threshold = 0.75,
    data_folder = './data/',
    data_filenames = ['Hadoop_BegOnly_512.tsv'],
    model_path = './model/',
    model_name = 'BERT4DAT.pkl',
    log_file = './log/classifierResults' + datetime.now().strftime('%Y%m%d-%H%M') + '.txt',
)

load_from_gdrive = True
save_model = False


Set up where the data comes from

In [0]:
data_folder = './data/'
data_filenames = ['1_classCorpus_BegOnly_512.tsv']
# 1_classCorpus_BegOnly_512
# 1_classCorpus_Shrunk
# 2_classCorpus_BegOnly_512
# 2_classCorpus_Shrunk

To import the dataset, first we have to connect to our Google drive (if data should be loaded from gdrive). For this, we have to authenticating the access and mount the drive

In [0]:
if load_from_gdrive:
    from google.colab import drive
    # Connect to drive to load the corpus from there
    drive.mount('/content/drive', force_remount=True)
    data_folder = data_folder.replace('.', '/content/drive/My Drive')
    config_use.model_path = data_folder
    config_use.data_folder = config_use.data_folder.replace('.', '/content/drive/My Drive')
    config_use.log_file = config_use.log_file.replace('.', config_use.data_folder, 1)

In [0]:
def logLine(line):
    with open(config_use.log_file, 'a') as log:
        log.write(line + '\n')

In [0]:
def get_info():
    model_config = 'model: {}, max_lr: {}, epochs: {}, bs: {}, msl: {}, train_size: {}, BERT-Adam: {}, FP16: {}, Loss: {}, Threshold: {}, Seed: {}, Data: {}'.format(config.bert_model_name, config.max_lr, config.epochs, config.bs, config.max_seq_len, config.train_size, config.use_bertAdam, config.use_fp16, config.loss_func, config_use.threshold, config.seed, data_filenames)
    return model_config

# Data


Create proper tokenizer for our data

In [0]:

class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=512, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length. Prepend with [CLS] and append [SEP]"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]



Now, we can create our own databunch using the tokenizer above. Notice we're passing the include_bos=False and include_eos=False options. This is to prevent fastai from adding its own SOS/EOS tokens that will interfere with BERT's SOS/EOS tokens.

We can pass our own list of Preprocessors to the databunch.

In [0]:
class BertTokenizeProcessor(TokenizeProcessor):
    """Special Tokenizer, where we remove sos/eos tokens since we add that ourselves in the tokenizer."""
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class BertNumericalizeProcessor(NumericalizeProcessor):
    """Use a custom vocabulary to match the original BERT model."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]

class BertDataBunch(TextDataBunch):
    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
              tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
              label_cols:IntsOrStrs=0, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                      TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)

Load the different data sets

In [0]:
def load_data(filename):
    fpath = data_folder + filename
    df = pd.read_csv(fpath, sep='\t', usecols=['file', 'label', 'text'])
    df = df.dropna()
    return df

def load_all_data(filenames):
    df = load_data(filenames[0])
    for i in range(1, len(filenames)):
        df = df.append(load_data(filenames[i]))
    return df

# load the datasets from files
df = load_all_data(data_filenames)

# shuffle the dataset a bit and get the amount of labels
df = df.sample(frac=1, axis=0, random_state = config.seed)
num_labels = df['label'].nunique()

print(df.shape)
print(df['label'].value_counts())

In [0]:
# load the eval dataset(s)
df_use = load_all_data(config_use.data_filenames)

print(df_use.shape)
print(df_use['label'].value_counts())

Create the dictionary that contains the labels along with their indices. This is useful for evaluation and similar.

Usual dict: {'audit': 0, 'authenticate': 1, 'heartbeat': 2, 'pooling': 3, 'scheduler': 4, 'unrelated': 5}

In [0]:
def create_label_indices(df):
    #prepare labels
    labels = df['label'].unique()
    labels = np.delete(labels, np.where(labels == 'unrelated'))
    labels.sort() 
  
    #create dict
    labelDict = dict()
    for i in range (0, len(labels)):
        labelDict[labels[i]] = i
    labelDict['unrelated'] = len(labels)
    return labelDict

label_indices = create_label_indices(df)
print(label_indices)

# Create and train the learner/classifier


Create the needed functions to create and train a classifier



In [0]:
def split_dataframe(df, train_size = 0.9, random_state = None):
    # split data into training and validation set
    df_trn, df_valid = train_test_split(df, stratify = df['label'], train_size = train_size, random_state = random_state)
    return df_trn, df_valid
  
def create_databunch(config, df_trn, df_valid):
    bert_tok = BertTokenizer.from_pretrained(config.bert_model_name,)
    fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])
    fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))
    return BertDataBunch.from_df(".", 
                   train_df=df_trn,
                   valid_df=df_valid,
                   tokenizer=fastai_tokenizer,
                   vocab=fastai_bert_vocab,
                   bs=config.bs,
                   text_cols='text',
                   label_cols='label',
                   collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
              )


def create_learner(config, databunch):
    bert_model = BertForSequenceClassification.from_pretrained(config.bert_model_name, num_labels=num_labels)

    optimizer = AdamW # AdamW is the default optimizer of fastai.Learner
    if config.use_bertAdam:
      # BertAdam optimizer
      optimizer = partial(BertAdam)

    learner = Learner(
        databunch, bert_model,
        optimizer,
        metrics=accuracy,
        loss_func=config.loss_func
    )
    if config.use_fp16:
        learner.to_fp16()
    return learner


def train(config, df):
    df_trn, df_valid = split_dataframe(df, train_size = config.train_size, random_state = config.seed)
    databunch = create_databunch(config, df_trn, df_valid)

    learner = create_learner(config, databunch)
    learner.fit_one_cycle(config.epochs, max_lr=config.max_lr, moms=config.moms)

    return learner



Actually create the trained classifier

In [0]:
def set_seed(seed):
    if seed is None:
        seed = random.randint(0, 2**31)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [0]:
set_seed(config.seed)
classifier = train(config, df)

Save the model along with its config

In [0]:
def create_model_name():
    name = 'BERT4DAT_e{epochs}_{data_filename}'.format(epochs=str(config.epochs),data_filename=data_filenames[0][:-4])
    return name

def save_config(model_save_path, model_name):
    settings = ''
    for item in config.__dict__:
        value = config[item]
        setting = '{item}={value},\n'.format(item=item, value=value)
        settings += setting
    save_path = model_save_path + model_name + '.config'
    with open(save_path, 'w', encoding='utf-8') as out:
        out.write(settings)



In [0]:
if save_model:
    model_name = create_model_name()
    save_config(config_use.model_path, model_name)
    model_save_path = config_use.model_path + model_name + '.pkl'
    classifier.export(file = model_save_path)
    #classifier.save('model_name', return_path=True)

# Predictor


Create a predictor class. Just uses the prediction of the classifier/learner, but labels with confidentiality below a threshold get labeled as 'unrelated'

In [0]:
class Predictor:
    def __init__(self, classifier, threshold=0.90, default_value =  'unrelated'):
        self.classifier = classifier
        self.threshold = threshold
        self.classes = self.classifier.data.classes
        self.default_value = default_value

    def predict(self, text):
        prediction = self.classifier.predict(text)
        prediction_class = prediction[1]
        prob = prediction[2][prediction_class].item()
        if prob > self.threshold:
            return self.classes[prediction_class]
        else: return self.default_value   

# Use trained classifier

Load the saved model and create the predictor

In [0]:
#classifier = load_learner(config_use.model_path, config_use.model_name)
predictor = Predictor(classifier, threshold=config_use.threshold)

Predict/classify

In [0]:
log_txt = datetime.now().strftime('%Y-%m-%d %H:%M') + ' ' + get_info()
logLine(log_txt)
for row in df_use.itertuples():
    filename = row.file
    class_text = row.text
    prediction = predictor.predict(class_text)
    log_text = '{} -> {}'.format(filename, prediction)
    logLine(log_text)