# Prepare
Install required libraries and import

In [None]:
!pip install pytorch-pretrained-bert

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [3]:
from fastai import *
from fastai.text import *

In [4]:
from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification
from pytorch_pretrained_bert import BertTokenizer, BertAdam
from sklearn.model_selection import train_test_split

In [5]:
from datetime import datetime

Check, if and what kind of GPU is used

In [6]:
cuda_available = torch.cuda.is_available()
print(cuda_available)
if cuda_available:
    curr_device = torch.cuda.current_device()
    print(torch.cuda.get_device_name(curr_device))

True
Tesla V100-SXM2-16GB


Set up where the data comes from

In [7]:
data_folder = './data/'
data_filenames = ['1_classCorpus_Shrunk.tsv']

To import the dataset, first we have to connect to our Google drive (if data should be loaded from gdrive). For this, we have to authenticating the access and mount the drive

In [8]:
load_from_gdrive = False

if load_from_gdrive:
    from google.colab import drive
    # Connect to drive to load the corpus from there
    drive.mount('/content/drive', force_remount=True)
    data_folder = data_folder.replace('.', '/content/drive/My Drive')

Create a config

In [9]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    model_path = './model/',
    model_name = 'BERT4DAT_e20_1_classCorpus_BegOnly_512.pkl', # TODO adapt to your needs!
    log_file = './log/classifierResults.txt',
)


In [10]:
def logLine(line):
    with open(config.log_file, 'a') as log:
        log.write(line + '\n')

# Data


In [11]:
def load_data(filename):
    fpath = data_folder + filename
    df = pd.read_csv(fpath, sep='\t', usecols=['file', 'label', 'text'])
    df = df.dropna()
    return df

def load_all_data(filenames):
    df = load_data(filenames[0])
    for i in range(1, len(filenames)):
        df = df.append(load_data(filenames[i]))
    return df

# load the dataset(s)
df = load_all_data(data_filenames)

print(df.shape)

(510, 3)


# BERT-related classes

In [12]:
class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=512, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length. Prepend with [CLS] and append [SEP]"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]

class BertTokenizeProcessor(TokenizeProcessor):
    """Special Tokenizer, where we remove sos/eos tokens since we add that ourselves in the tokenizer."""
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class BertNumericalizeProcessor(NumericalizeProcessor):
    """Use a custom vocabulary to match the original BERT model."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]

class BertDataBunch(TextDataBunch):
    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
              tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
              label_cols:IntsOrStrs=0, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                      TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)

# Predictor


Create a predictor class. Just uses the prediction of the classifier/learner, but labels with confidentiality below a threshold get labeled as 'unrelated'

In [13]:
class Predictor:
    def __init__(self, classifier, threshold=0.85, default_value =  'unrelated'):
        self.classifier = classifier
        self.threshold = threshold
        self.classes = self.classifier.data.classes
        self.default_value = default_value

    def predict(self, text):
        prediction = self.classifier.predict(text)
        prediction_class = prediction[1]
        prob = prediction[2][prediction_class].item()
        if prob > self.threshold:
            return self.classes[prediction_class]
        else: return self.default_value   

# Load and use
Load the saved model

In [14]:
classifier = load_learner(config.model_path, config.model_name)
predictor = Predictor(classifier)

Predict/classify

In [16]:
logLine(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
for index, row in df.iterrows():
    filename = row['file']
    class_text = row['text']
    label = row['label']
    prediction = predictor.predict(class_text)
    log_text = '{} -> {} ({})'.format(filename, prediction, label)
    logLine(log_text)