In [0]:
!pip install pytorch-transformers fastprogress pytorch-nlp

In [0]:
import sys
import numpy as np
import random as rn
import pandas as pd

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils import clip_grad_norm_
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, f1_score

from pytorch_transformers import BertTokenizer, BertPreTrainedModel, BertModel, BertConfig
from pytorch_transformers import AdamW, WarmupLinearSchedule, ConstantLRSchedule

from fastprogress import master_bar, progress_bar
from datetime import datetime

In [0]:
# Download the lr_finder-library only if it does not exist yet
!(if [ ! -f "lr_finder.py" ]; then wget https://raw.githubusercontent.com/davidtvs/pytorch-lr-finder/master/lr_finder.py; fi)

In [0]:
from lr_finder import LRFinder

Configuration and basic settings

In [0]:
cuda_available = torch.cuda.is_available()
if cuda_available:
    curr_device = torch.cuda.current_device()
    print(torch.cuda.get_device_name(curr_device))
device = torch.device("cuda" if cuda_available else "cpu")
device

In [0]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    model_name = 'bert-base-uncased',
    bs = 2,
    epochs = 10,
    lr = 2e-3, #2e-5,
    adam_betas = (0.8, 0.7), # (0.8, 0.7), (0.9, 0.999)
    adam_epsilon = 1e-8,
    #training_test_share = 0.1,
    num_labels = 6,
    threshold = 0.9,
    seed = 31337,
)

config_data = Config(
    root_folder = '.',
    data_folder = '/data/',
    train_data = ['1_classCorpus_BegOnly_512.tsv'],
    eval_data = ['Hadoop_BegOnly_512.tsv'],
    log_file = '/log/classifierPredictions_' + datetime.now().strftime('%Y%m%d-%H%M') + '.txt',
    answer_set = '/AnswerSetHadoop.md',
    eval_script = '/scripts/eval.py',
    result_file = '/log/classifierResults_' + datetime.now().strftime('%Y%m%d-%H%M') + '.txt',
)

load_from_gdrive = True

In [0]:
def set_seed(seed=31337):
    rn.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(config.seed)

To import the dataset, first we have to connect to our Google drive (if data should be loaded from gdrive). For this, we have to authenticating the access and mount the drive

In [0]:
if load_from_gdrive:
    from google.colab import drive
    # Connect to drive to load the corpus from there
    drive.mount('/content/drive', force_remount=True)
    config_data.root_folder = '/content/drive/My Drive/BERT4DAT'

## Util functions

In [0]:
def get_memory_usage():
    return torch.cuda.memory_allocated(device)/1000000

def get_memory_usage_str():
    return 'Memory usage: {:.2f} MB'.format(get_memory_usage())

def get_info():
    model_config = 'model: {}, lr: {}, epochs: {}, bs: {}, Threshold: {}, Seed: {}, Data: {}'.format(config.model_name, config.lr, config.epochs, config.bs, config.threshold, config.seed, config_data.train_data)
    return model_config

def initLog():
    logfile = config_data.root_folder + config_data.log_file
    log_txt = datetime.now().strftime('%Y-%m-%d %H:%M') + ' ' + get_info()
    with open(logfile, 'w') as log:
        log.write(log_txt + '\n')

def logLine(line):
    logfile = config_data.root_folder + config_data.log_file
    with open(logfile, 'a') as log:
        log.write(line + '\n')

def logResult(result):
    logfile = config_data.root_folder + config_data.result_file
    with open(logfile, 'a') as log:
        log.write(get_info() + '\n')
        for line in result:
            log.write(line + '\n')


## Prepare the Data

Load the data

In [0]:
def load_data(filename):
    fpath = config_data.root_folder+ config_data.data_folder + filename
    df = pd.read_csv(fpath, sep='\t', usecols=['file', 'label', 'text'])
    df = df.dropna()
    return df

def load_all_data(filenames):
    df = load_data(filenames[0])
    for i in range(1, len(filenames)):
        df = df.append(load_data(filenames[i]))
    return df



Load the train datasets

In [0]:
df_train = load_all_data(config_data.train_data)

# shuffle the dataset a bit and get the amount of labels
df_train = df_train.sample(frac=1, axis=0, random_state = config.seed)
config.num_labels = df_train['label'].nunique()

print(df_train.shape)
print(df_train['label'].value_counts())

Load the eval dataset

In [0]:
df_eval = load_all_data(config_data.eval_data)

print(df_eval.shape)
print(df_eval['label'].value_counts())

Divide texts and labels properly

In [0]:
train_texts, train_labels = list(zip(*map(lambda d: (d[2], d[1]), np.array(df_train))))

len(train_texts), len(train_labels)

In [0]:
labels = sorted(list(set(train_labels)))
config.num_labels = len(labels)
labels

Initialize tokenizer and tokenize the text

In [0]:
tokenizer = BertTokenizer.from_pretrained(config.model_name, do_lower_case=True)

In [0]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))

len(train_tokens)

Pad the tokens and convert them to their IDs

In [0]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape

In [0]:
train_y = np.array([labels.index(x) for x in train_labels])
print(train_y.shape, np.unique(train_y, return_counts=True)[1] / len(train_y))

Masks for tokens, so we can tell the model where the unimportant (padded) part is

In [0]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]

# Bert Model

In [0]:
class WrappedCrossEntropyLoss(nn.CrossEntropyLoss):
    """Wrapper around nn.CrossEntropyLoss to deal with the special behavior of our inputs that need to be transformed before calculating the loss"""
    def __init__(self, num_labels):
        super(WrappedCrossEntropyLoss, self).__init__()

        self.num_labels = num_labels
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, input, target):
        return self.loss_fct(input.view(-1, self.num_labels), target.view(-1))

In [0]:
# TODO: is it possible to do a multi-level classification? Like n binary classifications and the one with the highest confidentiality is taken
class BertTextClassifier(BertPreTrainedModel):
    def __init__(self, model_name, num_labels, avg_pool = False):
        config = BertConfig.from_pretrained(model_name)
        super(BertTextClassifier, self).__init__(config)
        self.num_labels = num_labels
        self.avg_pool = avg_pool
        self.simple = False
        
        self.bert = BertModel.from_pretrained(model_name, config=config)
        
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.config.hidden_size, num_labels)

        #self.apply(self.init_weights)
    
    def forward(self, tokens, labels=None, position_ids=None, token_type_ids=None, attention_mask=None, head_mask=None):
        outputs = self.bert(tokens, position_ids=position_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, head_mask=head_mask)
        
        pooled_output = outputs[1]
        # According to documentation of pytorch-transformers, pooled output might not be the best 
        # and you’re often better with averaging or pooling the sequence of hidden-states for the whole input sequence
        if self.avg_pool:
            hidden_states = outputs[0]
            pooled_output = torch.mean(hidden_states, 1)

        dropout_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_output)

        softmax = nn.Softmax(dim=1)
        probs = softmax(logits)

        if self.simple:
            return probs

        outputs = (probs,) + outputs[2:]
        
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(probs.view(-1), labels.view(-1))
            else:
                loss_fct = WrappedCrossEntropyLoss(self.num_labels)
                loss = loss_fct(probs, labels)
            outputs = (loss,) + outputs          

        return outputs
    
    def predict(self, tokens, labels=None, position_ids=None, token_type_ids=None, attention_mask=None, head_mask=None):
        predictions = self(tokens, labels=labels, position_ids=position_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, head_mask=head_mask)
        
        if labels is not None:
            loss, logits = predictions
        else:
            loss, logits = None, predictions[0]
    
        softmax = nn.Softmax(dim=1)
        probs = softmax(logits)

        return loss, probs 
        

# Learner

In [0]:
class Learner:
    def __init__(self, model):
        self.device = torch.device("cuda" if cuda_available else "cpu")
        self.model = model.to(device)

        self.max_grad_norm = 1.0
    
    def train(self, epochs, train_dataloader, optimizer, scheduler = None, test_dataloader = None):
        """Trains the model for the given amount of epochs using the given dataloader and optimizer

        Parameters:
            epochs(int): Number of Epochs
            train_dataloader(torch.utils.data.DataLoader): Dataloader consisting of a TensorDataset with tokens, masks, and labels
            optimizer(nn.Optimizer): Optimizer to use, e.g., AdamW
            scheduler(Scheduler, optional): Scheduler to use. If none is provided, uses the default ConstantLRSchedule
            test_dataloader(torch.utils.data.DataLoader, optional): Dataloader for the test-data. If provided, does a evaluation step after each epoch. Needs the same shape like the train_dataloader. 
        """
        master_bar_iterator = master_bar(range(epochs))
        master_bar_iterator.first_bar.comment = get_memory_usage_str()
        master_bar_iterator.update(0)

        if scheduler is None:
            scheduler = ConstantLRSchedule(optimizer)

        for epoch_num in master_bar_iterator:
            # Train for the epoch
            train_loss = 0
            self.model.zero_grad()
            epoch_bar_iterator = progress_bar(train_dataloader, parent=master_bar_iterator)
            for step_num, batch_data in enumerate(epoch_bar_iterator):
                if step_num < 2:
                    # update the memory usage; in the first steps only
                    master_bar_iterator.first_bar.comment = get_memory_usage_str()
                    master_bar_iterator.update(epoch_num)

                self.model.train()
                token_ids, masks, labels = tuple(t.to(self.device) for t in batch_data)
                outputs = self.model(token_ids, labels=labels, attention_mask = masks)
                
                loss = outputs[0]
                loss.backward()
                clip_grad_norm_(parameters=self.model.parameters(), max_norm=self.max_grad_norm)

                scheduler.step()
                optimizer.step()
                self.model.zero_grad()
                
                train_loss += loss.item()
                curr_loss = train_loss / (step_num + 1)
                master_bar_iterator.child.comment = f'loss: {curr_loss}'
                master_bar_iterator.first_bar.comment = get_memory_usage_str()
            # Calculate train stats
            epoch_train_loss = train_loss / len(train_dataloader)

            if test_dataloader is None:
                master_bar_iterator.write(f'Finished epoch {epoch_num + 1}: train loss = {epoch_train_loss:.5}')
                continue

            # Evaluate the current classifier
            self.model.eval()
            test_loss = 0
            predictions = None
            corr_labels = None
            with torch.no_grad():
                for step_num, batch_data in enumerate(progress_bar(test_dataloader, parent=master_bar_iterator)):
                    token_ids, masks, labels = tuple(batch.to(self.device) for batch in batch_data)

                    outputs = self.model.predict(token_ids, labels=labels, attention_mask = masks)
                    batch_loss, probs = outputs[:2]

                    test_loss += batch_loss.item()

                    numpy_probs = probs.cpu().detach().numpy()
                    numpy_labels = labels.cpu().detach().numpy()
                    if predictions is None:
                        predictions = np.argmax(numpy_probs, axis = 1)
                        corr_labels = numpy_labels.T[0]
                    else:
                        predictions = np.append(predictions, np.argmax(numpy_probs, axis = 1), axis=0)
                        corr_labels = np.append(corr_labels, numpy_labels.T[0], axis=0)
            
            # Calculate test stats
            epoch_test_loss = test_loss / len(test_dataloader)
            accuracy = (predictions == corr_labels).mean()
            f1 = f1_score(y_true=corr_labels, y_pred=predictions, average='macro')
            master_bar_iterator.write(f'Finished epoch {epoch_num + 1}: train loss = {epoch_train_loss:.5}, test loss = {epoch_test_loss:.5}, accuracy = {accuracy:.2%}, f1 = {f1:.2%}')

        return train_loss / len(train_dataloader)
    

    def evaluate(self, test_dataloader):
        """ Evaluates the model using the given dataloader for the test-data
        Parameters:
            test_dataloader(torch.utils.data.DataLoader): Dataloader for the test-data consisting of a TensorDataset with tokens, masks, and labels
        """

        self.model.eval()
        predicted = []
        gold_answers = []
        with torch.no_grad():
            for batch_data in master_bar(test_dataloader):
                token_ids, masks, labels = tuple(batch.to(self.device) for batch in batch_data)

                logits = self.model(token_ids)[0]
                numpy_logits = logits.cpu().detach().numpy()
                numpy_labels = labels.cpu().detach().numpy()

                predictions = np.argmax(numpy_logits, axis = 1)
                predicted += list(predictions)
                gold_answers += numpy_labels.T[0].tolist()

        report = classification_report(gold_answers, predicted)
        return report, predicted, gold_answers
    

    def find_lr(self, dataloader, optimizer, step_mode="exp"):
        """ Tries different learning rates and plots a curve describing the loss for the learning rates
        Parameters:
            dataloader: dataloader with a 2-dim-dataset consisting of train- and label-tensors
            optimizer: the optimizer that should be used
            step_mode(str): "linear" for within one exponent or "exp" for between exp.
        """
        self.model.simple = True
        loss_fct = WrappedCrossEntropyLoss(self.model.num_labels)
        lr_finder = LRFinder(self.model, optimizer, loss_fct, device="cuda")
        
        lr_finder.range_test(dataloader, end_lr=1, num_iter=100, step_mode=step_mode)

        self.model.simple = False
        lr_finder.plot()

# Fine-tune BERT

Create tensors

In [0]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1))
train_masks_tensor = torch.tensor(train_masks)

get_memory_usage_str()

Create the datasets

In [0]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.bs)

Initialize the model and create the optimizer and scheduler

In [0]:
model = BertTextClassifier(config.model_name, config.num_labels)
optimizer = AdamW(model.parameters(), lr=config.lr, betas=config.adam_betas, eps=config.adam_epsilon, correct_bias=False)
scheduler = ConstantLRSchedule(optimizer)

In [0]:
torch.cuda.empty_cache()

Create the learner

In [0]:
learner = Learner(model)

Optional: Find LR

In [0]:
do_find_lr = False

if do_find_lr:
    start_lr = 1e-10
    lr_optimizer = AdamW(model.parameters(), lr=start_lr, betas=config.adam_betas, eps=config.adam_epsilon, correct_bias=False)
    lr_train_dataset = TensorDataset(train_tokens_tensor, train_y_tensor)
    lr_train_dataloader = DataLoader(lr_train_dataset, shuffle=True, batch_size=config.bs)
    learner.find_lr(dataloader=lr_train_dataloader, optimizer=lr_optimizer)

In [0]:
torch.cuda.empty_cache()
get_memory_usage_str()

Start the training

In [0]:
learner.train(epochs = config.epochs, train_dataloader=train_dataloader, optimizer=optimizer, scheduler = scheduler)

# Predictor

In [0]:
class Predictor:
    def __init__(self, classifier, tokenizer, class_labels, threshold=0.90, default_value =  'unrelated'):
        self.classifier = classifier
        self.tokenizer = tokenizer
        self.threshold = threshold
        self.class_labels = class_labels
        self.default_value = default_value

        self.device = torch.device("cuda" if cuda_available else "cpu")

    def predict(self, text):
        tokens = [['[CLS]'] + tokenizer.tokenize(text)[:510] + ['[SEP]']]
        tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
        masks = [[float(i > 0) for i in ii] for ii in tokens_ids]
        token_ids = torch.tensor(tokens_ids).to(self.device)
        masks = torch.tensor(masks).to(self.device)       
        
        probs = self.classifier.predict(token_ids, attention_mask=masks)[1]
        numpy_probs = probs.cpu().detach().numpy()
        prediction = np.argmax(numpy_probs)
        prob = numpy_probs[0][prediction].item()
        if prob > self.threshold:
            return self.class_labels[prediction], prob
        else: return self.default_value, prob

In [0]:
predictor = Predictor(learner.model, tokenizer, labels)

Predict/classify

In [0]:
initLog()
for row in progress_bar(df_eval.itertuples(), total=len(df_eval)):
    filename = row.file
    class_text = row.text
    prediction, prob = predictor.predict(class_text)
    log_text = '{} -> {}'.format(filename, prediction)
    logLine(log_text)

In [0]:
EVAL_SCRIPT = config_data.root_folder + config_data.eval_script
ANSWERS = config_data.root_folder + config_data.answer_set
LOG_FILE = config_data.root_folder + config_data.log_file

eval_command = 'python "{}" "{}" "{}"'.format(EVAL_SCRIPT, ANSWERS, LOG_FILE)

In [0]:
result = !{eval_command}
logResult(result)

In [0]:
for line in result:
    print(line)