<a href="https://colab.research.google.com/github/Jontpan/master-thesis/blob/master/QCBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
pip install transformers



In [56]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [57]:
def read_qaqc_split(filename):
    texts = []
    labels = []

    with open('drive/MyDrive/master-thesis/data/' + filename + '_texts.txt', 'r') as f:
        texts = f.readlines()

    with open('drive/MyDrive/master-thesis/data/' + filename + '_labels.txt', 'r') as f:
        labels = f.readlines()
    
    t = [x.strip() for x in texts]
    l = [x.strip() for x in labels]
    l2 = label_to_index(l)

    return pd.DataFrame(list(zip(t, l, l2)), columns=['text', 'verbose label', 'label'])

def label_to_index(labels):
    converter = []
    converted_labels = []

    with open('drive/MyDrive/master-thesis/data/labels.txt', 'r') as f:
        converter = f.readlines()
    
    converter = [x.strip() for x in converter]
    
    for idx, label in enumerate(labels):
        for i, e in enumerate(converter):
            if label == e:
                converted_labels.append(i)
                break
        
        else:
            print(label, idx)
    
    assert(len(converted_labels) == len(labels))

    return converted_labels


In [58]:
train_data = read_qaqc_split('train')
test_data = read_qaqc_split('test')

train_data.sample(5)


Unnamed: 0,text,verbose label,label
4040,Vad heter Tom Sawyers moster som han bor med?,HUM:ind,30
4248,Vilken religion har flest medlemmar?,ENTY:religion,20
1966,Vad tillverkades papper av i slutet av 1500-ta...,ENTY:substance,22
4766,Hur fixar man gnissliga golv?,DESC:manner,4
1374,Vad piskade Mr. Magoo pÃ¥ TV fÃ¶r General Electric?,ENTY:other,17


In [59]:
from sklearn.model_selection import train_test_split

X = train_data.text.values
y = train_data.label.values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2)

In [60]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')

# Concatenate train data and test data
all_texts = np.concatenate([train_data.text.values, test_data.text.values])

# Encode our concatenated data
encoded_texts = [tok.encode(sent, add_special_tokens=True) for sent in all_texts]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_texts])
print('Max length: ', max_len)

Max length:  43


In [61]:
MAX_LEN = 43

train_encodings = tok(list(X_train), truncation=True, padding=True, max_length=MAX_LEN)
val_encodings = tok(list(X_val), truncation=True, padding=True, max_length=MAX_LEN)
test_encodings = tok(list(test_data.text.values), truncation=True, padding=True, max_length=MAX_LEN)

In [62]:
class QAQCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = QAQCDataset(train_encodings, y_train)
val_dataset = QAQCDataset(val_encodings, y_val)
test_dataset = QAQCDataset(test_encodings, test_data.label.values)

In [63]:
%%time
from transformers import AutoModel
import torch
import torch.nn as nn
from transformers.modeling_outputs import SequenceClassifierOutput

class QCBERT(nn.Module):

    def __init__(self):
        super(QCBERT, self).__init__()
        self.num_labels = 50
        self.bert = AutoModel.from_pretrained('KB/bert-base-swedish-cased')
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, self.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        ):

        return_dict = return_dict if return_dict is not None else self.bert.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

CPU times: user 27 Âµs, sys: 14 Âµs, total: 41 Âµs
Wall time: 43.6 Âµs


In [68]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [69]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=4,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',
    learning_rate=5e-5
)

model = QCBERT()

trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics      # evaluation metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
1,0.8496,0.740014,0.834593,0.820909,0.823698,0.834593,2.7902,426.85
2,0.6271,0.545854,0.86314,0.85763,0.862546,0.86314,2.8575,416.803
3,0.2504,0.521983,0.88665,0.882241,0.885655,0.88665,2.724,437.233
4,0.0754,0.52505,0.889169,0.886848,0.891366,0.889169,2.8741,414.391


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1192, training_loss=0.6169425434114149, metrics={'train_runtime': 220.9514, 'train_samples_per_second': 5.395, 'total_flos': 0, 'epoch': 4.0})

In [70]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))


{'epoch': 4.0,
 'eval_accuracy': 0.889168765743073,
 'eval_f1': 0.8868477475148121,
 'eval_loss': 0.5250504612922668,
 'eval_precision': 0.8913659180049227,
 'eval_recall': 0.889168765743073,
 'eval_runtime': 2.4148,
 'eval_samples_per_second': 493.201}

In [None]:
%reload_ext tensorboard
%tensorboard --logdir logs