<a href="https://colab.research.google.com/github/Jontpan/master-thesis/blob/master/QCBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets

In [2]:
import os
import re
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext tensorboard

In [4]:
def read_qaqc_split(filename):
    texts = []
    labels = []

    with open('drive/MyDrive/master-thesis/data/' + filename + '_texts.txt', 'r') as f:
        texts = f.readlines()

    with open('drive/MyDrive/master-thesis/data/' + filename + '_labels.txt', 'r') as f:
        labels = f.readlines()
    
    t = [x.strip() for x in texts]
    l = [x.strip() for x in labels]
    fine, course = label_to_index(l)

    return pd.DataFrame(list(zip(t, l, course, fine)), columns=['text', 'verbose label', 'course label', 'fine label'])

def label_to_index(labels):
    converter = []
    fine_labels = []
    course_labels = []

    with open('drive/MyDrive/master-thesis/data/labels.txt', 'r') as f:
        converter = f.readlines()
    
    converter = [x.strip() for x in converter]
    fine_converter = {k: v for v, k in enumerate(converter)}

    course_converter = [re.sub(r'([A-Z]+)(.*)', r'\1', label) for label in converter]
    course_converter = {k: v for v, k in enumerate(list(set(course_converter)))}
    
    for label in labels:
        fine_labels.append(fine_converter[label])
        course_labels.append(course_converter[re.sub(r'([A-Z]+)(.*)', r'\1', label)])
    
    assert(len(fine_labels) == len(labels))

    return fine_labels, course_labels

In [9]:
train_data = read_qaqc_split('train')
test_data = read_qaqc_split('test')

train_data.sample(5)


Unnamed: 0,text,verbose label,course label,fine label
3142,Vem befallde de franska styrkorna i slaget vid...,HUM:ind,5,30
5008,N√§r var den f√∂rsta framg√•ngsrika hj√§rttranspla...,NUM:date,3,39
646,Vilken typ av vin √§r Spumante?,ENTY:food,0,13
2999,Vem √§r sk√•despelerskan Bette Davis en g√•ng sa ...,HUM:ind,5,30
4282,Vilken l√•t satte James Taylor i rampljuset?,ENTY:cremat,0,9


In [10]:
from sklearn.model_selection import train_test_split

X = train_data['text'].tolist()
y = train_data['fine label'].tolist()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2)


In [11]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')

# Concatenate train data and test data
all_texts = np.concatenate([train_data.text.values, test_data.text.values])

# Encode our concatenated data
encoded_texts = [tok.encode(sent, add_special_tokens=True) for sent in all_texts]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_texts])
print('Max length: ', max_len)

Max length:  43


In [12]:
MAX_LEN = 43

train_encodings = tok(list(X_train), truncation=True, padding=True, max_length=MAX_LEN)
val_encodings = tok(list(X_val), truncation=True, padding=True, max_length=MAX_LEN)

In [14]:
class QAQCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = QAQCDataset(train_encodings, y_train)
val_dataset = QAQCDataset(val_encodings, y_val)

In [15]:
%%time
from transformers import AutoModel
import torch.nn as nn
from transformers.modeling_outputs import MultipleChoiceModelOutput

class QCBERT(nn.Module):

    def __init__(self, num_labels):
        super(QCBERT, self).__init__()
        self.num_labels = num_labels
        self.bert = AutoModel.from_pretrained('KB/bert-base-swedish-cased')
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, self.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        ):

        return_dict = return_dict if return_dict is not None else self.bert.config.use_return_dict
        num_choices = self.num_labels

        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

CPU times: user 83.8 ms, sys: 6.76 ms, total: 90.6 ms
Wall time: 103 ms


In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [17]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=4,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',
    learning_rate=5e-5
)

model = QCBERT(50)

trainer = Trainer(
    model=model,                         # the instantiated ü§ó Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics      # evaluation metrics
)

trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501379977.0, style=ProgressStyle(descri‚Ä¶




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
1,0.7478,0.74395,0.822838,0.805957,0.812492,0.822838,2.903,410.264
2,0.4076,0.618649,0.860621,0.85691,0.86509,0.860621,3.2912,361.878
3,0.1382,0.610031,0.877414,0.876592,0.880875,0.877414,3.1241,381.229
4,0.0748,0.602394,0.879933,0.879208,0.884185,0.879933,3.199,372.305


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1192, training_loss=0.5739878607326306, metrics={'train_runtime': 212.8889, 'train_samples_per_second': 5.599, 'total_flos': 0, 'epoch': 4.0})

In [102]:
trainer.evaluate()

{'epoch': 4.0,
 'eval_accuracy': 0.9403862300587741,
 'eval_f1': 0.9405224041785631,
 'eval_loss': 0.32615673542022705,
 'eval_precision': 0.9409240124445233,
 'eval_recall': 0.9403862300587741,
 'eval_runtime': 5.8274,
 'eval_samples_per_second': 204.38}

In [None]:
%reload_ext tensorboard
%tensorboard --logdir logs