<a href="https://colab.research.google.com/github/Jontpan/master-thesis/blob/master/QCBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers
!pip install wandb

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 16.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 34.3MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 50.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=0b83

In [3]:
import os
import re
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import wandb

wandb.login()

%matplotlib inline
%load_ext tensorboard

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
%env WANDB_LOG_MODEL=true

env: WANDB_LOG_MODEL=true


In [9]:
train_data = pd.read_csv('drive/MyDrive/master-thesis/data/swe_qaqc_train.csv')
test_data = pd.read_csv('drive/MyDrive/master-thesis/data/swe_qaqc_test.csv')


train_data.sample(5)

Unnamed: 0.1,Unnamed: 0,text,verbose label,course label,fine label
5928,5928,Men hjälp av vad sker bakningen på industribag...,ENTY:other,2,17
3250,3250,Vilken Lewis Carroll-bok introducerade Humpty ...,ENTY:cremat,2,9
770,770,Varför gick kycklingen över vägen?,DESC:reason,0,5
3437,3437,Vilket hockeylag spelade Wayne Gretzky för?,HUM:gr,5,29
1122,1122,Vem tillhandahåller telefontjänster i Orange C...,HUM:gr,5,29


In [17]:
from sklearn.model_selection import train_test_split

X_train = train_data['text'].tolist()
y_train = train_data['fine label'].tolist()
X_test = test_data['text'].to_list()
y_test = test_data['fine label'].to_list()


In [19]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')

# Concatenate train data and test data
all_texts = np.concatenate([X_train, X_test])

# Encode our concatenated data
encoded_texts = [tok.encode(sent, add_special_tokens=True) for sent in all_texts]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_texts])
print('Max length: ', max_len)

Max length:  43


In [20]:
train_encodings = tok(list(X_train), truncation=True, padding=True, max_length=max_len)
test_encodings = tok(list(X_test), truncation=True, padding=True, max_length=max_len)

In [21]:
class QAQCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = QAQCDataset(train_encodings, y_train)
test_dataset = QAQCDataset(test_encodings, y_test)

In [25]:
%%time
from transformers import AutoModel
import torch.nn as nn
from transformers.modeling_outputs import MultipleChoiceModelOutput

class SwEAT_BERT(nn.Module):

    def __init__(self, num_labels):
        super(SwEAT_BERT, self).__init__()
        self.num_labels = num_labels
        self.bert = AutoModel.from_pretrained('KB/bert-base-swedish-cased')
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, self.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        ):

        return_dict = return_dict if return_dict is not None else self.bert.config.use_return_dict
        num_choices = self.num_labels

        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

CPU times: user 44 µs, sys: 0 ns, total: 44 µs
Wall time: 47.9 µs


In [23]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    prec, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'recall': recall,
        'precision': prec
    }

In [26]:
def model_init(num_labels):
    return SwEAT_BERT(num_labels=num_labels)

In [27]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs=4,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=0,              # number of warmup steps for learning rate scheduler
    learning_rate=5e-5,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    run_name='SwEAT-BERT'
)

model = model_init(50)

trainer = Trainer(
    model=model,
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics      # evaluation metrics
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501379977.0, style=ProgressStyle(descri…




In [28]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mjontpan[0m (use `wandb login --relogin` to force relogin)




Step,Training Loss
100,2.2461
200,1.149
300,0.8368
400,0.7144
500,0.5076
600,0.4401
700,0.4352
800,0.3064
900,0.2117
1000,0.2134


TrainOutput(global_step=1492, training_loss=0.5126462986258336, metrics={'train_runtime': 234.2579, 'train_samples_per_second': 6.369, 'total_flos': 0, 'epoch': 4.0})

In [29]:
trainer.evaluate()



{'epoch': 4.0,
 'eval_accuracy': 0.922,
 'eval_f1': 0.9210996824649069,
 'eval_loss': 0.49302026629447937,
 'eval_precision': 0.9281413919413919,
 'eval_recall': 0.922,
 'eval_runtime': 0.6919,
 'eval_samples_per_second': 722.658}