# Data processing

In [57]:
import torch
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")


In [58]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@File    : data_processing.py
@IDE     : PyCharm
@Author  : Yaokun Li
@Date    : 2022/10/18 20:30
@Description :
'''

import gensim
from nltk.util import bigrams
from nltk.util import trigrams


def getLanguageDataSet(data, language):
    return data.filter(lambda x: x['language'] == language)


def getJapaneseDataSet(data):
    return getLanguageDataSet(data, "japanese")


def getEnglishDataSet(data):
    return getLanguageDataSet(data, "english")


def getFinnishDataSet(data):
    return getLanguageDataSet(data, "finnish")


MAX_VOCAB_SIZE = 20000
UNK, PAD = '<UNK>', '<PAD>'


def build_vocab(sent_list, max_size, min_freq, tokenizer):
    vocab_dic = {}
    for sent in sent_list:
        for word in tokenizer(sent):
            vocab_dic[word] = vocab_dic.get(word, 0) + 1
    print(len(vocab_dic))
    vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[
                 :max_size]
    vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
    print("final voc length:", len(vocab_dic))
    return vocab_dic


In [59]:
class QADataSet():
    English_vocab = {}
    Japanese_vocab = {}
    Finnish_vocab = {}
    def __init__(self, tokenizer, dataset,language = "english", vocab_size = 10000):
        self.vocabulary = None
        self.tokenizer = tokenizer
        self.question = []
        self.answer_text = []
        self.answer_start = []
        self.document = []
        self.tokenized_question = []
        self.tokenized_answer_text = []
        self.tokenized_document = []
        self.answer_label = []
        self.vocab_size = vocab_size
        self.language=language

        for element in dataset:
            self.question.append(element["question_text"].lower())
            self.answer_text.append(element["annotations"]["answer_text"][0])
            self.answer_start.append(element["annotations"]["answer_start"])
            self.document.append(element["document_plaintext"].lower())
            if (element["annotations"]["answer_start"] == [-1]):
                self.answer_label.append(torch.tensor([0], dtype=torch.int64).cuda())
            else:
                self.answer_label.append(torch.tensor([1], dtype=torch.int64).cuda())


        for s in self.answer_text:
            self.tokenized_answer_text.append(self.__tokenize(s))

        for s in self.question:
            self.tokenized_question.append(self.__tokenize(s))

        for s in self.document:
            self.tokenized_document.append(self.__tokenize(s))

        self.get_vocab(language)
        self.document_num = []
        self.question_num = []
        for sent in self.tokenized_document:
            self.document_num.append([self.vocabulary.get(word, self.vocab_size) for word in sent])
        for sent in self.tokenized_question:
            self.question_num.append([self.vocabulary.get(word, self.vocab_size) for word in sent])

    def get_vocab(self, language):
        if language == "english":
            if len(QADataSet.English_vocab) != 0:
                self.vocabulary = QADataSet.English_vocab
            else:
                QADataSet.English_vocab = build_vocab(self.question + self.document, self.vocab_size, 2, self.tokenizer)
                self.vocabulary = QADataSet.English_vocab
        elif language == "japanese":
            if len(QADataSet.Japanese_vocab) != 0:
                self.vocabulary = QADataSet.Japanese_vocab
            else:
                QADataSet.Japanese_vocab = build_vocab(self.question + self.document, self.vocab_size, 2, self.tokenizer)
                self.vocabulary = QADataSet.Japanese_vocab
        elif language == "finnish":
            if len(QADataSet.Finnish_vocab) != 0:
                self.vocabulary = QADataSet.Finnish_vocab
            else:
                QADataSet.Finnish_vocab = build_vocab(self.question + self.document, self.vocab_size, 2, self.tokenizer)
                self.vocabulary = QADataSet.Finnish_vocab
        return self.vocabulary

    def __tokenize(self, l, with_stop_word=True):
        return self.tokenizer(l)

    def get_overlaps_words_num(self):
        overlaps_words_num = []
        for question, document in zip(self.question_num, self.document_num):
            count = 0
            for word in question:
                if word in document:
                    count = count + 1
            overlaps_words_num.append(count)
        return overlaps_words_num

    def get_document_length(self):
        return [len(document) for document in self.document_num]

    def get_question_length(self):
        return [len(question) for question in self.question_num]

    def get_overlaps_2_gram(self):
        overlaps_words_num = []
        for question, document in zip(self.tokenized_question, self.tokenized_document):
            count = 0
            doc_bigrams = list(bigrams(document))
            for word in bigrams(question):
                if word in doc_bigrams:
                    count = count + 1
            overlaps_words_num.append(count)
        return overlaps_words_num

    def get_overlaps_3_gram(self):
        overlaps_words_num = []
        for question, document in zip(self.tokenized_question, self.tokenized_document):
            count = 0
            doc_bigrams = list(trigrams(document))
            for word in trigrams(question):
                if word in doc_bigrams:
                    count = count + 1
            overlaps_words_num.append(count)
        return overlaps_words_num

    def get_label(self):
        return torch.cat(self.answer_label, dim=0)

    def get_question_bow(self, vocab_size):
        data = []
        for ques in self.question_num:
            bow = [0]*vocab_size
            for word in ques:
                bow[word] += 1
            data.append(bow)
        return data

    def get_doc_bow(self, vocab_size):
        data = []
        for ques in self.document_num:
            bow = [0] * vocab_size
            for word in ques:
                bow[word] += 1
            data.append(bow)
        return data

    def get_features(self):
        feature1 = self.get_overlaps_words_num()
        feature2 = self.get_overlaps_2_gram()
        feature5 = self.get_overlaps_3_gram()
        feature3 = self.get_document_length()
        feature4 = self.get_question_length()
        feature_ques_bow = torch.Tensor(self.get_question_bow(self.vocab_size + 1)).cuda()
        feature_doc_bow = torch.Tensor(self.get_doc_bow(self.vocab_size + 1)).cuda()
        X = torch.Tensor([feature1,feature2, feature3, feature4,feature5]).t().cuda()
        return torch.cat([feature_ques_bow,feature_doc_bow, X], dim = 1)


In [60]:
import tokenizer
import torch.utils.data as Data
from datasets import load_dataset
import torch
from torch import nn
import spacy

In [61]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

Using custom data configuration copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6
Reusing dataset parquet (/root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

# Model

In [62]:
epochs = 5
batch_size = 64
lr = 0.0001

In [63]:
from torch import nn

class AnswerableClassifier(nn.Module):
    def __init__(self, vocab_size, num_labels = 2, num_hidden = 20):
        super(AnswerableClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_hidden)
        self.dropout = nn.Dropout(0.5)
        self.nonlinear = nn.ReLU()
        self.final = nn.Linear(num_hidden, num_labels)

    def forward(self, bow_vec):
        return self.final(self.nonlinear(self.dropout(self.linear(bow_vec))))


In [64]:
def train_features_model( model, train_loader, criterion, optimizer, model_file_name, epochs):
    max_acc = 0
    for epoch in range(epochs):
        model.train()
        batch_num = 0

        for features, label in train_loader:
            predict_label = model(features)
            loss = criterion(predict_label, label)

            pred = predict_label.max(-1, keepdim=True)[1]
            acc = pred.eq(label.view_as(pred)).sum().item() / predict_label.shape[0]
            optimizer.zero_grad()
            if (acc > max_acc):
                max_acc = acc
                torch.save(model.state_dict(), model_file_name)
            loss.backward()
            optimizer.step()
            batch_num += 1
            print("epoch:", epoch + 1, "batch_num:", batch_num, "loss:", round(loss.item(), 4), "acc:", acc)
    return max_acc


# English Dataset

In [65]:
from torchtext.data import get_tokenizer
torch_tokenizer = get_tokenizer('basic_english', language="en")
English_vocab_size = 30000

english_tokenizer = spacy.load("en_core_web_sm")
def new_english_tokenizer(sent):
    return [token.text for token in english_tokenizer(sent)]

In [66]:
train_english_qa_dataset = QADataSet(new_english_tokenizer, getEnglishDataSet(train_set), language = "english", vocab_size = English_vocab_size)
train_features = train_english_qa_dataset.get_features()
train_label = train_english_qa_dataset.get_label()
train_features_model_dataset = Data.TensorDataset(train_features, train_label)
train_features_model_loader = Data.DataLoader(dataset=train_features_model_dataset,
                                              batch_size=batch_size,
                                              shuffle=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-1103e6c04ff44af3.arrow


61385
final voc length: 27211


In [67]:
val_english_qa_dataset = QADataSet(new_english_tokenizer, getEnglishDataSet(validation_set),  vocab_size = English_vocab_size)
val_features = val_english_qa_dataset.get_features()
val_label = val_english_qa_dataset.get_label()
val_features_model_dataset = Data.TensorDataset(val_features, val_label)
val_features_model_loader = Data.DataLoader(dataset=val_features_model_dataset,
                                            batch_size= batch_size,
                                            shuffle=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-dabfcfd450c9224c.arrow


In [130]:
criterion = nn.CrossEntropyLoss(reduction="sum")  # loss function
english_model = AnswerableClassifier(vocab_size= 2*English_vocab_size + 7, num_labels=2, num_hidden=100).to('cuda')
optimizer = torch.optim.Adam(english_model.parameters(), lr = 0.0005, amsgrad=True)

max_acc = train_features_model(model = english_model, train_loader=train_features_model_loader,
                               criterion= criterion, optimizer=optimizer, model_file_name="english_model.pth",
                               epochs = 3)
print("max_acc:", max_acc)
english_model.load_state_dict(torch.load("english_model.pth"))
english_model.eval()
predict_label = english_model(val_features)
pred = predict_label.max(-1, keepdim=True)[1]
label = val_label
test_acc = pred.eq(label.view_as(pred)).sum().item() / predict_label.shape[0]

print("test acc:", test_acc)

epoch: 1 batch_num: 1 loss: 44.8093 acc: 0.453125
epoch: 1 batch_num: 2 loss: 44.7177 acc: 0.515625
epoch: 1 batch_num: 3 loss: 42.5308 acc: 0.625
epoch: 1 batch_num: 4 loss: 44.5481 acc: 0.484375
epoch: 1 batch_num: 5 loss: 43.1645 acc: 0.546875
epoch: 1 batch_num: 6 loss: 42.9111 acc: 0.609375
epoch: 1 batch_num: 7 loss: 45.1337 acc: 0.484375
epoch: 1 batch_num: 8 loss: 46.6303 acc: 0.46875
epoch: 1 batch_num: 9 loss: 44.0609 acc: 0.484375
epoch: 1 batch_num: 10 loss: 44.0736 acc: 0.453125
epoch: 1 batch_num: 11 loss: 40.3666 acc: 0.703125
epoch: 1 batch_num: 12 loss: 42.5462 acc: 0.625
epoch: 1 batch_num: 13 loss: 42.5296 acc: 0.5625
epoch: 1 batch_num: 14 loss: 42.3705 acc: 0.671875
epoch: 1 batch_num: 15 loss: 43.3426 acc: 0.625
epoch: 1 batch_num: 16 loss: 42.3368 acc: 0.671875
epoch: 1 batch_num: 17 loss: 40.8403 acc: 0.65625
epoch: 1 batch_num: 18 loss: 41.3647 acc: 0.6875
epoch: 1 batch_num: 19 loss: 41.5203 acc: 0.609375
epoch: 1 batch_num: 20 loss: 42.884 acc: 0.65625
epoch:

epoch: 1 batch_num: 197 loss: 31.4266 acc: 0.734375
epoch: 1 batch_num: 198 loss: 32.1646 acc: 0.765625
epoch: 1 batch_num: 199 loss: 25.0558 acc: 0.828125
epoch: 1 batch_num: 200 loss: 34.6397 acc: 0.78125
epoch: 1 batch_num: 201 loss: 27.5995 acc: 0.875
epoch: 1 batch_num: 202 loss: 22.9048 acc: 0.859375
epoch: 1 batch_num: 203 loss: 30.2297 acc: 0.84375
epoch: 1 batch_num: 204 loss: 26.7127 acc: 0.8125
epoch: 1 batch_num: 205 loss: 32.5531 acc: 0.765625
epoch: 1 batch_num: 206 loss: 27.6195 acc: 0.859375
epoch: 1 batch_num: 207 loss: 31.2814 acc: 0.78125
epoch: 1 batch_num: 208 loss: 29.2752 acc: 0.84375
epoch: 1 batch_num: 209 loss: 26.5395 acc: 0.8125
epoch: 1 batch_num: 210 loss: 27.9996 acc: 0.84375
epoch: 1 batch_num: 211 loss: 36.965 acc: 0.734375
epoch: 1 batch_num: 212 loss: 30.2268 acc: 0.828125
epoch: 1 batch_num: 213 loss: 27.08 acc: 0.828125
epoch: 1 batch_num: 214 loss: 26.8855 acc: 0.84375
epoch: 1 batch_num: 215 loss: 2.891 acc: 0.4
epoch: 2 batch_num: 1 loss: 21.3574

epoch: 2 batch_num: 179 loss: 28.1441 acc: 0.796875
epoch: 2 batch_num: 180 loss: 26.1352 acc: 0.875
epoch: 2 batch_num: 181 loss: 22.1133 acc: 0.875
epoch: 2 batch_num: 182 loss: 27.6349 acc: 0.765625
epoch: 2 batch_num: 183 loss: 23.6873 acc: 0.84375
epoch: 2 batch_num: 184 loss: 22.9334 acc: 0.875
epoch: 2 batch_num: 185 loss: 21.4117 acc: 0.859375
epoch: 2 batch_num: 186 loss: 18.7234 acc: 0.890625
epoch: 2 batch_num: 187 loss: 23.0184 acc: 0.859375
epoch: 2 batch_num: 188 loss: 15.7587 acc: 0.9375
epoch: 2 batch_num: 189 loss: 21.0098 acc: 0.875
epoch: 2 batch_num: 190 loss: 19.7606 acc: 0.90625
epoch: 2 batch_num: 191 loss: 22.6585 acc: 0.859375
epoch: 2 batch_num: 192 loss: 23.1133 acc: 0.859375
epoch: 2 batch_num: 193 loss: 19.1601 acc: 0.890625
epoch: 2 batch_num: 194 loss: 22.3784 acc: 0.8125
epoch: 2 batch_num: 195 loss: 24.541 acc: 0.875
epoch: 2 batch_num: 196 loss: 26.8072 acc: 0.875
epoch: 2 batch_num: 197 loss: 19.8098 acc: 0.890625
epoch: 2 batch_num: 198 loss: 26.2972

epoch: 3 batch_num: 205 loss: 26.9121 acc: 0.796875
epoch: 3 batch_num: 206 loss: 17.5091 acc: 0.90625
epoch: 3 batch_num: 207 loss: 15.856 acc: 0.890625
epoch: 3 batch_num: 208 loss: 16.4125 acc: 0.875
epoch: 3 batch_num: 209 loss: 20.1004 acc: 0.859375
epoch: 3 batch_num: 210 loss: 20.4378 acc: 0.875
epoch: 3 batch_num: 211 loss: 11.8911 acc: 0.953125
epoch: 3 batch_num: 212 loss: 20.3502 acc: 0.859375
epoch: 3 batch_num: 213 loss: 19.9837 acc: 0.84375
epoch: 3 batch_num: 214 loss: 17.1052 acc: 0.921875
epoch: 3 batch_num: 215 loss: 0.2911 acc: 1.0
max_acc: 1.0
test acc: 0.7882562277580071


In [131]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
print("confusion matrix : \n", confusion_matrix(label.cpu(),  pred.cpu()))
precision_recall_fscore_support(label.cpu(), pred.cpu(), average=None)

confusion matrix : 
 [[670 173]
 [184 659]]


(array([0.78454333, 0.79206731]),
 array([0.79478055, 0.78173191]),
 array([0.78962876, 0.78686567]),
 array([843, 843]))

In [132]:
precision_recall_fscore_support(label.cpu(), pred.cpu(), average="macro")

(0.7883053166096199, 0.7882562277580072, 0.7882472141355685, None)

# Japanese Datasets

In [70]:
import spacy


In [71]:
sentences[0]

'アップルがイギリスの新興企業を１０億ドルで購入を検討'

In [72]:
japanese_tokenizer = spacy.load("ja_core_news_sm")
japanese_vocab_size = 30000

In [73]:
def new_japanese_tokenizer(sent):
    return [token.text for token in japanese_tokenizer(sent)]

In [74]:
new_japanese_tokenizer(sentences[0])

['アップル', 'が', 'イギリス', 'の', '新興', '企業', 'を', '１０億', 'ドル', 'で', '購入', 'を', '検討']

In [75]:
train_japanese_qa_dataset = QADataSet(new_japanese_tokenizer, getJapaneseDataSet(train_set), language="japanese", vocab_size = japanese_vocab_size)
train_features = train_japanese_qa_dataset.get_features()
train_label = train_japanese_qa_dataset.get_label()
train_features_model_dataset = Data.TensorDataset(train_features, train_label)
train_features_model_loader = Data.DataLoader(dataset=train_features_model_dataset,
                                              batch_size=batch_size,
                                              shuffle=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-dc9fb4fd79187984.arrow


64303
final voc length: 30000


In [76]:
len(QADataSet.Japanese_vocab)

30000

In [77]:
val_japanese_qa_dataset = QADataSet(new_japanese_tokenizer, getJapaneseDataSet(validation_set), language="japanese", vocab_size = japanese_vocab_size)
val_features = val_japanese_qa_dataset.get_features()
val_label = val_japanese_qa_dataset.get_label()
val_features_model_dataset = Data.TensorDataset(val_features, val_label)
val_features_model_loader = Data.DataLoader(dataset=val_features_model_dataset,
                                            batch_size= batch_size,
                                            shuffle=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-cee1771b5f371cc3.arrow


In [133]:
criterion = nn.CrossEntropyLoss(reduction="sum",weight=torch.FloatTensor([1, 1]).to(device))  # loss function
japanese_model = AnswerableClassifier(vocab_size=japanese_vocab_size * 2 + 7, num_labels=2, num_hidden=100).to('cuda')
optimizer = torch.optim.Adam(japanese_model.parameters(), lr = 0.001, amsgrad=True)

max_acc = train_features_model(model = japanese_model, train_loader=train_features_model_loader,
                               criterion= criterion, optimizer=optimizer, model_file_name="japanese_model.pth",
                               epochs = 2)
print("max_acc:", max_acc)
japanese_model.load_state_dict(torch.load("japanese_model.pth"))
japanese_model.eval()
predict_label = japanese_model(val_features)
pred = predict_label.max(-1, keepdim=True)[1]
label = val_label
test_acc = pred.eq(label.view_as(pred)).sum().item() / predict_label.shape[0]

print("test acc:", test_acc)

epoch: 1 batch_num: 1 loss: 44.9062 acc: 0.515625
epoch: 1 batch_num: 2 loss: 44.2825 acc: 0.5
epoch: 1 batch_num: 3 loss: 43.7293 acc: 0.59375
epoch: 1 batch_num: 4 loss: 43.9112 acc: 0.5
epoch: 1 batch_num: 5 loss: 43.9055 acc: 0.640625
epoch: 1 batch_num: 6 loss: 43.0701 acc: 0.59375
epoch: 1 batch_num: 7 loss: 42.6467 acc: 0.625
epoch: 1 batch_num: 8 loss: 42.8872 acc: 0.546875
epoch: 1 batch_num: 9 loss: 41.3268 acc: 0.671875
epoch: 1 batch_num: 10 loss: 41.6627 acc: 0.625
epoch: 1 batch_num: 11 loss: 41.6158 acc: 0.546875
epoch: 1 batch_num: 12 loss: 40.2094 acc: 0.5625
epoch: 1 batch_num: 13 loss: 40.8105 acc: 0.625
epoch: 1 batch_num: 14 loss: 41.2021 acc: 0.5625
epoch: 1 batch_num: 15 loss: 39.1377 acc: 0.6875
epoch: 1 batch_num: 16 loss: 37.8692 acc: 0.703125
epoch: 1 batch_num: 17 loss: 40.9677 acc: 0.71875
epoch: 1 batch_num: 18 loss: 41.3198 acc: 0.625
epoch: 1 batch_num: 19 loss: 39.8226 acc: 0.6875
epoch: 1 batch_num: 20 loss: 40.0426 acc: 0.703125
epoch: 1 batch_num: 21

epoch: 1 batch_num: 183 loss: 25.4994 acc: 0.828125
epoch: 1 batch_num: 184 loss: 29.5384 acc: 0.78125
epoch: 1 batch_num: 185 loss: 29.889 acc: 0.71875
epoch: 1 batch_num: 186 loss: 30.5355 acc: 0.84375
epoch: 1 batch_num: 187 loss: 31.37 acc: 0.75
epoch: 1 batch_num: 188 loss: 26.9696 acc: 0.875
epoch: 1 batch_num: 189 loss: 30.2847 acc: 0.75
epoch: 1 batch_num: 190 loss: 29.0013 acc: 0.8125
epoch: 1 batch_num: 191 loss: 26.6842 acc: 0.8125
epoch: 1 batch_num: 192 loss: 28.92 acc: 0.8125
epoch: 1 batch_num: 193 loss: 27.7705 acc: 0.734375
epoch: 1 batch_num: 194 loss: 33.6121 acc: 0.78125
epoch: 1 batch_num: 195 loss: 30.6447 acc: 0.78125
epoch: 1 batch_num: 196 loss: 27.7857 acc: 0.78125
epoch: 1 batch_num: 197 loss: 22.0863 acc: 0.875
epoch: 1 batch_num: 198 loss: 31.6455 acc: 0.78125
epoch: 1 batch_num: 199 loss: 23.6372 acc: 0.90625
epoch: 1 batch_num: 200 loss: 26.1611 acc: 0.796875
epoch: 1 batch_num: 201 loss: 34.7989 acc: 0.796875
epoch: 1 batch_num: 202 loss: 28.4703 acc: 0.

epoch: 2 batch_num: 177 loss: 20.6794 acc: 0.890625
epoch: 2 batch_num: 178 loss: 23.4471 acc: 0.828125
epoch: 2 batch_num: 179 loss: 25.192 acc: 0.84375
epoch: 2 batch_num: 180 loss: 24.4108 acc: 0.859375
epoch: 2 batch_num: 181 loss: 19.4404 acc: 0.90625
epoch: 2 batch_num: 182 loss: 21.7478 acc: 0.875
epoch: 2 batch_num: 183 loss: 26.465 acc: 0.828125
epoch: 2 batch_num: 184 loss: 28.7263 acc: 0.859375
epoch: 2 batch_num: 185 loss: 17.7278 acc: 0.90625
epoch: 2 batch_num: 186 loss: 20.2447 acc: 0.875
epoch: 2 batch_num: 187 loss: 17.7489 acc: 0.84375
epoch: 2 batch_num: 188 loss: 20.5503 acc: 0.859375
epoch: 2 batch_num: 189 loss: 19.0373 acc: 0.90625
epoch: 2 batch_num: 190 loss: 23.497 acc: 0.8125
epoch: 2 batch_num: 191 loss: 21.2469 acc: 0.859375
epoch: 2 batch_num: 192 loss: 20.5419 acc: 0.890625
epoch: 2 batch_num: 193 loss: 18.5771 acc: 0.890625
epoch: 2 batch_num: 194 loss: 18.7909 acc: 0.890625
epoch: 2 batch_num: 195 loss: 29.8004 acc: 0.796875
epoch: 2 batch_num: 196 loss

In [134]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
print("confusion matrix : \n", confusion_matrix(label.cpu(),  pred.cpu()))
precision_recall_fscore_support(label.cpu(), pred.cpu(), average=None)

confusion matrix : 
 [[727 116]
 [256 587]]


(array([0.73957274, 0.83499289]),
 array([0.8623962 , 0.69632266]),
 array([0.79627601, 0.75937904]),
 array([843, 843]))

In [135]:
precision_recall_fscore_support(label.cpu(), pred.cpu(), average="macro")

(0.7872828120726605, 0.7793594306049823, 0.7778275279171492, None)

# Finnish Datasets

In [80]:
finnish_tokenizer = spacy.load("fi_core_news_sm")
finnish_vocab_size = 30000
def new_finnish_tokenizer(sent):
    return [token.text for token in finnish_tokenizer(sent)]

In [81]:
train_finnish_qa_dataset = QADataSet(new_finnish_tokenizer, getFinnishDataSet(train_set), language="finnish",vocab_size = finnish_vocab_size)
train_features = train_finnish_qa_dataset.get_features()
train_label = train_finnish_qa_dataset.get_label()
train_features_model_dataset = Data.TensorDataset(train_features, train_label)
train_features_model_loader = Data.DataLoader(dataset=train_features_model_dataset,
                                              batch_size=batch_size,
                                              shuffle=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-23b0a46089b23e95.arrow


160505
final voc length: 30000


In [82]:
val_finnish_qa_dataset = QADataSet(new_finnish_tokenizer, getFinnishDataSet(validation_set) , language="finnish",vocab_size = finnish_vocab_size)
val_features = val_finnish_qa_dataset.get_features()
val_label = val_finnish_qa_dataset.get_label()
val_features_model_dataset = Data.TensorDataset(val_features, val_label)
val_features_model_loader = Data.DataLoader(dataset=val_features_model_dataset,
                                            batch_size= batch_size,
                                            shuffle=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-235768edd2fc5f73.arrow


In [143]:
criterion = nn.CrossEntropyLoss(reduction="sum", weight=torch.FloatTensor([1, 1]).to(device))  # loss function
finnish_model = AnswerableClassifier(vocab_size=2*finnish_vocab_size + 7, num_labels=2, num_hidden=100).to('cuda')
optimizer = torch.optim.Adam(finnish_model.parameters(), lr = 0.0005, amsgrad=True)

max_acc = train_features_model(model = finnish_model, train_loader=train_features_model_loader,
                               criterion= criterion, optimizer=optimizer, model_file_name="finnish_model.pth",
                               epochs = 4)
print("max_acc:", max_acc)
finnish_model.load_state_dict(torch.load("finnish_model.pth"))
finnish_model.eval()
predict_label = finnish_model(val_features)
pred = predict_label.max(-1, keepdim=True)[1]
label = val_label
test_acc = pred.eq(label.view_as(pred)).sum().item() / predict_label.shape[0]

print("test acc:", test_acc)

epoch: 1 batch_num: 1 loss: 44.8371 acc: 0.546875
epoch: 1 batch_num: 2 loss: 44.7184 acc: 0.4375
epoch: 1 batch_num: 3 loss: 43.3586 acc: 0.734375
epoch: 1 batch_num: 4 loss: 45.0104 acc: 0.65625
epoch: 1 batch_num: 5 loss: 44.2091 acc: 0.578125
epoch: 1 batch_num: 6 loss: 43.5008 acc: 0.515625
epoch: 1 batch_num: 7 loss: 43.703 acc: 0.515625
epoch: 1 batch_num: 8 loss: 44.0892 acc: 0.625
epoch: 1 batch_num: 9 loss: 43.4298 acc: 0.578125
epoch: 1 batch_num: 10 loss: 44.3167 acc: 0.609375
epoch: 1 batch_num: 11 loss: 42.723 acc: 0.671875
epoch: 1 batch_num: 12 loss: 43.0372 acc: 0.625
epoch: 1 batch_num: 13 loss: 43.0186 acc: 0.640625
epoch: 1 batch_num: 14 loss: 42.361 acc: 0.71875
epoch: 1 batch_num: 15 loss: 43.4806 acc: 0.671875
epoch: 1 batch_num: 16 loss: 42.2327 acc: 0.734375
epoch: 1 batch_num: 17 loss: 42.7647 acc: 0.6875
epoch: 1 batch_num: 18 loss: 41.3204 acc: 0.703125
epoch: 1 batch_num: 19 loss: 42.0268 acc: 0.640625
epoch: 1 batch_num: 20 loss: 41.8923 acc: 0.734375
epoc

epoch: 1 batch_num: 200 loss: 26.8201 acc: 0.8125
epoch: 1 batch_num: 201 loss: 26.1493 acc: 0.84375
epoch: 1 batch_num: 202 loss: 30.1103 acc: 0.8125
epoch: 1 batch_num: 203 loss: 28.5292 acc: 0.75
epoch: 1 batch_num: 204 loss: 25.324 acc: 0.90625
epoch: 1 batch_num: 205 loss: 33.3368 acc: 0.734375
epoch: 1 batch_num: 206 loss: 30.3979 acc: 0.796875
epoch: 1 batch_num: 207 loss: 25.4685 acc: 0.84375
epoch: 1 batch_num: 208 loss: 25.9504 acc: 0.84375
epoch: 1 batch_num: 209 loss: 30.8551 acc: 0.765625
epoch: 1 batch_num: 210 loss: 28.9676 acc: 0.796875
epoch: 1 batch_num: 211 loss: 24.3642 acc: 0.828125
epoch: 1 batch_num: 212 loss: 29.1698 acc: 0.75
epoch: 1 batch_num: 213 loss: 31.3878 acc: 0.796875
epoch: 1 batch_num: 214 loss: 28.9165 acc: 0.78125
epoch: 1 batch_num: 215 loss: 1.9836 acc: 1.0
epoch: 2 batch_num: 1 loss: 20.8061 acc: 0.921875
epoch: 2 batch_num: 2 loss: 28.6255 acc: 0.78125
epoch: 2 batch_num: 3 loss: 25.1824 acc: 0.828125
epoch: 2 batch_num: 4 loss: 31.8697 acc: 0.

epoch: 2 batch_num: 207 loss: 34.4702 acc: 0.8125
epoch: 2 batch_num: 208 loss: 20.2433 acc: 0.84375
epoch: 2 batch_num: 209 loss: 24.1955 acc: 0.859375
epoch: 2 batch_num: 210 loss: 28.5616 acc: 0.84375
epoch: 2 batch_num: 211 loss: 25.7277 acc: 0.8125
epoch: 2 batch_num: 212 loss: 26.835 acc: 0.8125
epoch: 2 batch_num: 213 loss: 34.0574 acc: 0.765625
epoch: 2 batch_num: 214 loss: 26.4137 acc: 0.859375
epoch: 2 batch_num: 215 loss: 2.3314 acc: 0.8
epoch: 3 batch_num: 1 loss: 22.6765 acc: 0.84375
epoch: 3 batch_num: 2 loss: 22.1579 acc: 0.859375
epoch: 3 batch_num: 3 loss: 14.4896 acc: 0.953125
epoch: 3 batch_num: 4 loss: 18.2508 acc: 0.90625
epoch: 3 batch_num: 5 loss: 20.1426 acc: 0.890625
epoch: 3 batch_num: 6 loss: 24.1968 acc: 0.796875
epoch: 3 batch_num: 7 loss: 15.5047 acc: 0.9375
epoch: 3 batch_num: 8 loss: 14.0047 acc: 0.953125
epoch: 3 batch_num: 9 loss: 15.8012 acc: 0.90625
epoch: 3 batch_num: 10 loss: 15.0613 acc: 0.953125
epoch: 3 batch_num: 11 loss: 13.9982 acc: 0.90625
e

epoch: 3 batch_num: 163 loss: 15.0984 acc: 0.890625
epoch: 3 batch_num: 164 loss: 16.3784 acc: 0.859375
epoch: 3 batch_num: 165 loss: 21.2042 acc: 0.859375
epoch: 3 batch_num: 166 loss: 17.6731 acc: 0.9375
epoch: 3 batch_num: 167 loss: 19.0933 acc: 0.875
epoch: 3 batch_num: 168 loss: 9.4246 acc: 1.0
epoch: 3 batch_num: 169 loss: 19.5195 acc: 0.875
epoch: 3 batch_num: 170 loss: 18.0781 acc: 0.859375
epoch: 3 batch_num: 171 loss: 17.5875 acc: 0.921875
epoch: 3 batch_num: 172 loss: 15.045 acc: 0.921875
epoch: 3 batch_num: 173 loss: 23.5974 acc: 0.84375
epoch: 3 batch_num: 174 loss: 21.2827 acc: 0.890625
epoch: 3 batch_num: 175 loss: 13.9855 acc: 0.921875
epoch: 3 batch_num: 176 loss: 22.1685 acc: 0.84375
epoch: 3 batch_num: 177 loss: 9.2617 acc: 0.984375
epoch: 3 batch_num: 178 loss: 19.8459 acc: 0.875
epoch: 3 batch_num: 179 loss: 19.8476 acc: 0.859375
epoch: 3 batch_num: 180 loss: 16.0412 acc: 0.90625
epoch: 3 batch_num: 181 loss: 19.4383 acc: 0.859375
epoch: 3 batch_num: 182 loss: 13.5

epoch: 4 batch_num: 119 loss: 14.2129 acc: 0.9375
epoch: 4 batch_num: 120 loss: 12.0775 acc: 0.953125
epoch: 4 batch_num: 121 loss: 14.6275 acc: 0.890625
epoch: 4 batch_num: 122 loss: 15.0041 acc: 0.953125
epoch: 4 batch_num: 123 loss: 14.0199 acc: 0.890625
epoch: 4 batch_num: 124 loss: 14.3763 acc: 0.921875
epoch: 4 batch_num: 125 loss: 21.7991 acc: 0.859375
epoch: 4 batch_num: 126 loss: 17.8419 acc: 0.90625
epoch: 4 batch_num: 127 loss: 10.993 acc: 0.96875
epoch: 4 batch_num: 128 loss: 7.6408 acc: 1.0
epoch: 4 batch_num: 129 loss: 13.7259 acc: 0.921875
epoch: 4 batch_num: 130 loss: 12.0638 acc: 0.953125
epoch: 4 batch_num: 131 loss: 13.5236 acc: 0.890625
epoch: 4 batch_num: 132 loss: 12.4681 acc: 0.9375
epoch: 4 batch_num: 133 loss: 12.3894 acc: 0.9375
epoch: 4 batch_num: 134 loss: 9.3955 acc: 0.96875
epoch: 4 batch_num: 135 loss: 8.6478 acc: 0.96875
epoch: 4 batch_num: 136 loss: 11.3799 acc: 0.96875
epoch: 4 batch_num: 137 loss: 11.0641 acc: 0.921875
epoch: 4 batch_num: 138 loss: 10

In [144]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
print("confusion matrix : \n", confusion_matrix(label.cpu(),  pred.cpu()))
precision_recall_fscore_support(label.cpu(), pred.cpu(), average=None)

confusion matrix : 
 [[712 131]
 [241 602]]


(array([0.74711438, 0.8212824 ]),
 array([0.84460261, 0.71411625]),
 array([0.79287305, 0.76395939]),
 array([843, 843]))

In [145]:
precision_recall_fscore_support(label.cpu(), pred.cpu(), average="macro")

(0.7841983883736144, 0.7793594306049823, 0.7784162210439443, None)