In [33]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@File    : data_processing.py
@IDE     : PyCharm
@Author  : Yaokun Li
@Date    : 2022/10/18 20:30
@Description :
'''

import gensim
from nltk.util import bigrams
from nltk.util import trigrams


def getLanguageDataSet(data, language):
    return data.filter(lambda x: x['language'] == language)


def getJapaneseDataSet(data):
    return getLanguageDataSet(data, "japanese")


def getEnglishDataSet(data):
    return getLanguageDataSet(data, "english")


def getFinnishDataSet(data):
    return getLanguageDataSet(data, "finnish")


MAX_VOCAB_SIZE = 5000
UNK, PAD = '<UNK>', '<PAD>'


def build_vocab(sent_list, max_size, min_freq, tokenizer):
    vocab_dic = {}
    for sent in sent_list:
        for word in tokenizer(sent):
            vocab_dic[word] = vocab_dic.get(word, 0) + 1
    vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[
                 :max_size]
    vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
    return vocab_dic



In [43]:

class QADataSet():
    def __init__(self, tokenizer, dataset):
        self.vocabulary = None
        self.tokenizer = tokenizer
        self.question = []
        self.answer_text = []
        self.answer_start = []
        self.document = []
        self.tokenized_question = []
        self.tokenized_answer_text = []
        self.tokenized_document = []
        self.answer_label = []

        for element in dataset:
            self.question.append(element["question_text"].lower())
            self.answer_text.append(element["annotations"]["answer_text"][0])
            self.answer_start.append(element["annotations"]["answer_start"])
            self.document.append(element["document_plaintext"].lower())
            if (element["annotations"]["answer_start"] == [-1]):
                self.answer_label.append(torch.tensor([0], dtype=torch.int64).cuda())
            else:
                self.answer_label.append(torch.tensor([1], dtype=torch.int64).cuda())


        for s in self.answer_text:
            self.tokenized_answer_text.append(self.__tokenize(s))

        for s in self.question:
            self.tokenized_question.append(self.__tokenize(s))

        for s in self.document:
            self.tokenized_document.append(self.__tokenize(s))

        self.get_vocab()
        self.document_num = []
        self.question_num = []
        for sent in self.tokenized_document:
            self.document_num.append([self.vocabulary.get(word, MAX_VOCAB_SIZE) for word in sent])
        for sent in self.tokenized_question:
            self.question_num.append([self.vocabulary.get(word, MAX_VOCAB_SIZE) for word in sent])

    def get_vocab(self):
        self.vocabulary = build_vocab(self.question + self.document, MAX_VOCAB_SIZE, 2, self.tokenizer)

        return self.vocabulary

    def __tokenize(self, l, with_stop_word=True):
        return self.tokenizer(l)

    def get_overlaps_words_num(self):
        overlaps_words_num = []
        for question, document in zip(self.question_num, self.document_num):
            count = 0
            for word in question:
                if word in document:
                    count = count + 1
            overlaps_words_num.append(count)
        return overlaps_words_num

    def get_document_length(self):
        return [len(document) for document in self.document_num]

    def get_question_length(self):
        return [len(question) for question in self.question_num]

    def get_overlaps_2_gram(self):
        overlaps_words_num = []
        for question, document in zip(self.tokenized_question, self.tokenized_document):
            count = 0
            doc_bigrams = list(bigrams(document))
            for word in bigrams(question):
                if word in doc_bigrams:
                    count = count + 1
            overlaps_words_num.append(count)
        return overlaps_words_num

    def get_overlaps_3_gram(self):
        overlaps_words_num = []
        for question, document in zip(self.tokenized_question, self.tokenized_document):
            count = 0
            doc_bigrams = list(trigrams(document))
            for word in trigrams(question):
                if word in doc_bigrams:
                    count = count + 1
            overlaps_words_num.append(count)
        return overlaps_words_num

    def get_label(self):
        return torch.cat(self.answer_label, dim=0)

    def get_question_bow(self, vocab_size):
        data = []
        for ques in self.question_num:
            bow = [0]*vocab_size
            for word in ques:
                bow[word] += 1
            data.append(bow)
        return data

    def get_doc_bow(self, vocab_size):
        data = []
        for ques in self.document_num:
            bow = [0] * vocab_size
            for word in ques:
                bow[word] += 1
            data.append(bow)
        return data

    def get_features(self):
        feature1 = self.get_overlaps_words_num()
        feature2 = self.get_overlaps_2_gram()
        feature5 = self.get_overlaps_3_gram()
        # feature3 = self.get_document_length()
        # feature4 = self.get_question_length()
        feature_ques_bow = torch.Tensor(self.get_question_bow(MAX_VOCAB_SIZE + 1)).cuda()
        feature_doc_bow = torch.Tensor(self.get_doc_bow(MAX_VOCAB_SIZE + 1)).cuda()
        X = torch.Tensor([ feature2, feature5]).t().cuda()
        return torch.cat([feature_ques_bow,feature_doc_bow, X], dim = 1)

    def get_answer_text_vec(self):
        w2vModel = gensim.models.KeyedVectors.load_word2vec_format("week1/vector.txt", binary=False)
        data = []
        en_tensor_ = torch.tensor([w2vModel.get_vector(token) for token in self.tokenized_answer_text],
                                  dtype=torch.float32)
        if torch.cuda.is_available():
            en_tensor_ = torch.mean(en_tensor_, dim=0, keepdim=True).cuda()
        else:
            en_tensor_ = torch.mean(en_tensor_, dim=0, keepdim=True)

        data.append(en_tensor_)
        return torch.cat(data, dim=0)

    def get_document_vec(self):
        w2vModel = gensim.models.KeyedVectors.load_word2vec_format("week1/vector.txt", binary=False)
        data = []
        en_tensor_ = torch.tensor([w2vModel.get_vector(token) for token in self.tokenized_document],
                                  dtype=torch.float32)
        if torch.cuda.is_available():
            en_tensor_ = torch.mean(en_tensor_, dim=0, keepdim=True).cuda()
        else:
            en_tensor_ = torch.mean(en_tensor_, dim=0, keepdim=True)

        data.append(en_tensor_)
        return torch.cat(data, dim=0)

    def get_question_vec(self):
        w2vModel = gensim.models.KeyedVectors.load_word2vec_format("week1/vector.txt", binary=False)
        data = []
        en_tensor_ = torch.tensor([w2vModel.get_vector(token) for token in self.tokenized_question],
                                  dtype=torch.float32)
        if torch.cuda.is_available():
            en_tensor_ = torch.mean(en_tensor_, dim=0, keepdim=True).cuda()
        else:
            en_tensor_ = torch.mean(en_tensor_, dim=0, keepdim=True)

        data.append(en_tensor_)
        return torch.cat(data, dim=0)


In [44]:
from torch import nn


class AnswerableClassifier(nn.Module):
    def __init__(self, vocab_size, num_labels = 2, num_hidden = 100):
        super(AnswerableClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_hidden)
        self.dropout = nn.Dropout(0.2)
        self.nonlinear = nn.ReLU()
        self.final = nn.Linear(num_hidden, num_labels)

    def forward(self, bow_vec):
        return self.final(self.nonlinear(self.dropout(self.linear(bow_vec))))


In [45]:
import tokenizer
import torch.utils.data as Data
from datasets import load_dataset
import torch
from torch import nn
import spacy

In [46]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

Using custom data configuration copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6
Reusing dataset parquet (/home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [47]:
epochs = 30
batch_size = 64
lr = 0.0005

In [48]:
from torchtext.data import get_tokenizer
torch_tokenizer = get_tokenizer('basic_english', language="en")

In [49]:
train_english_qa_dataset = QADataSet(torch_tokenizer, tokenizer.getEnglishDataSet(train_set))
train_features = train_english_qa_dataset.get_features()
train_label = train_english_qa_dataset.get_label()
train_features_model_dataset = Data.TensorDataset(train_features, train_label)
train_features_model_loader = Data.DataLoader(dataset=train_features_model_dataset,
                                              batch_size=batch_size,
                                              shuffle=True)

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e961ad4e6a80dccd.arrow


In [50]:
val_english_qa_dataset = QADataSet(torch_tokenizer,
                                   tokenizer.getEnglishDataSet(validation_set))
val_features = val_english_qa_dataset.get_features()
val_label = val_english_qa_dataset.get_label()
val_features_model_dataset = Data.TensorDataset(val_features, val_label)
val_features_model_loader = Data.DataLoader(dataset=val_features_model_dataset,
                                            batch_size= batch_size,
                                            shuffle=True)

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-5432f3a6e01e68fc.arrow


In [51]:
def train_features_model( model, train_loader, criterion, optimizer, model_file_name, epochs):
    max_acc = 0
    for epoch in range(epochs):
        model.train()
        batch_num = 0

        for features, label in train_loader:
            predict_label = model(features)
            loss = criterion(predict_label, label)

            pred = predict_label.max(-1, keepdim=True)[1]
            acc = pred.eq(label.view_as(pred)).sum().item() / predict_label.shape[0]
            optimizer.zero_grad()
            if (acc > max_acc):
                max_acc = acc
                torch.save(model.state_dict(), model_file_name)
            loss.backward()
            optimizer.step()
            batch_num += 1
            print("epoch:", epoch + 1, "batch_num:", batch_num, "loss:", round(loss.item(), 4), "acc:", acc)
    return max_acc


In [52]:
criterion = nn.CrossEntropyLoss(reduction="sum")  # loss function
english_model = AnswerableClassifier(vocab_size=10004, num_labels=2, num_hidden=100).to('cuda')
optimizer = torch.optim.Adam(english_model.parameters(), lr = lr, amsgrad=True)

max_acc = train_features_model(model = english_model, train_loader=train_features_model_loader,
                               criterion= criterion, optimizer=optimizer, model_file_name="english_model.pth",
                               epochs = 20)
print("max_acc:", max_acc)
english_model.load_state_dict(torch.load("english_model.pth"))
english_model.eval()
predict_label = english_model(val_features)
pred = predict_label.max(-1, keepdim=True)[1]
label = val_label
test_acc = pred.eq(label.view_as(pred)).sum().item() / predict_label.shape[0]

print("test acc:", test_acc)

epoch: 1 batch_num: 1 loss: 44.2427 acc: 0.515625
epoch: 1 batch_num: 2 loss: 44.0469 acc: 0.484375
epoch: 1 batch_num: 3 loss: 43.2315 acc: 0.671875
epoch: 1 batch_num: 4 loss: 43.2498 acc: 0.609375
epoch: 1 batch_num: 5 loss: 42.0908 acc: 0.59375
epoch: 1 batch_num: 6 loss: 44.7493 acc: 0.578125
epoch: 1 batch_num: 7 loss: 45.2881 acc: 0.546875
epoch: 1 batch_num: 8 loss: 42.9856 acc: 0.53125
epoch: 1 batch_num: 9 loss: 45.4687 acc: 0.484375
epoch: 1 batch_num: 10 loss: 45.7899 acc: 0.546875
epoch: 1 batch_num: 11 loss: 42.0714 acc: 0.640625
epoch: 1 batch_num: 12 loss: 43.8922 acc: 0.59375
epoch: 1 batch_num: 13 loss: 43.0143 acc: 0.703125
epoch: 1 batch_num: 14 loss: 44.1418 acc: 0.65625
epoch: 1 batch_num: 15 loss: 42.6057 acc: 0.765625
epoch: 1 batch_num: 16 loss: 42.3941 acc: 0.640625
epoch: 1 batch_num: 17 loss: 42.6035 acc: 0.609375
epoch: 1 batch_num: 18 loss: 44.1874 acc: 0.546875
epoch: 1 batch_num: 19 loss: 43.3476 acc: 0.578125
epoch: 1 batch_num: 20 loss: 40.5957 acc: 0.