In [1]:
!python -m spacy download en_core_web_sm
!python -m spacy download ja_core_news_sm
!python -m spacy download fi_core_news_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting ja-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ja_core_news_sm-3.4.0/ja_core_news_sm-3.4.0-py3-none-any.whl (12.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting sudachipy!=0.6.1,>=0.5.2
  Downloading SudachiPy-0.6.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.w

In [2]:
!pip install spacy



In [2]:
import gensim
import torchtext
import spacy
import torch.utils.data as Data
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
from datasets import load_dataset
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from matplotlib import pyplot as plt
from collections import OrderedDict
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


torch.manual_seed(123)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7fd20fc5fe10>

In [3]:
# epochs
epochs = 150
# batch size
batch_size = 128
# learning rate
lr = 0.0001


# model parameters
input_dim = 384
hidden_dim = 256
output_dim = 2

vocab = False
language_train = ["english"]
language_val = ["english"]
model_type = "regression"

In [4]:
if vocab:
    if language_train == ["english"]:
        tokenizer = spacy.load("en_core_web_sm")
    elif language_train == ["japanese"]:
        tokenizer = spacy.load("ja_core_news_sm")
    elif language_train == ["finnish"]:
        tokenizer = spacy.load("fi_core_news_sm")
    else:
        print("bow model cant be multilingual")

In [5]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

Using custom data configuration copenlu--nlp_course_tydiqa-cceecfb5416d988a
Reusing dataset parquet (/home/wangqiongyan/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 1238.35it/s]


In [6]:
def getLanguageDataSet(data, language):
    def printAndL(x):
        return x["language"] in language
    return data.filter(printAndL)

In [7]:
def build_vocab(dataSet, tokenizer):
    counter = Counter()
    for data in dataSet:
        counter.update([token.text for token in tokenizer(data['document_plaintext'])])
    return Vocab(counter)

In [8]:
def getWord2VecModel(train_dataSet, test_dataSet, tokenizer):
    sentences = []
    keys = ["document_plaintext", "question_text"]
    print("train data to vec:")
    for element in tqdm(train_dataSet):
        for key in keys:
            sentences.append([token.text for token in tokenizer(element[key])])
    print("val data to vec:")
    for element in tqdm(test_dataSet):
        for key in keys:
            sentences.append([token.text for token in tokenizer(element[key])])
    
    w2v_model = gensim.models.Word2Vec(sentences, vector_size=input_dim, min_count=1, window=5, epochs=3)

    return w2v_model

In [9]:
def data_process_vocab(dataSet, w2vModel, tokenizer, tokenPart="document"):
    data = []
    for element in tqdm(dataSet):
        if tokenPart == "document":
            en_tensor_ = torch.tensor([w2vModel.wv.get_vector(token) for token in [token_.text for token_ in tokenizer(element["document_plaintext"])]], dtype=torch.float32)
            en_tensor_ = torch.mean(en_tensor_, dim=0, keepdim=True).cuda()
            data.append(en_tensor_)
        elif tokenPart == "question":
            en_tensor_ = torch.tensor([w2vModel.wv.get_vector(token) for token in [token_.text for token_ in tokenizer(element["question_text"])]], dtype=torch.float32)
            en_tensor_ = torch.mean(en_tensor_, dim=0, keepdim=True).cuda()
            data.append(en_tensor_)
        elif tokenPart == "answer":
            if (element["annotations"]["answer_start"] == [-1]):
                data.append(torch.tensor([0], dtype=torch.int64).cuda())
            else:
                data.append(torch.tensor([1], dtype=torch.int64).cuda())
    return torch.cat(data, dim=0)

def data_process_sentence(dataSet, SbertModel, tokenPart="document"):
    data = []
    for element in tqdm(dataSet):
        if tokenPart == "document":
            en_tensor_ = torch.tensor([SbertModel.encode(element["document_plaintext"])], dtype=torch.float32).cuda()
            data.append(en_tensor_)
        elif tokenPart == "question":
            en_tensor_ = torch.tensor([SbertModel.encode((element["question_text"]))], dtype=torch.float32).cuda()
            data.append(en_tensor_)
        elif tokenPart == "answer":
            if (element["annotations"]["answer_start"] == [-1]):
                data.append(torch.tensor([0], dtype=torch.int64).cuda())
            else:
                data.append(torch.tensor([1], dtype=torch.int64).cuda())
    return torch.cat(data, dim=0)

In [10]:
def getData(data, language):
    print("get data:")
    dataset = getLanguageDataSet(data, language)
    if vocab:
        print("bow model:")
        # englishVocab = build_vocab(dataset, tokenizer)
        model = getWord2VecModel(train_set, validation_set, tokenizer)
        answer_set = data_process_vocab(dataset, model, tokenizer, "answer")
        question_set = data_process_vocab(dataset, model, tokenizer, "question")
        document_set = data_process_vocab(dataset, model, tokenizer, "document")
    else:
        print("sentence model:")
        model = SentenceTransformer('all-MiniLM-L6-v2')
        answer_set = data_process_sentence(dataset, model, "answer")
        question_set = data_process_sentence(dataset, model, "question")
        document_set = data_process_sentence(dataset, model, "document")
    
    return answer_set, question_set, document_set

In [11]:
# train_set
answer_train_set, question_train_set, document_train_set = getData(train_set, language_train)
# val_set
answer_validation_set, question_validation_set, document_validation_set = getData(validation_set, language_val)

Loading cached processed dataset at /home/wangqiongyan/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-4e64dec92aeba576.arrow


get data:
sentence model:


100%|██████████| 8778/8778 [00:01<00:00, 5164.47it/s]
  en_tensor_ = torch.tensor([SbertModel.encode((element["question_text"]))], dtype=torch.float32).cuda()
100%|██████████| 8778/8778 [00:26<00:00, 334.70it/s]
100%|██████████| 8778/8778 [00:30<00:00, 291.60it/s]
Loading cached processed dataset at /home/wangqiongyan/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-b3d1fc6829af69d9.arrow


get data:
sentence model:


100%|██████████| 1036/1036 [00:00<00:00, 15852.28it/s]
100%|██████████| 1036/1036 [00:03<00:00, 340.25it/s]
100%|██████████| 1036/1036 [00:03<00:00, 284.11it/s]


In [12]:
class attention(nn.Module):
    def __init__(self, hidden_dim, attn_drop):
        super(attention, self).__init__()
        self.fc = nn.Linear(hidden_dim, hidden_dim, bias=True)
        nn.init.xavier_normal_(self.fc.weight, gain=1.414)

        self.tanh = nn.Tanh()
        self.att = nn.Parameter(torch.empty(size=(1, hidden_dim)), requires_grad=True)
        nn.init.xavier_normal_(self.att.data, gain=1.414)

        self.softmax = nn.Softmax()
        if attn_drop:
            self.attn_drop = nn.Dropout(attn_drop)
        else:
            self.attn_drop = lambda x: x

    def forward(self, embeds):
        beta = []
        attn_curr = self.attn_drop(self.att)
        for embed in embeds:
            sp = self.tanh(self.fc(embed)).mean(dim=0)
            beta.append(attn_curr.matmul(sp.t()))
        beta = torch.cat(beta, dim=-1).view(-1)
        beta = self.softmax(beta)
        #print(ntype+" mp ", beta.data.cpu().numpy())  # semantic attention
        z_mp = 0
        for i in range(len(embeds)):
            z_mp = z_mp+ embeds[i]*beta[i]
        return z_mp

In [13]:
class QA_model(nn.Module):
    def __init__(self,input_dim,hidden_dim,output_dim):
        super(QA_model, self).__init__()
        self.que_in_mlp = nn.Linear(input_dim,hidden_dim)
        self.context_in_mlp = nn.Linear(input_dim,hidden_dim)
        self.output_layer = nn.Linear(hidden_dim,output_dim)
        self.attention_layer = attention(hidden_dim,0.5)

    def forward(self,question,context):
        q_vec = F.leaky_relu(self.que_in_mlp(question))
        c_vec = F.leaky_relu(self.context_in_mlp(context))
        attention_out = self.attention_layer([q_vec,c_vec])
        predict_label = self.output_layer(attention_out)

        return predict_label

In [14]:
torch_dataset = Data.TensorDataset(question_train_set, document_train_set, answer_train_set)
train_loader = Data.DataLoader(dataset=torch_dataset, batch_size=batch_size, shuffle=True)

In [17]:
class QA_LR_model(nn.Module):
    def __init__(self,input_dim,hidden_dim,output_dim):
        super(QA_LR_model, self).__init__()
        self.input_layer = nn.Linear(input_dim*2,hidden_dim)
        self.relu_0 = nn.ReLU()
        self.hidden_layer_0 = nn.Linear(hidden_dim,int(hidden_dim/2))
        self.relu_1 = nn.ReLU()
        # self.hidden_layer_1 = nn.Linear(hidden_dim*2,hidden_dim)
        # self.relu_2 = nn.ReLU()
        self.output_layer = nn.Linear(int(hidden_dim/2),output_dim)

    def forward(self,data):
        input = self.input_layer(data)
        relu_0 = self.relu_0(input)
        hidden_0 = self.hidden_layer_0(relu_0)
        relu_1 = self.relu_1(hidden_0)
        # hidden_1 = self.hidden_layer_1(relu_1)
        # relu_2 = self.relu_2(hidden_1)
        output = self.output_layer(relu_1)

        return output
# class QA_LR_model(nn.Module):
#     def __init__(self,input_dim,hidden_dim,output_dim):
#         super(QA_LR_model, self).__init__()
#         self.input_layer = nn.Linear(input_dim*2,hidden_dim)
#         self.relu_0 = nn.ReLU()
#         self.hidden_layer_0 = nn.Linear(hidden_dim,hidden_dim*2)
#         self.relu_1 = nn.ReLU()
#         self.hidden_layer_1 = nn.Linear(hidden_dim*2,hidden_dim)
#         self.relu_2 = nn.ReLU()
#         self.output_layer = nn.Linear(hidden_dim,output_dim)

#     def forward(self,data):
#         input = self.input_layer(data)
#         relu_0 = self.relu_0(input)
#         hidden_0 = self.hidden_layer_0(relu_0)
#         relu_1 = self.relu_1(hidden_0)
#         hidden_1 = self.hidden_layer_1(relu_1)
#         relu_2 = self.relu_2(hidden_1)
#         output = F.sigmoid(self.output_layer(relu_0))

#         return output

In [18]:
if model_type == "regression":
    # LR model define
    model = QA_LR_model(input_dim, hidden_dim, output_dim).to('cuda')
else:
    # attention model define
    model = QA_model(input_dim, hidden_dim, output_dim).to('cuda')

In [19]:
# loss function
criterion = nn.CrossEntropyLoss()

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, amsgrad=True)

In [None]:
# train
max_acc = 0
loss_list = []
patience = 30
count=0
min_val_loss=100000
for epoch in tqdm(range(250)):
    model.train()
    batch_num = 0
    for question_vec, document_vec, label in train_loader:
        if model_type == "regression":
            data = torch.cat((question_vec, document_vec), 1)
            predict_label = model(data)
        else:
            predict_label = model(question_vec, document_vec)
        
        loss = criterion(predict_label, label)

        pred = predict_label.max(-1, keepdim=True)[1]
        acc = pred.eq(label.view_as(pred)).sum().item() / predict_label.shape[0]
        optimizer.zero_grad()


        if (acc > max_acc):
            max_acc = acc
            torch.save(model.state_dict(), 'model.pth')
        loss.backward()
        optimizer.step()
        batch_num += 1
        loss_list.append(round(loss.item(), 4))
    

    if epoch % 5 == 0:
        print("epoch:", epoch, "loss:", round(loss.item(), 4), "acc:", acc)
print("max acc:", max_acc)

plt.cla()
plt.plot(loss_list)

In [None]:
model.load_state_dict(torch.load("model_eng.pth"))

if model_type == "regression":
    val_input = torch.cat((question_validation_set, document_validation_set), 1)
    predict_label = model(val_input)
else:
    predict_label = model(question_validation_set, document_validation_set)
pred = predict_label.max(-1, keepdim=True)[1]
label = answer_validation_set
test_acc = pred.eq(label.view_as(pred)).sum().item() / predict_label.shape[0]


In [None]:
print("Languge:",language_train[0])
report = classification_report(label.cpu(), pred.cpu(), output_dict=True)
pd.DataFrame(report).transpose()