In [1]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import tokenize
from nltk.corpus import stopwords
import re
from sudachipy import tokenizer
from sudachipy import dictionary
import string

## Load data set into pandas DataFrame

In [2]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

df_val = pd.DataFrame(validation_set)
df_val = df_val[df_val.language.isin(['finnish', 'english', 'japanese'])]

df_train = pd.DataFrame(train_set)
df_train = df_train[df_train.language.isin(['finnish', 'english', 'japanese'])]
len(df_train)

Using custom data configuration copenlu--nlp_course_tydiqa-cceecfb5416d988a
Found cached dataset parquet (/Users/dpr577/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

29868

## 1.1 Preprocessing and Dataset Analysis

In [3]:
def get_lang_df(df, language):
    return df[df['language'] == language]

df_train_FI = get_lang_df(df_train, 'finnish').copy()
df_train_JAP = get_lang_df(df_train, 'japanese').copy()
df_train_EN = get_lang_df(df_train, 'english').copy()

In [4]:
def clean_text(text):
    return "".join([char.lower() for char in text if char not in string.punctuation]) 

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [w for w in tokens if not w in stop_words]

def tokenize_EN(df, col: str):
    df[col+'tokens'] = df[col].apply(word_tokenize)
    df[col+'tokens_cleaned'] = df[col].apply(clean_text)
    df[col+'tokens_cleaned'] = df[col+'tokens_cleaned'].apply(word_tokenize)
    df[col+'tokens_cleaned'] = df[col+'tokens_cleaned'].apply(remove_stopwords)

def tokenize_FI(df):
    df['tokens'] = df['question_text'].apply(word_tokenize, language='finnish')

def helper_func_JAP(question):
    tokenizer_obj = dictionary.Dictionary().create()
    res_list = tokenizer_obj.tokenize(question)
    return [x.surface() for x in res_list]

def tokenize_JAP(df):
    df['tokens'] = df['question_text'].apply(helper_func_JAP)


tokenize_EN(df_train_EN, 'question_text')
#tokenize_FI(df_train_FI)
#tokenize_JAP(df_train_JAP)


In [18]:
test = "“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ"
helper_func_JAP(test)

['“', 'ダン', '”', ' ', 'ダニエル', '・', 'ジャドソン', '・', 'キャラハン', 'の', '出身', 'は', 'どこ']

In [5]:
def get_most_common_first_n_tokens(df, n):
    df['first_token'] = df['tokens'].apply(lambda x: x[0])
    return df.first_token.value_counts()[:n]

def get_most_common_last_n_tokens(df, n):
    df['last_token'] = df['tokens'].apply(lambda x: x[-1] if x[-1].isalpha() else x[-2])
    return df.last_token.value_counts()[:n]

#For English
df_res = get_most_common_first_n_tokens(df_train_EN, 10)
get_most_common_last_n_tokens(df_train_EN, 10)

KeyError: 'tokens'

In [35]:
#Finnish
get_most_common_first_n_tokens(df_train_FI, 10)
get_most_common_last_n_tokens(df_train_FI, 10)

syntyi          1072
on               723
kuoli            720
tarkoittaa       488
perustettu       476
syntynyt         398
oli              382
perustettiin     351
sijaitsee        258
pinta-ala        214
Name: last_token, dtype: int64

In [36]:
#Japanese
res_JAP_first = get_most_common_first_n_tokens(df_train_JAP, 10).reset_index()
res_JAP_last = get_most_common_last_n_tokens(df_train_JAP, 10).reset_index()
res_JAP = pd.concat([res_JAP_first, res_JAP_last], ignore_index=True, axis=1)
res_JAP.to_latex()



  res_JAP.to_latex()


'\\begin{tabular}{llrlr}\n\\toprule\n{} &        0 &    1 &   2 &     3 \\\\\n\\midrule\n0 &       日本 &  354 &   た &  2115 \\\\\n1 &        『 &  306 &   か &  1305 \\\\\n2 &       世界 &   94 &   何 &  1192 \\\\\n3 &      ジョン &   58 &  いつ &   996 \\\\\n4 &        第 &   56 &   は &   932 \\\\\n5 &  アメリカ合衆国 &   54 &  どこ &   884 \\\\\n6 &        「 &   50 &   誰 &   746 \\\\\n7 &     アメリカ &   50 &  ある &   174 \\\\\n8 &    ウィリアム &   44 &  だれ &    64 \\\\\n9 &     ジョージ &   44 &  いる &    42 \\\\\n\\bottomrule\n\\end{tabular}\n'

## 1.2 Binary Question Classification

In [117]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import json
from sklearn.feature_extraction.text import CountVectorizer

### Data Cleaning

In [10]:
def make_col_answer(text):
    return text['answer_text'][0]

def make_col_answer_start(text):
    return text['answer_start'][0]

df_train_EN['answer_text'] = df_train_EN['annotations'].apply(make_col_answer)
df_train_EN['answer_start'] = df_train_EN['annotations'].apply(make_col_answer_start)
df_train_EN['answerable'] = df_train_EN['answer_start'].apply(lambda x : 0 if x == -1 else 1)

df_train_EN.head()

tokenize_EN(df_train_EN, 'document_plaintext')

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url,question_texttokens,question_texttokens_cleaned,doc_tokens,word_count_doc,answer_text,answer_start,answerable
26,When was quantum field theory developed?,Quantum field theory,english,"{'answer_start': [159], 'answer_text': ['1920s']}",Quantum field theory naturally began with the ...,https://en.wikipedia.org/wiki/Quantum%20field%...,"[When, was, quantum, field, theory, developed, ?]","[quantum, field, theory, developed]","[Quantum, field, theory, naturally, began, wit...",31,1920s,159,1
43,Who was the first Nobel prize winner for Liter...,List of Nobel laureates in Literature,english,"{'answer_start': [610], 'answer_text': ['Sully...",The Nobel Prize in Literature (Swedish: Nobelp...,https://en.wikipedia.org/wiki/List%20of%20Nobe...,"[Who, was, the, first, Nobel, prize, winner, f...","[first, nobel, prize, winner, literature]","[The, Nobel, Prize, in, Literature, (, Swedish...",188,Sully Prudhomme,610,1
112,When is the dialectical method used?,Dialectic,english,"{'answer_start': [129], 'answer_text': ['disco...","Dialectic or dialectics (Greek: διαλεκτική, di...",https://en.wikipedia.org/wiki/Dialectic,"[When, is, the, dialectical, method, used, ?]","[dialectical, method, used]","[Dialectic, or, dialectics, (, Greek, :, διαλε...",113,discourse between two or more people holding d...,129,1
123,Who invented Hangul?,Origin of Hangul,english,"{'answer_start': [88], 'answer_text': ['Sejong...",Hangul was personally created and promulgated ...,https://en.wikipedia.org/wiki/Origin%20of%20Ha...,"[Who, invented, Hangul, ?]","[invented, hangul]","[Hangul, was, personally, created, and, promul...",69,Sejong the Great,88,1
125,What do Grasshoppers eat?,Grasshopper,english,"{'answer_start': [0], 'answer_text': ['Grassho...","Grasshoppers are plant-eaters, with a few spec...",https://en.wikipedia.org/wiki/Grasshopper,"[What, do, Grasshoppers, eat, ?]","[grasshoppers, eat]","[Grasshoppers, are, plant-eaters, ,, with, a, ...",125,"Grasshoppers are plant-eaters, with a few spec...",0,1


In [64]:
type(df_train_EN[df_train_EN['answerable'] == 0].iloc[82]['document_plaintexttokens_cleaned'][0])

str

### Feature Engineering

In [162]:
def count_words_in_doc(df):
    df['doc_tokens'] = df['document_plaintext'].apply(word_tokenize)
    df["word_count_doc"] = df['doc_tokens'].str.len()

def get_question_vocab():
        token_list_temp = df_train_EN.question_texttokens_cleaned.to_list()
        return  [item for sublist in token_list_temp for item in sublist]


def get_bow(df, vocab):
    
    vectorizer = CountVectorizer()
    vectorizer.fit(vocab)

    def transform_bow(cell):
        text = [" ".join(cell)]
        res = vectorizer.transform(text)
        return res.toarray()
    
    df['bow_question'] = df['question_texttokens_cleaned'].apply(transform_bow)

def get_overlap(df):
    def calculate_overlap(row):
        return len(list(set(row['question_texttokens_cleaned']) & set(row['document_plaintexttokens_cleaned'])))
    #df['overlap_doc_question'] = df['question_texttokens_cleaned'].apply(set) & df['document_plaintexttokens_cleaned'].apply(set)
    df['overlap_doc_question'] = df.apply(calculate_overlap, axis=1)



#count_words_in_doc(df_train_EN)
vocab = get_question_vocab()
get_bow(df_train_EN, vocab)
#get_overlap(df_train_EN)
len(df_train_EN.iloc[0]['bow_question'][0])

4575

### Building the Binary Classifier

In [247]:
class BoWClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size, num_hidden=8):
        super(BoWClassifier, self).__init__()

        self.linear = nn.Linear(vocab_size, num_hidden)
        self.nonlinear = nn.ReLU()
        self.final = nn.Linear(num_hidden, num_labels)
    
    def forward(self, bow_vector):
        return self.final(self.nonlinear(self.linear(bow_vector)))


In [248]:
#Test data
x_test = torch.randn(10, 7)
y_test = torch.tensor([1, 0, 0, 1, 1, 1, 0, 0, 1, 1])
print("Sixe X test: ", x_test.size())

test_model = BoWClassifier(num_labels=2, vocab_size=7, num_hidden=8)
train_loop(test_model, loss_function, optimizer, x_test, y_test)

Sixe X test:  torch.Size([10, 7])
epoch:  0  loss:  0.6307103633880615


In [254]:

temp_list = list(df_train_EN['bow_question'].values)
temp_list_ii = [x[0] for x in temp_list]

X = torch.FloatTensor(temp_list_ii)
y_list = list(df_train_EN['answerable'].values)
#y = torch.tensor([[x] for x in y_list])
y = torch.tensor(y_list)

X

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [250]:
x_test

tensor([[-0.5056,  0.9153,  0.4793, -0.6444,  0.3628, -0.8074,  2.7315],
        [-1.8338,  0.7448,  0.3746,  0.3131, -0.0850,  0.7846, -0.8020],
        [ 0.1749, -0.0328, -1.6972, -0.6483,  0.6256, -0.7786,  0.1086],
        [ 0.7864,  0.4207,  0.6837, -0.1010, -1.1849, -0.6070,  0.1704],
        [-0.7880,  0.2376, -0.9414, -1.3934, -2.3270, -0.7605,  0.4261],
        [-0.0236, -1.6823, -0.7846, -1.0659,  0.6222,  0.6719, -1.3679],
        [ 1.9006, -1.6853,  0.8929, -0.5236,  0.7401, -0.8913,  0.0544],
        [-0.7086,  0.7980,  2.2888, -0.6826,  0.1287, -1.5862, -0.4234],
        [ 0.8111,  0.5778, -0.0111, -0.4818, -0.7811, -1.4629,  0.3349],
        [ 0.7053, -0.8898,  1.1208, -0.4188, -1.1646,  0.0765, -0.1233]])

In [257]:

model = BoWClassifier(num_labels=2, vocab_size=4575, num_hidden=7)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

def train_loop(model, loss_func, optimizer, x, y):   
    #compute prediction and loss
    y_pred = model(x)
    loss = loss_func(y_pred,y)
    #backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
        
    print('epoch: ', epoch,' loss: ', loss.item())


for epoch in range(100):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(model, loss_function, optimizer, X, y)


Epoch 1
-------------------------------
epoch:  0  loss:  0.7183316349983215
Epoch 2
-------------------------------
epoch:  1  loss:  0.7158913016319275
Epoch 3
-------------------------------
epoch:  2  loss:  0.7136825919151306
Epoch 4
-------------------------------
epoch:  3  loss:  0.7116838693618774
Epoch 5
-------------------------------
epoch:  4  loss:  0.7098755836486816
Epoch 6
-------------------------------
epoch:  5  loss:  0.7082398533821106
Epoch 7
-------------------------------
epoch:  6  loss:  0.706760585308075
Epoch 8
-------------------------------
epoch:  7  loss:  0.7054229974746704
Epoch 9
-------------------------------
epoch:  8  loss:  0.7042140364646912
Epoch 10
-------------------------------
epoch:  9  loss:  0.7031218409538269
Epoch 11
-------------------------------
epoch:  10  loss:  0.7021350860595703
Epoch 12
-------------------------------
epoch:  11  loss:  0.7012439966201782
Epoch 13
-------------------------------
epoch:  12  loss:  0.7004395723