In [1]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import tokenize
from nltk.corpus import stopwords
import re
from sudachipy import tokenizer
from sudachipy import dictionary
import string

## Load data set into pandas DataFrame

In [2]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

df_val = pd.DataFrame(validation_set)
df_val = df_val[df_val.language.isin(['finnish', 'english', 'japanese'])]

df_train = pd.DataFrame(train_set)
df_train = df_train[df_train.language.isin(['finnish', 'english', 'japanese'])]

Using custom data configuration copenlu--nlp_course_tydiqa-cceecfb5416d988a
Found cached dataset parquet (/Users/dpr577/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

## 1.1 Preprocessing and Dataset Analysis

In [3]:
def split_language_data(df):
    def get_lang_df(df, language):
        return df[df['language'] == language]
    return get_lang_df(df_train, 'english').copy(), get_lang_df(df_train, 'finnish').copy(), get_lang_df(df_train, 'japanese').copy()

df_train_EN, df_train_FI, df_train_JAP = split_language_data(df_train)
df_val_EN, df_val_FI, df_val_JAP = split_language_data(df_val)

###  (a) Tokenize question and document text

In [4]:
def prepare_df(df):
    def make_col_answer(text):
        return text['answer_text'][0]

    def make_col_answer_start(text):
        return text['answer_start'][0]

    df['answer_text'] = df['annotations'].apply(make_col_answer)
    df['answer_start'] = df['annotations'].apply(make_col_answer_start)
    df['answerable'] = df['answer_start'].apply(lambda x : 0 if x == -1 else 1)


def clean_text(text):
    return "".join([char.lower() for char in text if char not in string.punctuation]) 

def remove_stopwords(tokens, language):
    stop_words = set(stopwords.words(language))
    return [w for w in tokens if not w in stop_words]

def tokenize(df, col: str, language):
    df[col+'_tokens'] = df[col].apply(word_tokenize, language=language)
    df[col+'_tokens_cleaned'] = df[col].apply(clean_text)
    df[col+'_tokens_cleaned'] = df[col+'_tokens_cleaned'].apply(word_tokenize, language=language)
    df[col+'_tokens_cleaned'] = df[col+'_tokens_cleaned'].apply(remove_stopwords, language=language)

def helper_func_JAP(text):
    tokenizer_obj = dictionary.Dictionary().create()
    res_list = tokenizer_obj.tokenize(text)
    return [x.surface() for x in res_list]

def tokenize_JAP(df, col):
    df[col+'_tokens'] = df[col].apply(helper_func_JAP)
    df[col+'_tokens_cleaned'] = df[col].apply(clean_text)
    df[col+'_tokens_cleaned'] = df[col+'_tokens_cleaned'].apply(helper_func_JAP)


In [5]:
#Tokenize EN
prepare_df(df_train_EN)
tokenize(df_train_EN, 'question_text', 'english')
tokenize(df_train_EN, 'document_plaintext', 'english')

In [6]:
#Tokenize FI
prepare_df(df_train_FI)
tokenize(df_train_FI, 'question_text', 'finnish')
tokenize(df_train_FI, 'document_plaintext', 'finnish')

In [7]:
#Tokenize JAP
prepare_df(df_train_JAP)
tokenize_JAP(df_train_JAP, 'question_text')
tokenize_JAP(df_train_JAP, 'document_plaintext')

In [18]:
test = "“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ"
helper_func_JAP(test)

['“', 'ダン', '”', ' ', 'ダニエル', '・', 'ジャドソン', '・', 'キャラハン', 'の', '出身', 'は', 'どこ']

### (b) Compute most common first and last tokens in question

In [281]:
def get_most_common_first_n_tokens(df, n):
    df['first_token'] = df['question_text_tokens'].apply(lambda x: x[0])
    return df.first_token.value_counts()[:n]

def get_most_common_last_n_tokens(df, n):
    df['last_token'] = df['question_text_tokens'].apply(lambda x: x[-1] if x[-1].isalpha() else x[-2])
    return df.last_token.value_counts()[:n]

In [282]:
#English
df_res = get_most_common_first_n_tokens(df_train_EN, 10)
get_most_common_last_n_tokens(df_train_EN, 10)

born           342
founded        204
die            122
have           104
formed         100
established     96
air             82
released        80
live            76
introduced      72
Name: last_token, dtype: int64

In [283]:
#Finnish
get_most_common_first_n_tokens(df_train_FI, 10)
get_most_common_last_n_tokens(df_train_FI, 10)

syntyi          1072
on               723
kuoli            720
tarkoittaa       488
perustettu       476
syntynyt         398
oli              382
perustettiin     351
sijaitsee        258
pinta-ala        214
Name: last_token, dtype: int64

In [284]:
#Japanese
res_JAP_first = get_most_common_first_n_tokens(df_train_JAP, 10).reset_index()
res_JAP_last = get_most_common_last_n_tokens(df_train_JAP, 10).reset_index()
res_JAP = pd.concat([res_JAP_first, res_JAP_last], ignore_index=True, axis=1)

## 1.2 Binary Question Classification

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Feature Engineering

In [142]:
df_train_EN.columns

Index(['question_text', 'document_title', 'language', 'annotations',
       'document_plaintext', 'document_url', 'answer_text', 'answer_start',
       'answerable', 'question_text_tokens', 'question_text_tokens_cleaned',
       'document_plaintext_tokens', 'document_plaintext_tokens_cleaned',
       'bow_question'],
      dtype='object')

In [9]:
def count_words_in_doc(df):
    df['doc_tokens'] = df['document_plaintext'].apply(word_tokenize)
    df["word_count_doc"] = df['doc_tokens'].str.len()


def make_bow_get_vocab_size(df):  

    def get_question_vocab(df):
        token_list_temp = df.joined_tokens_cleaned.to_list()
        return  [item for sublist in token_list_temp for item in sublist]

    df['joined_tokens_cleaned'] = df['question_text_tokens_cleaned'] + df['document_plaintext_tokens_cleaned']

    vectorizer = CountVectorizer()
    vocab = get_question_vocab(df)
    vectorizer.fit(vocab)

    def transform_bow(cell):
        text = [" ".join(cell)]
        res = vectorizer.transform(text)
        return res.toarray()
    
    df['bow_joined'] = df['joined_tokens_cleaned'].apply(transform_bow)
    return len(df.iloc[0]['bow_joined'][0])

def get_overlap(df):
    def calculate_overlap(row):
        return len(list(set(row['question_text_tokens_cleaned']) & set(row['document_plaintext_tokens_cleaned'])))
    df['overlap_doc_question'] = df.apply(calculate_overlap, axis=1)

### Building the Binary Classifier

In [10]:
class BoWClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size, num_hidden):
        super(BoWClassifier, self).__init__()

        self.linear = nn.Linear(vocab_size, vocab_size)
        self.nonlinear = nn.ReLU()
        self.final = nn.Linear(vocab_size, num_labels)
        #self.output = nn.LogSoftmax(dim=1)
    
    def forward(self, bow_vector):
        return self.final(self.nonlinear(self.linear(bow_vector)))


### Testing the model with test data

In [248]:
#Test data
x_test = torch.randn(10, 7)
y_test = torch.tensor([1, 0, 0, 1, 1, 1, 0, 0, 1, 1])
print("Sixe X test: ", x_test.size())

test_model = BoWClassifier(num_labels=2, vocab_size=7, num_hidden=8)
train_loop(test_model, loss_function, optimizer, x_test, y_test)

Sixe X test:  torch.Size([10, 7])
epoch:  0  loss:  0.6307103633880615


### Preparing the data

In [16]:
def prep_data(df):
    temp_list = list(df['bow_joined'].values)
    temp_list_ii = [x[0] for x in temp_list]
    y_list = list(df['answerable'].values)

    X = torch.FloatTensor(temp_list_ii)
    y = torch.tensor(y_list)

    return train_test_split(X,y, test_size=0.2)

### Training the model

In [12]:
loss_function = nn.CrossEntropyLoss()

def train_model(model, X, y):

    def train_loop(model, X, y):   
        optimizer = optim.SGD(model.parameters(), lr=0.1)
        #compute prediction and loss
        y_pred = model(X)
        loss = loss_function(y_pred, y)
        #backpropagation
        optimizer.zero_grad() #cleans the gradients
        loss.backward() #computes the gradients
        optimizer.step() #update the parameters
            
        print('epoch: ', epoch,' loss: ', loss.item())

    for epoch in range(100):
        train_loop(model, X, y)


### Testing the trained model

In [13]:
def test_model(model, X, y):
    with torch.no_grad():
        model.eval()
        total_loss = 0.0
        y_pred_test = model(X)
        y_pred_test = y_pred_test.detach().cpu().numpy()
        y_pred = np.argmax(y_pred_test, axis=1)
        y_pred_tensor = torch.FloatTensor(y_pred)

        y_true = y.type(torch.FloatTensor)
        
        loss_test = loss_function(y_pred_tensor, y_true)
        total_loss += float(loss_test)
    print("Total loss on test data: ", total_loss)
    print(classification_report(y_true=y_true, y_pred=y_pred_tensor))

### Calling the methods for each language

### EN

In [17]:
vocab_size_EN = make_bow_get_vocab_size(df_train_EN)
X_train_EN, X_test_EN, y_train_EN, y_test_EN = prep_data(df_train_EN)
model_EN = BoWClassifier(num_labels=2, vocab_size=vocab_size_EN, num_hidden=vocab_size_EN)
train_model(model_EN, X_train_EN, y_train_EN)
test_model(model_EN, X_test_EN, y_test_EN)


  X = torch.FloatTensor(temp_list_ii)


epoch:  0  loss:  0.6932293772697449
epoch:  1  loss:  0.6921654343605042
epoch:  2  loss:  0.6913775205612183
epoch:  3  loss:  0.6907072067260742


### FI

In [131]:
vocab_size_FI = make_bow_get_vocab_size(df_train_FI)
X_train_FI, X_test_FI, y_train_FI, y_test_FI = prep_data(df_train_FI)
model_FI = BoWClassifier(num_labels=2, vocab_size=vocab_size_FI, num_hidden=vocab_size_FI)
train_model(model_FI, X_train_FI, y_train_FI)
test_model(model_FI, X_test_FI, y_test_FI)

epoch:  0  loss:  0.6931490302085876
epoch:  1  loss:  0.6931475400924683
epoch:  2  loss:  0.6931461691856384
epoch:  3  loss:  0.6931449174880981
epoch:  4  loss:  0.6931435465812683
epoch:  5  loss:  0.693142294883728
epoch:  6  loss:  0.6931410431861877
epoch:  7  loss:  0.6931398510932922
epoch:  8  loss:  0.6931387186050415
epoch:  9  loss:  0.693137526512146
epoch:  10  loss:  0.6931365132331848
epoch:  11  loss:  0.6931353211402893
epoch:  12  loss:  0.6931342482566833
epoch:  13  loss:  0.6931332349777222
epoch:  14  loss:  0.6931321024894714
epoch:  15  loss:  0.693131148815155
epoch:  16  loss:  0.6931300759315491
epoch:  17  loss:  0.6931290626525879
epoch:  18  loss:  0.6931279897689819
epoch:  19  loss:  0.6931269764900208
epoch:  20  loss:  0.6931260824203491
epoch:  21  loss:  0.6931250095367432
epoch:  22  loss:  0.693123996257782
epoch:  23  loss:  0.6931230425834656
epoch:  24  loss:  0.6931220293045044
epoch:  25  loss:  0.6931210160255432
epoch:  26  loss:  0.69312

### JAP

In [141]:
vocab_size_JAP = make_bow_get_vocab_size(df_train_JAP)
X_train_JAP, X_test_JAP, y_train_JAP, y_test_JAP = prep_data(df_train_JAP)
model_JAP = BoWClassifier(num_labels=2, vocab_size=vocab_size_JAP, num_hidden=vocab_size_JAP)
train_model(model_JAP, X_train_JAP, y_train_JAP)
test_model(model_JAP, X_test_JAP, y_test_JAP)

epoch:  0  loss:  0.6932111382484436
epoch:  1  loss:  0.6932008266448975
epoch:  2  loss:  0.6931919455528259
epoch:  3  loss:  0.693183958530426
epoch:  4  loss:  0.6931769251823425
epoch:  5  loss:  0.6931706666946411
epoch:  6  loss:  0.6931650638580322
epoch:  7  loss:  0.6931599974632263
epoch:  8  loss:  0.6931554675102234
epoch:  9  loss:  0.6931514143943787
epoch:  10  loss:  0.6931475400924683
epoch:  11  loss:  0.6931442022323608
epoch:  12  loss:  0.6931410431861877
epoch:  13  loss:  0.693138062953949
epoch:  14  loss:  0.6931353211402893
epoch:  15  loss:  0.6931326985359192
epoch:  16  loss:  0.693130373954773
epoch:  17  loss:  0.6931279301643372
epoch:  18  loss:  0.6931257247924805
epoch:  19  loss:  0.6931235790252686
epoch:  20  loss:  0.6931215524673462
epoch:  21  loss:  0.6931195855140686
epoch:  22  loss:  0.6931174993515015
epoch:  23  loss:  0.6931157112121582
epoch:  24  loss:  0.6931138634681702
epoch:  25  loss:  0.6931120157241821
epoch:  26  loss:  0.6931