In [3]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import tokenize
from nltk.corpus import stopwords
import re
from sudachipy import tokenizer
from sudachipy import dictionary
import string

## Load data set into pandas DataFrame

In [4]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

df_val = pd.DataFrame(validation_set)
df_val = df_val[df_val.language.isin(['finnish', 'english', 'japanese'])]

df_train = pd.DataFrame(train_set)
df_train = df_train[df_train.language.isin(['finnish', 'english', 'japanese'])]

Using custom data configuration copenlu--nlp_course_tydiqa-cceecfb5416d988a
Found cached dataset parquet (/Users/dpr577/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

## 1.1 Preprocessing and Dataset Analysis

In [5]:
def split_language_data(df):
    def get_lang_df(df, language):
        return df[df['language'] == language]
    return get_lang_df(df_train, 'english').copy(), get_lang_df(df_train, 'finnish').copy(), get_lang_df(df_train, 'japanese').copy()

df_train_EN, df_train_FI, df_train_JAP = split_language_data(df_train)
df_val_EN, df_val_FI, df_val_JAP = split_language_data(df_val)

###  (a) Tokenize question and document text

In [6]:
def prepare_df(df):
    def make_col_answer(text):
        return text['answer_text'][0]

    def make_col_answer_start(text):
        return text['answer_start'][0]

    df['answer_text'] = df['annotations'].apply(make_col_answer)
    df['answer_start'] = df['annotations'].apply(make_col_answer_start)
    df['answerable'] = df['answer_start'].apply(lambda x : 0 if x == -1 else 1)


def clean_text(text):
    return "".join([char.lower() for char in text if char not in string.punctuation]) 

def remove_stopwords(tokens, language):
    stop_words = set(stopwords.words(language))
    return [w for w in tokens if not w in stop_words]

def tokenize(df, col: str, language):
    df[col+'_tokens'] = df[col].apply(word_tokenize, language=language)
    df[col+'_tokens_cleaned'] = df[col].apply(clean_text)
    df[col+'_tokens_cleaned'] = df[col+'_tokens_cleaned'].apply(word_tokenize, language=language)
    df[col+'_tokens_cleaned'] = df[col+'_tokens_cleaned'].apply(remove_stopwords, language=language)

def helper_func_JAP(text):
    tokenizer_obj = dictionary.Dictionary().create()
    res_list = tokenizer_obj.tokenize(text)
    return [x.surface() for x in res_list]

def tokenize_JAP(df, col):
    df[col+'_tokens'] = df[col].apply(helper_func_JAP)


In [7]:
#Tokenize EN
prepare_df(df_train_EN)
tokenize(df_train_EN, 'question_text', 'english')
tokenize(df_train_EN, 'document_plaintext', 'english')

In [8]:
#Tokenize FI
prepare_df(df_train_FI)
tokenize(df_train_FI, 'question_text', 'finnish')
tokenize(df_train_FI, 'document_plaintext', 'finnish')

In [None]:
#Tokenize JAP
prepare_df(df_train_JAP)
tokenize_JAP(df_train_JAP, 'question_text')
tokenize_JAP(df_train_JAP, 'document_plaintext')

In [18]:
test = "“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ"
helper_func_JAP(test)

['“', 'ダン', '”', ' ', 'ダニエル', '・', 'ジャドソン', '・', 'キャラハン', 'の', '出身', 'は', 'どこ']

### (b) Compute most common first and last tokens in question

In [281]:
def get_most_common_first_n_tokens(df, n):
    df['first_token'] = df['question_text_tokens'].apply(lambda x: x[0])
    return df.first_token.value_counts()[:n]

def get_most_common_last_n_tokens(df, n):
    df['last_token'] = df['question_text_tokens'].apply(lambda x: x[-1] if x[-1].isalpha() else x[-2])
    return df.last_token.value_counts()[:n]

In [282]:
#English
df_res = get_most_common_first_n_tokens(df_train_EN, 10)
get_most_common_last_n_tokens(df_train_EN, 10)

born           342
founded        204
die            122
have           104
formed         100
established     96
air             82
released        80
live            76
introduced      72
Name: last_token, dtype: int64

In [283]:
#Finnish
get_most_common_first_n_tokens(df_train_FI, 10)
get_most_common_last_n_tokens(df_train_FI, 10)

syntyi          1072
on               723
kuoli            720
tarkoittaa       488
perustettu       476
syntynyt         398
oli              382
perustettiin     351
sijaitsee        258
pinta-ala        214
Name: last_token, dtype: int64

In [284]:
#Japanese
res_JAP_first = get_most_common_first_n_tokens(df_train_JAP, 10).reset_index()
res_JAP_last = get_most_common_last_n_tokens(df_train_JAP, 10).reset_index()
res_JAP = pd.concat([res_JAP_first, res_JAP_last], ignore_index=True, axis=1)

## 1.2 Binary Question Classification

In [44]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Feature Engineering

In [36]:
def count_words_in_doc(df):
    df['doc_tokens'] = df['document_plaintext'].apply(word_tokenize)
    df["word_count_doc"] = df['doc_tokens'].str.len()


def make_bow_get_vocab_size(df):  

    def get_question_vocab(df):
        token_list_temp = df.question_text_tokens_cleaned.to_list()
        return  [item for sublist in token_list_temp for item in sublist]

    vectorizer = CountVectorizer()
    vocab = get_question_vocab(df)
    vectorizer.fit(vocab)

    def transform_bow(cell):
        text = [" ".join(cell)]
        res = vectorizer.transform(text)
        return res.toarray()
    
    df['bow_question'] = df['question_text_tokens_cleaned'].apply(transform_bow)
    return len(df.iloc[0]['bow_question'][0])

def get_overlap(df):
    def calculate_overlap(row):
        return len(list(set(row['question_text_tokens_cleaned']) & set(row['document_plaintext_tokens_cleaned'])))
    df['overlap_doc_question'] = df.apply(calculate_overlap, axis=1)

### Building the Binary Classifier

In [68]:
class BoWClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size, num_hidden):
        super(BoWClassifier, self).__init__()

        self.linear = nn.Linear(vocab_size, vocab_size)
        self.nonlinear = nn.ReLU()
        self.final = nn.Linear(vocab_size, num_labels)
        #self.output = nn.LogSoftmax(dim=1)
    
    def forward(self, bow_vector):
        return self.final(self.nonlinear(self.linear(bow_vector)))


### Testing the model with test data

In [248]:
#Test data
x_test = torch.randn(10, 7)
y_test = torch.tensor([1, 0, 0, 1, 1, 1, 0, 0, 1, 1])
print("Sixe X test: ", x_test.size())

test_model = BoWClassifier(num_labels=2, vocab_size=7, num_hidden=8)
train_loop(test_model, loss_function, optimizer, x_test, y_test)

Sixe X test:  torch.Size([10, 7])
epoch:  0  loss:  0.6307103633880615


### Preparing the data

In [12]:
def prep_data(df):
    temp_list = list(df['bow_question'].values)
    temp_list_ii = [x[0] for x in temp_list]
    y_list = list(df['answerable'].values)

    X = torch.FloatTensor(temp_list_ii)
    y = torch.tensor(y_list)

    return train_test_split(X,y, test_size=0.2)

### Training the model

In [59]:
loss_function = nn.CrossEntropyLoss()

def train_model(model, X, y):

    def train_loop(model, X, y):   
        optimizer = optim.SGD(model.parameters(), lr=0.1)
        #compute prediction and loss
        y_pred = model(X)
        loss = loss_function(y_pred, y)
        #backpropagation
        optimizer.zero_grad() #cleans the gradients
        loss.backward() #computes the gradients
        optimizer.step() #update the parameters
            
        print('epoch: ', epoch,' loss: ', loss.item())

    for epoch in range(10):
        train_loop(model, X, y)


### Testing the trained model

In [83]:
def test_model(model, X, y):
    with torch.no_grad():
        model.eval()
        total_loss = 0.0
        y_pred_test = model(X)
        y_pred = np.argmax(y_pred_test, axis=1)
        y_pred_tensor = y_pred.type(torch.FloatTensor)
        print(y_pred_tensor)
        print(type(y_pred_tensor))
        loss_test = loss_function(y_pred_tensor, y)
        total_loss += float(loss_test)
    print("Total loss on test data: ", total_loss)
    print(classification_report(y_true=y, y_pred=y_pred_test))

### Calling the methods for each language

### EN

In [47]:
y_test_EN.size()

torch.Size([1478])

In [84]:
#vocab_size_EN = make_bow_get_vocab_size(df_train_EN)
#X_train_EN, X_test_EN, y_train_EN, y_test_EN = prep_data(df_train_EN)
#model_EN = BoWClassifier(num_labels=2, vocab_size=vocab_size_EN, num_hidden=vocab_size_EN)
#train_model(model_EN, X_train_EN, y_train_EN)
test_model(model_EN, X_test_EN, y_test_EN)


tensor([0., 0., 1.,  ..., 0., 0., 1.])
<class 'torch.Tensor'>


RuntimeError: Expected floating point type for target with class probabilities, got Long

### FI

In [64]:
vocab_size = make_bow_get_vocab_size(df_train_FI)
X_train_FI, X_test_FI, y_train_FI, y_test_FI = prep_data(df_train_FI)
model_FI = BoWClassifier(num_labels=2, vocab_size=vocab_size, num_hidden=7)
train_model(model_FI, X_train_FI, y_train_FI)
test_model(model_FI, X_test_FI, y_test_FI)

TypeError: __init__() got an unexpected keyword argument 'dim'