In [1]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import tokenize
from nltk.corpus import stopwords
import re
from sudachipy import tokenizer
from sudachipy import dictionary
import string

## Load data set into pandas DataFrame

In [2]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

df_val = pd.DataFrame(validation_set)
df_val = df_val[df_val.language.isin(['finnish', 'english', 'japanese'])]

df_train = pd.DataFrame(train_set)
df_train = df_train[df_train.language.isin(['finnish', 'english', 'japanese'])]
len(df_train)

Using custom data configuration copenlu--nlp_course_tydiqa-cceecfb5416d988a
Found cached dataset parquet (/Users/dpr577/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

29868

## 1.1 Preprocessing and Dataset Analysis

In [3]:
def get_lang_df(df, language):
    return df[df['language'] == language]

df_train_FI = get_lang_df(df_train, 'finnish').copy()
df_train_JAP = get_lang_df(df_train, 'japanese').copy()
df_train_EN = get_lang_df(df_train, 'english').copy()

In [4]:
def clean_text(text):
    return "".join([char.lower() for char in text if char not in string.punctuation]) 

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [w for w in tokens if not w in stop_words]

def tokenize_EN(df, col: str):
    df[col+'tokens'] = df[col].apply(word_tokenize)
    df[col+'tokens_cleaned'] = df[col].apply(clean_text)
    df[col+'tokens_cleaned'] = df[col+'tokens_cleaned'].apply(word_tokenize)
    df[col+'tokens_cleaned'] = df[col+'tokens_cleaned'].apply(remove_stopwords)

def tokenize_FI(df):
    df['tokens'] = df['question_text'].apply(word_tokenize, language='finnish')

def helper_func_JAP(question):
    tokenizer_obj = dictionary.Dictionary().create()
    res_list = tokenizer_obj.tokenize(question)
    return [x.surface() for x in res_list]

def tokenize_JAP(df):
    df['tokens'] = df['question_text'].apply(helper_func_JAP)


tokenize_EN(df_train_EN, 'question_text')
#tokenize_FI(df_train_FI)
#tokenize_JAP(df_train_JAP)


In [18]:
test = "“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ"
helper_func_JAP(test)

['“', 'ダン', '”', ' ', 'ダニエル', '・', 'ジャドソン', '・', 'キャラハン', 'の', '出身', 'は', 'どこ']

In [5]:
def get_most_common_first_n_tokens(df, n):
    df['first_token'] = df['tokens'].apply(lambda x: x[0])
    return df.first_token.value_counts()[:n]

def get_most_common_last_n_tokens(df, n):
    df['last_token'] = df['tokens'].apply(lambda x: x[-1] if x[-1].isalpha() else x[-2])
    return df.last_token.value_counts()[:n]

#For English
df_res = get_most_common_first_n_tokens(df_train_EN, 10)
get_most_common_last_n_tokens(df_train_EN, 10)

KeyError: 'tokens'

In [35]:
#Finnish
get_most_common_first_n_tokens(df_train_FI, 10)
get_most_common_last_n_tokens(df_train_FI, 10)

syntyi          1072
on               723
kuoli            720
tarkoittaa       488
perustettu       476
syntynyt         398
oli              382
perustettiin     351
sijaitsee        258
pinta-ala        214
Name: last_token, dtype: int64

In [36]:
#Japanese
res_JAP_first = get_most_common_first_n_tokens(df_train_JAP, 10).reset_index()
res_JAP_last = get_most_common_last_n_tokens(df_train_JAP, 10).reset_index()
res_JAP = pd.concat([res_JAP_first, res_JAP_last], ignore_index=True, axis=1)
res_JAP.to_latex()



  res_JAP.to_latex()


'\\begin{tabular}{llrlr}\n\\toprule\n{} &        0 &    1 &   2 &     3 \\\\\n\\midrule\n0 &       日本 &  354 &   た &  2115 \\\\\n1 &        『 &  306 &   か &  1305 \\\\\n2 &       世界 &   94 &   何 &  1192 \\\\\n3 &      ジョン &   58 &  いつ &   996 \\\\\n4 &        第 &   56 &   は &   932 \\\\\n5 &  アメリカ合衆国 &   54 &  どこ &   884 \\\\\n6 &        「 &   50 &   誰 &   746 \\\\\n7 &     アメリカ &   50 &  ある &   174 \\\\\n8 &    ウィリアム &   44 &  だれ &    64 \\\\\n9 &     ジョージ &   44 &  いる &    42 \\\\\n\\bottomrule\n\\end{tabular}\n'

## 1.2 Binary Question Classification

In [117]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import json
from sklearn.feature_extraction.text import CountVectorizer

### Data Cleaning

In [10]:
def make_col_answer(text):
    return text['answer_text'][0]

def make_col_answer_start(text):
    return text['answer_start'][0]

df_train_EN['answer_text'] = df_train_EN['annotations'].apply(make_col_answer)
df_train_EN['answer_start'] = df_train_EN['annotations'].apply(make_col_answer_start)
df_train_EN['answerable'] = df_train_EN['answer_start'].apply(lambda x : 0 if x == -1 else 1)

df_train_EN.head()

tokenize_EN(df_train_EN, 'document_plaintext')

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url,question_texttokens,question_texttokens_cleaned,doc_tokens,word_count_doc,answer_text,answer_start,answerable
26,When was quantum field theory developed?,Quantum field theory,english,"{'answer_start': [159], 'answer_text': ['1920s']}",Quantum field theory naturally began with the ...,https://en.wikipedia.org/wiki/Quantum%20field%...,"[When, was, quantum, field, theory, developed, ?]","[quantum, field, theory, developed]","[Quantum, field, theory, naturally, began, wit...",31,1920s,159,1
43,Who was the first Nobel prize winner for Liter...,List of Nobel laureates in Literature,english,"{'answer_start': [610], 'answer_text': ['Sully...",The Nobel Prize in Literature (Swedish: Nobelp...,https://en.wikipedia.org/wiki/List%20of%20Nobe...,"[Who, was, the, first, Nobel, prize, winner, f...","[first, nobel, prize, winner, literature]","[The, Nobel, Prize, in, Literature, (, Swedish...",188,Sully Prudhomme,610,1
112,When is the dialectical method used?,Dialectic,english,"{'answer_start': [129], 'answer_text': ['disco...","Dialectic or dialectics (Greek: διαλεκτική, di...",https://en.wikipedia.org/wiki/Dialectic,"[When, is, the, dialectical, method, used, ?]","[dialectical, method, used]","[Dialectic, or, dialectics, (, Greek, :, διαλε...",113,discourse between two or more people holding d...,129,1
123,Who invented Hangul?,Origin of Hangul,english,"{'answer_start': [88], 'answer_text': ['Sejong...",Hangul was personally created and promulgated ...,https://en.wikipedia.org/wiki/Origin%20of%20Ha...,"[Who, invented, Hangul, ?]","[invented, hangul]","[Hangul, was, personally, created, and, promul...",69,Sejong the Great,88,1
125,What do Grasshoppers eat?,Grasshopper,english,"{'answer_start': [0], 'answer_text': ['Grassho...","Grasshoppers are plant-eaters, with a few spec...",https://en.wikipedia.org/wiki/Grasshopper,"[What, do, Grasshoppers, eat, ?]","[grasshoppers, eat]","[Grasshoppers, are, plant-eaters, ,, with, a, ...",125,"Grasshoppers are plant-eaters, with a few spec...",0,1


In [64]:
type(df_train_EN[df_train_EN['answerable'] == 0].iloc[82]['document_plaintexttokens_cleaned'][0])

str

### Feature Engineering

In [147]:
def count_words_in_doc(df):
    df['doc_tokens'] = df['document_plaintext'].apply(word_tokenize)
    df["word_count_doc"] = df['doc_tokens'].str.len()

def get_bow(df):
    def get_question_vocab():
        token_list_temp = df_train_EN.question_texttokens_cleaned.to_list()
        return  [item for sublist in token_list_temp for item in sublist]

    vocab = get_question_vocab()
    vectorizer = CountVectorizer()
    vectorizer.fit(vocab)

    def transform_bow(cell):
        text = [" ".join(cell)]
        res = vectorizer.transform(text)
        return res.toarray()
    
    df['bow_question'] = df['question_texttokens_cleaned'].apply(transform_bow)

def get_overlap(df):
    def calculate_overlap(row):
        return len(list(set(row['question_texttokens_cleaned']) & set(row['document_plaintexttokens_cleaned'])))
    #df['overlap_doc_question'] = df['question_texttokens_cleaned'].apply(set) & df['document_plaintexttokens_cleaned'].apply(set)
    df['overlap_doc_question'] = df.apply(calculate_overlap, axis=1)



#count_words_in_doc(df_train_EN)
get_bow(df_train_EN)
#get_overlap(df_train_EN)
df_train_EN.iloc[0]['bow_question']

array([[0, 0, 0, ..., 0, 0, 0]])

In [142]:
doc_1 = ["this", "is", "test"]
doc_2 = ["this", "test"]
doc_3 = ["this", "diff"]

vocab = doc_1 + doc_2 + doc_3

vectorizer = CountVectorizer()
vectorizer.fit(vocab)

vectorizer.transform(doc_1).toarray()
vectorizer.get_feature_names_out()

array(['diff', 'is', 'test', 'this'], dtype=object)

In [143]:
vectorizer.transform(doc_1).toarray()

array([[0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 1, 0]])

### Building the Binary Classifier

In [118]:
class BoWClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size, num_hidden=2):
        super(BoWClassifier, self).__init__()

        self.linear = nn.Linear(vocab_size, num_hidden)
        self.nonlinear = nn.ReLU()
        self.final = nn.Linear(num_hidden, num_labels)
    
    def forward(self, bow_vector):
        return self.final(self.nonlinear(self.linear(bow_vector)))
        

In [112]:
len(df_train_EN)

7389

In [127]:

batch_size = 64
train_data = df_train_EN[['bow_question', 'answerable']]
train_dataloader = DataLoader(train_data, batch_size=batch_size)

temp_list = list(df_train_EN['bow_question'].values)

X = torch.tensor(temp_list)
y = torch.tensor(df_train_EN['answerable'].values)

len(train_dataloader.dataset)

  X = torch.tensor(temp_list)


ValueError: expected sequence of length 4 at dim 1 (got 5)

In [134]:
temp_list[9]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [120]:
model = BoWClassifier(num_labels=2, vocab_size=len(vocab))
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-0.3393, -0.3436, -0.2345,  0.2599, -0.1353,  0.2573, -0.2947, -0.0647],
        [ 0.1685, -0.2212, -0.2073,  0.1736, -0.1699,  0.0307,  0.0142,  0.2243]],
       requires_grad=True)
Parameter containing:
tensor([-0.0426, -0.1082], requires_grad=True)
Parameter containing:
tensor([[-0.2744,  0.5941],
        [ 0.3179,  0.4292]], requires_grad=True)
Parameter containing:
tensor([ 0.3868, -0.1938], requires_grad=True)


In [122]:

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

def train_loop(dataloader, model, loss_func, optimizer):
    for batch, (X,y) in enumerate(dataloader):
        #compute prediction and loss
        pred = model(X)
        loss = loss_func(pred,y)
        #backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loss, current = loss.item(), batch * len(X)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


for epoch in range(10):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_function, optimizer)


Epoch 1
-------------------------------


KeyError: 0