In [85]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import tokenize
from nltk.corpus import stopwords
import re
from sudachipy import tokenizer
from sudachipy import dictionary
import string

## Load data set into pandas DataFrame

In [31]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

df_val = pd.DataFrame(validation_set)
df_val = df_val[df_val.language.isin(['finnish', 'english', 'japanese'])]

df_train = pd.DataFrame(train_set)
df_train = df_train[df_train.language.isin(['finnish', 'english', 'japanese'])]
len(df_train)

Using custom data configuration copenlu--nlp_course_tydiqa-cceecfb5416d988a
Found cached dataset parquet (/Users/dpr577/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

29868

## 1.1 Preprocessing and Dataset Analysis

In [32]:
def get_lang_df(df, language):
    return df[df['language'] == language]

df_train_FI = get_lang_df(df_train, 'finnish').copy()
df_train_JAP = get_lang_df(df_train, 'japanese').copy()
df_train_EN = get_lang_df(df_train, 'english').copy()

In [89]:
def clean_text(text):
    return "".join([char.lower() for char in text if char not in string.punctuation]) 

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [w for w in tokens if not w in stop_words]

def tokenize_EN(df, col: str):
    df[col+'tokens'] = df[col].apply(word_tokenize)
    df[col+'tokens_cleaned'] = df[col].apply(clean_text)
    df[col+'tokens_cleaned'] = df[col+'tokens_cleaned'].apply(word_tokenize)
    df[col+'tokens_cleaned'] = df[col+'tokens_cleaned'].apply(remove_stopwords)

def tokenize_FI(df):
    df['tokens'] = df['question_text'].apply(word_tokenize, language='finnish')

def helper_func_JAP(question):
    tokenizer_obj = dictionary.Dictionary().create()
    res_list = tokenizer_obj.tokenize(question)
    return [x.surface() for x in res_list]

def tokenize_JAP(df):
    df['tokens'] = df['question_text'].apply(helper_func_JAP)


tokenize_EN(df_train_EN, 'question_text')
#tokenize_FI(df_train_FI)
#tokenize_JAP(df_train_JAP)


In [18]:
test = "“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ"
helper_func_JAP(test)

['“', 'ダン', '”', ' ', 'ダニエル', '・', 'ジャドソン', '・', 'キャラハン', 'の', '出身', 'は', 'どこ']

In [34]:
def get_most_common_first_n_tokens(df, n):
    df['first_token'] = df['tokens'].apply(lambda x: x[0])
    return df.first_token.value_counts()[:n]

def get_most_common_last_n_tokens(df, n):
    df['last_token'] = df['tokens'].apply(lambda x: x[-1] if x[-1].isalpha() else x[-2])
    return df.last_token.value_counts()[:n]

#For English
df_res = get_most_common_first_n_tokens(df_train_EN, 10)
get_most_common_last_n_tokens(df_train_EN, 10)

born           342
founded        204
die            122
have           104
formed         100
established     96
air             82
released        80
live            76
introduced      72
Name: last_token, dtype: int64

In [35]:
#Finnish
get_most_common_first_n_tokens(df_train_FI, 10)
get_most_common_last_n_tokens(df_train_FI, 10)

syntyi          1072
on               723
kuoli            720
tarkoittaa       488
perustettu       476
syntynyt         398
oli              382
perustettiin     351
sijaitsee        258
pinta-ala        214
Name: last_token, dtype: int64

In [36]:
#Japanese
res_JAP_first = get_most_common_first_n_tokens(df_train_JAP, 10).reset_index()
res_JAP_last = get_most_common_last_n_tokens(df_train_JAP, 10).reset_index()
res_JAP = pd.concat([res_JAP_first, res_JAP_last], ignore_index=True, axis=1)
res_JAP.to_latex()



  res_JAP.to_latex()


'\\begin{tabular}{llrlr}\n\\toprule\n{} &        0 &    1 &   2 &     3 \\\\\n\\midrule\n0 &       日本 &  354 &   た &  2115 \\\\\n1 &        『 &  306 &   か &  1305 \\\\\n2 &       世界 &   94 &   何 &  1192 \\\\\n3 &      ジョン &   58 &  いつ &   996 \\\\\n4 &        第 &   56 &   は &   932 \\\\\n5 &  アメリカ合衆国 &   54 &  どこ &   884 \\\\\n6 &        「 &   50 &   誰 &   746 \\\\\n7 &     アメリカ &   50 &  ある &   174 \\\\\n8 &    ウィリアム &   44 &  だれ &    64 \\\\\n9 &     ジョージ &   44 &  いる &    42 \\\\\n\\bottomrule\n\\end{tabular}\n'

## 1.2 Binary Question Classification

In [77]:
import torch
import json
from sklearn.feature_extraction.text import CountVectorizer

In [84]:
def make_col_answer(text):
    return text['answer_text'][0]

def make_col_answer_start(text):
    return text['answer_start'][0]

df_train_EN['answer_text'] = df_train_EN['annotations'].apply(make_col_answer)
df_train_EN['answer_start'] = df_train_EN['annotations'].apply(make_col_answer_start)
df_train_EN['answerable'] = df_train_EN['answer_start'].apply(lambda x : 0 if x == -1 else 1)

df_train_EN.head()

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url,tokens,first_token,last_token,answer_text,answer start,answer_start,answerable,doc_tokens,word_count,word_count_doc,tokens_cleaned
26,When was quantum field theory developed?,Quantum field theory,english,"{'answer_start': [159], 'answer_text': ['1920s']}",Quantum field theory naturally began with the ...,https://en.wikipedia.org/wiki/Quantum%20field%...,"[When, was, quantum, field, theory, developed, ?]",When,developed,1920s,159,159,1,"[Quantum, field, theory, naturally, began, wit...",31,31,"[when, was, quantum, field, theory, developed]"
43,Who was the first Nobel prize winner for Liter...,List of Nobel laureates in Literature,english,"{'answer_start': [610], 'answer_text': ['Sully...",The Nobel Prize in Literature (Swedish: Nobelp...,https://en.wikipedia.org/wiki/List%20of%20Nobe...,"[Who, was, the, first, Nobel, prize, winner, f...",Who,Literature,Sully Prudhomme,610,610,1,"[The, Nobel, Prize, in, Literature, (, Swedish...",188,188,"[who, was, the, first, nobel, prize, winner, f..."
112,When is the dialectical method used?,Dialectic,english,"{'answer_start': [129], 'answer_text': ['disco...","Dialectic or dialectics (Greek: διαλεκτική, di...",https://en.wikipedia.org/wiki/Dialectic,"[When, is, the, dialectical, method, used, ?]",When,used,discourse between two or more people holding d...,129,129,1,"[Dialectic, or, dialectics, (, Greek, :, διαλε...",113,113,"[when, is, the, dialectical, method, used]"
123,Who invented Hangul?,Origin of Hangul,english,"{'answer_start': [88], 'answer_text': ['Sejong...",Hangul was personally created and promulgated ...,https://en.wikipedia.org/wiki/Origin%20of%20Ha...,"[Who, invented, Hangul, ?]",Who,Hangul,Sejong the Great,88,88,1,"[Hangul, was, personally, created, and, promul...",69,69,"[who, invented, hangul]"
125,What do Grasshoppers eat?,Grasshopper,english,"{'answer_start': [0], 'answer_text': ['Grassho...","Grasshoppers are plant-eaters, with a few spec...",https://en.wikipedia.org/wiki/Grasshopper,"[What, do, Grasshoppers, eat, ?]",What,eat,"Grasshoppers are plant-eaters, with a few spec...",0,0,1,"[Grasshoppers, are, plant-eaters, ,, with, a, ...",125,125,"[what, do, grasshoppers, eat]"


In [76]:
df_train_EN[df_train_EN['answerable'] == 0].iloc[82]['document_plaintext']

'Georges Touchard-Lafosse at the age of 17, paid a visit to Madame Dupin in 1797. He later evokes it:[40]'

### Data Cleaning

In [90]:
tokenize_EN(df_train_EN, 'document_plaintext')

### Feature Engineering

In [70]:
def count_words_in_doc(df):
    df['doc_tokens'] = df['document_plaintext'].apply(word_tokenize)
    df["word_count_doc"] = df['doc_tokens'].str.len()

def get_bow(df):
    def get_question_vocab():
        token_list_temp = df_train_EN.question_texttokens_cleaned.to_list()
        return  [item for sublist in token_list_temp for item in sublist]
  
    vocab = get_question_vocab()
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(vocab)
    df['bow_question'] = X.get_feature_names_out(df.question_texttokens_cleaned)

def get_overlap(df):
    pass

def

count_words_in_doc(df_train_EN)
df_train_EN.head()

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url,tokens,first_token,last_token,answer_text,answer start,answer_start,answerable,doc_tokens,word_count,word_count_doc
26,When was quantum field theory developed?,Quantum field theory,english,"{'answer_start': [159], 'answer_text': ['1920s']}",Quantum field theory naturally began with the ...,https://en.wikipedia.org/wiki/Quantum%20field%...,"[When, was, quantum, field, theory, developed, ?]",When,developed,1920s,159,159,1,"[Quantum, field, theory, naturally, began, wit...",31,31
43,Who was the first Nobel prize winner for Liter...,List of Nobel laureates in Literature,english,"{'answer_start': [610], 'answer_text': ['Sully...",The Nobel Prize in Literature (Swedish: Nobelp...,https://en.wikipedia.org/wiki/List%20of%20Nobe...,"[Who, was, the, first, Nobel, prize, winner, f...",Who,Literature,Sully Prudhomme,610,610,1,"[The, Nobel, Prize, in, Literature, (, Swedish...",188,188
112,When is the dialectical method used?,Dialectic,english,"{'answer_start': [129], 'answer_text': ['disco...","Dialectic or dialectics (Greek: διαλεκτική, di...",https://en.wikipedia.org/wiki/Dialectic,"[When, is, the, dialectical, method, used, ?]",When,used,discourse between two or more people holding d...,129,129,1,"[Dialectic, or, dialectics, (, Greek, :, διαλε...",113,113
123,Who invented Hangul?,Origin of Hangul,english,"{'answer_start': [88], 'answer_text': ['Sejong...",Hangul was personally created and promulgated ...,https://en.wikipedia.org/wiki/Origin%20of%20Ha...,"[Who, invented, Hangul, ?]",Who,Hangul,Sejong the Great,88,88,1,"[Hangul, was, personally, created, and, promul...",69,69
125,What do Grasshoppers eat?,Grasshopper,english,"{'answer_start': [0], 'answer_text': ['Grassho...","Grasshoppers are plant-eaters, with a few spec...",https://en.wikipedia.org/wiki/Grasshopper,"[What, do, Grasshoppers, eat, ?]",What,eat,"Grasshoppers are plant-eaters, with a few spec...",0,0,1,"[Grasshoppers, are, plant-eaters, ,, with, a, ...",125,125
