## Weak 36. Exploring dataset

In [1]:
import tiktoken
from tokenizers import Tokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    pipeline,
)
from datasets import load_dataset
import polars as pl
import os

In [2]:
# Get huggingface dataset
dataset = load_dataset("coastalcph/tydi_xor_rc")
df_train = pl.from_pandas(dataset["train"].to_pandas())
df_val = pl.from_pandas(dataset["validation"].to_pandas())

In [3]:
df_train.head()

question,context,lang,answerable,answer_start,answer,answer_inlang
str,str,str,bool,i64,str,str
"""উইকিলিকস কত সালে সর্বপ্রথম ইন্…","""WikiLeaks () is an internation…","""bn""",True,182,"""2006""",
"""দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ প…","""The war in Europe concluded wi…","""bn""",True,48,"""Germany""",
"""মার্কিন যুক্তরাষ্ট্রের সংবিধান…","""Same-sex marriage in the Unite…","""bn""",False,-1,"""no""",
"""আরব-ইসরায়েলি যুদ্ধে আরবের মোট…","""The exact number of Arab casua…","""bn""",True,39,"""unknown""",
"""বিশ্বে প্রথম পুঁজিবাদী সমাজ কব…","""As Thomas Hall (2000) notes, ""…","""bn""",True,1219,"""17th century""",


In [4]:
df_ar = df_train.filter(pl.col("lang") == "ar")
df_ko = df_train.filter(pl.col("lang") == "ko")
df_te = df_train.filter(pl.col("lang") == "te")
df_arkote = df_train.filter(pl.col("lang").is_in(["ar", "ko", "te"]))
assert df_ar.height + df_ko.height + df_te.height == df_arkote.height; # sanity check

df_ar_val = df_val.filter(pl.col("lang") == "ar")
df_ko_val = df_val.filter(pl.col("lang") == "ko")
df_te_val = df_val.filter(pl.col("lang") == "te")
df_arkote_val = df_val.filter(pl.col("lang").is_in(["ar", "ko", "te"]))
assert df_ar_val.height + df_ko_val.height + df_te_val.height == df_arkote_val.height; # sanity check

In [5]:
df_arkote.describe()

statistic,question,context,lang,answerable,answer_start,answer,answer_inlang
str,str,str,str,f64,f64,str,str
"""count""","""6335""","""6335""","""6335""",6335.0,6335.0,"""6335""","""50"""
"""null_count""","""0""","""0""","""0""",0.0,0.0,"""0""","""6285"""
"""mean""",,,,0.942699,155.155012,,
"""std""",,,,,225.380919,,
"""min""","""'과학혁명의 구조'내용은 무엇인가?""",""" Gerschenkron did not define e…","""ar""",0.0,-1.0,"""""A Portuguesa""""","""(100 °సెం.)"""
"""25%""",,,,,14.0,,
"""50%""",,,,,78.0,,
"""75%""",,,,,208.0,,
"""max""","""힌두교의 정통 철학은 총 몇개인가?""","""Ṭāriq ibn Ziyād () was a Musli…","""te""",1.0,3964.0,"""“marry-your-rapist” laws""","""సి.ఎన్.అన్నాదురై"""


### Get tokenizers and look at sample (Arabic) sentence

In [6]:
# Load multilingual BERT tokenizer
mbert_tokenizer = Tokenizer.from_pretrained("bert-base-multilingual-uncased")
# Load GPT-4 tokenizer
gpt4_tokenizer = tiktoken.get_encoding("cl100k_base")
# Load NLLB-200 tokenizer
nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")

In [7]:
# BERT tokenization example
" | ".join(mbert_tokenizer.encode(df_te["question"][0]).tokens)

'[CLS] | పరపంచ | ##ంల | మ | ##టట | ##మ | ##ద | ##ట | ద | ##ూర | వదయ | వదయ | ##లయం | ఏ | ద | ##శం | ##ల | స | ##థ | ##ప | ##ంచ | ##బడంద | ? | [SEP]'

In [8]:
# Decode each token from GPT-4 tokenizer
" | ".join([gpt4_tokenizer.decode([token]) for token in gpt4_tokenizer.encode(df_te["question"][0])])

'� | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � |   |  � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � |  � | � | � | � | � | � |  � | � | � | � | � | � | � | � | � | � |  � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � |  � | � |  � | � | � | � | � | � | � | � | � | � | � | � |  � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � | � |  ?'

In [9]:
# NLLB-200 tokenization example
" | ".join(nllb_tokenizer.tokenize(df_te["question"][0]))

'▁ప్రపంచంలో | ▁మొట్టమొదటి | ▁దూ | ర | ▁విద్య | ▁విద్య | ాల | యం | ▁ఏ | ▁దేశంలో | ▁స్థా | పించ | బడింది | ▁?'

## Get the top 5 most frequent words

In [10]:
# Language dict
lang_dict = {
    "ar": "arb_Arab",
    "ko": "kor_Hang",
    "te": "tel_Telu",
}

In [11]:
# Set up translation pipeline for NLLB-200
translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [12]:
# Example translation
random_txt = df_ko["question"][0]
translator = translator = pipeline(
        "translation",
        model=translator_model,
        tokenizer=nllb_tokenizer,
        src_lang="kor_Hang",
        tgt_lang="eng_Latn",
    )
translator(random_txt)[0]["translation_text"] 

Device set to use cuda:0


"Who is the winner of the Thirty Years' War?"

In [13]:
def _make_translator(src_lang:str):
    translator = pipeline(
        "translation",
        model=translator_model,
        tokenizer=nllb_tokenizer,
        src_lang=src_lang,
        tgt_lang="eng_Latn",
    )
    return translator

def tokenize_question(df: pl.DataFrame, use_cache: bool = True, translation: bool = True, tokenization: bool = True) -> pl.DataFrame:
    
    # Check if already tokenized and translated
    src_lang = lang_dict[df["lang"][0]]
    cache_path = os.path.join("data", f"tydi_xor_rc_{src_lang}")
    print(f"Tokenizing and translating {src_lang}...")
    if use_cache and f"tydi_xor_rc_{src_lang}.parquet" in os.listdir("data"):
        # Load from parquet
        print(f"Loading from cached file tydi_xor_rc_{src_lang}.parquet")

        return pl.read_parquet(os.path.join("data", f"tydi_xor_rc_{src_lang}.parquet"))

    if tokenization:
        # Tokenize questions using multilingual BERT tokenizer
        print("Tokenizing")
        df = df.with_columns(
            pl.col("question")
            .map_elements(lambda x: nllb_tokenizer.tokenize(x), return_dtype=pl.List(pl.Utf8))
            .alias("tokens")
        )

    if translation:
        print("Translating")
        # Translate questions using NLLB-200
        translator = _make_translator(src_lang)
        df = df.with_columns(
            pl.col("question")
            .map_elements(lambda x: translator(x)[0]["translation_text"])
            .alias("translation")
        )

    if use_cache:
        print(f"Caching to {cache_path}.parquet and {cache_path}.xlsx")
        df.write_parquet(cache_path + ".parquet")
        df.write_excel(cache_path + ".xlsx")

    return df

In [14]:
tokenize_question(df_ar)
tokenize_question(df_ko)
tokenize_question(df_te)

Tokenizing and translating arb_Arab...
Loading from cached file tydi_xor_rc_arb_Arab.parquet
Tokenizing and translating kor_Hang...
Loading from cached file tydi_xor_rc_kor_Hang.parquet
Tokenizing and translating tel_Telu...
Loading from cached file tydi_xor_rc_tel_Telu.parquet


question,context,lang,answerable,answer_start,answer,answer_inlang,tokens,translation
str,str,str,bool,i64,str,str,list[str],str
"""ప్రపంచంలో మొట్టమొదటి దూర విద్…","""Referred to as ""People's Unive…","""te""",true,236,"""London""",,"[""▁ప్రపంచంలో"", ""▁మొట్టమొదటి"", … ""▁?""]","""The world's first distance lea…"
"""1959వ సంవత్సరంలో భారతదేశ ప్రధా…","""Since 1947, there have been 14…","""te""",true,220,"""Jawaharlal Nehru""",,"[""▁1959"", ""వ"", … ""?""]","""Who was the Prime Minister of …"
"""ఏ కాకతీయ రాజు కర్నూలు జిల్లాను…","""Rani Rudrama Devi (died 1289 o…","""te""",true,194,"""Prataparudra""",,"[""▁ఏ"", ""▁కా"", … ""?""]","""Which Kakatiya king was the la…"
"""మానవ హక్కులు ఎన్ని?""","""The Declaration consists of 30…","""te""",true,28,"""30""",,"[""▁మానవ"", ""▁హక్కు"", … ""?""]","""How many human rights do you h…"
"""భారదేశంలో అత్యధిక జనాభా కలిగిన…","""Uttar Pradesh (; IAST: ""Uttar …","""te""",true,0,"""Uttar Pradesh""",,"[""▁భార"", ""దేశ"", … ""?""]","""Which is the most populous sta…"
…,…,…,…,…,…,…,…,…
"""కోళ్లు ఎక్కువగా ఏ దేశంలో కనిపి…","""Since time immemorial, man has…","""te""",false,-1,"""United States of America""","""అమెరికా సంయుక్త రాష్ట్రాలు""","[""▁కో"", ""ళ్లు"", … ""?""]","""In what country are chickens m…"
"""క్షయ వ్యాధికి విరుగుడు ఏ దేశంల…","""Vaccines against anthrax for u…","""te""",false,-1,"""France""","""ఫ్రాన్స్""","[""▁క్ష"", ""య"", … ""?""]","""In what country was the antido…"
"""ఖురాన్ ఏ అరబ్బీ భాషలో ఎవరు రాస…","""are broken Other Names of the …","""te""",false,-1,"""Prophet Muhammad""","""ముహమ్మద్ ప్రవక్త""","[""▁ఖ"", ""ు"", … ""?""]","""Who wrote the Qur'an in which …"
"""టెక్సస్ రాష్ట్రంలోని అతిపెద్ద …","""Austin is the capital of the U…","""te""",false,-1,"""JP Morgan Chase Tower""","""జేపీ మోర్గాన్ ఛేజ్ టవర్""","[""▁టెక్"", ""స"", … ""▁?""]","""What is the largest man-made s…"


In [15]:
import numpy as np
# Get the number of questions and the number of total tokens in each language
def df_metadata(df: pl.DataFrame) -> None:
    n_questions = df.height
    tokens_list = tokenize_question(df, use_cache = False, translation=False)["tokens"].to_list()
    n_tokens = sum([len(tokens) for tokens in tokens_list])
    print(f"Number of questions: {n_questions} Number of tokens: {n_tokens}")

print("Arabic dataset")
df_metadata(df_ar)
print("Korean dataset")
df_metadata(df_ko)
print("Telugu dataset")
df_metadata(df_te)
print("Arabic validation dataset")
df_metadata(df_ar_val)
print("Korean validation dataset")
df_metadata(df_ko_val)
print("Telugu validation dataset")
df_metadata(df_te_val)

Arabic dataset
Tokenizing and translating arb_Arab...
Tokenizing
Number of questions: 2558 Number of tokens: 33733
Korean dataset
Tokenizing and translating kor_Hang...
Tokenizing
Number of questions: 2422 Number of tokens: 25829
Telugu dataset
Tokenizing and translating tel_Telu...
Tokenizing
Number of questions: 1355 Number of tokens: 18365
Arabic validation dataset
Tokenizing and translating arb_Arab...
Tokenizing
Number of questions: 415 Number of tokens: 5604
Korean validation dataset
Tokenizing and translating kor_Hang...
Tokenizing
Number of questions: 356 Number of tokens: 3775
Telugu validation dataset
Tokenizing and translating tel_Telu...
Tokenizing
Number of questions: 384 Number of tokens: 5020


In [16]:
# Tokenize and translate each language dataframe, then compute token frequencies
for df in [df_ar, df_ko, df_te]:
    df = tokenize_question(df)
    count_dict = {}
    for tokens in df["tokens"]:
        for token in tokens:
            if token in count_dict:
                count_dict[token] += 1
            else:
                count_dict[token] = 1
    sorted_frequency_list = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
    print(sorted_frequency_list[:10])


Tokenizing and translating arb_Arab...
Loading from cached file tydi_xor_rc_arb_Arab.parquet
[('؟', 1483), ('▁؟', 1057), ('ية', 656), ('▁في', 609), ('▁من', 593), ('▁متى', 535), ('ة', 477), ('▁ما', 450), ('▁هو', 355), ('▁ال', 334)]
Tokenizing and translating kor_Hang...
Loading from cached file tydi_xor_rc_kor_Hang.parquet
[('?', 2420), ('인가', 610), ('▁무엇인가', 592), ('은', 586), ('▁가장', 529), ('▁언제', 432), ('의', 388), ('는가', 354), ('는', 323), ('▁몇', 320)]
Tokenizing and translating tel_Telu...
Loading from cached file tydi_xor_rc_tel_Telu.parquet
[('?', 1093), ('▁ఎవరు', 274), ('▁?', 260), ('▁ఏ', 223), ('▁ఏది', 192), ('ంలో', 169), ('▁ఎన్ని', 165), ('▁జి', 163), ('మ', 157), ('▁ఎప్పుడు', 154)]


In [17]:
# Tokenize and translate each language dataframe, then compute token frequencies of the translations
for df in [df_ar, df_ko, df_te]:
    df = tokenize_question(df)
    count_dict = {}
    for translation in df["translation"]:
        for token in nllb_tokenizer.tokenize(translation):
            if token in count_dict:
                count_dict[token] += 1
            else:
                count_dict[token] = 1
    sorted_frequency_list = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
    print(sorted_frequency_list[:10])

Tokenizing and translating arb_Arab...
Loading from cached file tydi_xor_rc_arb_Arab.parquet
[('▁the', 2252), ('?', 1600), ('▁?', 984), ('▁of', 867), ('▁What', 687), ('s', 576), ('▁When', 567), ('▁in', 545), ('▁is', 539), ('▁was', 538)]
Tokenizing and translating kor_Hang...
Loading from cached file tydi_xor_rc_kor_Hang.parquet
[('?', 2716), ('▁the', 2350), ('▁What', 929), ('▁is', 857), ('▁in', 844), ('▁of', 759), ("'", 646), ('s', 639), ('▁was', 446), ('▁When', 431)]
Tokenizing and translating tel_Telu...
Loading from cached file tydi_xor_rc_tel_Telu.parquet
[('?', 1392), ('▁the', 1254), ('▁is', 682), ('▁of', 681), ('▁What', 459), ('▁in', 439), ('▁Who', 324), ('▁was', 283), ('▁How', 206), ('▁many', 181)]


In [18]:
# Top tokens in answerable true and false
for df in [df_ar, df_ko, df_te]:
    df = tokenize_question(df)
    for answerable in [True, False]:
        count_dict = {}
        for translation in df.filter(pl.col("answerable") == answerable)["translation"]:
            for token in nllb_tokenizer.tokenize(translation):
                if token in count_dict:
                    count_dict[token] += 1
                else:
                    count_dict[token] = 1
        sorted_frequency_list = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
        print(f"Top tokens in {lang_dict[df['lang'][0]]} answerable={answerable}: {sorted_frequency_list[:10]}")

Tokenizing and translating arb_Arab...
Loading from cached file tydi_xor_rc_arb_Arab.parquet
Top tokens in arb_Arab answerable=True: [('▁the', 2090), ('?', 1446), ('▁?', 884), ('▁of', 806), ('▁What', 683), ('▁When', 567), ('s', 555), ('▁was', 537), ('▁is', 533), ('▁in', 477)]
Top tokens in arb_Arab answerable=False: [('▁the', 162), ('?', 154), ('▁Is', 122), ('▁?', 100), ('▁a', 73), ('▁in', 68), ('▁of', 61), ('▁to', 44), ('▁Does', 29), ('▁Can', 27)]
Tokenizing and translating kor_Hang...
Loading from cached file tydi_xor_rc_kor_Hang.parquet
Top tokens in kor_Hang answerable=True: [('?', 2654), ('▁the', 2299), ('▁What', 925), ('▁is', 849), ('▁in', 825), ('▁of', 733), ("'", 643), ('s', 635), ('▁was', 446), ('▁When', 430)]
Top tokens in kor_Hang answerable=False: [('?', 62), ('▁the', 51), ('▁of', 26), ('▁in', 19), ('▁a', 18), ('▁Is', 16), ('▁Can', 9), ('▁Does', 8), ('▁is', 8), ('▁have', 8)]
Tokenizing and translating tel_Telu...
Loading from cached file tydi_xor_rc_tel_Telu.parquet
Top tok