## Weak 36

In [1]:
import tiktoken
from tokenizers import Tokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    pipeline,
)
from datasets import load_dataset
import polars as pl
import os

In [2]:
# Get huggingface dataset
dataset = load_dataset("coastalcph/tydi_xor_rc")
df_train = pl.from_pandas(dataset["train"].to_pandas())
df_test = pl.from_pandas(dataset["test"].to_pandas())

In [3]:
df_train.columns

['question',
 'context',
 'lang',
 'answerable',
 'answer_start',
 'answer',
 'answer_inlang']

In [4]:
df_train.head()

question,context,lang,answerable,answer_start,answer,answer_inlang
str,str,str,bool,i64,str,str
"""উইকিলিকস কত সালে সর্বপ্রথম ইন্…","""WikiLeaks () is an internation…","""bn""",True,182,"""2006""",
"""দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ প…","""The war in Europe concluded wi…","""bn""",True,48,"""Germany""",
"""মার্কিন যুক্তরাষ্ট্রের সংবিধান…","""Same-sex marriage in the Unite…","""bn""",False,-1,"""no""",
"""আরব-ইসরায়েলি যুদ্ধে আরবের মোট…","""The exact number of Arab casua…","""bn""",True,39,"""unknown""",
"""বিশ্বে প্রথম পুঁজিবাদী সমাজ কব…","""As Thomas Hall (2000) notes, ""…","""bn""",True,1219,"""17th century""",


In [5]:
df_ar = df_train.filter(pl.col("lang") == "ar")
df_ko = df_train.filter(pl.col("lang") == "ko")
df_te = df_train.filter(pl.col("lang") == "te")

In [6]:
df_te.describe()

statistic,question,context,lang,answerable,answer_start,answer,answer_inlang
str,str,str,str,f64,f64,str,str
"""count""","""1355""","""1355""","""1355""",1355.0,1355.0,"""1355""","""50"""
"""null_count""","""0""","""0""","""0""",0.0,0.0,"""0""","""1305"""
"""mean""",,,,0.96679,142.467159,,
"""std""",,,,,191.558834,,
"""min""","""1950 నాటికి విశాఖపట్నం జిల్లాల…",""" Gerschenkron did not define e…","""te""",0.0,-1.0,"""""Ai'n-e Akbari""""","""(100 °సెం.)"""
"""25%""",,,,,17.0,,
"""50%""",,,,,79.0,,
"""75%""",,,,,198.0,,
"""max""","""హ్యారీ పోట్టర్ చిత్ర కథానాయకుడ…","""will be the place. The nuclear…","""te""",1.0,2400.0,"""złoty""","""సి.ఎన్.అన్నాదురై"""


### Get tokenizers and look at sample (Arabic) sentence

In [7]:
# Load multilingual BERT tokenizer
mbert_tokenizer = Tokenizer.from_pretrained("bert-base-multilingual-uncased")
# Load GPT-4 tokenizer
gpt4_tokenizer = tiktoken.get_encoding("cl100k_base")
# Load NLLB-200 tokenizer
nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")

In [8]:
" | ".join(mbert_tokenizer.encode(df_ar["question"][0]).tokens)

'[CLS] | م | ##تى | ت | ##دخل | ##ت | روسيا | في | الحرب | الا | ##هل | ##ية | السورية | ؟ | [SEP]'

In [9]:
# Decode each token from GPT-4 tokenizer
" | ".join([gpt4_tokenizer.decode([token]) for token in gpt4_tokenizer.encode(df_ar["question"][0])])


'م | ت | ى |  ت | د | خ | ل | ت |  ر | و | س | ي | ا |  في |   |  ال | ح | ر | ب |  ال | أ | ه | ل | ية |  ال | س | ور | ية | � | �'

In [10]:
" | ".join(nllb_tokenizer.tokenize(df_ar["question"][0]))

'▁متى | ▁تد | خ | لت | ▁روس | يا | ▁في | ▁الحرب | ▁الأ | ه | لية | ▁الس | ور | ية | ؟'

## Get the top 5 most frequent words

In [11]:
# Language dict
lang_dict = {
    "ar": "arb_Arab",
    "ko": "kor_Hang",
    "te": "tel_Telu",
}

In [12]:
# Set up translation pipeline for NLLB-200
translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [13]:
random_txt = df_ko["question"][0]
translator = translator = pipeline(
        "translation",
        model=translator_model,
        tokenizer=nllb_tokenizer,
        src_lang="kor_Hang",
        tgt_lang="eng_Latn",
    )
translator(random_txt)[0]["translation_text"] 

Device set to use mps:0


"Who is the winner of the Thirty Years' War?"

In [16]:
# Tokenize questions in Arabic using multilingual BERT tokenizer
def _make_translator(src_lang:str):
    translator = pipeline(
        "translation",
        model=translator_model,
        tokenizer=nllb_tokenizer,
        src_lang=src_lang,
        tgt_lang="eng_Latn",
    )
    return translator

def tokenize_question(df: pl.DataFrame) -> pl.DataFrame:
    
    src_lang = lang_dict[df["lang"][0]]
    print(f"Tokenizing and translating {src_lang}...")
    print(f"tydi_xor_rc_{src_lang}.parquet" in os.listdir())
    if f"tydi_xor_rc_{src_lang}.parquet" in os.listdir():
        return pl.read_parquet(f"tydi_xor_rc_{src_lang}.parquet")

    df = df.with_columns(
        pl.col("question")
        .map_elements(lambda x: nllb_tokenizer.tokenize(x), return_dtype=pl.List(pl.Utf8))
        .alias("tokens")
    )

    translator = _make_translator(src_lang)
    df = df.with_columns(
        pl.col("question")
        .map_elements(lambda x: translator(x)[0]["translation_text"])
        .alias("translation")
    )

    df.write_parquet(f"tydi_xor_rc_{src_lang}.parquet")
    df.write_excel(f"tydi_xor_rc_{src_lang}.xlsx")

    return df

In [None]:
for df in [df_ar, df_ko, df_te]:
    df = tokenize_question(df)
    print(df.head())

#df_ar = tokenize_question(df_ar)
#df_ar.head()

Device set to use mps:0


Tokenizing and translating arb_Arab...
False


In [None]:
count_dict_ar = {}
for tokens in df_ar["tokens"]:
    for token in tokens:
        if token in count_dict_ar:
            count_dict_ar[token] += 1
        else:
            count_dict_ar[token] = 1
# Get as tuple and sort by frequency
sorted_frequency_list_ar = sorted(count_dict_ar.items(), key=lambda x: x[1], reverse=True)

In [None]:
sorted_frequency_list_ar