## Weak 36

In [None]:
import tiktoken
from tokenizers import Tokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    pipeline,
)
from datasets import load_dataset
import polars as pl

In [None]:
# Get huggingface dataset
dataset = load_dataset("coastalcph/tydi_xor_rc")
df_train = pl.from_pandas(dataset["train"].to_pandas())
df_test = pl.from_pandas(dataset["test"].to_pandas())

In [None]:
df_train.columns

In [None]:
df_train.head()

In [None]:
df_ar = df_train.filter(pl.col("lang") == "ar")
df_ko = df_train.filter(pl.col("lang") == "ko")
df_te = df_train.filter(pl.col("lang") == "te")

In [None]:
df_te.describe()

### Get tokenizers and look at sample (Arabic) sentence

In [None]:
# Load multilingual BERT tokenizer
mbert_tokenizer = Tokenizer.from_pretrained("bert-base-multilingual-uncased")
# Load GPT-4 tokenizer
gpt4_tokenizer = tiktoken.get_encoding("cl100k_base")
# Load NLLB-200 tokenizer
nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")

In [None]:
" | ".join(mbert_tokenizer.encode(df_ar["question"][0]).tokens)

In [None]:
# Decode each token from GPT-4 tokenizer
" | ".join([gpt4_tokenizer.decode([token]) for token in gpt4_tokenizer.encode(df_ar["question"][0])])


In [None]:
" | ".join(nllb_tokenizer.tokenize(df_ar["question"][0]))

## Get the top 5 most frequent words

In [None]:
# Language dict
lang_dict = {
    "ar": "arb_Arab",
    "ko": "kor_Hang",
    "te": "tel_Telu",
}

In [None]:
# Set up translation pipeline for NLLB-200
translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [None]:
# Tokenize questions in Arabic using multilingual BERT tokenizer
def _make_translator(src_lang:str):
    translator = pipeline(
        "translation",
        model=translator_model,
        tokenizer=nllb_tokenizer,
        src_lang=src_lang,
        tgt_lang="eng_Latn",
    )
    return translator

def tokenize_question(df: pl.DataFrame, with_translation: bool = True) -> pl.DataFrame:
    df.with_columns(
        pl.col("question")
        .map_elements(lambda x: nllb_tokenizer.encode(x))
        .alias("tokens")
    )
    if with_translation:
        src_lang = lang_dict[df["lang"][0]]
        translator = _make_translator(src_lang)
        df = df.with_columns(
            pl.col("question")
            .map_elements(lambda x: translator(x)[0]["translation_text"])
            .alias("translation")
        )
            
    return df

In [None]:
df_ar_mini = tokenize_question(df_ar[:10], with_translation=True)

In [None]:
df_ar_mini.write_excel("df_ar_mini.xlsx")
df_ar_mini.write_parquet("df_ar_mini.parquet")

In [None]:
count_dict_ko = {}
for tokens in df_ko["tokens"]:
    for token in tokens:
        if token in count_dict_ko:
            count_dict_ko[token] += 1
        else:
            count_dict_ko[token] = 1
# Get as tuple and sort by frequency
sorted_frequency_list_ko = sorted(count_dict_ko.items(), key=lambda x: x[1], reverse=True)

In [None]:
sorted_frequency_list_ko