## Weak 36

In [None]:
import tiktoken
from tokenizers import Tokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    pipeline,
)
from datasets import load_dataset
import polars as pl
import os

In [None]:
# Get huggingface dataset
dataset = load_dataset("coastalcph/tydi_xor_rc")
df_train = pl.from_pandas(dataset["train"].to_pandas())
df_val = pl.from_pandas(dataset["validation"].to_pandas())

In [None]:
df_train.head()

In [None]:
df_ar = df_train.filter(pl.col("lang") == "ar")
df_ko = df_train.filter(pl.col("lang") == "ko")
df_te = df_train.filter(pl.col("lang") == "te")
df_arkote = df_train.filter(pl.col("lang").is_in(["ar", "ko", "te"]))
assert df_ar.height + df_ko.height + df_te.height == df_arkote.height; # sanity check

df_ar_val = df_val.filter(pl.col("lang") == "ar")
df_ko_val = df_val.filter(pl.col("lang") == "ko")
df_te_val = df_val.filter(pl.col("lang") == "te")
df_arkote_val = df_val.filter(pl.col("lang").is_in(["ar", "ko", "te"]))
assert df_ar_val.height + df_ko_val.height + df_te_val.height == df_arkote_val.height; # sanity check

In [None]:
df_arkote.describe()

### Get tokenizers and look at sample (Arabic) sentence

In [None]:
# Load multilingual BERT tokenizer
mbert_tokenizer = Tokenizer.from_pretrained("bert-base-multilingual-uncased")
# Load GPT-4 tokenizer
gpt4_tokenizer = tiktoken.get_encoding("cl100k_base")
# Load NLLB-200 tokenizer
nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")

In [None]:
# BERT tokenization example
" | ".join(mbert_tokenizer.encode(df_te["question"][0]).tokens)

In [None]:
# Decode each token from GPT-4 tokenizer
" | ".join([gpt4_tokenizer.decode([token]) for token in gpt4_tokenizer.encode(df_te["question"][0])])

In [None]:
# NLLB-200 tokenization example
" | ".join(nllb_tokenizer.tokenize(df_te["question"][0]))

## Get the top 5 most frequent words

In [None]:
# Language dict
lang_dict = {
    "ar": "arb_Arab",
    "ko": "kor_Hang",
    "te": "tel_Telu",
}

In [None]:
# Set up translation pipeline for NLLB-200
translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [None]:
# Example translation
random_txt = df_ko["question"][0]
translator = translator = pipeline(
        "translation",
        model=translator_model,
        tokenizer=nllb_tokenizer,
        src_lang="kor_Hang",
        tgt_lang="eng_Latn",
    )
translator(random_txt)[0]["translation_text"] 

In [None]:
def _make_translator(src_lang:str):
    translator = pipeline(
        "translation",
        model=translator_model,
        tokenizer=nllb_tokenizer,
        src_lang=src_lang,
        tgt_lang="eng_Latn",
    )
    return translator

def tokenize_question(df: pl.DataFrame, use_cache: bool = True, translation: bool = True, tokenization: bool = True) -> pl.DataFrame:
    
    # Check if already tokenized and translated
    src_lang = lang_dict[df["lang"][0]]
    cache_path = os.path.join("data", f"tydi_xor_rc_{src_lang}")
    print(f"Tokenizing and translating {src_lang}...")
    if use_cache and f"tydi_xor_rc_{src_lang}.parquet" in os.listdir("data"):
        # Load from parquet
        print(f"Loading from cached file tydi_xor_rc_{src_lang}.parquet")

        return pl.read_parquet(os.path.join("data", f"tydi_xor_rc_{src_lang}.parquet"))

    if tokenization:
        # Tokenize questions using multilingual BERT tokenizer
        print("Tokenizing")
        df = df.with_columns(
            pl.col("question")
            .map_elements(lambda x: nllb_tokenizer.tokenize(x), return_dtype=pl.List(pl.Utf8))
            .alias("tokens")
        )

    if translation:
        print("Translating")
        # Translate questions using NLLB-200
        translator = _make_translator(src_lang)
        df = df.with_columns(
            pl.col("question")
            .map_elements(lambda x: translator(x)[0]["translation_text"])
            .alias("translation")
        )

    if use_cache:
        print(f"Caching to {cache_path}.parquet and {cache_path}.xlsx")
        df.write_parquet(cache_path + ".parquet")
        df.write_excel(cache_path + ".xlsx")

    return df

In [None]:
tokenize_question(df_ar)
tokenize_question(df_ko)
tokenize_question(df_te)

In [None]:
import numpy as np
# Get the number of questions and the number of total tokens in each language
def df_metadata(df: pl.DataFrame) -> None:
    n_questions = df.height
    tokens_list = tokenize_question(df, use_cache = False, translation=False)["tokens"].to_list()
    n_tokens = sum([len(tokens) for tokens in tokens_list])
    print(f"Number of questions: {n_questions} Number of tokens: {n_tokens}")

print("Arabic dataset")
df_metadata(df_ar)
print("Korean dataset")
df_metadata(df_ko)
print("Telugu dataset")
df_metadata(df_te)
print("Arabic validation dataset")
df_metadata(df_ar_val)
print("Korean validation dataset")
df_metadata(df_ko_val)
print("Telugu validation dataset")
df_metadata(df_te_val)

In [None]:
# Tokenize and translate each language dataframe, then compute token frequencies
for df in [df_ar, df_ko, df_te]:
    df = tokenize_question(df)
    count_dict = {}
    for tokens in df["tokens"]:
        for token in tokens:
            if token in count_dict:
                count_dict[token] += 1
            else:
                count_dict[token] = 1
    sorted_frequency_list = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
    print(sorted_frequency_list[:10])


In [None]:
# Tokenize and translate each language dataframe, then compute token frequencies of the translations
for df in [df_ar, df_ko, df_te]:
    df = tokenize_question(df)
    count_dict = {}
    for translation in df["translation"]:
        for token in nllb_tokenizer.tokenize(translation):
            if token in count_dict:
                count_dict[token] += 1
            else:
                count_dict[token] = 1
    sorted_frequency_list = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
    print(sorted_frequency_list[:10])

In [None]:
# Top tokens in answerable true and false
for df in [df_ar, df_ko, df_te]:
    df = tokenize_question(df)
    for answerable in [True, False]:
        count_dict = {}
        for translation in df.filter(pl.col("answerable") == answerable)["translation"]:
            for token in nllb_tokenizer.tokenize(translation):
                if token in count_dict:
                    count_dict[token] += 1
                else:
                    count_dict[token] = 1
        sorted_frequency_list = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
        print(f"Top tokens in {lang_dict[df['lang'][0]]} answerable={answerable}: {sorted_frequency_list[:10]}")