# LM for QA Tidy_XOR dataset

In [None]:
import polars as pl
import nltk
from transformers import AutoTokenizer
from data.const import ARB_CACHE, KOR_CACHE, TELU_CACHE

In [None]:
df_arkote = pl.concat([
    pl.read_parquet(ARB_CACHE),
    pl.read_parquet(KOR_CACHE),
    pl.read_parquet(TELU_CACHE)
])
df_ko_mini = pl.read_parquet(KOR_CACHE)[:100]
df_ko_mini.describe()

In [None]:
df_ko_mini.head()

## Process the data

In [None]:
nltk.download("stopwords")

### Get corpus as one long string

In [None]:
# Calculate average context length
context = df_arkote["context"]
avg_len = sum(len(c) for c in context) / len(context)
print(f"Average context length: {avg_len:.2f} characters")
len(context)

In [None]:
# Get english corpus
context_corpus = df_arkote["context"].str.join("\n")[0]
context_vocab = set(context_corpus)
print(f"Number of characters in corpus: {len(context_corpus):,}")
print(f"Vocalulary size: {len(context_vocab):,}")

### Tokenize corpus
Here we use Multilingual BERT tokenizer. We use identical tokenizer for comparing perplexity.
Each string entrance may be several sentences, but for simplicity we are gonna treat each one as a single sequence, and use the inherent start- and end-of-sentence markers from mBERT

In [None]:
# Get multilingual bert tokenizer
mbert = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")
mbert.add_tokens(["<s>", "</s>"])  # Add start and end tokens


In [None]:
context = ("<s>" + df_arkote["context"] + "</s>").to_list()
context_tokenized = [mbert.tokenize(c) for c in context]
# Example tokenization
" | ".join(mbert_tokens[0])

In [None]:
# Split into train, val, test
train_index = int(len(context_tokenized) * 0.6)
val_size = int(len(context_tokenized) * 0.2 + train_index)
context_train = context_tokenized[:train_index]
context_val = context_tokenized[train_index:val_size]
context_test = context_tokenized[val_size:]
print(f"Train size: {len(context_train):,}")
print(f"Validation size: {len(context_val):,}")
print(f"Test size: {len(context_test):,}")
print(f"Total size: {len(context_train) + len(context_val) + len(context_test):,} (sanity check)")