# LM for QA Tidy_XOR dataset

In [None]:
import polars as pl
import nltk
from transformers import AutoTokenizer
from data.const import ARB_CACHE, KOR_CACHE, TELU_CACHE
from typing import TypeAlias
import numpy as np

In [None]:
df_arkote = pl.concat([
    pl.read_parquet(ARB_CACHE),
    pl.read_parquet(KOR_CACHE),
    pl.read_parquet(TELU_CACHE)
])
df_ko_mini = pl.read_parquet(KOR_CACHE)[:100]
df_ko_mini.describe()

In [None]:
df_ko_mini.head()

## Process the data

In [None]:
#nltk.download("stopwords")

### Get corpus as one long string

In [None]:
# Calculate average context length
context = df_arkote["context"]
avg_len = sum(len(c) for c in context) / len(context)
print(f"Average context length: {avg_len:.2f} characters")
len(context)

In [None]:
# Get english corpus
context_corpus = df_arkote["context"].str.join("\n")[0]
context_vocab = set(context_corpus)
print(f"Number of characters in corpus: {len(context_corpus):,}")
print(f"Vocalulary size: {len(context_vocab):,}")

### Tokenize corpus
Here we use Multilingual BERT tokenizer. We use identical tokenizer for comparing perplexity.
Each string entrance may be several sentences, but for simplicity we are gonna treat each one as a single sequence, and use the inherent start- and end-of-sentence markers from mBERT

In [None]:
# Get multilingual bert tokenizer
mbert = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")
mbert.add_tokens(["<s>", "</s>"])  # Add start and end tokens


In [None]:
context = ("<s>" + df_arkote["context"] + "</s>").to_list()
context_tokenized = [mbert.tokenize(c) for c in context]
# Example tokenization
" | ".join(context_tokenized[0])

In [None]:
def flatten_tokens(token_lists: list[list[str]]) -> list[str]:
    """Unnest list of lists (one of most efficient methods with .extend())"""
    flattened = []
    for token_list in token_lists:
        flattened.extend(token_list)
    return flattened

In [None]:
# Split into train, val, test
train_index = int(len(context_tokenized) * 0.7)
raw_context_train = context_tokenized[:train_index]
raw_context_test = context_tokenized[train_index:]
context_train = flatten_tokens(raw_context_train)
context_test = flatten_tokens(raw_context_test)
print(f"Train size: {len(context_train):>15,} tokens.")
print(f"Test size: {len(context_test):>16,} tokens.")
print(f"Total corpus size: {len(context_train) + len(context_test):>8,} tokens.")

## N-Gram LM
First we explore some statistics of $n$, to pick the size we want

In [None]:
NGramsDict: TypeAlias = dict[tuple[str, ...], int]
def get_ngrams_dict(
    tokens: list[str],
    n: int,
    verbose: bool = False,
) -> NGramsDict:
    """Get n-grams count dictionary from list of tokens."""
    n_grams_gen = nltk.ngrams(tokens, n)
    count_dict = {}
    num_duplicates = 0
    for gram in n_grams_gen:
        if gram in count_dict:
            count_dict[gram] += 1
            num_duplicates += 1
            if verbose and num_duplicates <= 5:
                print("Duplicate gram found: ", gram)
            if verbose and num_duplicates == 6:
                print("...")  # Indicate more duplicates exist
        else:
            count_dict[gram] = 1
    if verbose:
        print(f"Number of unique {n}-grams:  {len(count_dict):,}")
        print(f"Total number of {n}-grams:  {sum(count_dict.values()):,}")
        print(f"Number of duplicate {n}-grams encountered:  {num_duplicates:,}")
        assert num_duplicates == sum(count_dict.values()) - len(count_dict), "Duplicate count mismatch!"
    return count_dict

In [None]:
# Examine unigrams for train
unigrams = get_ngrams_dict(context_train, 1, verbose=True)

In [None]:
# Examine bigrams for train
bigrams = get_ngrams_dict(context_train, 2, verbose=True)

In [None]:
# Examine trigrams for train
trigrams = get_ngrams_dict(context_train, 3, verbose=True)

In [None]:
# Examine 4-grams for train
fourgrams = get_ngrams_dict(context_train, 4, verbose=True)

In [None]:
SeqProbDict: TypeAlias = dict[tuple[str, ...], float]
class DataInconsistencyError(Exception):
    """Custom error for data inconsistency issues."""

class NGramModel:
    """Class to represent an N-gram language model.
    The model takes as input the n-grams and (n-1)-grams count dictionaries,
    and computes the conditional probabilities of the n-grams given the (n-1)-grams. 
    """
    def __init__(self, nm1grams: NGramsDict, ngrams: NGramsDict, vocabulary: set[str], alpha: float = 1.0):
        self.ngrams = ngrams
        self.nm1grams = nm1grams
        self.n = len(list(ngrams.keys())[0])
        self.alpha = alpha
        self.vocabulary = vocabulary
        self.vocab_size = len(vocabulary)
        self.probabilities = self._compute_probabilities()

    def _compute_probabilities(self) -> SeqProbDict:
        """Compute the conditional probabilities of the n-grams given the (n-1)-grams."""
        probabilities_dict = {}
        for gram in self.ngrams:
            nm1gram = gram[:-1]
            probabilities_dict[gram] = (self.ngrams[gram] + self.alpha) / (self.nm1grams[nm1gram] + self.alpha * self.vocab_size)

        return probabilities_dict

    def get_text_log_prob(self, text: list[str]) -> float:
            """Get the probability of a given text sequence (input as list of tokens)."""
            log_prob = 0.0
            for ngram in nltk.ngrams(text, self.n):
                if ngram in self.probabilities:
                    prob = self.probabilities[ngram]
                else:
                    prob = 1e-10  # Very small probability for completely unseen context
                if prob <= 0:
                    prob = 1e-10  # Avoid log(0)
                log_prob += np.log(prob)
                
            return log_prob
    def get_text_prob(self, text: list[str]) -> float:
        return np.exp(self.get_text_log_prob(text))

In [None]:
vocab = set(context_train)
model = NGramModel(unigrams, bigrams, vocab, alpha=10.0)
total_train_prob = sum([np.exp(model.get_text_log_prob(text)) for text in raw_context_train])
total_test_prob = sum([np.exp(model.get_text_log_prob(text)) for text in raw_context_test])
print(f"Total train probability: {total_train_prob}")
print(f"Total test probability: {total_test_prob}")
single_prob = model.get_text_log_prob(raw_context_test[0])
print(f"Single test sequence log probability: {single_prob}")
print(f"Single test sequence probability: {np.exp(single_prob)}")