# LM for QA Tidy_XOR dataset

In [None]:
import polars as pl
from transformers import AutoTokenizer
from data.const import ARB_CACHE, KOR_CACHE, TELU_CACHE
from typing import TypeAlias
from ngrams.utils import (
    TokenizedSentences,
    NGramsDict,
    DataInconsistencyError,
    train_test_split_and_tokenize,
    get_ngrams_dict,
    get_ngrams_dict_from_sentences,
    tokenize,
)
from ngrams.models import NGramLM


In [None]:
df_arkote = pl.concat([
    pl.read_parquet(ARB_CACHE),
    pl.read_parquet(KOR_CACHE),
    pl.read_parquet(TELU_CACHE)
])
df_ko_mini = pl.read_parquet(KOR_CACHE)[:100]
df_ko_mini.describe()

In [None]:
df_ko_mini.head()

## Process the data

### Examine the corpus stats

In [None]:
# Calculate average context length
context = df_arkote["context"]
avg_len = sum(len(c) for c in context) / len(context)
print(f"Average context length: {avg_len:.2f} characters")
print(f"Number of sequences (rows) in context: {len(context)}")

In [None]:
# Get english corpus
context_corpus = df_arkote["context"].to_list()
# Get number of unique space seperated words (not tokens)
context_vocab = set(" ".join(context_corpus).split())
number_of_unique_words = len(context_vocab)
print(f"Number of unique (space seperated) words in context: {number_of_unique_words}")

### Tokenize corpus
Here we use Multilingual BERT tokenizer. We use identical tokenizer for comparing perplexity.
Each string entrance may be several sentences, but for simplicity we are gonna treat each one as a single sequence, and use the inherent start- and end-of-sentence markers from mBERT

In [None]:
# Get multilingual bert tokenizer
mbert = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")
mbert.add_tokens(["<s>", "</s>"])  # Add start and end tokens

# Example tokenization
sample_content_tokens = tokenize(df_arkote["context"][10], n=5)
print(f"Sample content tokens (n=5):")
" | ".join(sample_content_tokens)

## N-Gram LM
First we explore some statistics of $n$, to pick the size we want

### Examine NGramDicts for context series for N={1, 2, 3, 4}

In [None]:
# Examine unigrams for train
n1_context_train, n1_context_test = train_test_split_and_tokenize(context, verbose=True)
print("Getting unigrams...")
unigrams = get_ngrams_dict_from_sentences(n1_context_train, 1, verbose=True)

In [None]:
# Examine bigrams for train
n2_context_train, n2_context_test = train_test_split_and_tokenize(context, n=2, verbose=True)
print("Getting bigrams...")
bigrams = get_ngrams_dict_from_sentences(n2_context_train, 2, verbose=True)

In [None]:
# Examine trigrams for train
n3_context_train, n3_context_test = train_test_split_and_tokenize(context, n=3, verbose=True)
print("Getting trigrams...")
trigrams = get_ngrams_dict_from_sentences(n3_context_train, 3, verbose=True)

In [None]:
# Examine 4-grams for train
n4_context_train, n4_context_test = train_test_split_and_tokenize(context, n=4, verbose=True)
print("Getting fourgrams...")
fourgrams = get_ngrams_dict_from_sentences(n4_context_train, 4, verbose=True)

## Verifying correctness of NGramModel
First we regenerate probabilities from example in SLP book, then we verify our model against it

In [None]:
mock_series = pl.Series(["I am Sam", "Sam I am", "I do not like green eggs and ham"])
cased_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
cased_tokenizer.add_tokens(["<s>", "</s>"])  # Add start and end tokens
ngram_ready_tokens = [tokenize(seq, tokenizer=cased_tokenizer.tokenize, n=2) for seq in mock_series]
mock_data = ngram_ready_tokens # work for both uni- and bigrams
mock_nm1grams = get_ngrams_dict_from_sentences(mock_data, 1, verbose=True)
mock_ngrams = get_ngrams_dict_from_sentences(mock_data, 2, verbose=True)
for row in mock_data:
    print(" | ".join(row))

In [None]:
import math
NGram = tuple[str, ...]
Tokens: TypeAlias = list[str]
TokenizedSentences: TypeAlias = list[Tokens]

class NGramLM:
    """Class to represent an N-gram language model.
    The model takes as input the n-grams and (n-1)-grams count dictionaries,
    and computes the conditional probabilities of the n-grams given the (n-1)-grams. 
    """
    def __init__(self, nm1grams: NGramsDict, ngrams: NGramsDict):
        self.ngrams = ngrams
        self.nm1grams = nm1grams
        self.n = len(list(ngrams.keys())[0])
        self.probabilities = {key: 0.0 for key in self.ngrams.keys()}
        self._calc_word_probabilities()

    def _calc_word_probabilities(self) -> None:
        """For word (token) in vocabulary, estimate probabilities by counts"""
        for ngram in self.ngrams.keys():
            word = ngram[-1]
            prefix = ngram[:-1]
            self.probabilities[ngram] = self.ngrams[ngram] / self.nm1grams[prefix]
    
    def get_sentence_probability(self, sentence: list[NGram]) -> float:
        """Get the probability of a sentence (list of tokens) under this model."""
        log_prob = 0.0
        for ngram in sentence:
            pass
        
        return math.exp(log_prob)

In [None]:
bigram_model = NGramLM(mock_nm1grams, mock_ngrams)
for key, item in bigram_model.probabilities.items():
    print(f"{key}: {item:.4f}")

In [None]:
mock_sentence = "I am Sam"
tokenize(mock_sentence, tokenizer=cased_tokenizer.tokenize, n=2)
# Should ignore "b" as it is OOV and return 2/3 * 2/3 * 1/2 * 1/2 = 1/9 = 0.1111
#print(bigram_model.get_sentence_probability(mock_sentence))