# LM for QA Tidy_XOR dataset

In [None]:
import polars as pl
from transformers import AutoTokenizer
from data.const import ARB_CACHE, KOR_CACHE, TELU_CACHE
from typing import TypeAlias
from ngrams.utils import (
    TokenizedSentences,
    NGramsDict,
    DataInconsistencyError,
    train_test_split_and_tokenize,
    get_ngrams_dict,
    get_ngrams_dict_from_sentences,
    tokenize,
)
from ngrams.models import NGramLM

In [None]:
df_arkote = pl.concat([
    pl.read_parquet(ARB_CACHE),
    pl.read_parquet(KOR_CACHE),
    pl.read_parquet(TELU_CACHE)
])
df_ko_mini = pl.read_parquet(KOR_CACHE)[:100]
df_ko_mini.describe()

In [None]:
df_ko_mini.head()

## Process the data

### Examine the corpus stats

In [None]:
# Calculate average context length
context = df_arkote["context"]
avg_len = sum(len(c) for c in context) / len(context)
print(f"Average context length: {avg_len:.2f} characters")
print(f"Number of sequences (rows) in context: {len(context)}")

In [None]:
# Get english corpus
context_corpus = df_arkote["context"].to_list()
# Get number of unique space seperated words (not tokens)
context_vocab = set(" ".join(context_corpus).split())
number_of_unique_words = len(context_vocab)
print(f"Number of unique (space seperated) words in context: {number_of_unique_words}")

### Tokenize corpus
Here we use Multilingual BERT tokenizer. We use identical tokenizer for comparing perplexity.
Each string entrance may be several sentences, but for simplicity we are gonna treat each one as a single sequence, and use the inherent start- and end-of-sentence markers from mBERT

In [None]:
# Get multilingual bert tokenizer
mbert = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")
mbert.add_tokens(["<s>", "</s>"])  # Add start and end tokens

# Example tokenization
sample_content_tokens = tokenize(df_arkote["context"][10], n=5)
print(f"Sample content tokens (n=5):")
" | ".join(sample_content_tokens)

## N-Gram LM
First we explore some statistics of $n$, to pick the size we want

### Examine NGramDicts for context series for N={1, 2, 3, 4}

In [None]:
# Examine unigrams for train
n1_context_train, n1_context_test = train_test_split_and_tokenize(context, verbose=True)
print("Getting unigrams...")
unigrams = get_ngrams_dict_from_sentences(n1_context_train, 1, verbose=True)

In [None]:
# Examine bigrams for train
n2_context_train, n2_context_test = train_test_split_and_tokenize(context, n=2, verbose=True)
print("Getting bigrams...")
bigrams = get_ngrams_dict_from_sentences(n2_context_train, 2, verbose=True)

In [None]:
# Examine trigrams for train
n3_context_train, n3_context_test = train_test_split_and_tokenize(context, n=3, verbose=True)
print("Getting trigrams...")
trigrams = get_ngrams_dict_from_sentences(n3_context_train, 3, verbose=True)

In [None]:
# Examine 4-grams for train
n4_context_train, n4_context_test = train_test_split_and_tokenize(context, n=4, verbose=True)
print("Getting fourgrams...")
fourgrams = get_ngrams_dict_from_sentences(n4_context_train, 4, verbose=True)

### NGramModel from Scratch

## Verifying correctness of NGramModel
First we regenerate probabilities from example in SLP book, then we verify our model against it

In [None]:
mock_series = pl.Series(["I am Sam", "Sam I am", "I do not like green eggs and ham"])
cased_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
cased_tokenizer.add_tokens(["<s>", "</s>"])  # Add start and end tokens
ngram_ready_tokens = [tokenize(seq, tokenizer=cased_tokenizer.tokenize, n=2) for seq in mock_series]
mock_data = ngram_ready_tokens # work for both uni- and bigrams
mock_nm1grams = get_ngrams_dict_from_sentences(mock_data, 1, verbose=True)
mock_ngrams = get_ngrams_dict_from_sentences(mock_data, 2, verbose=True)
for row in mock_data:
    print(" | ".join(row))

In [None]:
bigram_model = NGramLM(mock_nm1grams, mock_ngrams, smoothing=None)

In [None]:
import nltk
mock_sentence = "I am Sam"
mock_sentence = tokenize(mock_sentence, tokenizer=cased_tokenizer.tokenize, n=2)
bigram_mock_sentence = list(nltk.ngrams(mock_sentence, 2))
# Should ignore "b" as it is OOV and return 2/3 * 2/3 * 1/2 * 1/2 = 1/9 = 0.1111
print(bigram_model.get_sentence_probability(bigram_mock_sentence, verbose=True))

Count num words via ngrams (sanity check)

In [None]:
# Get N for bi, tri and fourgrams
#for n in [2, 3, 4]:
#    ngram_ready_tokens = [tokenize(seq, tokenizer=cased_tokenizer.tokenize, n=n) for seq in mock_series]
#    ngram_mock_sentences = [list(nltk.ngrams(seq, n)) for seq in ngram_ready_tokens]
#    print(f"Sample ngram: {ngram_mock_sentences[0] if ngram_mock_sentences else None}")
#    # Should match num tokens excluding <s>
#    print(f"Len of sample ngram (sanity check): {len(ngram_mock_sentences[0])}")
#    N = sum([len(sentence) for sentence in ngram_mock_sentences]) - len(ngram_mock_sentences)
#    print(f"N for {n}-grams: {N}")

In [None]:
# Perplexity of mock example
ngram_ready_tokens = [tokenize(seq, tokenizer=cased_tokenizer.tokenize, n=2) for seq in mock_series]
bigram_mock_sentences = [list(nltk.ngrams(seq, 2)) for seq in ngram_ready_tokens]
# Mock sentence ["<s>", "I", "am", "Sam", "</s>"], seq prob = 1/9  (not counting <s>)
# thus perplexity = (1/9)^(-1/4) = sqrt(3) approx 1.732
print(bigram_model.get_perplexity([bigram_mock_sentence]))

## Get Perplexity

In [None]:
import polars as pl
import nltk
from transformers import AutoTokenizer
from data.const import ARB_CACHE, KOR_CACHE, TELU_CACHE
from typing import TypeAlias, cast

NGram = tuple[str, ...]
Tokens: TypeAlias = list[str]
TokenizedSentences: TypeAlias = list[Tokens]
NGramsDict: TypeAlias = dict[NGram, int]
ModelReadyData: TypeAlias = tuple[NGramsDict, NGramsDict, list[list[NGram]]]

def get_model_ready_data(corpus: pl.Series, n: int) -> ModelReadyData:
    """One function to call on relevant series, to get NGramModel ready data"""
    x_train, x_test = train_test_split_and_tokenize(corpus, n=n)
    nm1grams_dict = get_ngrams_dict_from_sentences(x_train, n-1, verbose=False)
    ngrams_dict = get_ngrams_dict_from_sentences(x_train, n, verbose=False)
    # Make test into list of ngram sentences
    x_test = list([list(nltk.ngrams(sentence, n)) for sentence in x_test])
    x_test = cast(list[list[NGram]], x_test)  # Type hinting for clarity
    data = cast(ModelReadyData, (nm1grams_dict, ngrams_dict, x_test))
    return data

In [None]:
unigram, bigram, test = get_model_ready_data(df_arkote["context"], 2)
content_bigram_model = NGramLM(unigram, bigram, smoothing="laplace")
content_model_perplexity = content_bigram_model.get_perplexity(test)
print(f"Content bigram LM perplexity: {content_model_perplexity:.2f}")

In [None]:
bigram, trigram, test = get_model_ready_data(df_arkote["context"], 3)
content_trigram_model = NGramLM(bigram, trigram, smoothing="laplace")
content_trigram_model_perplexity = content_trigram_model.get_perplexity(test)
print(f"Content trigram LM perplexity: {content_trigram_model_perplexity:.2f}")

In [None]:
trigram, fourgram, test = get_model_ready_data(df_arkote["context"], 4)
content_fourgram_model = NGramLM(trigram, fourgram, smoothing="laplace")
content_fourgram_model_perplexity = content_fourgram_model.get_perplexity(test)
print(f"Content fourgram LM perplexity: {content_fourgram_model_perplexity:.2f}")