# Week 37: LM for QA Tidy_XOR dataset

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import polars as pl
from transformers import AutoTokenizer
from datasets import load_dataset, DatasetDict
from typing import TypeAlias, cast
from ngrams.utils import (
    get_model_ready_data,
    get_ngrams_dict_from_sentences,
    _pad_and_tokenize,
    my_tokenize,
    NGram,
    NGramsDict,
    ModelReadyData,
)
import nltk
from ngrams.models import NGramLM

In [None]:
# Load dataset
dataset = load_dataset("coastalcph/tydi_xor_rc")
assert isinstance(dataset, DatasetDict), "Loaded dataset is not a DatasetDict"
df_train = dataset["train"].to_polars()
df_val = dataset["validation"].to_polars()

assert isinstance(df_train, pl.DataFrame), "Training set is not a Polars DataFrame"
assert isinstance(df_val, pl.DataFrame), "Validation set is not a Polars DataFrame"

# Arabic, Telegu and Korean
df_ar_train = df_train.filter(pl.col("lang") == "ar")
df_ar_val = df_val.filter(pl.col("lang") == "ar")
df_te_train = df_train.filter(pl.col("lang") == "te")
df_te_val = df_val.filter(pl.col("lang") == "te")
df_ko_train = df_train.filter(pl.col("lang") == "ko")
df_ko_val = df_val.filter(pl.col("lang") == "ko")
df_arkote_train = df_train.filter(pl.col("lang").is_in(["ar", "ko", "te"]))
df_arkote_val = df_val.filter(pl.col("lang").is_in(["ar", "ko", "te"]))
df_arkote = pl.concat([df_arkote_train, df_arkote_val])

# Make a dict
data = {
    "arabic": {"train": df_ar_train, "val": df_ar_val},
    "telegu": {"train": df_te_train, "val": df_te_val},
    "korean": {"train": df_ko_train, "val": df_ko_val},
    "full": {"train": df_train, "val": df_val},
}
df_ar_train.head()

## Process the data

### Examine the corpus stats

In [None]:
# Calculate average (train) context length
context = df_arkote["context"]
avg_len = sum(len(c) for c in context) / len(context)
print(f"Average context length: {avg_len:.2f} characters")
print(f"Number of sequences (rows) in context: {len(context)}")

In [None]:
# Get english corpus
context_corpus = df_arkote["context"].to_list()
# Get number of unique space seperated words (not tokens)
context_vocab = set(" ".join(context_corpus).split())
number_of_unique_words = len(context_vocab)
print(f"Number of unique (space seperated) words in context: {number_of_unique_words}")

### Tokenize corpus
Here we use Multilingual BERT tokenizer. We use identical tokenizer for comparing perplexity.
Each string entrance may be several sentences, but for simplicity we are gonna treat each one as a single sequence, and use the inherent start- and end-of-sentence markers from mBERT

In [None]:
# Get multilingual bert tokenizer
mbert = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")
mbert.add_tokens(["<s>", "</s>"])  # Add start and end tokens

# Example tokenization
sample_content_tokens = _pad_and_tokenize(df_arkote["context"][10], n=5)
print(f"Sample content tokens (n=5):")
" | ".join(sample_content_tokens)

## N-Gram LM
First we explore some statistics of $n$, to pick the size we want

### Examine NGramDicts for context series for N={1, 2, 3, 4}

In [None]:
# Examine unigrams for train
n1_context_train = my_tokenize(context, n=1)
print("Getting unigrams...")
unigrams = get_ngrams_dict_from_sentences(n1_context_train, 1, verbose=True)

In [None]:
# Examine bigrams for train
n2_context_train = my_tokenize(context, n=2)
print("Getting bigrams...")
bigrams = get_ngrams_dict_from_sentences(n2_context_train, 2, verbose=True)

In [None]:
# Examine trigrams for train
n3_context_train = my_tokenize(context, n=3)
print("Getting trigrams...")
trigrams = get_ngrams_dict_from_sentences(n3_context_train, 3, verbose=True)

In [None]:
# Examine 4-grams for train
n4_context_train = my_tokenize(context, n=4)
print("Getting fourgrams...")
fourgrams = get_ngrams_dict_from_sentences(n4_context_train, 4, verbose=True)

### NGramModel from Scratch

## Verifying correctness of NGramModel
First we regenerate probabilities from example in SLP book, then we verify our model against it

In [None]:
mock_series = pl.Series(["I am Sam", "Sam I am", "I do not like green eggs and ham"])
cased_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
cased_tokenizer.add_tokens(["<s>", "</s>"])  # Add start and end tokens
ngram_ready_tokens = [_pad_and_tokenize(seq, tokenizer=cased_tokenizer.tokenize, n=2) for seq in mock_series]
mock_data = ngram_ready_tokens # work for both uni- and bigrams
mock_nm1grams = get_ngrams_dict_from_sentences(mock_data, 1, verbose=True)
mock_ngrams = get_ngrams_dict_from_sentences(mock_data, 2, verbose=True)
for row in mock_data:
    print(" | ".join(row))

In [None]:
bigram_model = NGramLM(mock_nm1grams, mock_ngrams, smoothing=None)

In [None]:
mock_sentence = "I am Sam"
mock_sentence = _pad_and_tokenize(mock_sentence, tokenizer=cased_tokenizer.tokenize, n=2)
bigram_mock_sentence = list(nltk.ngrams(mock_sentence, 2))
# Should ignore "b" as it is OOV and return 2/3 * 2/3 * 1/2 * 1/2 = 1/9 = 0.1111
print(bigram_model.get_sentence_probability(bigram_mock_sentence, verbose=True))

Count num words via ngrams (sanity check)

In [None]:
# Get N for bi, tri and fourgrams
#for n in [2, 3, 4]:
#    ngram_ready_tokens = [tokenize(seq, tokenizer=cased_tokenizer.tokenize, n=n) for seq in mock_series]
#    ngram_mock_sentences = [list(nltk.ngrams(seq, n)) for seq in ngram_ready_tokens]
#    print(f"Sample ngram: {ngram_mock_sentences[0] if ngram_mock_sentences else None}")
#    # Should match num tokens excluding <s>
#    print(f"Len of sample ngram (sanity check): {len(ngram_mock_sentences[0])}")
#    N = sum([len(sentence) for sentence in ngram_mock_sentences]) - len(ngram_mock_sentences)
#    print(f"N for {n}-grams: {N}")

In [None]:
# Perplexity of mock example
ngram_ready_tokens = [_pad_and_tokenize(seq, tokenizer=cased_tokenizer.tokenize, n=2) for seq in mock_series]
bigram_mock_sentences = [list(nltk.ngrams(seq, 2)) for seq in ngram_ready_tokens]
# Mock sentence ["<s>", "I", "am", "Sam", "</s>"], seq prob = 1/9  (not counting <s>)
# thus perplexity = (1/9)^(-1/4) = sqrt(3) approx 1.732
print(bigram_model.get_perplexity([bigram_mock_sentence]))

## Get Perplexity

In [None]:
for n in [2]:
    print("=" * 70)
    print(f"Calculating {n}-gram LM perplexity for each of 4 corpora...")
    print("=" * 70 + "\n")
    for corpus in ["arabic", "telegu", "korean", "full"]:
        col_name = "question" if not "full" else "context"
        train, val = data[corpus]["train"][col_name], data[corpus]["val"][col_name]
        unigram, bigram, test = get_model_ready_data(train, val, n)
        n_gram_lm = NGramLM(unigram, bigram, smoothing="laplace")
        model_perplexity = n_gram_lm.get_perplexity(test)
        print(f"{corpus.capitalize()} {n}-gram LM perplexity: {model_perplexity:.2f}")
    print()