In [1]:
import os
from tqdm.auto import tqdm
from tnqeet.data import train_dataset, test_dataset

In [2]:
def to_chars(text):
    chars = list()
    for c in text:
        if c.isspace():
            chars.append('<SPACE>')
        else:
            chars.append(c)
    chars = ' '.join(chars)
    return chars

In [3]:
def write_dataset_to_file(dataset, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for item in tqdm(dataset, desc=f"Writing {filename}"):
            f.write(to_chars(item['text'].strip()) + '\n')

In [4]:
!mkdir -p tmp

In [5]:
if not os.path.exists("tmp/ngrams_train_dataset.txt"):
    write_dataset_to_file(train_dataset, "tmp/ngrams_train_dataset.txt")

In [6]:
if not os.path.exists("tmp/ngrams_test_dataset.txt"):
    write_dataset_to_file(test_dataset, "tmp/ngrams_test_dataset.txt")

In [7]:
def train_lm(ngram, overwrite=False):
    if not overwrite and os.path.exists(f"tnqeet/dotting_models/ngrams/trained_models/ngrams_{ngram}.binary"):
        print(f"Language model for ngram {ngram} already exists. Skipping training.")
        return
    !mkdir -p tnqeet/dotting_models/ngrams/trained_models
    if ngram > 2:
        !kenlm/build/bin/lmplz -o {ngram} --discount_fallback --text tmp/ngrams_train_dataset.txt --arpa tnqeet/dotting_models/ngrams/trained_models/ngrams_{ngram}.arpa --prune 0 0 1
    else:
        !kenlm/build/bin/lmplz -o {ngram} --discount_fallback --text tmp/ngrams_train_dataset.txt --arpa tnqeet/dotting_models/ngrams/trained_models/ngrams_{ngram}.arpa
    !kenlm/build/bin/build_binary tnqeet/dotting_models/ngrams/trained_models/ngrams_{ngram}.arpa tnqeet/dotting_models/ngrams/trained_models/ngrams_{ngram}.binary
    # drop the arpa file to save space
    !rm tnqeet/dotting_models/ngrams/trained_models/ngrams_{ngram}.arpa

In [8]:
def get_perplexity_and_OOVs(model_path):
    # calculate and dump to a file
    model_name = model_path.split('/')[-1].removesuffix('.binary').removeprefix('.arpa')
    !mkdir -p tnqeet/dotting_models/ngrams/test_results_kenlm_output
    !kenlm/build/bin/query "{model_path}" < tmp/ngrams_test_dataset.txt > tnqeet/dotting_models/ngrams/test_results_kenlm_output/"{model_name}".txt
    
    print('model name is:', model_name)
    with open(
        f"tnqeet/dotting_models/ngrams/test_results_kenlm_output/{model_name}.txt",
        "r",
        encoding="utf-8",
        ) as f:
        lines = f.read().splitlines()

    # collect
    perplexity_with_OOVs_line = lines[-4]
    perplexity_without_OOVs_line = lines[-3]
    counts_of_OOVs_line = lines[-2]

    perplexity_with_OOVs = float(perplexity_with_OOVs_line.split("Perplexity including OOVs:")[-1].strip())
    perplexity_without_OOVs = float(perplexity_without_OOVs_line.split("Perplexity excluding OOVs:")[-1].strip())
    counts_of_OOVs = int(counts_of_OOVs_line.split("OOVs:")[-1].strip())

    return perplexity_with_OOVs, perplexity_without_OOVs, counts_of_OOVs

In [9]:
for gram in range(2,16):
    print(f'Training ngram model for {gram=}')
    train_lm(gram)
    perplexity_with_OOVs, perplexity_without_OOVs, counts_of_OOVs = get_perplexity_and_OOVs(model_path=f"tnqeet/dotting_models/ngrams/trained_models/ngrams_{gram}.binary")
    print('*'*120)
    print(f"calculated perplexity for {gram=}")
    print('='*120)
    print(f'{perplexity_with_OOVs=}, {perplexity_without_OOVs=}, {counts_of_OOVs=}')
    print('='*120)
    print('*'*120)

Training ngram model for gram=2
Language model for ngram 2 already exists. Skipping training.


This binary file contains probing hash tables.
Name:query	VmPeak:8688 kB	VmRSS:4512 kB	RSSMax:5248 kB	user:0.550673	sys:0.143653	CPU:0.694378	real:0.677878
model name is: ngrams_2
************************************************************************************************************************
calculated perplexity for gram=2
perplexity_with_OOVs=19.298962904664844, perplexity_without_OOVs=19.297093650302678, counts_of_OOVs=24
************************************************************************************************************************
Training ngram model for gram=3
Language model for ngram 3 already exists. Skipping training.
This binary file contains probing hash tables.
Name:query	VmPeak:13208 kB	VmRSS:4752 kB	RSSMax:10008 kB	user:0.609438	sys:0.14835	CPU:0.757852	real:0.743567
model name is: ngrams_3
************************************************************************************************************************
calculated perplexity for gram=3
perplexity_wi