In [1]:
from pathlib import Path
import sys
import json
from nltk import sent_tokenize
import re

if Path.cwd().parent not in sys.path:
    sys.path.insert(0, str(Path.cwd().parent))

from config import settings
from utils.mex3 import get_vocabulary, to_token_sents, count_ngrams, add_bounds, split_train_dev, tune_lambdas, perplexity, generate, read_file_to_docs
from utils.trigram_lm import InterpTrigramLM

### Pre-processing of Corpus

In [2]:
train_docs = read_file_to_docs(settings.OUTPUT_JSONL)
test_docs = read_file_to_docs(settings.OUTPUT_JSONL_TEST)


### Vocabulary Creation

In [3]:
# Build vocab
vocab = get_vocabulary(train_docs, pattern=r"\w+|\(|\)|\.|\,", min_freq=2)

print("Vocabulary length:", len(vocab))
print("Sample words:", vocab[:30])

Vocabulary length: 44925
Sample words: ['visceral', 'fat', 'more', 'than', 'subcutaneous', 'has', 'been', 'associated', 'with', 'numerous', 'age', 'related', 'problems', ',', 'such', 'as', 'insulin', 'resistance', 'chronic', 'inflammation', 'and', 'cardiac', 'diastolic', 'dysfunction', '.', 'computed', 'tomography', '(', 'ct', ')']


### Adding "unk" to the tokenize sentences

In [4]:
def apply_unk_sents(token_sents, vocab):
    return [[t if t in vocab else "<unk>" for t in sent] for sent in token_sents]

train_sents = apply_unk_sents(to_token_sents(train_docs), set(vocab))
test_sents = apply_unk_sents(to_token_sents(test_docs), set(vocab))

### N-Gram Counts

In [5]:
# Split for dev
tr_sents, dv_sents = split_train_dev(train_sents, dev_ratio=0.1, seed=42)

uni, bi, tri = count_ngrams(add_bounds(tr_sents))
lm = InterpTrigramLM(uni, bi, tri, vocab_size=len(vocab) + 3)

### Tune lambdas

In [6]:
lambdas, dev_pp = tune_lambdas(lm, dv_sents, step=0.1)

print("Best lambdas:", lambdas)

Best lambdas: (0.1, 0.2, 0.7)


### Perplexity and Text Generation

In [7]:
test_pp = perplexity(lm, test_sents, lambdas)
sample = generate(lm, lambdas, max_tokens=30, seed=42)

In [8]:
print(f"[INFO] Train sentences: {len(tr_sents)}, Dev: {len(dv_sents)}, Test: {len(test_sents)}")
print(f"[INFO] Vocabulary size: {len(vocab)}")
print(f"[INFO] Tuned lambdas (uni, bi, tri): {lambdas}")
print(f"[INFO] Dev perplexity:  {dev_pp:.6f}")
print(f"[INFO] Test perplexity: {test_pp:.6f}")
print(f"[GEN ] {sample}")

[INFO] Train sentences: 340373, Dev: 37820, Test: 2103
[INFO] Vocabulary size: 44925
[INFO] Tuned lambdas (uni, bi, tri): (0.1, 0.2, 0.7)
[INFO] Dev perplexity:  20.963202
[INFO] Test perplexity: 84.531898
[GEN ] however <unk> num <unk> that support calculating the annual <unk> num <unk> <unk> <unk> <unk> <unk> <unk> <unk> a <unk> g <unk> <unk> <unk> allow for the


### Summary

- ***λ (unigram, bigram, trigram): (0.1, 0.2, 0.7)*** - The model relies mostly on trigrams, with smaller contributions from bigrams and unigrams. This is typical when enough training data is available to support higher-order n-grams.
- The LM models the training distribution well (dev PP ≈ 21).
- Test PP is substantially higher (≈ 84), which indicates domain mismatch or higher OOV rate in the test set compared to train/dev.
- Gap suggests test data contains rarer entities, more `<unk>` tokens, or contexts not well covered during training.
- Generation:
    - Model produces coherent sentence starts ("however … that support calculating the annual …").
    - Heavy `<unk>` presence shows test/train mismatch and frequent rare tokens in the corpus.
    - `<num>` substitution works (numbers normalized), but suggests numeric expressions dominate test contexts.

### Documents Used

In [12]:
import pandas as pd
df = pd.read_csv(settings.OUTPUT_CSV)
print(df['file_name'].to_list())

['5e13fbe9-d8c1-4403-a2f5-201dc10acfe3.json', '0fde8052-aad8-40ef-b332-c7f87ced0fe1.json', '02f05659-fdd4-493d-b597-0944dcbb55e0.json', '6cf15c74-e3fb-4261-8647-63d9f4c9c504.json', '28d5da6a-efd6-4f4a-9e71-2675be085c45.json', '244f2e7e-1b10-4bd3-8f96-3947457c1b36.json', '21084fb9-ec0a-4da5-81dc-8863c9920f98.json', '13fad9e0-27e0-4b64-a7de-8c502ca2e94b.json', '6c5bcceb-44c6-453d-98aa-d2686d23a0a1.json', '0e6e8c48-3398-4a4c-a2ac-433c05eaaf90.json', '639426b8-6fcd-4817-b30b-25b825ba0da1.json', '6cdf7468-0611-4e0d-af2a-66d99db49b86.json', '83301ff3-174f-4a6f-84ba-1ae95f0f3864.json', '50ca00a7-233b-48a3-bb1b-6e79575f2a00.json', '0bb165c5-eef0-48b3-8a32-958a7137947c.json', '5763f38f-da79-4cc8-9ada-b88d7485ebb0.json', '3e89c855-92b3-42f8-887f-4fd7a47d9017.json', '03cfdb79-43e0-4010-bfde-b3ddf9a2dfd1.json', '038b6f1a-48a3-4a82-bb4a-4eed469805b7.json', '0cb12a38-48e2-4fcc-8415-12d874f84e1d.json', '205cd1a2-0b21-4ebd-b781-f5c2e410cf40.json', '226b3064-b988-48af-aa16-442e9846fc25.json', '4afe6c2e

### Sources

- https://www.geeksforgeeks.org/nlp/n-gram-language-modelling-with-nltk/
- https://aabidkarim.hashnode.dev/training-a-basic-language-model-trigram-language-model
- https://medium.com/mti-technology/n-gram-language-models-70af02e742ad