# Homework 3: Language models

CS/Ling 581  
Spring 2024

In [139]:
!pip install -q pytest ipytest

In [140]:
%precision 4

import numpy as np
import gzip
from cytoolz import concat, sliding_window
from collections import Counter

In [141]:
import pytest

try:
    get_ipython()

    import ipytest

    ipytest.autoconfig()

    def init_test():
        ipytest.clean()

    def run_test():
        ipytest.run()

except NameError:

    def init_test():
        pass

    def run_test():
        pass

## Trigram language model

We'll load some data and use it to train a simple trigram language model:

In [142]:
def read_corpus(filename):
    return [line.lower().split() for line in gzip.open(filename)]


sentences = read_corpus("bnc_train.txt.gz")
sentences_train = sentences[:175000]
sentences_test = sentences[175000:]

In [143]:
class TrigramLM:
    def __init__(self, alpha):
        self.alpha = alpha

    def preprocess(self, sentence):
        """Normalize sentence and add filler tokens <s> and </s>"""
        return ["<s>", "<s>"] + [w.lower() for w in sentence] + ["</s>","</s>"]

    def get_unigram_counts(self, train_corpus):
        self.unigrams = Counter(concat(train_corpus))

    def get_bigram_counts(self, train_corpus):
        self.bigrams = Counter(sliding_window(2, concat(train_corpus)))

    def get_trigram_counts(self, train_corpus):
        self.trigrams = Counter(sliding_window(3, concat(train_corpus)))

    def train(self, train_corpus):
        """Count bigram and unigram frequencies in the training corpus."""
        train_corpus = [self.preprocess(sentence) for sentence in train_corpus]
        self.get_unigram_counts(train_corpus)
        self.get_bigram_counts(train_corpus)
        self.get_trigram_counts(train_corpus)
        self.V = len(self.unigrams)

    def log_prob(self, sentence):
        """Calculate the log_2 probability of a sentence given the model."""
        p = 0.0
        try:
            for (w1, w2, w3) in sliding_window(3, self.preprocess(sentence)):
                p = (
                        p
                        + np.log2(self.trigrams[w1, w2, w3] + self.alpha)
                        - np.log2(self.bigrams[w1, w2] + self.alpha * self.V)
                )
            return p
        except ZeroDivisionError:
            return 0.0

In [144]:
lm = TrigramLM(alpha=1000)
lm.train(sentences_train)
lm.log_prob('Testing'.split())

-50.0718

---

## Problem 1: Perplexity

Define a function that calculates the perplexity of a model on a corpus (= a list of sentences). Use the definition of perplexity given in section 3.2.1 of Jurafsky and Martin.

In [145]:
def perplexity(model, sentenceGroup):
    sumLogProbability, wordCount = 0, 0

    for sentence in sentenceGroup:
        tokens = model.preprocess(sentence)
        wordCount += len(tokens) - 3

        sentenceLogProbability = model.log_prob(sentence)
        sumLogProbability += sentenceLogProbability

    averageNegativeLogProbability = -sumLogProbability / wordCount

    return np.exp2(averageNegativeLogProbability)

In [146]:
lm = TrigramLM(alpha=0.1)
lm.train(sentences_train)
perplexity(lm, sentences_test[:100])

15418.3348

In [147]:
%%ipytest

@pytest.fixture(scope="module")
def my_trigram_lm():
    lm = TrigramLM(alpha=0.1)
    lm.train(sentences_train)
    return lm


@pytest.mark.parametrize(
    "sentence,logprob", [(sentences_train[0], -211.8753), (sentences_test[0], -86.1634)]
)
def test_trigram_logprob(my_trigram_lm, sentence, logprob):
    assert my_trigram_lm.log_prob(sentence) == pytest.approx(logprob, rel=1e-3)


@pytest.mark.parametrize(
    "sentences,perplex",
    [(sentences_train[:100], 3773.9770), (sentences_test[:100], 15418.3348)],
)
def test_trigram_perplexity(my_trigram_lm, sentences, perplex):
    assert perplexity(my_trigram_lm, sentences) == pytest.approx(perplex, rel=1e-3)

[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                                                         [100%][0m
[32m[32m[1m4 passed[0m[32m in 4.04s[0m[0m


---

## Problem 2: Smoothing

In the definition for `TrigramLM`, `alpha` is the smoothing parameter. What is the best value to use? Try building models with different values for `alpha` and compute their perplexity on both `sentences_train[:500]` and `sentences_test[:500]`. For `alpha` values, try different powers of 10 (e.g., `[1e-5, 1e-4, 1e-3, 1e-2, 1e-1]`). What patterns do you see and what might account for them? What's the best value of `alpha`?

For the values of the alpha, I tried different values such as 1e-5, 1e-2 and 1e-1 and noticed as alpha increases, then the train perplexity increases while the test perplexity increases and decreases. This pattern might be accounted for by the value of alpha as alpha increases, or the smoothing value increases, then the less predictability the model becomes or the train perplexity increases. For the test perplexity, the value increases and decreases according to the value of alpha and this could be due to specific alpha value being a more accurate fit for the model. An example that could be made would be increasing alpha or increasing smoothing, creating more generalizations with the data and less predicatability with the text. The best value for alpha is 0.01 as it has the train perplexity of 35.692 and a gap of test perplexity of 2620.16. The gap showcases no overfitting created from the data and an accurate fit for smoothing that isn't undersmoothing/oversmoothing.

In [148]:
smoothing_params = [100**-exp for exp in range(5, 0, -1)] 
results = {}

In [149]:
for alpha in smoothing_params:
    model = TrigramLM(alpha)
    model.train(sentences_train[:500])
    train_perplexity = perplexity(model, sentences_train[:500])
    test_perplexity = perplexity(model, sentences_test[:500])
    results[alpha] = (train_perplexity, test_perplexity)

for alpha, (train_perplexity, test_perplexity) in results.items():
    print(f"Alpha: {alpha}, Train Perplexity: {train_perplexity}, Test Perplexity: {test_perplexity}")

Alpha: 1e-10, Train Perplexity: 1.7681421648241777, Test Perplexity: 80604.74032165066
Alpha: 1e-08, Train Perplexity: 1.7681859970423177, Test Perplexity: 29523.74979813698
Alpha: 1e-06, Train Perplexity: 1.7725681034526346, Test Perplexity: 10817.847259898635
Alpha: 0.0001, Train Perplexity: 2.201090919491455, Test Perplexity: 4095.120954266784
Alpha: 0.01, Train Perplexity: 35.692763570531405, Test Perplexity: 2620.167290134109


## Problem 3: Random sampling

Write a function that generates a random sentence by sampling from a trigram language model (see sections 3.3 and 3.4 in Jurafsky & Martin).

Here's the basic approach you should take: Every sentence will start with the start symbols `<s> <s>`. The language model gives us the conditional probability of each possible word given that context:
$$P(w_1|\texttt{<s> <s>})=\frac{C(\texttt{<s> <s> } w_1)+\alpha}{C(\texttt{<s> <s>})+\alpha V}$$
Pick word $w_1$ at random by drawing from this distribution. Let's say the word you pick is `disgruntled`. Now the probability of any word being the next word in the sentence is:
$$P(w_2|\texttt{<s> disgruntled})=\frac{C(\texttt{<s> disgruntled } w_2)+\alpha}{C(\texttt{<s> disgruntled})+\alpha V}$$
Pick word $w_2$ at random by drawing from this new distribution. Keep going like this until you've picked 50 words or the next word is `</s>` (and the sentence is finished), whichever comes first.

To do the random picking, use the function `multinomial` defined below. It takes a dictionary mapping words to probabilities and chooses one word at random using the method shown in Figure 3.3 in Jurafsky & Martin. 




In [159]:
import random

def generate(lm):
    sentence = ['<s>', '<s>']
    while len(sentence) < 52:
        context = (sentence[-2], sentence[-1])

        probs = {w: lm.log_prob(context + (w,)) for w in lm.unigrams}
        totalProb = sum(probs.values())
        normalizedProbs = {w: p / totalProb for w, p in probs.items()}

        nextWord = random.choices(list(normalizedProbs.keys()), weights=normalizedProbs.values(), k=1)[0]
        if nextWord == '</s>':
            break
        sentence.append(nextWord)

    return ' '.join([w.decode('utf-8') if isinstance(w, bytes) else w for w in sentence[2:]])

In [None]:
smoothing_params = [100**-exp for exp in range(5, 0, -1)] 
results = {}
for alpha in smoothing_params:
    print(f"alpha={alpha}")
    lm = TrigramLM(alpha=alpha)
    lm.train(sentences_train[:500])
    print(generate(lm))

alpha=1e-10
narrow veto sofa vertical sports galleries classes levels styles owners 1988 controversy half holding normal 's room cotman aquarium across pp. mount blessed edwardian suffolk dampers lost ! solid talks broad bristol strengthening bacteria areas periodicals burmans manufacturer gallons killinghall speak automatic exhibit power jumped road hard infant activators cheltenham
alpha=1e-08


Try sampling from language models trained using the same data but with different values for `alpha`. What effect does `alpha` have on the sentences you get?

The bigger value that alpha is, the more smoothing the testing words will recieve. However, the greater the value of alpha will cause cases of over-smoothing where some data will be incoherent. From the example, I used 0.01 and 10000 and found that the higher the value of alpha, the more smoothing while vice versa for the smaller the value of alpha gets. From the text, I found that alpha creates more coherent sentences as alpha reaches a certain value or level of smoothing from the model and test data.