In [2]:
import pandas as pd
import numpy as np
import string

# Language model (2nd order)

We will build a 2nd order language model based on Robert Frost's poems.

Since we are departing from the Markov assumption and  now $p(x_t| x_{t-1}, x_{t-2}) \ne p(x_t| x_{t-1})$, for the model is assumed to be second order, we will need 2 *initial* ditributions: one for the first word and another one for the second word.

We will also include sa tag <END\> to identify (and predict) the end of a sentence.


In [3]:
def remove_puntuation(s: str) -> str:
    return s.translate(str.maketrans(dict.fromkeys(string.punctuation)))

def add2dict(d, k, v):
    if k not in d:
        d[k] = []
    d[k].append(v)

In [62]:
# store the initial distribution
initial = {}
# store the second word distribution
second_word = {}
# second order transitions
transitions = {}


# loop through the file of poems
for line in open("robert_frost.txt", 'r'):
    tokens = remove_puntuation(line.rstrip().lower()).split()

    # length of sequence
    T = len(tokens)
    if T == 1:
        print(tokens)
    for i in range(T):
        t = tokens[i]
        if i==0:
            # keep counts of initial word
            initial[t] = initial.get(t, 0.) + 1
        else:
            t_1 = tokens[i-1]
            if i == T-1:
                # add end of line
                add2dict(transitions, (t_1, t), 'END')
            if i==1:
                # add second word
                add2dict(second_word, t_1, t)
            else:
                # add regular words
                t_2 = tokens[i-2]
                add2dict(transitions, (t_2, t_1), t)

['today']
['crosslegged']
['no']
['done']
['never']
['it']
['speculation']


The above cell has an issue! There is a sentence that is "done?", which, after removing punctuation, turns into a tokenization as follows: ["done"]. The first iteration has i == 0,  but in this case this is also i == T-1. However, it will go only through the first if (if i == 0) due to the if/else structure inside the loop above. 

Now, word "done" only appears one time in the first position, meaning that <second_word> will NOT contain 'done' as key.

Therefore, if when sampling (see below), we happen to pick "done" as the first word, we will get an error because there is no transition considered from "done" as first word!

This happens potentially with these other cases also:

['today']

['crosslegged']

['no']

['done']

['never']

['it']

['speculation']

but we need two conditions for this corner case to raise an error: that the sentence only contains a word and that the only time such word appears as the first word is in the length == 1 sentence.

Get initial probability distribution

In [64]:
# normalize the initial counts
initial_total = sum(initial.values())

# get initial probability distribution
for t, c in initial.items():
    initial[t] = c / initial_total

print('check that total probability = 1: ', sum(initial.values()))

check that total probability = 1:  0.9999999999999969


In [42]:
def list2pdict(ts):
    """
    Get prob distribution for words in list ts

    ts is a list following a given key. This means:
        - conditioned on previous word in case of <second_word>
        - conditioned on previous two words in case of <transitions>
    """
    d = {}
    #total number of values 
    n = len(ts)
    for t in ts:
        # count each token conditioned to a previous word/two words
        d[t] = d.get(t, 0.) + 1
    for t, c in d.items():
        # normalize to get probs
        d[t] = c / n
    return d

def sample_word(d):
    p0 = np.random.random()
    cumulative = 0
    for t, p in d.items():
        # p could be very small so it could occur that p0 > p for every word
        # using cumulative we make sure that at least one word is always picked up
        # (the first word that hits the threshold is picked up)
        cumulative += p
        if p0 < cumulative:
            return t
    # code should never get here: assert that
    assert False

def generate_poem():
    for i in range(4): # 4 sentences
        sentence = []
    
        # sample & append first word
        w0 = sample_word(initial)
        sentence.append(w0)

        # sample & append second word
        w1 = sample_word(second_word[w0])
        sentence.append(w1)

        # infinite loop
        while True:
            w2 = sample_word(transitions[(w0,w1)])
            if w2 == 'END':
                break
            sentence.append(w2)
            # next round
            w0 = w1
            w1 = w2
        print(" ".join(sentence))

In [54]:
for t_1, ts in second_word.items():
    second_word[t_1] = list2pdict(ts)

for k, ts in transitions.items():
    transitions[k] = list2pdict(ts)

At this point <second_word> and <transitions> are dictionaries of dictionaries

In [55]:
generate_poem()

two at a farm not far away
two roads diverged in a glass case as you may have shifted since i saw no window but that was not enough
two roads diverged in a byroad
two roads diverged in a glass case then


In [65]:
initial

{'two': 0.005571030640668524,
 'and': 0.08983286908077995,
 'to': 0.034818941504178275,
 'then': 0.008356545961002786,
 'because': 0.0006963788300835655,
 'though': 0.004874651810584958,
 'had': 0.002785515320334262,
 'in': 0.0201949860724234,
 'oh': 0.002785515320334262,
 'yet': 0.0020891364902506965,
 'i': 0.08217270194986072,
 'somewhere': 0.0006963788300835655,
 'whose': 0.001392757660167131,
 'his': 0.004874651810584958,
 'he': 0.023676880222841225,
 'my': 0.004874651810584958,
 'between': 0.0020891364902506965,
 'the': 0.057103064066852366,
 'of': 0.0201949860724234,
 'but': 0.035515320334261836,
 'some': 0.003481894150417827,
 'from': 0.006963788300835654,
 'is': 0.003481894150417827,
 'natures': 0.0006963788300835655,
 'her': 0.001392757660167131,
 'so': 0.009052924791086351,
 'nothing': 0.001392757660167131,
 'when': 0.006267409470752089,
 'came': 0.0006963788300835655,
 'one': 0.00766016713091922,
 'proclaimed': 0.0006963788300835655,
 'smoothlaid': 0.0006963788300835655,
 'h