# Creation of a trigram model with a k-smoothing of 0.01 #

In [35]:
import nltk
nltk.download('treebank')
from nltk.corpus import treebank
from collections import defaultdict, Counter
import math
import random

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


### Print file length ###


In [36]:
files=treebank.fileids()
len(files)

199

### Print first sentence ###

In [37]:
treebank.sents(files[0])

[['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'], ['Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.']]

### Split into training and test dataset ###

In [38]:
train_files = treebank.fileids()[:170]
test_files = treebank.fileids()[170:]

### Making sure that they are the correct length ###

In [39]:
print(len(train_files))
print(len(test_files))

170
29


### Create a vocab for words >3 ###
 

In [40]:
token_counter = Counter()

for file in train_files:
    for sent in treebank.sents(file):
        token_counter.update([token.lower() for token in sent])
        

unk_token = "<UNK>"
vocab = {token.lower() for token, count in token_counter.items() if count >= 3}


### Creates a list of trigrams with boundary markers from sentences in training files ###

In [41]:
train_trigrams = []
for file in train_files:
    for sent in treebank.sents(file):
        sent = ['<BOS>'] + [token.lower() if token.lower() in vocab else unk_token for token in sent] + ['<EOS>']
        train_trigrams.extend(nltk.trigrams(sent))


### Computing smoothed trigram probabilities  ###

In [42]:
k=1
# Count trigrams in the training data
trigram_counts = defaultdict(Counter)
for trigram in train_trigrams:
    trigram_counts[tuple(t.lower() for t in trigram[:-1])][trigram[-1].lower()] += 1

# Calculate smoothed probabilities for trigrams
trigram_smoothed_probs = defaultdict(Counter)
for w1_w2 in trigram_counts:
    total_count = sum(trigram_counts[w1_w2].values()) + k * len(vocab)
    for w3 in trigram_counts[w1_w2]:
        trigram_smoothed_probs[w1_w2][w3] = (trigram_counts[w1_w2][w3] + k) / total_count

test_trigrams = []
test_trigram_count = 0
for file in test_files:
    for sent in treebank.sents(file):
        sent = ['<BOS>'] + [token.lower() if token.lower() in vocab else unk_token for token in sent] + ['<EOS>']
        test_trigrams.extend(nltk.trigrams(sent))
        test_trigram_count += len(sent) 
        


    


### Evaluating sum of ln prob ###

In [43]:
total_prob_sum = 0.0
ln_prob_sum = 0.0

for trigram in test_trigrams:
    w1, w2, w3 = trigram
    w1_lower, w2_lower, w3_lower = w1.lower(), w2.lower(), w3.lower()

    prob = trigram_smoothed_probs[(w1_lower, w2_lower)][w3_lower] if w3_lower in trigram_smoothed_probs[(w1_lower, w2_lower)] else (k / (sum(trigram_counts[(w1_lower, w2_lower)].values()) + k * len(vocab)))
    ln_prob_sum += math.log(prob)

### Print perplexity ###

In [44]:
perplexity = math.exp(-1 * (ln_prob_sum / test_trigram_count))
print(perplexity)

857.1584334159471


### Function to generate sentences based on 3 starting word of the model checking start with \<BOS > ####

In [45]:
def generate_sentence(test_trigrams, trigram_smoothed_probs, start_word, max_length=60, unk_token='<unk>'):
    generated_sentence = ['<bos>', start_word]
    while len(generated_sentence) < max_length and generated_sentence[-1] != '<eos>':
        w1, w2 = generated_sentence[-2], generated_sentence[-1]
        next_word_candidates = list(trigram_smoothed_probs[(w1, w2)].keys())
        next_word_candidates = [word for word in next_word_candidates if word != unk_token]  # Filter out <unk> token
        
        if next_word_candidates:
            next_word_probs = [trigram_smoothed_probs[(w1, w2)][word] for word in next_word_candidates]
            next_word = random.choices(next_word_candidates, weights=next_word_probs)[0]
        else:
            break  # If there are no valid next word candidates, stop generating the sentence
        
        generated_sentence.append(next_word)
    
    if generated_sentence[-1] == '<eos>':
        generated_sentence.pop()  # Remove <EOS> token from the generated sentence
    
    return generated_sentence[1:]  # Exclude the <BOS> token from the output

### Generate the sentences with starting words 'if', 'an, 'for' ###

The generated sentences are better than bigram sentences,although not satisfying at all, but the trigram does not always find a possible next canditate so the use of break is needed to not create an infinite loop. Also the pereplexity seems to be higher for the lowercase model which indicates faulty code or small-unusual dataset.

In [34]:
start_words = ['if', 'an', 'for']
generated_sentences = []

for start_word in start_words:
    generated_sentence = generate_sentence(test_trigrams, trigram_smoothed_probs, start_word)
    generated_sentences.append(generated_sentence)
    print(' '.join(generated_sentence))

if president bush *?* , as members here always have *?* historically .
an index of economic news had little effect on consumers , '' which he declined *-2 to be a power of excision over unconstitutional conditions in legislation that *t*-1 produced the ford
for 10 years ago , nearly the same sounds that the japanese term for the military
