# Creation of a trigram model with a k-smoothing of 0.01 #

In [1]:
import nltk
nltk.download('treebank')
from nltk.corpus import treebank
from collections import defaultdict, Counter
import math
import random

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


### Print file length ###


In [2]:
files=treebank.fileids()
len(files)

199

### Print first sentence ###

In [3]:
treebank.sents(files[0])

[['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'], ['Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.']]

### Split into training and test dataset ###

In [4]:
train_files = treebank.fileids()[:170]
test_files = treebank.fileids()[170:]

### Making sure that they are the correct length ###

In [5]:
print(len(train_files))
print(len(test_files))

170
29


### If word count in vocabulary <3 change with \<UNK> ###
 

In [6]:
token_counter = Counter()

for file in train_files:
    for sent in treebank.sents(file):
        token_counter.update([token for token in sent])

unk_token = "<UNK>"
vocab = {token for token, count in token_counter.items() if count >= 3}

### Creates a list of trigrams with boundary markers from sentences in training files ###

In [27]:
train_trigrams = []
for file in train_files:
    for sent in treebank.sents(file):
        sent = ['<BOS>'] + [token if token in vocab else unk_token for token in sent] + ['<EOS>']
        train_trigrams.extend(nltk.trigrams(sent))


### Computing smoothed bigram probabilities  ###

In [28]:
k=0.01
# Count trigrams in the training data
trigram_counts = defaultdict(Counter)
for trigram in train_trigrams:
    trigram_counts[trigram[:-1]][trigram[-1]] += 1

# Calculate smoothed probabilities for trigrams
trigram_smoothed_probs = defaultdict(Counter)
for w1_w2 in trigram_counts:
    total_count = sum(trigram_counts[w1_w2].values()) + k * len(vocab)
    for w3 in trigram_counts[w1_w2]:
        trigram_smoothed_probs[w1_w2][w3] = (trigram_counts[w1_w2][w3] + k) / total_count

test_trigrams = []
test_trigram_count = 0
for file in test_files:
    for sent in treebank.sents(file):
        sent = ['<BOS>'] + [token if token in vocab else unk_token for token in sent] + ['<EOS>']
        test_trigrams.extend(nltk.trigrams(sent))
        test_trigram_count += len(sent) - 2  # Subtract 2 because <BOS> and <EOS>
        


    


### Evaluating sum of ln prob ###

In [29]:
ln_prob_sum = 0.0
for trigram in test_trigrams:
    w1, w2, w3 = trigram
    prob = trigram_smoothed_probs[(w1, w2)][w3] if w3 in trigram_smoothed_probs[(w1, w2)] else (k / (sum(trigram_counts[(w1, w2)].values()) + k * len(vocab)))
    
    ln_prob_sum += math.log(prob)

### Print perplexity ###

In [30]:
perplexity = math.exp(-1 * (ln_prob_sum / test_trigram_count))
print(perplexity)

463.80467915524156


### Function to generate sentences based on starting word of the model checking start with \<BOS > ####

In [31]:
def generate_sentence(test_trigrams, trigram_smoothed_probs, start_word,  unk_token='<UNK>'):
    generated_sentence = ['<BOS>', start_word]
    while generated_sentence[-1] != '<EOS>':
        w1, w2 = generated_sentence[-2], generated_sentence[-1]
        next_word_candidates = list(trigram_smoothed_probs[(w1, w2)].keys())
        next_word_candidates = [word for word in next_word_candidates if word != unk_token]  # Filter out <unk> token
        
        if next_word_candidates:
            next_word_probs = [trigram_smoothed_probs[(w1, w2)][word] for word in next_word_candidates]
            next_word = random.choices(next_word_candidates, weights=next_word_probs)[0]
        else:
            break  
        
        generated_sentence.append(next_word)
    
    if generated_sentence[-1] == '<EOS>':
        generated_sentence.pop()  # Remove <EOS>
    
    return generated_sentence[1:]  # Remove <BOS> 

### Generate the sentences with starting words 'If', 'An, 'For' ###

The generated sentences are better than bigram sentences,although not satisfying at all, but the trigram does not always find a possible next canditate so the use of break is needed to not create an infinite loop.

In [34]:
start_words = ['If', 'An', 'For']
generated_sentences = []

for start_word in start_words:
    generated_sentence = generate_sentence(test_trigrams, trigram_smoothed_probs, start_word)
    generated_sentences.append(generated_sentence)
    print(' '.join(generated_sentence))

If , when terms are scheduled *-3 to cover all transaction costs .
An appeal is expected *-4 to be sold *-1 at $ 90 *U* a share , compared with $ 6 million *U* in 1990 .
For their part , this exclusive club has taken measures *-1 to accommodate Japanese business interests -LCB- in the fiscal year ending June 30 , 1990 .
