# Creation of a bigram model with a k-smoothing of 0.1 #

In [1]:
import nltk
nltk.download('treebank')
from nltk.corpus import treebank
from collections import defaultdict, Counter
import math
import random

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


### Print file length ###


In [2]:
files=treebank.fileids()
len(files)

199

### Print first sentence ###

In [3]:
treebank.sents(files[0])

[['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'], ['Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.']]

### Split into training and test dataset ###

In [4]:
train_files = treebank.fileids()[:170]
test_files = treebank.fileids()[170:]

### Making sure that they are the correct length ###

In [5]:
print(len(train_files))
print(len(test_files))

170
29


### If word count in vocabulary <3 change with \<UNK> ###
 

In [6]:
token_counter = Counter()
for file in train_files:
    for sent in treebank.sents(file):
        token_counter.update([token for token in sent])

unk_token = "<UNK>"
vocab = {token for token, count in token_counter.items() if count >= 3}


###  Creates bigrams using \<BOS> and \<EOS> to ensure correct bigram creation ###

In [7]:
train_bigrams = []
for file in train_files:
    for sent in treebank.sents(file):
        sent = ['<BOS>'] + [token if token in vocab else unk_token for token in sent] + ['<EOS>']
        train_bigrams.extend(nltk.bigrams(sent))

### Bigram Language Model with Add-k Smoothing  ###

In [14]:
k = 1
bigram_counts = defaultdict(Counter)
for bigram in train_bigrams:
    bigram_counts[bigram[0]][bigram[1]] += 1

bigram_smoothed_probs = defaultdict(Counter)
for w1 in bigram_counts:
    total_count = sum(bigram_counts[w1].values()) + k * len(vocab)
    for w2 in bigram_counts[w1]:
        bigram_smoothed_probs[w1][w2] = (bigram_counts[w1][w2] + k) / total_count
        
test_bigrams = []
test_bigram_count = 0
for file in test_files:
    for sent in treebank.sents(file):
        sent = ['<BOS>'] + [token if token in vocab else unk_token for token in sent] + ['<EOS>']
        test_bigrams.extend(nltk.bigrams(sent))
        test_bigram_count += len(sent) - 2  # Subtract 2 because <BOS> and <EOS>

    


### Evaluating test data log probability ###

In [13]:
ln_prob_sum = 0.0
for bigram in test_bigrams:
    w1, w2 = bigram
    prob = bigram_smoothed_probs[w1][w2] if w2 in bigram_smoothed_probs[w1] else (k / (sum(bigram_counts[w1].values()) + k * len(vocab)))
    ln_prob_sum += math.log(prob)

### Print perplexity ###

In [11]:
perplexity = math.exp(-1 * (ln_prob_sum / test_bigram_count))
print(perplexity)

486.42766484323977


### Function to generate sentences based on starting word of the model checking start with \<BOS > ####

In [16]:
def generate_sentence(test_bigrams, bigram_model, start_word):
    if ('<BOS>', start_word) not in [(bigram[0], bigram[1]) for bigram in test_bigrams if bigram[0] == '<BOS>']:
        raise ValueError("The provided start_word should be the second word of a bigram where the first word is '<BOS>' in the bigram model.")
    
    sentence = [start_word]
    while sentence[-1] != '<EOS>':
        next_word_candidates = list(bigram_model[sentence[-1]].keys())
        next_word_probs = list(bigram_model[sentence[-1]].values())
        next_word = random.choices(next_word_candidates, next_word_probs)[0]
        
        if next_word == '<UNK>':
            continue
        
        sentence.append(next_word)
    
    return sentence[:-1] # exclude <EOS>

### Generate the sentences with starting words 'If', 'An, 'For' ###

The sentences do not appear to convey any meaningful information or follow a coherent narrative or theme. 

In [18]:
start_words = ['If', 'An', 'For']
generated_sentences = []

for start_word in start_words:
    generated_sentence = generate_sentence(test_bigrams,bigram_smoothed_probs, start_word)
    generated_sentences.append(generated_sentence)
    print(' '.join(generated_sentence))

If those returns are in their stock when the prospects .
An equal fiscal 1990 would result , Wall Street , says *T*-1 * to be able *-1 with its own tax .
For the Old Guard -- even in Article II '' Chaplin 's and the Foreign Ministry official at $ 75 million *U* -RRB- in question now , construction , also could get *T*-1 ? '' said 0 *T*-1 open a default to a great purpose , to compromise measure the banks said 0 he 's car purchase grain supply of specialty retail or history , a mental illness or at Greenville in intellectual-property rights of small American member of a recent example , regulatory capital .
