# Creation of a bigram model with a k-smoothing of 1 #

In [12]:
import nltk
nltk.download('treebank')
from nltk.corpus import treebank
from collections import defaultdict, Counter
import math
import random

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


### Print file length ###


In [13]:
files=treebank.fileids()
len(files)

199

### Print first sentence ###

In [14]:
treebank.sents(files[0])

[['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'], ['Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.']]

### Split into training and test dataset ###

In [15]:
train_files = treebank.fileids()[:170]
test_files = treebank.fileids()[170:]

### Making sure that they are the correct length ###

In [16]:
print(len(train_files))
print(len(test_files))

170
29


### Create a vocab for words >3 ###
 

In [17]:
token_counter = Counter()

for file in train_files:
    for sent in treebank.sents(file):
        token_counter.update([token.lower() for token in sent])

unk_token = "<UNK>"
vocab = {token.lower() for token, count in token_counter.items() if count >= 3}



### Creates a list of bigrams with boundary markers from sentences in training files ###

In [18]:
train_bigrams = []
for file in train_files:
    for sent in treebank.sents(file):
        sent = ['<BOS>'] + [token.lower() if token.lower() in vocab else unk_token for token in sent] + ['<EOS>']
        train_bigrams.extend(nltk.bigrams(sent))


### Computing smoothed bigram probabilities  ###

In [29]:
k = 0.01
bigram_counts = defaultdict(Counter)
for bigram in train_bigrams:
    bigram_counts[bigram[0].lower()][bigram[1].lower()] += 1


bigram_smoothed_probs = defaultdict(Counter)
for w1 in bigram_counts:
    total_count = sum(bigram_counts[w1].values()) + k * len(vocab)
    for w2 in bigram_counts[w1]:
        bigram_smoothed_probs[w1][w2] = (bigram_counts[w1][w2] + k) / total_count

       
test_bigrams = []
test_bigram_count = 0
for file in test_files:
    for sent in treebank.sents(file):
        sent = ['<BOS>'] + [token.lower() if token.lower() in vocab else unk_token for token in sent] + ['<EOS>'] # Replace with UNK also
        test_bigrams.extend(nltk.bigrams(sent))
        test_bigram_count += len(sent) 
        

    


### Evaluating sum of ln prob ###

In [30]:
total_prob_sum=0.0        
ln_prob_sum = 0.0

for bigram in test_bigrams:
    w1, w2 = bigram
    w1_lower, w2_lower = w1.lower(), w2.lower()
    prob = bigram_smoothed_probs[w1_lower][w2_lower] if w2_lower in bigram_smoothed_probs[w1_lower] else (k / (sum(bigram_counts[w1_lower].values()) + k * len(vocab)))
    ln_prob_sum += math.log(prob)

### Print perplexity ###

In [31]:
perplexity = math.exp(-1 * (ln_prob_sum / test_bigram_count))
print(perplexity)

119.64154057015432


### Function to generate sentences based on 3 starting word of the model checking start with \<BOS > ####

In [23]:
def generate_sentence(test_bigrams, bigram_model, start_word):
    if ('<BOS>', start_word.lower()) not in [(bigram[0], bigram[1].lower()) for bigram in test_bigrams if bigram[0] == '<BOS>']:
        raise ValueError("The provided start_word should be the second word of a bigram where the first word is '<BOS>' in the bigram model.")
    
    sentence = [ start_word.lower()]
    while sentence[-1] != '<eos>':
        next_word_candidates = [word.lower() for word in list(bigram_model[sentence[-1]].keys())]
        next_word_probs = list(bigram_model[sentence[-1]].values())
        
        if not next_word_candidates:
            sentence.append('<EOS>')
            break
        
        next_word = random.choices(next_word_candidates, next_word_probs)[0]
        
        if next_word == '<unk>':
            continue
        
        sentence.append(next_word)
    
    return sentence[:-1] # exclude <EOS>


### Generate the sentences with starting words 'if', 'an, 'for' ###

The sentences do not appear to convey any meaningful information or follow a coherent narrative or theme. Also the pereplexity seems to be higher for the lowercase model which indicates faulty code or small-unusual dataset.

In [24]:
start_words = ['if', 'an', 'the']
generated_sentences = []

for start_word in start_words:
    generated_sentence = generate_sentence(test_bigrams, bigram_smoothed_probs, start_word.lower())
    generated_sentences.append(generated_sentence)
    print(' '.join(generated_sentence))

if after dealers ' report , merrill lynch , $ 130 million *u* ; 8 % 60 *u* for concern at which most english , dealers .
an estimated 0 they could not a way to a joint venture .
the chairman , giant montedison acquisition of *-4 to foreign country funds fell 20 % term as evidence of credit to veto at 98 *u* a leading political ties when mr. kaminski , comments did n't important as an american educators , vice minister for soviet union troubles , and homelessness is considering *-1 , with them -- like they `` something happen during a downturn reflects a few days ; iowa and white house 0 for the purchases *t*-1 .
