In [1]:
import nltk
import math
import random
import collections
import numpy as np
import pandas as pd

In [2]:
# Load Data

with open("Dataset/en_US.twitter.txt", "r", encoding="utf8") as f:
    data = f.read()
    
print("No of letters :", len(data))

No of letters : 3335477


In [3]:
# Pre Processing text data

def pre_processing(data):
    
    # String to Sentences
    sentences = data.split("\n")
    sentences = [text.strip() for text in sentences]
    sentences = [text for text in sentences if len(text) > 0]
    
    # Tokenize sentences into tokens
    tokenized_sentences = [nltk.word_tokenize(text.lower()) for text in sentences]
    
    return tokenized_sentences

pre_processing("Sky is blue.\nLeaves are green\nRoses are red.")

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green'],
 ['roses', 'are', 'red', '.']]

In [4]:
# train and test sets

tokenized_data = pre_processing(data)

random.seed(69)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [5]:
# Helper Functions

def count_words(tokenized_data):
    
    data = []
    [data.extend(text_l) for text_l in tokenized_data]
    
    return dict(collections.Counter(data))

def get_closed_vocab(tokenized_data, threshold):
    
    vocab = count_words(tokenized_data)
    closed_vocab = set([word for word in vocab if vocab[word] >= threshold])
    
    return closed_vocab

def replace_oov(tokenized_data, closed_vocab):
    
    replaced_tokenized_data = [[word if word in closed_vocab else "<unk>" for word in sentence] for sentence in tokenized_data]
    return replaced_tokenized_data
    
print(count_words([['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green', '.'], ['roses', 'are', 'red', '.']]))
print(get_closed_vocab([['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green', '.'], ['roses', 'are', 'red', '.']], 2))
print(replace_oov([["dogs", "run"], ["cats", "sleep"]], {"dogs", "sleep"}))

{'sky': 1, 'is': 1, 'blue': 1, '.': 3, 'leaves': 1, 'are': 2, 'green': 1, 'roses': 1, 'red': 1}
{'are', '.'}
[['dogs', '<unk>'], ['<unk>', 'sleep']]


In [6]:
# Pre Processing tokenized_data

def pre_processing(train_data, test_data, threshold):
    
    vocab = get_closed_vocab(train_data, threshold)
    
    train_data_replaced = replace_oov(train_data, vocab)
    test_data_replaced = replace_oov(test_data, vocab)
    
    return train_data_replaced, test_data_replaced, vocab

In [7]:
# Pre Processing on train and test split

train_data_processed, test_data_processed, vocab = pre_processing(train_data, test_data, threshold=2)

### N-Gram

In [8]:
# Helper Functions 

def get_n_gram(data, n, start_token = "<s>", end_token = "<e>"):
    
    pairs = []
    
    for sentence in data:
        
        sentence = n * [start_token] + sentence + [end_token]
        
        for idx in range(len(sentence)-n+1):
            pairs.append(tuple(sentence[idx: idx+n]))
            
    return dict(collections.Counter(pairs))
        
# print("Uni-gram :", get_n_gram([['i', 'like', 'a', 'cat'], ['this', 'dog', 'is', 'like', 'a', 'cat']], 1))
# print("Bi-gram :", get_n_gram([['i', 'like', 'a', 'cat'], ['this', 'dog', 'is', 'like', 'a', 'cat']], 2))


def estimate_probability(word, previous_n_gram, n_gram, n_plus1_gram, vocabulary_size, k=1.0):
    
    previous_n_gram = tuple(previous_n_gram)
    
    string_tuple = previous_n_gram + (word, )
    
    numerator = n_plus1_gram.get(string_tuple, 0) + k
    denominator = n_gram.get(previous_n_gram, 0) + k * vocabulary_size
    
    probability = numerator / denominator
    
    return probability

def estimate_probabilities(previous_n_gram, n_gram, n_plus1_gram, vocab, k=1.0):
    
    previous_n_gram = tuple(previous_n_gram)
    
    vocab = vocab.union({"<e>", "<unk>"})
    vocab_size = len(vocab)
    
    probabilities = {}
    for word in vocab:
        probabilities[word] = estimate_probability(word, previous_n_gram, n_gram, n_plus1_gram, vocab_size, k=k)
    
    return probabilities

### Count and probability matrices

In [9]:
def get_count_matrix(n_plus1_gram, vocab):
    
    vocab = vocab.union({"<e>", "<unk>"})
    
    n_grams = list(set([val[0:-1] for val in n_plus1_gram.keys()]))
    
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
    col_index = {word: i for i, word in enumerate(vocab)}
    
    count_matrix = np.zeros((len(n_grams), len(vocab)))
    
    for key, value in n_plus1_gram.items():
        
        n_gram = key[0: -1]
        word = key[-1]
        
        if word not in vocab:
            continue
            
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i][j] = value
        
    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=list(vocab))
    return count_matrix

sentences = [['i', 'like', 'a', 'cat'], ['this', 'dog', 'is', 'like', 'a', 'cat']]
words = set(sentences[0] + sentences[1])
bigram = get_n_gram(sentences, 2)

get_count_matrix(bigram, words)

Unnamed: 0,like,i,<e>,<unk>,a,this,cat,is,dog
"(like,)",0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
"(is,)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>,)",0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"(cat,)",0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
"(i,)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(dog,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
"(a,)",0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
"(this,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
def get_probability_matrix(n_plus1_gram, vocab, k):
    
    count_matrix = get_count_matrix(n_plus1_gram, vocab)
    count_matrix += k
    
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

sentences = [['i', 'like', 'a', 'cat'], ['this', 'dog', 'is', 'like', 'a', 'cat']]
words = set(sentences[0] + sentences[1])
bigram = get_n_gram(sentences, 2)

get_probability_matrix(bigram, words, k=1)

Unnamed: 0,like,i,<e>,<unk>,a,this,cat,is,dog
"(like,)",0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909,0.090909
"(is,)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>,)",0.090909,0.181818,0.090909,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909
"(cat,)",0.090909,0.090909,0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(i,)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(dog,)",0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1
"(a,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909
"(this,)",0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.2


### Auto Complete System

In [11]:
def suggest_a_word(previous_tokens, n_gram, n_plus1_gram, vocab, k=1.0, start_with=None):
    
    n = len(list(n_gram.keys())[0]) 
    
    previous_n_gram = previous_tokens[-n:]
    
    probabilities = estimate_probabilities(previous_n_gram, n_gram, n_plus1_gram, vocab, k=k)
    
    suggestion = None
    max_prob = 0
    
    for word, prob in probabilities.items():
        
        if start_with != None:
            if not word.startswith(start_with):
                continue
        
        if prob > max_prob:
            suggestion = word            
            max_prob = prob

    return suggestion, max_prob

# sentences = [['i', 'like', 'a', 'cat'], ['this', 'dog', 'is', 'like', 'a', 'cat']]
# unique_words = set(sentences[0] + sentences[1])

# unigram_counts = get_n_gram(sentences, 1)
# bigram_counts = get_n_gram(sentences, 2)

# previous_tokens = ["i", "like"]

# suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0)
# suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0, start_with="c")

In [12]:
# Play ground 

previous_tokens = input("Enter Text : ").lower()
previous_tokens = nltk.word_tokenize(previous_tokens)

unigram = get_n_gram(train_data_processed, 1)
bigram = get_n_gram(train_data_processed, 2)

suggest_a_word(previous_tokens, unigram, bigram, vocab, k=1.0)

Enter Text : Hello World !


('<e>', 0.21491718889883618)

In [13]:
def choose_a_word(sentence, n_gram, n_plus1_gram, vocab, k=1.0, options=None):
    
    previous_tokens = sentence.lower()
    previous_tokens = nltk.word_tokenize(previous_tokens)
    
    n = len(list(n_gram.keys())[0]) 
    
    previous_n_gram = previous_tokens[-n:]
    
    probabilities = estimate_probabilities(previous_n_gram, n_gram, n_plus1_gram, vocab, k=k)
    
    output = {word: probabilities.get(word, -1) for word in options}
    return output

In [14]:
bigram = get_n_gram(train_data_processed, 2)
trigram = get_n_gram(train_data_processed, 3)


questions = """Mr Patrick is our new (principle/principal).
The company (excepted/accepted) all the terms.
Please don’t keep your dog on the (lose/loose).
The (later/latter) is my best friend.
I need some (stationary/stationery) products for my craftwork.
The actor (excepted/accepted) the Oscar.
I will call you (later/latter) in the evening.
Covid (affects/effects) the lungs.
The (council/counsel) of the ministers were sworn in yesterday.
Robert (too/to) wants to accompany us to the park.
Mia will (council/counsel) me about choosing fashion as my career.
The (bear/bare) at the zoo was very playful.
The sheep have a lot of (fur/far) that keeps them warm.
The hot spring is at the (furthest/ farthest) corner of the street.
Can you (advice/advise) me on how to study for exams?
The team will (loose/lose) the match if they don’t play well.
Can you go (to/too) the market for me?
The teachers asked the students to keep (quite/quiet).
The (heap/hip) of garbage should be cleaned immediately.
This is (there/their) house. """

questions = questions.split("\n")
for question in questions:
    
    question = question.replace(")", "(")
    question = question.split("(")
    
    print(choose_a_word(question[0], bigram, trigram, vocab, k=1.0, options = question[1].split("/")))

{'principle': 6.718172657037286e-05, 'principal': 6.718172657037286e-05}
{'excepted': -1, 'accepted': 6.738544474393532e-05}
{'lose': 6.419309282321222e-05, 'loose': 6.419309282321222e-05}
{'later': 6.739906989283548e-05, 'latter': 6.739906989283548e-05}
{'stationary': -1, 'stationery': -1}
{'excepted': -1, 'accepted': 6.739906989283548e-05}
{'later': 6.734006734006734e-05, 'latter': 6.734006734006734e-05}
{'affects': 6.739906989283548e-05, 'effects': 6.739906989283548e-05}
{'council': 6.739906989283548e-05, 'counsel': 6.739906989283548e-05}
{'too': 6.739906989283548e-05, 'to': 6.739906989283548e-05}
{'council': 6.739906989283548e-05, 'counsel': 6.739906989283548e-05}
{'bear': 6.739906989283548e-05, 'bare': 6.739906989283548e-05}
{'fur': -1, 'far': 6.691648822269808e-05}
{'furthest': -1, ' farthest': -1}
{'advice': 6.697923643670462e-05, 'advise': 6.697923643670462e-05}
{'loose': 6.739452756436177e-05, 'lose': 6.739452756436177e-05}
{'to': 0.0006722237160527024, 'too': 0.00013444474321

### Perplexity Score

In [15]:
def calculate_perplexity(sentence, n_gram, n_plus1_gram, vocab, k=1.0):
    
    n = len(list(n_gram.keys())[0]) 
    
    sentence = ["<s>"] * n + sentence + ["<e>"]
    sentence = tuple(sentence)
    
    N = len(sentence)
    
    product_pi = 1.0
    for i in range(n, N):

        list_t = sentence[i-n:i]    
        
        word = sentence[i]
        probability = estimate_probability(word, list_t, n_gram, n_plus1_gram, len(vocab), k=1)
        
        product_pi *= 1 / probability

    perplexity = product_pi**(1/float(N))
    
    return perplexity


bigram = get_n_gram(train_data_processed, 2)
trigram = get_n_gram(train_data_processed, 3)

calculate_perplexity(test_data_processed[69], bigram, trigram, vocab, k=1.0)

2247.9099690172125