In [1]:
import numpy as np
import pickle

In [2]:
with open("data_and_vocab.p", "rb") as f:
    train_data, test_data, vocab = pickle.load(f)

In [3]:
len(train_data)

42577

In [4]:
def get_n_gram(sentences, n_gram):

    n_gram_count = {}

    for sent in sentences:
        s = ["<s>"] * (n_gram - 1) + sent + ["<e>"]

        s = tuple(s)

        m = len(s) if n_gram == 1 else len(s) - (n_gram - 1)

        for i in range(m):
            n_gram_count[s[i:i+n_gram]] = n_gram_count.get(s[i:i+n_gram], 0) + 1

    return n_gram_count

In [5]:
bigram_counts = get_n_gram(train_data, 2)
unigram_count = get_n_gram(train_data, 1)

In [6]:
def estimate_probability(word, previous_n_grams, n_gram_counts, n_plus1_gram_counts, vocab_size, k = 1.0):
    prev = tuple(previous_n_grams)

    denominator = n_gram_counts.get(prev, 0) + k * vocab_size

    ngram = prev + (word,)

    numerator = n_plus1_gram_counts.get(ngram, 0) + k

    return numerator / denominator


In [7]:
gram = list(bigram_counts.keys())[200]
print(gram)
previous_gram = gram[:-1]
previous_gram

estimate_probability(gram[1], previous_gram, unigram_count, bigram_counts, len(vocab))

('drop', 'kar')


0.0011949810794662419

In [115]:
def estimate_all_probabilities(given_words, n_gram_count, n_plus1_gram_count, vocabulary, k = 1.0):
    n = len(list(n_plus1_gram_count.keys())[0])
    
    if isinstance(given_words, str):
        previous_gram = tuple([given_words,])
    else:
        previous_gram = tuple(given_words)

    vocabulary = list(vocab.keys()) + ["<e>", "<UNK>"]

    
    probabilities = {}
    for word in vocabulary:
        x = estimate_probability(word, previous_gram, n_gram_count, n_plus1_gram_count, len(vocabulary))
        probabilities[word] = x

    return probabilities

In [9]:
x = estimate_all_probabilities("drop", unigram_count, bigram_counts, vocab)

In [10]:
x_items = list(x.items())

z = sorted(x_items, key=lambda s: s[1], reverse=True)

In [11]:
z[:5]

[('kar', 0.0011945052757316346),
 ('karke', 0.0007963368504877563),
 ('ka', 0.0005972526378658173),
 ('karunga', 0.0005972526378658173),
 ('one', 0.0005972526378658173)]

# Create count and probability matrices

In [12]:
import pandas as pd

def make_count_matrix(n_plus1_gram_counts, vocabulary):
    vocab = list(vocabulary.keys()) + ["<e>", "<UNK>"]

    n_grams = []

    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[:-1]
        n_grams.append(n_gram)

    n_grams = list(set(n_grams))

    rows = {gram : i for i, gram in enumerate(n_grams)}

    cols = {word : j for j, word in enumerate(vocab)}

    n_rows = len(rows)
    n_cols = len(cols)

    count_matrix = np.zeros((n_rows, n_cols))

    for n_plus1_gram, count in n_plus1_gram_counts.items():
        ngram = n_plus1_gram[:-1]
        word = n_plus1_gram[-1]
        if word not in vocab:
            continue

        i = rows.get(ngram)
        j = cols.get(word)

        count_matrix[i, j] = count

    return pd.DataFrame(count_matrix, index = n_grams, columns = vocab)    

In [13]:
#trigram_counts = get_n_gram(train_data, 3)
cmatrix = make_count_matrix(bigram_counts, vocab)

In [14]:
cmatrix.head()

Unnamed: 0,ha,to,me,ho,hai,he,b,ni,kar,aur,...,fed,stars,html5,hcl,woman,ankush,reg,domain,<e>,<UNK>
"(jyada,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
"(topic,)",8.0,1.0,1.0,1.0,5.0,3.0,1.0,7.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,5.0
"(chalaya,)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(air,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(thoda,)",0.0,12.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,13.0


In [15]:
cmatrix += 1
prob_matrix = cmatrix.div(cmatrix.sum(axis = 1), axis = 0)

In [16]:
prob_matrix.head()

Unnamed: 0,ha,to,me,ho,hai,he,b,ni,kar,aur,...,fed,stars,html5,hcl,woman,ankush,reg,domain,<e>,<UNK>
"(jyada,)",0.0002,0.000399,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,...,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.000399
"(topic,)",0.001744,0.000388,0.000388,0.000388,0.001163,0.000775,0.000388,0.00155,0.000388,0.000388,...,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.000194,0.00465,0.001163
"(chalaya,)",0.0002,0.0002,0.0002,0.0002,0.0004,0.0002,0.0002,0.0002,0.0002,0.0002,...,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002
"(air,)",0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,...,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199
"(thoda,)",0.000193,0.002512,0.000193,0.000193,0.000193,0.000386,0.000193,0.000193,0.000193,0.000773,...,0.000193,0.000193,0.000193,0.000193,0.000193,0.000193,0.000193,0.000193,0.002125,0.002705


# Perplexity

In [17]:
def compute_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocab_size, k = 1.0):
    n = len(list(n_gram_counts.keys())[0])

    sentence = ["<s>"] * n + sentence + ["<e>"]

    sentence = tuple(sentence)

    N = len(sentence)

    logsum = 0

    for t in range(n, N):
        n_gram = sentence[t - n : t]
        word = sentence[t]

        prob = estimate_probability(word, n_gram, n_gram_counts, n_plus1_gram_counts, vocab_size, k = k)

        logsum += np.log(prob)
    
    logsum = (1 / N) * logsum
    perplexity = 2 ** (-logsum)

    return perplexity

In [23]:
def compute_perplexity_over_sentences(sentences, vocab, n_gram_counts, n_plus1_gram_counts, k = 1):
    vocab = list(vocab.keys()) + ["<e>", "<UNK>"]
    vocab_size = len(vocab)

    p = 0

    for sentence in sentences:
        p += compute_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocab_size, k = k)

    return p / len(sentences)

In [25]:
train_perplexity = compute_perplexity_over_sentences(train_data, vocab, unigram_count, bigram_counts)
test_perplexity = compute_perplexity_over_sentences(test_data, vocab, unigram_count, bigram_counts)

In [28]:
print(f"Perplexity of Training set : {train_perplexity:.5f}\nPerplexity of Testing set : {test_perplexity:.5f}")

Perplexity of Training set : 36.53088
Perplexity of Testing set : 40.98047


# suggest words

In [116]:
def suggest_word(previous_words, n_gram_counts, n_plus1_gram_counts, vocabulary, k = 1.0, start_with = None):
    n = len(list(n_gram_counts.keys())[0])

    previous_n_token = previous_words[-n:]

    probs = estimate_all_probabilities(previous_n_token, n_gram_counts, n_plus1_gram_counts, vocabulary, k = k)

    max_prob = 0
    suggestion = None

    for word, prob in probs.items():
        if start_with:
            if not word.startswith(start_with):
                continue

        if prob > max_prob:
            suggestion = word
            max_prob = prob

    return suggestion, max_prob

In [117]:
import re

def clean_text(text):
    text = text.strip().lower()
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r"https?://.+", r"", text)
    text = re.sub("([.,!?():;])", r' ', text)
    text = text.replace("\n", "")
    return re.sub(r' +', r' ', text)

def preprocess(data, vocab):
    processed_sent = []
    for j, token in enumerate(data):
        if token not in vocab.keys():
            processed_sent.append("<UNK>")
        else:
            processed_sent.append(token)

    return processed_sent

In [118]:
x = "Kaha jaa rahe"
cleaned = clean_text(x).split()
text = preprocess(cleaned, vocab)

In [122]:
suggest_word(text, unigram_count, bigram_counts, vocab, start_with="hai")

('hai', 0.00633619083115915)

# Multiple Suggestions

In [126]:
n_gram_counts_list = []

for n in range(1, 6):
    print("Computing n grm counts for n = ", n, "...")
    n_model_counts = get_n_gram(train_data, n)
    n_gram_counts_list.append(n_model_counts)

Computing n grm counts for n =  1 ...
Computing n grm counts for n =  2 ...
Computing n grm counts for n =  3 ...
Computing n grm counts for n =  4 ...
Computing n grm counts for n =  5 ...


In [127]:
def get_multiple_suggestions(previous_words, n_gram_counts_list, vocabulary, k = 1.0, start_with=None):
    model_count = len(n_gram_counts_list)

    suggetions = []
    for i in range(model_count - 1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i + 1]

        s = suggest_word(previous_words, n_gram_counts, n_plus1_gram_counts, vocabulary, k = k, start_with=start_with)

        suggetions.append(s)
    
    return suggetions

In [128]:
get_multiple_suggestions(text, n_gram_counts_list, vocab)

[('ho', 0.026649273201639956),
 ('ha', 0.0011961722488038277),
 ('the', 0.0003997601439136518),
 ('ha', 0.00019992003198720512)]