# **LLM for Auto-Complete**

In [1]:
import nltk
import re
import random
import math
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
nltk.data.path.append('/home/jerlshin/my_nltk_data/')

In [2]:
nltk.data.path  

['/home/jerlshin/nltk_data',
 '/home/jerlshin/env_ai/nltk_data',
 '/home/jerlshin/env_ai/share/nltk_data',
 '/home/jerlshin/env_ai/lib/nltk_data',
 '/usr/share/nltk_data',
 '/usr/local/share/nltk_data',
 '/usr/lib/nltk_data',
 '/usr/local/lib/nltk_data',
 '/home/jerlshin/my_nltk_data/']

### Data Sources

In [3]:
with open("en_US.twitter.txt", 'r') as f:
    data = f.read()

print(type(data))
print(len(data))

print(data[:300])

<class 'str'>
3335477
How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.
When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.
they've decided its more fun if I don't.
So Tired D; Played Lazer Tag & Ran A 


### Corpus preprocessing

In [4]:
def split_to_sentences(data):
    sentences = data.split('.\n')

    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]

    return sentences

def tokenize_sentences(sentences):
    tokenized_sentences = []

    for sentence in sentences:
        sentence = sentence.lower()
        tokenized = nltk.word_tokenize(sentence)
        tokenized_sentences.append(tokenized)
    
    return tokenized_sentences


def get_tokenized_data(data):
    sentences = split_to_sentences(data)
    tokenized_sentences = tokenize_sentences(sentences)

    return tokenized_sentences

In [5]:
x = "I am Jerlshin.\nNever Ever give up.\nI am the beast.\nHa ha ha.\n"
get_tokenized_data(x)

[['i', 'am', 'jerlshin'],
 ['never', 'ever', 'give', 'up'],
 ['i', 'am', 'the', 'beast'],
 ['ha', 'ha', 'ha']]

In [6]:
'''Trigram'''
def sentence_to_n_gram(tokenized_sentence, n):
    for i in range(len(tokenized_sentence) -n +1):
        trigram = tokenized_sentence[i:i+n]
        print(trigram)

n = 3 # n-gram
tokenized_sentence = ['never', 'ever', 'give', 'up', '.']
tokenized_sentence = ["<s>"] * (n-1) + tokenized_sentence + ["</s>"]

sentence_to_n_gram(tokenized_sentence, n)


['<s>', '<s>', 'never']
['<s>', 'never', 'ever']
['never', 'ever', 'give']
['ever', 'give', 'up']
['give', 'up', '.']
['up', '.', '</s>']


In [7]:
def count_words(tokenize_sentences):
    word_counts = {}

    for sentence in tokenize_sentences:
        for token in sentence:
            if token not in word_counts.keys():
                word_counts[token] = 1
            else:
                word_counts[token] += 1
    
    return word_counts

In [8]:
tokenized_sentences = get_tokenized_data(x)
count_words(tokenized_sentences)

{'i': 2,
 'am': 2,
 'jerlshin': 1,
 'never': 1,
 'ever': 1,
 'give': 1,
 'up': 1,
 'the': 1,
 'beast': 1,
 'ha': 3}

### Data Split

In [9]:
def train_validation_test_split(data, train_percent, validation_percent):
    # Splits the data 
    # return train, valid, test data
    random.seed(42)
    random.shuffle(data)

    train_size = int(len(data) * train_percent / 100)
    train_data = data[0:train_size]

    validation_size = int(len(data) * validation_percent / 100)
    validation_data = data[train_size:train_size + validation_size]
    
    test_data = data[train_size + validation_size:]
    
    return train_data, validation_data, test_data

In [10]:
# for instance 
tokenized_data = get_tokenized_data(data)
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [11]:
len(train_data), len(test_data)

(11476, 2870)

#### Out of Voabulary words

In [12]:
def get_words_with_nplus_frequency(
        tokenized_sentences,
        count_threshold
):
    closed_vocab = []

    word_counts = count_words(tokenized_sentences)

    for word, cnt in word_counts.items():
        if cnt >= count_threshold:
            closed_vocab.append(word)

    return closed_vocab

In [13]:
def replace_oov_words_by_unk(
        tokenized_sentences, 
        vocabulary, 
        unknown_token="<unk>"
):
    vocabulary = set(vocabulary)
    replaced_tokenized_sentences = []

    for sentence in tokenized_sentences:
        replaced_sentences = []
        
        for token in sentence:
            if token in vocabulary:
                replaced_sentences.append(token)
            else:
                replaced_sentences.append(unknown_token)
    
        replaced_tokenized_sentences.append(replaced_sentences)

    return replaced_tokenized_sentences


In [14]:
def preprocess_data(train_data, test_data, count_threshold):
    vocabulary = get_words_with_nplus_frequency(train_data, count_threshold)
    train_data_replaced = replace_oov_words_by_unk(train_data, vocabulary)
    test_data_replaced = replace_oov_words_by_unk(test_data, vocabulary)

    return train_data_replaced, test_data_replaced, vocabulary

minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, 
                                                                        test_data, 
                                                                        minimum_freq)

### Developing n-gram based Language Model

In [15]:
def count_n_grams(data, n, start_token='<s>', end_token='</s>'):
    """Count all n-grams in the data

    Args:
        data (list): list of words
        n (int): words in seq (order of the gram )
        start_token (str, optional)
        end_token (str, optional)
    """
    n_grams = {}

    for sentence in data:
        sentence = [start_token] * n + sentence + [end_token] # augmentation
        sentence = tuple(sentence) # conver the list to tuple so that the seq of words can be used as a key in the dict 

        '''
        i - indicate the start of the n-gram from index 0 to the last index where the end of the n-gram is withing the sentence
        '''
        m = len(sentence) if n==1 else len(sentence) - 1

        for i in range(m): 
            n_gram = sentence[i:i+n] # n - order of the gram 

            if n_gram in n_grams.keys():
                n_grams[n_gram] += 1 # increase the count of the n-gram
            else:
                n_grams[n_gram] = 1 # initialize the n-gram count 
        
    return n_grams


### Smoothing

* K-smoothing 

\begin{equation*}
\hat{P}=\frac{C(w_{n-1}, w_{n})+k}{C(w_{n-1})+k*v}
\end{equation*}

* Good-Turing smoothing 
* Knesser-Ney smoothing 



In [16]:
def add_k_smmoothing_probability(k, vocabulary_size, n_gram_count, n_gram_prefix_count):
    numerator = n_gram_count + k
    denominator = n_gram_prefix_count + k * vocabulary_size
    return numerator / denominator

### Estimate the probability

Estimate the prob of the word given the prior 'n' words using the n-grams counts


$$ \hat{P}(w_t | w_{t-1}\dots w_{t-n}) = \frac{C(w_{t-1}\dots w_{t-n}, w_n) + k}{C(w_{t-1}\dots w_{t-n}) + k|V|} \tag{3} $$




In [17]:
def estimate_probability(
        word, previous_n_gram,
        n_gram_counts,
        n_plus1_gram_counts, 
        vocabulary_size, 
        k=1.0
):
    """Estimate the prob of the nest word using the n-grams counts with k-smoothing

    Args:
        word (str): next_word
        preprocess_n_gram (str): a seq of words of lenght n
        n_gram_counts (dict): dict that maps a tuple of n-words to its freq
        n_plus1_gram_counts (dict): countsof n+1 grams 
        vocabulary_size (int)
        k (float, optional): pos const, smoothing parameter. Defaults to 1.0
    """
    # convert to use it as a dict key 
    previous_n_gram = tuple(previous_n_gram)

    # if the prev n-grams exits in the dict of n-grams counts, get its count, else to 0
    previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0

    denominator = previous_n_gram_count + k * vocabulary_size

    # define n+1 gram as the prev as the n-gram + current word as a tuple
    n_plus1_gram = previous_n_gram + (word, ) # for the next gram

    # see the next gram if the gram exits there 
    n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts else 0


    numerator = n_plus1_gram_count + k

    probability = numerator / denominator

    return probability



In [18]:
'''Estimating the total probabilities'''

def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, 
                                           n_gram_counts, n_plus1_gram_counts, 
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities

### Count Matrix

In [19]:
def make_count_matrix(n_plus1_gram_counts, vocabulary):
    vocabulary = vocabulary + ["<e>", "<unk>"]

    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[0:-1]
        n_grams.append(n_gram)
    
    n_grams = list(set(n_grams))

    # making the matrix
    row_idex = {n_gram:i for i, n_gram in enumerate(n_grams)}
    col_idex = {word:j for j, word in enumerate(vocabulary)}

    n_row = len(n_grams)
    n_col = len(vocabulary)

    count_matrix = np.zeros((n_row, n_col))

    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1] # excluding the last word 
        word = n_plus1_gram[-1] # make it as the word 

        if word not in vocabulary:
            continue

        i = row_idex[n_gram]
        j = col_idex[word]

        count_matrix[i, j] = count

    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix


### Probability Matrix

In [20]:
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    count_matrix = make_count_matrix(n_plus1_gram_counts, unique_words)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

#### Back-off
- If higher order n-grams prob is missing the lower order n-1 gram is used to get the information. 

### Interpolation
- Alternative for Backoff. We use weightes prob of n-grams of all orders every time, not just when high order information is missing.

### Language Model Evaluation

#### **Perplexity**

Evaluation score for the language model.

- N - lenght of the sentence
- n - no of words in n-gram


Implementing the m-th order root of a variable

\begin{equation*}
PP(W)=\sqrt[M]{\prod_{i=1}^{m}{\frac{1}{P(w_i|w_{i-1})}}}
\end{equation*}


As in math, the indexing starts at 0, the range of t changes to t=n to N-1

- The more the N-grams tell us about the sentence, the lower the perlexity score will be 

In [21]:
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    """
    Calculate perplexity for a list of sentences
    
    Args:
        sentence: List of strings
        n_gram_counts: Dictionary of counts of (n+1)-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary_size: number of unique words in the vocabulary
        k: Positive smoothing constant
    
    Returns:
        Perplexity score
    """
    n = len(list(n_gram_counts.keys())[0]) 
    
    sentence = ["<s>"] * n + sentence + ["<e>"]
    
    sentence = tuple(sentence)
    
    N = len(sentence)
    
    product_pi = 1.0
    
    for t in range(n, N): 
        n_gram = sentence[t-n:t]
        
        word = sentence[t]
        
        probability = estimate_probability(word,n_gram, n_gram_counts, n_plus1_gram_counts, len(unique_words), k=1)

        product_pi *= 1 / probability

    perplexity = product_pi**(1/float(N))
    
    return perplexity

In [22]:
# test your code

sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)


perplexity_train1 = calculate_perplexity(sentences[0],
                                         unigram_counts, bigram_counts,
                                         len(unique_words), k=1.0)
print(f"Perplexity for first train sample: {perplexity_train1:.4f}")

test_sentence = ['i', 'like', 'a', 'dog']
perplexity_test = calculate_perplexity(test_sentence,
                                       unigram_counts, bigram_counts,
                                       len(unique_words), k=1.0)
print(f"Perplexity for test sample: {perplexity_test:.4f}")

Perplexity for first train sample: 3.3674
Perplexity for test sample: 3.9654


### Auto-Complete System

In [23]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    """
    Get suggestion for the next word
    
    Args:
        previous_tokens: The sentence you input where each token is a word. Must have length > n 
        n_gram_counts: Dictionary of counts of (n+1)-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary: List of words
        k: positive constant, smoothing parameter
        start_with: If not None, specifies the first few letters of the next word
        
    Returns:
        A tuple of 
          - string of the most likely next word
          - corresponding probability
    """
    
    n = len(list(n_gram_counts.keys())[0]) 
    
    previous_n_gram = previous_tokens[-n:]

    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)
    
    suggestion = None
    
    max_prob = 0
    
    for word, prob in probabilities.items(): 
        if start_with != None: 
            
            if not word.startswith(start_with): 
                continue  
        
        if prob > max_prob:
            suggestion = word
            
            max_prob = prob

    
    return suggestion, max_prob

In [32]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None, top_2=False):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    
    if top_2 is True:
        return suggestions[:2]
    else:
        return suggestions

In [33]:
# test your code
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
trigram_counts = count_n_grams(sentences, 3)
quadgram_counts = count_n_grams(sentences, 4)
qintgram_counts = count_n_grams(sentences, 5)

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts, qintgram_counts]
previous_tokens = ["i", "like"]
tmp_suggest3 = get_suggestions(previous_tokens, n_gram_counts_list, unique_words, k=1.0)

print(f"The previous words are 'i like', the suggestions are:")
display(tmp_suggest3)

The previous words are 'i like', the suggestions are:


[('a', 0.2727272727272727),
 ('a', 0.2),
 ('dog', 0.1111111111111111),
 ('dog', 0.1111111111111111)]

In [34]:
n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


In [35]:
previous_tokens = ["i", "am", "to"]
tmp_suggest4 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest4)

The previous words are ['i', 'am', 'to'], the suggestions are:


[('the', 0.02807799036074193),
 ('please', 0.00013528138528138528),
 ('please', 0.00013533631073216944),
 ('it', 6.76773145641581e-05)]

In [28]:
previous_tokens = ["i", "want", "to", "go"]
tmp_suggest5 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest5)

The previous words are ['i', 'want', 'to', 'go'], the suggestions are:


[('to', 0.014203478913778),
 ('to', 0.004581977554950528),
 ('to', 0.0008780224233418885),
 ('to', 0.00040565208572780745)]

In [29]:
previous_tokens = ["hey", "how", "are"]
tmp_suggest6 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest6)

The previous words are ['hey', 'how', 'are'], the suggestions are:


[('you', 0.023950729927007298),
 ('you', 0.0041748030435660895),
 ('you', 0.00013534546931041484),
 ('it', 6.76773145641581e-05)]

In [30]:
previous_tokens = ["hey", "how", "are", "you"]
tmp_suggest7 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest7)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:


[("'re", 0.024745032708951283),
 ('?', 0.0029615004935834156),
 ('?', 0.0017523758172137225),
 ('leaving', 0.00013534546931041484)]

In [31]:
previous_tokens = ["hey", "how", "are", "you"]
tmp_suggest8 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with="d")

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest8)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:


[('do', 0.00837024094916907),
 ('doing', 0.0017769002961500495),
 ('doing', 0.00047179348924984834),
 ('dude', 6.767273465520742e-05)]