In [2377]:
import re
import nltk
import numpy as np
from sklearn.model_selection import train_test_split

data_dir = "./archive"
file_path = data_dir + "/data_0.txt"

nltk.data.path.append(data_dir)
nltk.download('punkt')

with open(file_path, "r") as f:
    data = f.read()

def preprocess_pipeline(data) -> 'list':

    sentences = data.split('\n')
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    
    tokenized = []

    for sentence in sentences:
        
        sentence = re.sub(r'([\(\)/"?:!;-])', r' \g<1> ', sentence)
        sentence = re.sub(r'([^0-9]),', r'\g<1> ,', sentence)
        sentence = re.sub(r',([^0-9])', r', \g<1>', sentence)
        sentence = re.sub(r'  +', ' ', sentence)
        sentence = sentence.lower()
        
        token = []

        for tok in sentence.split(' '):
            if tok == '':
                continue
            if tok[-1] == '.':
                tok = tok[:-1] + ' .'
            token.append(tok)

        token = ' '.join(token)
               
        tokens = nltk.word_tokenize(token)
        
        tokenized.append(tokens)
        
    return tokenized

tokenized_sentences = preprocess_pipeline(data)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kunalmehra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2378]:
train, test = train_test_split(tokenized_sentences, random_state=42)

train, val = train_test_split(train, random_state=42)

In [2379]:
def count_words(sentences) -> 'dict':
    
  word_counts = {}
  for sentence in sentences:
    
    for token in sentence:
    
      if token not in word_counts.keys():
        word_counts[token] = 1
        
      else:
        word_counts[token] += 1
        
  return word_counts


In [2380]:
def handle_OOVocab(tokenized_sentences, count_threshold) -> 'list':

  closed_vocabulary = []

  words_count = count_words(tokenized_sentences)
    
  for word, count in words_count.items():
    
    if count >= count_threshold :
      closed_vocabulary.append(word)

  return closed_vocabulary


In [2381]:
count_threshold = 1

#Closed Vocabulary
vocab = handle_OOVocab(train, count_threshold)


In [2382]:
def unk_tokenize(tokenized_sentences, vocabulary, unknown_token = "<unk>") -> 'list':

  vocabulary = set(vocabulary)

  new_tokenized_sentences = []
  
  for sentence in tokenized_sentences:

    new_sentence = []
    for token in sentence:
      if token in vocabulary:
        new_sentence.append(token)
      else:
        new_sentence.append(unknown_token)
    
    new_tokenized_sentences.append(new_sentence)

  return new_tokenized_sentences


final_train = unk_tokenize(train, vocab)
    
final_test = unk_tokenize(test, vocab)


In [2383]:
#Function to map n-grams to their respective frequencies in the dataset

def count_n_grams(data, n, start_token = "<s>", end_token = "<e>") -> 'dict':

  n_grams = {}
 
  for sentence in data:
        
    sentence = [start_token]*n + sentence + [end_token]
    
    sentence = tuple(sentence)

    if n==1:
        m = len(sentence)
    else:
        m = len(sentence)-1
    
    for i in range(m):
        
      n_gram = sentence[i:i+n]
    
      if n_gram in n_grams.keys():
        n_grams[n_gram] += 1

      else:
        n_grams[n_gram] = 1
        
  return n_grams


In [2384]:

def get_prob(word, previous_n_gram, n_gram_counts, nplus1_gram_counts, vocabulary_size) -> 'float':

  # Smoothing factor
  k = 1

  previous_n_gram = tuple(previous_n_gram)
    
  previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0
  
  den = previous_n_gram_count + k * vocabulary_size

  nplus1_gram = previous_n_gram + (word,)

  nplus1_gram_count = nplus1_gram_counts[nplus1_gram] if nplus1_gram in nplus1_gram_counts else 0

  num = nplus1_gram_count + k

  # Final Fraction
  prob = num / den

  return prob



def get_probs(previous_n_gram, n_gram_counts, nplus1_gram_counts, vocabulary) -> 'dict':

  previous_n_gram = tuple(previous_n_gram)

  vocabulary = vocabulary + ["<e>", "<unk>"]

  vocabulary_size = len(vocabulary)

  probabilities = {}

  for word in vocabulary:
    
    probability = get_prob(word, previous_n_gram,n_gram_counts, nplus1_gram_counts,vocabulary_size)

    probabilities[word] = probability

  return probabilities


In [2385]:
def get_suggestion(previous_tokens, n_gram_counts, nplus1_gram_counts, vocabulary, start_with=None):

    n = len(list(n_gram_counts.keys())[0]) 
    
    # most recent 'n' words
    previous_n_gram = previous_tokens[-n:]
    
    probabilities = get_probs(previous_n_gram,n_gram_counts, nplus1_gram_counts,vocabulary)

    suggestion = None
    max_prob = 0

    for word, prob in probabilities.items():
        
        if start_with != None: 
            
            if not word.startswith(start_with):
                continue 

        if prob > max_prob: 

            suggestion = word
            max_prob = prob

    return suggestion, max_prob


def get_next_word(previous_tokens, n_gram_counts_list, vocabulary, start_with=None):

    count = len(n_gram_counts_list)
    
    suggestions = []
    
    for i in range(count-1):
        
        # get n and nplus1 counts
        n_gram_counts = n_gram_counts_list[i]
        nplus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = get_suggestion(previous_tokens, n_gram_counts, nplus1_gram_counts, vocabulary, start_with=start_with)

        suggestions.append(suggestion)
        
    return (suggestions)


previous_tokens = ['kar','sakte','hai']


n_gram_counts_list = []
for n in range(1, 6):
    n_model_counts = count_n_grams(final_train, n)
    n_gram_counts_list.append(n_model_counts)

# print(n_gram_counts_list)

suggestion = get_next_word(previous_tokens, n_gram_counts_list, vocab)

print(suggestion)


[('.', 0.03466204506065858), ('.', 0.007366482504604052), ('.', 0.007366482504604052), ('jab', 0.0018552875695732839)]
