Aim : To implement Next Word Prediction using Ngrams.

#Bigrams

In [None]:
corpus = '''the cat is red the cat is green the cat is blue the dog is brown.
sam is a student.'''

In [None]:
# Writing the corpus into a text file
with open('corpus.txt', 'w+') as f:
    f.write(corpus)

In [None]:
import re
from pprint import pprint

In [None]:
def _conditional_prob(s, n):
    return float(s.count(f'{n[0]} {n[1]}') / s.count(n[0]))

In [None]:
def get_conditional_prob(s, ngrams):
    conditional_prob = {}
    for ngram in ngrams:
        conditional_prob[ngram] = _conditional_prob(s, ngram)
    return conditional_prob

In [None]:
def tokenize(corpus):
    return [token for token in corpus.split(' ') if token != '']

In [None]:
def get_n_grams(s, tokens, n):
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    # return [" ".join(ngram) for ngram in ngrams]
    return list(set([ngram for ngram in ngrams]))

In [None]:
def predict_next_word(s, conditional_prob, tokens, word):
    predictions = {}
    for token in tokens:
        n = tuple([word, token])
        predictions[n] = _conditional_prob(s, n)
    return predictions

In [None]:
with open(r'corpus.txt', 'r') as f:
    all_lines = f.readlines()
corpus = ''.join([x.replace('\n', ' ') for x in all_lines])
n_grams = []
tokens = []
for line in all_lines:
    line = line.replace('\n', '')
    tkns = tokenize(line)
    for t in tkns:
        tokens.append(t)
    n_gram = get_n_grams(line, tokens,2)
    for n in n_gram:
        n_grams.append(n)
    
conditional_prob = get_conditional_prob(corpus, n_grams)
print('Conditional Probabilities: '); pprint(conditional_prob)
word = input('Enter a word: ')
predictions = predict_next_word(corpus, conditional_prob, tokens, word)
print('All predictions with probability are: ')
pprint(predictions)

Conditional Probabilities: 
{('a', 'student.'): 0.2,
 ('blue', 'the'): 1.0,
 ('brown.', 'sam'): 1.0,
 ('cat', 'is'): 1.0,
 ('dog', 'is'): 1.0,
 ('green', 'the'): 1.0,
 ('is', 'a'): 0.2,
 ('is', 'blue'): 0.2,
 ('is', 'brown.'): 0.2,
 ('is', 'green'): 0.2,
 ('is', 'red'): 0.2,
 ('red', 'the'): 1.0,
 ('sam', 'is'): 1.0,
 ('the', 'cat'): 0.75,
 ('the', 'dog'): 0.25}
Enter a word: dog
All predictions with probability are: 
{('dog', 'a'): 0.0,
 ('dog', 'blue'): 0.0,
 ('dog', 'brown.'): 0.0,
 ('dog', 'cat'): 0.0,
 ('dog', 'dog'): 0.0,
 ('dog', 'green'): 0.0,
 ('dog', 'is'): 1.0,
 ('dog', 'red'): 0.0,
 ('dog', 'sam'): 0.0,
 ('dog', 'student.'): 0.0,
 ('dog', 'the'): 0.0}


In [None]:
word = input('Enter a word: ')
predictions = predict_next_word(corpus, conditional_prob, tokens, word)
print('All predictions with probability are: ')
pprint(predictions)


Enter a word: sam
All predictions with probability are: 
{('sam', 'a'): 0.0,
 ('sam', 'blue'): 0.0,
 ('sam', 'brown.'): 0.0,
 ('sam', 'cat'): 0.0,
 ('sam', 'dog'): 0.0,
 ('sam', 'green'): 0.0,
 ('sam', 'is'): 1.0,
 ('sam', 'red'): 0.0,
 ('sam', 'sam'): 0.0,
 ('sam', 'student.'): 0.0,
 ('sam', 'the'): 0.0}


#Reference for Trigram...
https://github.com/susantabiswas/Word-Prediction-Ngram/blob/master/Knesser%20Ney.ipynb

# Trigrams implementation

In [None]:
def trigram_conditional_prob(s, n):
    return float(s.count(f'{n[0]} {n[1]} {n[2]}') / s.count(f'{n[0]} {n[1]}'))

In [None]:
def get_conditional_prob_tri(s, ngrams):
    conditional_prob = {}
    for ngram in ngrams:
        conditional_prob[ngram] = trigram_conditional_prob(s, ngram)
    return conditional_prob

In [None]:
def predict_next_word_tri(s, conditional_prob, tokens, word):
    predictions = {}
    for token in tokens:
        n = tuple([word[0],word[1], token])
        predictions[n] = trigram_conditional_prob(s, n)
    return predictions

In [None]:
with open(r'corpus.txt', 'r') as f:
    all_lines = f.readlines()
corpus = ''.join([x.replace('\n', ' ') for x in all_lines])
n_grams = []
tokens = []
for line in all_lines:
    line = line.replace('\n', '')
    tkns = tokenize(line)
    for t in tkns:
        tokens.append(t)
    n_gram = get_n_grams(line, tokens,3)
    for n in n_gram:
        n_grams.append(n)
    
conditional_prob = get_conditional_prob_tri(corpus, n_grams)
print('Conditional Probabilities: '); pprint(conditional_prob)
word = input('Enter two words to input ').split(" ")
predictions = predict_next_word_tri(corpus, conditional_prob, tokens, word)
print('All predictions with probability are: ')
pprint(predictions)

Conditional Probabilities: 
{('blue', 'the', 'dog'): 1.0,
 ('brown.', 'sam', 'is'): 1.0,
 ('cat', 'is', 'blue'): 0.3333333333333333,
 ('cat', 'is', 'green'): 0.3333333333333333,
 ('cat', 'is', 'red'): 0.3333333333333333,
 ('dog', 'is', 'brown.'): 1.0,
 ('green', 'the', 'cat'): 1.0,
 ('is', 'a', 'student.'): 1.0,
 ('is', 'blue', 'the'): 1.0,
 ('is', 'brown.', 'sam'): 1.0,
 ('is', 'green', 'the'): 1.0,
 ('is', 'red', 'the'): 1.0,
 ('red', 'the', 'cat'): 1.0,
 ('sam', 'is', 'a'): 1.0,
 ('the', 'cat', 'is'): 1.0,
 ('the', 'dog', 'is'): 1.0}
Enter two words to input the cat
All predictions with probability are: 
{('the', 'cat', 'a'): 0.0,
 ('the', 'cat', 'blue'): 0.0,
 ('the', 'cat', 'brown.'): 0.0,
 ('the', 'cat', 'cat'): 0.0,
 ('the', 'cat', 'dog'): 0.0,
 ('the', 'cat', 'green'): 0.0,
 ('the', 'cat', 'is'): 1.0,
 ('the', 'cat', 'red'): 0.0,
 ('the', 'cat', 'sam'): 0.0,
 ('the', 'cat', 'student.'): 0.0,
 ('the', 'cat', 'the'): 0.0}
