# Import Libraries

In [1]:
import nltk
from nltk import bigrams,trigrams 
from nltk.corpus import reuters
from collections import Counter, defaultdict
from gensim.test.utils import datapath
from gensim.corpora import WikiCorpus

# Data-set

In [2]:
dataset_path = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2')
wiki_sentences = WikiCorpus(dataset_path).get_texts()



In [3]:
wiki_sentences

<generator object WikiCorpus.get_texts at 0x000001F14B9A43C8>


# get Reuters DataSet

In [4]:
nltk.download('punkt')
nltk.download('reuters')
reuters_sentences  = reuters.sents()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [5]:
reuters_sentences

[['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.'], ['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.'], ...]

# Model

In [6]:
def calculate_probabilities(sentence_model):
  for next_word in sentence_model:
    next_words = sentence_model[next_word]
    total_word_count = float(sum(next_words.values()))
    for previous_word in next_words:
      next_words[previous_word]/=total_word_count

In [7]:
def calculate_sigle_word_probability(sentence_model,word_count):
  for word in sentence_model:
    sentence_model[word]/=word_count

In [8]:
def convert_to_lower(pa):
  if type(pa)==str:
    return pa.lower()
  return pa

In [9]:
sentence_model4 = defaultdict(lambda: set())
sentence_model5 = defaultdict(lambda: set())

def calculate_word_count(sentence_model1,sentence_model2,sentence_model3,sentences):
  word_count = 0
  for sentence in sentences:
    for word in sentence:
      word_count += 1
      sentence_model1[word] += 1
    for previous_word2, previous_word1, next_word in trigrams(sentence, pad_right=True, pad_left=True):
      previous_word1 = convert_to_lower(previous_word1)
      previous_word2 = convert_to_lower(previous_word2)
      next_word = convert_to_lower(next_word)
      sentence_model2[next_word][previous_word1] += 1
      sentence_model3[next_word][previous_word2] += 1
      sentence_model4[previous_word1].add(next_word)
      sentence_model5[previous_word2].add(next_word)

  return word_count

In [10]:
sentence_model1 = defaultdict(lambda:0)
sentence_model2 = defaultdict(lambda: defaultdict(lambda:0))
sentence_model3 = defaultdict(lambda: defaultdict(lambda:0))

In [11]:
wiki_word_count = calculate_word_count(sentence_model1,sentence_model2,sentence_model3,wiki_sentences)

In [12]:
wiki_word_count

452944

In [13]:
reuters_word_count = calculate_word_count(sentence_model1, sentence_model2, sentence_model3, reuters_sentences)

In [14]:
reuters_word_count

1720917

In [15]:
calculate_probabilities(sentence_model2)

In [16]:
calculate_probabilities(sentence_model3)

In [17]:
total_word = wiki_word_count + reuters_word_count
calculate_sigle_word_probability(sentence_model1, total_word)

In [18]:
max_probability_words = []

def make_word_suggestion_by_trigram(previous_word2, previous_word1):
  for next_word in sentence_model4[previous_word1] & sentence_model5[previous_word2]:
    naiveBias_trigram_weight = sentence_model1[next_word] * sentence_model2[next_word][previous_word1]*sentence_model3[next_word][previous_word2]
    max_probability_words.append((next_word, naiveBias_trigram_weight))

In [19]:
make_word_suggestion_by_trigram('my','name')
max_probability_words.sort(key=lambda o:o[1],reverse=True)

In [20]:
print(*max_probability_words[:10])

('is', 2.684929364088014e-08) ('to', 1.1753701280369164e-09) (',', 5.59438489695973e-10) ('in', 4.6278217257881707e-10) ('and', 4.5063509855288127e-10) ('for', 3.738790783749001e-10) ('would', 3.5383441884561533e-10) ('will', 2.8735045390861837e-10) ('that', 2.0690529880634479e-10) ('or', 1.711572663102755e-10)


In [21]:
while(True):
    text = input("Enter your line: ")
    if text == "stop":
        print("The Program Stopped.....")
        break
    
    else:
        try:
            max_probability_words = []
            text = text.split(" ")
            make_word_suggestion_by_trigram(text[0],text[1])
            max_probability_words.sort(key=lambda o:o[1],reverse=True)
            print(*max_probability_words[:10])
            
        except:
            continue

Enter your line: i am
('afraid', 1.0350247784931974e-06) ('sure', 8.036310732474202e-07) ('astonished', 4.600110126636432e-07) ('speculating', 3.6800881013091455e-07) ('deeply', 1.8818632336239947e-07) ('convinced', 1.5333700422121436e-07) ('confident', 1.2893042223444673e-07) ('inclined', 4.600110126636432e-08) ('hopeful', 4.600110126636432e-08) ('sceptical', 3.833425105530359e-08)
Enter your line: do you
('want', 1.457018352093729e-07) ('have', 7.580336901109201e-08) ('think', 2.6624154551874957e-08) ('look', 2.3725843203914056e-08) ('need', 2.1517623247651324e-08) ('know', 1.9034938455047303e-08) ("'", 1.2241932387039204e-08) ('see', 8.059507073354229e-09) ('could', 7.793242782937693e-09) ('make', 5.675392260577992e-09)
Enter your line: this is
('not', 2.067997280891107e-06) ('the', 9.009376019611534e-07) ('expected', 7.882243308937478e-07) ('due', 7.304506050183651e-07) ('also', 7.127958871912021e-07) ('a', 5.638395931369035e-07) ('likely', 4.876159326203692e-07) ('conspicuous', 4.