<a href="https://colab.research.google.com/github/Gituhin/Word-Prediction/blob/main/last_word_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing necessary libraries

In [None]:
from google.colab import auth
auth.authenticate_user()
import string 
import gspread
from oauth2client.client import GoogleCredentials
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd

loading the training data set

In [None]:
gc = gspread.authorize(GoogleCredentials.get_application_default())
worksheet = gc.open('40').sheet1
rows = worksheet.get_all_values()
# Convert to a DataFrame and render.
data=pd.DataFrame.from_records(rows)

function for preprocessing a line

In [None]:
def preprocessline(data, c):
  step=str.maketrans('','',string.punctuation)    #removing punctuations
  line=data[0][c].translate(step) 
  line=line.lower()       #converting all words to lower cases and splitting them
  words=line.split()

  st_words=stopwords.words('english')
  i=0
  while i<len(words):
    if words[i] in st_words:     #removing the stop words, finding them and poping them out from the words splitted list
      words.pop(i)
      i=0
    i+=1
  ps = PorterStemmer()           #using porter stemmer to convert to root words
  for w in range(len(words)):
    words[w]=(ps.stem(words[w]))
  return words                    #returning the processed words in a line

Creating set of unique words (to use as references for creating vocabulary)

In [None]:
word_setfinal=set()
for i in range(1, len(data)):
  word_setfinal.update(preprocessline(data, i))     #creating a collection of unique words in the document. The set has been used to avoid repeatitions

creating vocabulary

In [None]:
vocabulary={}
for wds in word_setfinal:         #creating vocabulary of all words, Dictionary has beed used to keep words in key and frequencies in values.
  vocabulary[wds]=0                 #setting initial frequency to 0 for all

for i in range(1, len(data)):
  words_in_line=preprocessline(data, i)
  for wd in word_setfinal:
    for c in range(len(words_in_line)):
      if wd==words_in_line[c]:
        vocabulary[wd]+=1                     #if word has occurred a numer of times the value of the key is increased by 1 and finally frequency is calculated

Prior distribuion of words

In [None]:
prob_words={}       #prior distribution dictionary
total_sum=0
for g in vocabulary:
  total_sum+=vocabulary[g]      #calculating total number of words in the document

for h in vocabulary:
  prob_words[h]=vocabulary[h]/total_sum             #prior distribution is frequency of the word upon total words

class conditional probabilities of words

In [None]:
doc_lst=[]                  #storing all the processed words of a line separately in a list to reduce time complexity of processing in the below block
for i in range(1, len(data)):
  doc_lst.append(preprocessline(data, i))
sent={}
for wd in word_setfinal:
  c=0
  for i in range(len(doc_lst)):         #storing the number of sentences in which a particular word appears in sent dictionary
    if wd in doc_lst[i]:
      c+=1
  sent[wd]=c

In [None]:
cc_prob=[]                # creating a list of tuples, where first element is a word and second element is a dictionary consisting of 
for wod1 in word_setfinal:  #other words as keys and the class conditional probabilty of first element word over the keys as the values
  d=dict()
  for wod2 in word_setfinal:
    c=0
    for i in range(len(doc_lst)):
      if wod1 in doc_lst[i] and wod2 in doc_lst[i]: #checking if both words are in same sentence and then increasing the counter
        c+=1
      if c!=0:
        d[wod2]=c/sent[wod1]
      else:
        d[wod2]=0.0001      #if they aren't in same sentences in all the sentences of document then assigning a small probabilty close to 0
  cc_prob.append((wod1, d))


Applying on test data

In [None]:
worksheet1 = gc.open('10').sheet1
rows = worksheet1.get_all_values()
# Convert to a DataFrame and render.
data_test=pd.DataFrame.from_records(rows)

posterior distribution of words for a sentence and predicting most likely word

In [None]:
def idx(lst, word):     #function to return index of the word in 2nd element of dictionary for class conditonal probability
  j=0
  while word!=lst[j][0]:
    j+=1
  return j

In [None]:
for f in range(1, len(data_test)):
  p_list={}          #dictionary for storing all the probabiltiies of the words for a sentence
  words=preprocessline(data_test, f)
  for wd in word_setfinal:
    prob=1
    for w in words:
      if w not in word_setfinal:      #if the word in testset is not in trainset then adjusting the probabability to small and close to 0
        prob=0.0001
      else:
        i=idx(cc_prob, w)
      prob*=cc_prob[i][1][wd]
    prob=prob*prob_words[wd]
    p_list[wd]=prob
  probable_word = max(p_list, key=p_list.get)
  print('The most suitable word is', probable_word)