In [171]:
import nltk
import os
import string
from nltk.corpus import stopwords

In [172]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\EGYPT_LAPTOP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [173]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\EGYPT_LAPTOP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [174]:
#reading the input and return list of sentences 
def readingData(directory_path):
    
    filesSentences = []
    for filename in os.listdir(directory_path):
        f = os.path.join(directory_path, filename)
        if os.path.isfile(f):
            textFile = open(f, "r")
            fileSentences = textFile.read()
            filesSentences.append(fileSentences)
            textFile.close()

    return filesSentences

In [175]:
def tokenization(data):
    
    tokenized = []    
    for sentence in data: 
        tokens = nltk.word_tokenize(sentence)  # Convert sentence to a list of words
        tokenized.append(tokens)
        
    return tokenized

In [176]:
def removeStopWords(sentence):
    
    newSentence = []
    stop_words = set(stopwords.words("english"))
    for word in sentence:
        if word not in stop_words:
            newSentence.append(word)

    return newSentence

In [177]:
def cleaning (tokenizedData):
    
    for i in range(len(tokenizedData)):
        tokenizedData[i] = [''.join(char for char in word if char not in string.punctuation) for word in tokenizedData[i]] # remove punctuations
        tokenizedData[i] = [word for word in tokenizedData[i] if word] # removes empty strings
        tokenizedData[i] = [word.lower() for word in tokenizedData[i]] # lower case
        tokenizedData[i] = removeStopWords(tokenizedData[i])# removes stop words

    return tokenizedData

In [178]:
filesSentences = readingData('./dataset')
tokenizedData = tokenization(filesSentences)
cleanedData = cleaning(tokenizedData)

In [179]:
def n_grams(data, n):
    
    n_grams = {} # Empty dict for n-grams
    
    for sentence in data:
        sentence = tuple(sentence)

        # Temp var to store length from start of n-gram to end
        nGramLength = len(sentence) - (n-1)

        for i in range(nGramLength):       
            n_gram = sentence[i:i+n]
            
            # Add the count of n-gram as value to our dictionary
            if n_gram in n_grams.keys():
                  n_grams[n_gram] += 1
            else:
                  n_grams[n_gram] = 1

    return n_grams

In [180]:
def calcProb(nGramFreqDic, nPlusOneFreqDic):
 
    probs  = {}
    for key in nPlusOneFreqDic:
        numerator = nPlusOneFreqDic.get(key) # get the count of the new sentence (given setence + suggested word) in the corpus.
        denominator = nGramFreqDic.get(key[0:len(key)-1]) # get the count of "given setence" in the corpus.
        if (numerator == 0 or denominator == 0):
            probs[key] = 0
        else:
            probs[key] = float(numerator)/float(denominator)

    return probs

In [181]:
def getSuggestedWords(previousTokens, probDic):
    
    suggestedWords = {}
    for key in probDic:
        if key[0:len(previousTokens)] == previousTokens:
            suggestedWords[key[len(previousTokens)]] = probDic.get(key)

    suggestedWords = sorted(suggestedWords.items(), key=lambda item: item[1],reverse=True)

    words  = []
    count = 0
    for  tup in suggestedWords:
        if (count > 10):
            break
        words.append(tup[0])
        count += 1

    return words

In [182]:
uniGramFreqDic = n_grams(cleanedData, 1)
biGramFreqDic = n_grams(cleanedData, 2)
triGramFreqDic = n_grams(cleanedData, 3)

In [183]:
def triGramModel(previousTokens):
    previousTokens = tuple(previousTokens)
    triGramProbDic = calcProb(biGramFreqDic, triGramFreqDic)
    return getSuggestedWords(previousTokens, triGramProbDic)

In [184]:
def biGramModel(previousTokens):
    previousTokens = tuple(previousTokens)
    biGramProbDic = calcProb(uniGramFreqDic, biGramFreqDic)
    return getSuggestedWords(previousTokens, biGramProbDic)

In [185]:
def autoFill(text):
    
    tokenizedData = tokenization(text)
    cleanedData = cleaning(tokenizedData)

    if (len(cleanedData[0]) >= 2):
        return triGramModel(cleanedData[0][-2:])

    elif (len(cleanedData[0]) == 1):       
        return biGramModel(cleanedData[0])

# GUI

In [199]:
import tkinter as tk
from tkinter.ttk import *
from PIL import ImageTk, Image

In [200]:
window = tk.Tk()
window.configure(bg='white')

In [201]:
img = ImageTk.PhotoImage(Image.open("./assets/google.png"))
imageLabel = tk.Label(image = img)
imageLabel.configure(bg='white')

In [202]:
frame = tk.Frame(window)
frame.configure(bg='white')

In [203]:
searchImage = (Image.open("./assets/search.png"))
resized_searchImage = searchImage.resize((30,40), Image.Resampling.LANCZOS)
new_searchImage = ImageTk.PhotoImage(resized_searchImage)
searchImageLabel = tk.Label(frame, image = new_searchImage)
searchImageLabel.configure(bg='white')

In [204]:
inputText = tk.Entry(frame, width = 35,  font=('Arial 24'))
listbox = tk.Listbox(window, width = 100, font=('Arial 24'))

def update(data):
    listbox.delete(0, tk.END)
    for item in data:
        listbox.insert(tk.END, item)
    listbox.pack(side='left', padx=450, pady=10)

#update entry box with listbox clicked
def fillout(event):
    if listbox.curselection() != ():
        inputText.insert(tk.END, listbox.get(listbox.curselection()))

def check(event):
    typed = inputText.get()
    data  = autoFill([typed])
    update(data)

listbox.bind("<<ListboxSelect>>", fillout)
inputText.bind("<space>", check)

'3208798791680check'

In [205]:
cameraImage = (Image.open("./assets/camera.png"))
resized_cameraImage= cameraImage.resize((30,40), Image.Resampling.LANCZOS)
new_cameraImage= ImageTk.PhotoImage(resized_cameraImage)
cameraImageLabel = tk.Label(frame, image = new_cameraImage)
cameraImageLabel.configure(bg='white')

In [206]:
imageLabel.pack(padx= 10, pady=90)
searchImageLabel.pack(side='left', expand = True)
inputText.pack(side='left', expand = True)
cameraImageLabel.pack(side='left', expand = True)
frame.pack()

In [207]:
window.mainloop()