In [1]:
#!pip install contractions
import operator
import contractions     
import re           
import nltk                   
nltk.download('stopwords')             # Download Stopwords.
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords            # Import stopwords.
from nltk.tokenize import word_tokenize # Import Tokenizer.
from nltk.stem.wordnet import WordNetLemmatizer         # Import Lemmatizer.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Opening and reading sample of the data files

In [2]:
import os
import random

path = "sample_dataset"
#sample_size = 1000

file_list = os.listdir(path)
#sample_files = random.sample(file_list, sample_size)

data = []
for file_name in file_list:
    with open(os.path.join(path, file_name), "r") as f:
        data.append(f.read())

### Preparing Data 

In [3]:
def removePunctuation(words):
  new_words = []
  for word in words:
    # Not word or space, then remove
      new_word = re.sub(r'[^\w\s]', '', word)
      if new_word != '':
          new_words.append(new_word)
  return new_words

def removeStopwords(words):
  stopwordsList = stopwords.words('english') 
  new_words = []
  for word in words:
      if word not in stopwordsList:
          new_words.append(word)
  return new_words

def lemmatize(words):
  lemmatizer = WordNetLemmatizer()
  new_words = []
  for word in words:
    new_words.append(lemmatizer.lemmatize(word, pos='v')) 
  return new_words

In [4]:
def preprocessing(data):
  # you're' -> you are
  data = contractions.fix(data)

  dataLowercase = data.lower() 
  dataTokens = nltk.word_tokenize(dataLowercase)

  dataTokens = removePunctuation(dataTokens)
  dataTokens = removeStopwords(dataTokens)
  modifiedDataTokens = lemmatize(dataTokens)
  
  return modifiedDataTokens

In [5]:
def prepareInput(inputSeq):

  inputSet = inputSeq.split()
  if len(inputSet) > 2:
    inputSet = inputSet[-2:]
    inputSeq = " ".join(inputSet)

  inputSeq= preprocessing(inputSeq)
  inputSeq = " ".join(inputSeq)

  return inputSeq

### Preparing Ngram and frequency table

In [6]:
def generateNgrams(data, n):
    ngram = []
    # n=3, len(data)=6 -> 4 combination of words
    for i in range(len(data)-n+1): 
      # Each combination has 3 words  
      ngram.append(tuple(data[i:i+n])) 
    return ngram

In [7]:
def calcFrequencies(ngrams):

  frequency = {}
  for ngram in ngrams:
   if ngram in frequency:
     frequency[ngram] += 1
   else:
     frequency[ngram] = 1

  return frequency

In [8]:
def calcProbabilities(frequencyTable,inputSeq):

  predictedWordsFreq = {}
  probability = {}

  try:
    inputSeq = tuple(inputSeq.split(" "))
    freqOfInputSeq = frequencyTable[inputSeq]

    for ngram,frequency in frequencyTable.items():

      if ngram[:-1] == inputSeq:
        predictedWordsFreq[ngram[-1]] = frequency

    for word,freq in predictedWordsFreq.items():
      prob = freq/freqOfInputSeq
      probability[word]= prob

  except KeyError:
    return probability
  
  return probability

In [9]:
def predict(inputSeq,frequencyTable): 

  probabilities = calcProbabilities(frequencyTable,inputSeq)
  sortedProbabilities = dict(sorted(probabilities.items(), key = operator.itemgetter(1), reverse = True))

  if len(sortedProbabilities) > 10:
    top10 = dict(list(sortedProbabilities.items())[:10])
    return top10
  
  return sortedProbabilities

In [10]:
def prepareNgrams(data):
  unigrams = []
  bigrams = []
  trigrams = []

  for record in data:
    record = preprocessing(record)
    unigrams += generateNgrams(record, 1)
    bigrams += generateNgrams(record, 2)
    trigrams += generateNgrams(record, 3)

  ngrams = unigrams + bigrams + trigrams
  return ngrams

In [11]:
def getPredictionByChar(predictionResult,inputSeqSubstr):
    charPrediction = []
    if inputSeqSubstr == "":
        return charPrediction
    for result in predictionResult:
        if result.startswith(inputSeqSubstr):
            charPrediction.append(result)
    return charPrediction

In [12]:
ngrams = prepareNgrams(data)
frequencyTable = calcFrequencies(ngrams)

# GUI

In [None]:
import tkinter as tk

def suggest_results(*args):
    searchTerm = searchVar.get()
    suggestions = []
    suggestionsInitial = {}

    last_space_index = searchTerm.rfind(" ")
    if last_space_index != -1:
        
        inputSeq = prepareInput(searchTerm[:last_space_index])
        suggestions = predict(inputSeq, frequencyTable)
        suggestions = getPredictionByChar(suggestions,searchTerm[last_space_index+1:])

    suggestionsInitial = predict(prepareInput(searchTerm), frequencyTable)
    suggestions += list(suggestionsInitial.keys())

    listbox.delete(0, tk.END)
    numOfCharPre= len(suggestions)-len(suggestionsInitial)
    output = ""

    for suggestion in suggestions:
        if numOfCharPre > 0:
            output = searchTerm[:last_space_index]
        else:
            if searchTerm[-1:]==" ":
                output = searchTerm[:-1]
            else:
                output = searchTerm
        numOfCharPre = numOfCharPre-1
        listbox.insert(tk.END, output+" "+suggestion)

def replace_selected_suggestion(event):
    selection = listbox.curselection()
    if selection:
        index = selection[0]
        # replace the original input with the selected suggestion
        searchVar.set(listbox.get(index))


root = tk.Tk()
root.title("Auto-Filling Program")
root.iconbitmap("search.ico")
root.geometry("370x320")


label = tk.Label(root, text="Search", font=("Arial", 20, "bold"), pady=5)
label.pack()
label.config(fg="#857DB1")

# create a search bar
searchVar = tk.StringVar()
searchEntry = tk.Entry(root, textvariable=searchVar, width=30, font=("Arial", 15), borderwidth=5)
searchEntry.pack(fill=tk.X, expand=True)


# create a listbox 
listbox = tk.Listbox(root, width=30, font=("Arial", 15), borderwidth=5)
listbox.config(selectmode=tk.SINGLE, selectbackground="light blue")
listbox.pack(fill=tk.BOTH, expand=True)


# bind a callback function to the ListboxSelect event of the listbox
listbox.bind("<<ListboxSelect>>", lambda event: replace_selected_suggestion(event))

# update the listbox whenever the user types in the search bar
searchVar.trace("w", suggest_results)


root.mainloop()