In [None]:
!pip install textblob
!pip install --upgrade spacy
!pip install wordsegment
!pip install jaro-winkler

import nltk
from nltk import FreqDist
from nltk.corpus import words, names, wordnet, brown, gutenberg, stopwords
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.tokenize import RegexpTokenizer, word_tokenize
import numpy as np
import re
import pandas as pd
from textblob import TextBlob
from wordsegment import load, segment
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE, Laplace, AbsoluteDiscountingInterpolated, KneserNeyInterpolated
from nltk.text import Text

from nltk.metrics.distance import edit_distance
import jaro
from datetime import datetime 
import pandas as pd
from nltk.corpus import wordnet
import joblib 
from collections import Counter
from sklearn.model_selection import train_test_split
import tkinter as tk

nltk.download('words')
nltk.download('names')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('gutenberg')
nltk.download('brown')
nltk.download("punkt")
nltk.download('stopwords')

class spelling_correction():

    def __init__(self, distance_type, language_model_type=1, ngram=2, weight=0.5, threshold=1.e-01):

        super(spelling_correction, self).__init__()
        
        # Load dictionary, corpus, name
        self.dictionary = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\Models\dictionary.joblib')
        self.wwn = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\Models\wwn.joblib')
        self.named_entity = joblib.load(r'C:\Users\User\Desktop\Software learning\name (4).joblib')

        # Lists to store words
        #self.error = []
        #self.filtered_list = []

        self.confusion_sets_common = None
        self.confusion_sets_synonym = None
        self.distance_type = distance_type

        # Load confusion sets
        self.confusion_sets_1 = pd.read_excel(r'C:\Users\User\Desktop\Software learning\Assignment\confusion sets.xlsx', sheet_name = "Conso (crafted)")
        self.confusion_sets_2 = pd.read_csv(r'C:\Users\User\Desktop\Software learning\Assignment\confusion_sets_2.csv', low_memory = False)
        self.confusion_sets_1 = self.confusion_sets_1.astype("string")
        self.confusion_sets_2 = self.confusion_sets_2.astype("string")

        # Load ngram
        self.laplace_bigram = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\Models\laplace_bigram.joblib')
        self.laplace_trigram = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\Models\laplace_trigram.joblib')
        self.absolute_bigram = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\Models\absolute_bigram.joblib')
        self.absolute_trigram = absolute_trigram = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\Models\absolute_trigram.joblib')
        self.kneser_bigram = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\Models\kneser_bigram.joblib')
        self.kneser_trigram = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\Models\kneser_trigram.joblib')
        self.language_model_type = language_model_type
        self.n_gram = ngram

        # Formula of noisy channel
        self.num_deletion = pd.read_csv(r'C:\Users\User\Desktop\Software learning\Assignment\deletion_cm.csv')
        self.num_insertion = pd.read_csv(r'C:\Users\User\Desktop\Software learning\Assignment\insertion_cm.csv')
        self.num_substitution = pd.read_csv(r'C:\Users\User\Desktop\Software learning\Assignment\substitution_cm.csv')
        self.num_transposition = pd.read_csv(r'C:\Users\User\Desktop\Software learning\Assignment\transposition_cm.csv')
        self.den_del = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\deletion_denominator')
        self.den_ins = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\insertion_denominator')
        self.den_sub = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\substitution_denominator')
        self.den_tra = joblib.load(r'C:\Users\User\Desktop\Software learning\Assignment\transposition_denominator')


        self.weight = weight
        self.threshold = threshold
        self.true_prob = 0.0001

        self.gui_layout()
        print("\nStatus: Ready\n")


    def gui_layout(self):

        window = tk.Tk()
        window.configure(bg="Honeydew")
        window.geometry('{}x{}'.format(1530, 780))
        window.title('Spelling Corrector')

        window.grid_rowconfigure(1, weight=1)
        window.grid_columnconfigure(0, weight=1)

        # -------------------------------------------------------------------------------
        # First container
        self.top_frame = tk.Frame(master=window, bg="MediumSeaGreen")
        self.top_frame.pack(fill=tk.X)
        logo = tk.Label(self.top_frame, text="English Checker", font=("Arial Bold", 18),
                         fg='White', bg='MediumSeaGreen')
        logo.grid(row=0, padx=20, ipady=10)

        hr_space = tk.Frame(window, bg="Honeydew", height=60)
        hr_space.pack(fill=tk.X)

        # -------------------------------------------------------------------------------
        # Second container
        # ****************************
        # Create left frame
        self.left_frame = tk.Frame(master=window, bg="light yellow", width=800, height=60,
                        borderwidth=2)
        self.left_frame.pack(side=tk.LEFT, anchor="nw", padx=40)

        # Inside Left Frame - Header
        ed_header = tk.Label(self.left_frame, text="Texts", font=("Arial Bold", 15),
                                fg='black', bg='light yellow', pady=5)
        ed_header.pack(padx=15, side=tk.TOP, anchor="nw")

        # Inside Left Frame - Text Box
        self.txt = tk.Text(self.left_frame, width=100, height=20, bg="snow",
                     bd=1, relief=tk.GROOVE, padx=8, pady=5, font=("Arial", 12))
        self.txt.focus()
        self.txt.pack(side=tk.TOP, anchor="nw", padx=20, pady=10)

        # Inside Left Frame - Check Button
        self.check_button = tk.Button(self.left_frame, text="Check", bg="Gainsboro", fg="black", 
                                      padx=10, command=lambda: [self.clear_output(), self.get_input()], pady=5)
        self.check_button.pack(side=tk.TOP, anchor="nw", padx=20, pady=(0, 10))
    

        # ****************************
        # Create right frame
        self.right_frame = tk.Frame(master=window, bg="light goldenrod yellow", width=500, height=70)  # Snow
        self.right_frame.pack(side=tk.RIGHT, anchor="ne", padx=40)

        # Inside Right Frame - Header
        sg_header = tk.Label(self.right_frame, text="Suggestions: ", font=("Arial Bold", 15),
                                fg='black', bg='light goldenrod yellow', pady=5)
        sg_header.pack(padx=15, side=tk.TOP, anchor="nw")

        # Inside Right Frame - Dictionary Box
        self.lb = tk.Listbox(self.right_frame, bg="White", fg="black", width=60, 
                               relief=tk.GROOVE, selectmode=5, font=12, activestyle='none')
        def selectedItem(event):
            selected = self.lb.get(self.lb.curselection())
            self.choose_correction(selected)
            
        self.lb.bind("<<ListboxSelect>>", selectedItem)
        self.lb.pack(padx=15, pady=10)

        # Create search frame
        self.search_frame = tk.Frame(master=self.right_frame, bg='light goldenrod yellow', width=50, height=35)
        self.search_frame.pack(side=tk.TOP)

        # Inside Search Frame - Search Box Header
        self.search = tk.Button(self.search_frame, text="Search", bg="Gainsboro", padx=6.8, pady=5,
                            fg="black", relief=tk.GROOVE, command=self.Search)
        self.search.pack(padx=5, pady=5, side=tk.LEFT)

        # Inside Search Frame - Search Box
        self.user_search = tk.StringVar()
        self.search_box = tk.Entry(self.search_frame, text="testang", bg="White", fg="black",
                           font=11, textvariable=self.user_search)
        self.search_box.pack(padx=5, pady=5, side=tk.RIGHT)
        
        # -------------------------------------------------------------------------------------------
        # Third container
        left_space = tk.Frame(self.left_frame, bg="light yellow", height=20)
        left_space.pack(side=tk.TOP)
        
        # Clear button space
        cb_space = tk.Frame(self.left_frame, bg="light yellow", height=30, width=800)
        cb_space.pack(side=tk.TOP)
        
        # Clear button
        self.clear_button = tk.Button(master=cb_space, text="Clear", bg="Gainsboro", fg="black", 
                                      command=self.clear_input, pady=5, width=6)
        self.clear_button.pack(side=tk.TOP, pady=10)
        
        window.mainloop()

        
    def get_input(self, idx="1.0"):
        self.idx = idx
        self.input = self.txt.get(self.idx, "end-1c")
        self.spelling_correction()
        
    
    def clear_output(self):
        self.lb.delete(0, tk.END)
    
    
    def clear_input(self):
        self.txt.delete("1.0", tk.END)
        self.clear_output()
        
    def Search(self):
        us = self.user_search.get()
        for x in range(len(self.filtered_list)):
            if us == self.filtered_list[x]:
                self.lb.selection_set(x)
        
        
    def choose_correction(self, selected):
        self.lb.delete(0, tk.END)
        self.txt.tag_remove("highlight", "1.0", tk.END)
        idx = self.txt.search(self.error[0], self.idx, nocase=1, stopindex=tk.END)
        print("starting index: ", idx)
        lastidx = '% s+% dc' % (idx, len(self.error[0]))
        print("input last index: ", lastidx)
        self.txt.delete(idx, lastidx)
        self.txt.insert(idx, selected)
        lastidx = '% s+% dc' % (idx, len(selected))
        print("candidate last index:", lastidx)
        idx = lastidx
        self.txt.tag_delete('highlight')
        self.get_input(str(idx))
        

    def detect_name(self, token = None):
        if (token in self.named_entity) | (self.input in self.named_entity):
            return True
        else:
            return False


    def detect_dictionary(self, token = None):
        if (token in self.dictionary) | (self.input in self.dictionary):
            return True
        else:
            return False


    def segmenting_token(self, token = None):
        candidates = []
        bigram_model = self.kneser_bigram

        for index in range(len(token)):
            if token[index:] in self.wwn:
                candidates.append([token[:index], token[index:]])
            elif token[:index] in self.wwn:
                candidates.append([token[:index], token[index:]])

        if len(candidates) == 0:
            output = [token]
        elif len(candidates) == 1:
            output = [word for sublist in candidates for word in sublist]
        elif len(candidates) >= 2:
            df = pd.DataFrame(data = candidates[0:], columns = ["candidate1", "candidate2"])
            df["prob"] = df.apply(lambda row: self.kneser_bigram.score(row["candidate2"], row["candidate1"].split()), axis = 1)
            df["rank"] = df["prob"].rank(axis = 0, method = "max", ascending = False)
            df["threshold"] = np.where(df["prob"] > self.threshold, 1, 0)
            df.loc[(df["rank"] == 1) & (df["threshold"] == 1), "selection"] = 1
            df_output = df.loc[df["selection"] == 1, ["candidate1", "candidate2"]]

            if df_output.shape[0] == 0:
                output = [token]
            else:
                output = df_output.iloc[0].tolist()
                output = [word for word in output if (word != "") | (word != " ")]

        return output # a list


    def generation_confusion_sets(self, token = None): 
        index_1 = self.confusion_sets_1.loc[self.confusion_sets_1.apply(lambda row: row.str.fullmatch(token, case = False).any(), axis = 1)].index
        output_confusion_sets_1 = self.confusion_sets_1.loc[self.confusion_sets_1.index.isin(index_1)].dropna(axis = 1).to_numpy().reshape(-1).tolist()

        index_2 = self.confusion_sets_2[self.confusion_sets_2.apply(lambda row: row.str.fullmatch(token, case = False).any(), axis = 1)].index
        output_confusion_sets_2 = self.confusion_sets_2.loc[self.confusion_sets_2.index.isin(index_2)].dropna(axis = 1).to_numpy().reshape(-1).tolist()

        output_confusion_sets = list(set(output_confusion_sets_1 + output_confusion_sets_2))
        return output_confusion_sets 


    def generation_edit_distance(self, token = None): 

        max_length_list = [len(token) - 1,  len(token), len(token) + 1]
        df = pd.DataFrame(self.dictionary, columns = ["original"])
        df["length"] = df["original"].apply(lambda x: len(x))
        df = df.loc[df["length"].isin(max_length_list)]

        # generation
        if self.distance_type == 1:  # Levenshtein (allow substitution)
            df["edit_distance"] = df["original"].apply(lambda x: edit_distance(token, x))
            df.sort_values(by = "edit_distance", ascending = True, inplace = True)

        elif self.distance_type == 2: # Levenshtein (disallow substitution)
            df["edit_distance"] = df["original"].apply(lambda x: edit_distance(token, x, substitution_cost = 2))
            df.sort_values(by = "edit_distance", ascending = True, inplace = True)

        elif self.distance_type == 3: # Damerau-Levenshtein 
            df["edit_distance"] = df["original"].apply(lambda x: edit_distance(token, x, transpositions = True))
            df.sort_values(by = "edit_distance", ascending = True, inplace = True)

        else:
            raise ValueError("Distance type is out of option.")

        df_edit_1 = df.loc[df["edit_distance"] <= 1]
        df_edit_2 = df.loc[df["edit_distance"] == 2]

        df_edit_1 = df_edit_1["original"].to_numpy().flatten().tolist()
        df_edit_2 = df_edit_2["original"].to_numpy().flatten().tolist()

        return df_edit_1, df_edit_2 # output_edit_distance is a list 


    def language_model(self):
        if (self.language_model_type == 1) & (self.n_gram == 2):
            lm = self.laplace_bigram
        elif (self.language_model_type == 1) & (self.n_gram == 3):
            lm = self.laplace_trigram

        elif (self.language_model_type == 2) & (self.n_gram == 2):
            lm = self.absolute_bigram
        elif (self.language_model_type == 2) & (self.n_gram == 3):
            lm = self.absolute_trigram

        elif (self.language_model_type == 3) & (self.n_gram == 2):
            lm = self.kneser_bigram
        elif (self.language_model_type == 3) & (self.n_gram == 3):
            lm = self.kneser_trigram
        else:
            raise ValueError("Invalid type")

        return lm


    def noisy_pattern(self, candidate, typed):

        index_e = []
        x = ""
        y = ""
        edit_type = ""

        if len(candidate) == len(typed):
            for i in range(len(candidate)):
                if candidate[i] == typed[i]:
                    pass
                else:
                    a = i
                    index_e.append(a)

        elif len(candidate) != len(typed):
            if len(candidate) > len(typed):
                edit_type = "deletion"
                for i in range(len(candidate)):
                    if i <= len(candidate) - 2:
                        if candidate[i] == typed[i]:
                                pass

                        else:
                            if i == 0:
                                x = "#"
                                y = candidate[i]
                                break

                            else:
                                x = candidate[i-1]
                                y = candidate[i]
                                break

                    elif i == len(candidate) - 1:
                        x = candidate[i-1]
                        y = candidate[i]
                        break

            elif len(candidate) < len(typed):
                edit_type = "insertion"
                for i in range(len(typed)):
                    if i <= len(typed) - 2:
                        if typed[i] == candidate[i]:
                                pass
                        else:
                            if i == 0:
                                x = "#"
                                y = typed[i]
                                break

                            else:
                                x = typed[i-1]
                                y = typed[i] 
                                break

                    elif i == len(typed) - 1:
                        x = typed[i-1]
                        y = typed[i]


        if len(index_e) == 1: # must be substitution
            edit_type = "substitution"
            x = typed[index_e[0]]
            y = candidate[index_e[0]]

        elif len(index_e) == 2: # must be transposition
            if (candidate[index_e[0]] == typed[index_e[1]]) & (candidate[index_e[1]] == typed[index_e[0]]):
                edit_type = "transposition"  
                x = candidate[index_e[0]]
                y = candidate[index_e[1]]

        return edit_type, x, y


    def noisy_prob(self, candidate = None, edit_type = None, x = None, y = None):

        numerator = None
        denominator = None

        num = None   # a dataframe for numerator
        den = None  # for denominator

        x_y = x + y

        if edit_type == "deletion":
            num = self.num_deletion
            den = self.den_del

            if num.loc[num["pattern"] == x_y, "count"].shape[0] > 0:
                numerator = num.loc[num["pattern"] == x_y, "count"].iloc[0]
            else:
                pass

            if x_y in den.keys():
                denominator = den[x_y]
            else: 
                pass

        elif edit_type == "insertion":
            num = self.num_insertion
            den = self.den_ins

            if num.loc[num["pattern"] == x_y, "count"].shape[0] > 0:
                numerator = num.loc[num["pattern"] == x_y, "count"].iloc[0]
            else:
                pass

            if x_y in den.keys():
                denominator = den[x]
            else: 
                pass

        elif edit_type == "substitution":
            num = self.num_substitution
            den = self.den_sub

            if num.loc[num["pattern"] == x_y, "count"].shape[0] > 0:
                numerator = num.loc[num["pattern"] == x_y, "count"].iloc[0]
            else:
                pass

            if x_y in den.keys():
                denominator = den[y]
            else: 
                pass

        elif edit_type == "transposition":
            num = self.num_transposition
            den = self.den_tra

            if num.loc[num["pattern"] == x_y, "count"].shape[0] > 0:
                numerator = num.loc[num["pattern"] == x_y, "count"].iloc[0]
            else:
                pass

            if x_y in den.keys():
                denominator = den[x_y]
            else: 
                pass

        # replace with zero in order to avoid a zero division error
        if (denominator == None) | (numerator == None):
            noisy_prob = 0
        else:
            noisy_prob = numerator / denominator 

        return noisy_prob


    def noisy_prob_compute(self, candidate_1, token):

        df_edit_1 = pd.DataFrame(data = list(zip(candidate_1, ["edit_1"] * len(candidate_1))), columns = ["candidates", "edit_distance"])
        df_edit_1["edit_type"] = df_edit_1["candidates"].apply(lambda row: self.noisy_pattern(row, token))

        df_edit_1["x"] = df_edit_1["edit_type"].apply(lambda row: row[1]) 
        df_edit_1["y"] = df_edit_1["edit_type"].apply(lambda row: row[2]) 
        df_edit_1["edit_type"] = df_edit_1["edit_type"].apply(lambda row: row[0]) 

        df_edit_1["noisy_prob"] = df_edit_1.apply(lambda row: self.noisy_prob(candidate = row["candidates"], 
                                                                            edit_type = row["edit_type"], 
                                                                            x = row["x"], y = row["y"]), axis = 1)
        return df_edit_1


    def lm_prob_compute(self, df_edit, token_i1, token_i2, index, token_length):
        lm = self.language_model()
        
        if (token_length==1):
            df_edit["prior_prob"] = df_edit["candidates"].apply(lambda row:lm.score(row))
        else:
            if (index == 0) & (self.n_gram == 2):
                df_edit["prior_prob"] = df_edit["candidates"].apply(lambda row: lm.score(row, "<s>".split()))
            elif (index == 0) & (self.n_gram == 3):
                df_edit["prior_prob"] = df_edit["candidates"].apply(lambda row: lm.score(row, "<s> <s>".split()))
            elif (index != 0) & (self.n_gram == 2):
                df_edit["prior_prob"] = df_edit["candidates"].apply(lambda row: lm.score(row, token_i1.split()))
            elif (index == 1) & (self.n_gram == 3):
                df_edit["prior_prob"] = df_edit["candidates"].apply(lambda row: lm.score(row, " ".join(map(str, ["<s>", token_i1])).split()))
            elif (index != 0) & (self.n_gram == 3):
                df_edit["prior_prob"] = df_edit["candidates"].apply(lambda row: lm.score(row, " ".join(map(str, [token_i2, token_i1])).split()))
            else:
                raise ValueError("Invalid")

        return df_edit


    def print_candidate_words(self, candidate_list = None):
        self.filtered_list = candidate_list
        for x in range(1, len(self.filtered_list) + 1, 1):
            self.lb.insert(x-1, self.filtered_list[x-1])
        
        return self.filtered_list


    def spelling_correction(self):
        print("Running spelling_correction")
        tokens = []
        self.error = []

        for word in word_tokenize(self.input):
            if word.lower() in self.named_entity:
                tokens.append(word)
            else:
                tokens.append(word.lower()) 

        if len(tokens) > 500:
            tk.messagebox.showerror("Warning", "You have exceeded the 500 words limit! Please adjust your word counts.")
        else:

          for i in range(len(tokens)):
              x = self.segmenting_token(token = tokens[i])
              tokens[i] = x                          

          tokens = [[word] for sublist in tokens for word in sublist if word != ""]
          print("Finished Tokenization")

          for i in range(len(tokens)):
              print("Checking the existence of token in name")
              # Step 1: check if it is a name
              if self.detect_name(token = tokens[i][0].lower()) is True:
                  print("Token is found in NAME")
                  pass
              else:

                  # Step 1.1: check if it is a real word
                  print("Checking dictionary")
                  if self.detect_dictionary(token = tokens[i][0]) is True:
                      
                      print("Word is found in dictionary, proceed with real word detection")
                      # generate candidates
                      con_set = self.generation_confusion_sets(token = tokens[i][0])
                      candidate_1, candidate_2 = self.generation_edit_distance(token = tokens[i][0])
                      
                      con_set = [word for word in con_set if (word not in candidate_1) & (word not in candidate_2)]
                      candidate_1 = [word for word in candidate_1 if (word not in candidate_2) & (word not in con_set)]
                      candidate_2 = [word for word in candidate_2 if (word not in candidate_1) & (word not in con_set)]


                      if len(candidate_1) < 5:
                          df_edit_2 = pd.DataFrame(data = list(zip(candidate_2, ["edit_2"] * len(candidate_2))), columns = ["candidates", "edit_distance"])
                          df_edit_2["jaro_score"] = df_edit_2["candidates"].apply(lambda row: jaro.jaro_winkler_metric(tokens[i][0], row))
                          df_edit_2 = df_edit_2.sort_values(by = "jaro_score", ascending = False)
                          df_edit_2.reset_index(drop = True, inplace = True)
                          df_edit_2 = df_edit_2.iloc[0:5 - len(candidate_1)]
                          if (len(candidate_1) == 0) & (len(con_set) == 0):
                                  df_edit = df_edit_2.copy()
                          elif (len(candidate_1) != 0) & (len(con_set) == 0):
                                  df_edit_1 = self.noisy_prob_compute(candidate_1 = candidate_1, token = tokens[i][0])
                                  df_edit = pd.concat([df_edit_1, df_edit_2])
                          elif (len(candidate_1) != 0) & (len(con_set) != 0):
                                  df_edit_1 = self.noisy_prob_compute(candidate_1 = candidate_1, token = tokens[i][0])
                                  df_con_set = pd.DataFrame(data = list(zip(con_set, ["con_set"] * len(con_set))), columns = ["candidates", "edit_distance"])
                                  df_edit = pd.concat([df_edit_1, df_edit_2, df_con_set])
                          elif (len(candidate_1) == 0) & (len(con_set) != 0):
                                  df_con_set = pd.DataFrame(data = list(zip(con_set, ["con_set"] * len(con_set))), columns = ["candidates", "edit_distance"])
                                  df_edit = pd.concat([df_edit_2, df_con_set])
                      else:
                          df_edit_1 = self.noisy_prob_compute(candidate_1 = candidate_1, token = tokens[i][0])
                          if len(con_set) == 0:
                              df_edit = df_edit_1.copy()
                          else:
                              df_con_set = pd.DataFrame(data = list(zip(con_set, ["con_set"] * len(con_set))), columns = ["candidates", "edit_distance"])
                              df_edit = pd.concat([df_edit_1, df_con_set])

                      # other controls are done in lm_prob_compute
                      if (i != 0) & (self.n_gram == 2):
                          token_i1 = tokens[i-1][0]
                          token_i2 = None
                      elif (i == 1) & (self.n_gram == 3):
                          token_i1 = tokens[i-1][0]
                          token_i2 = None
                      elif (i != 0) & (self.n_gram == 3):
                          token_i1 = tokens[i-1][0]
                          token_i2 = tokens[i-2][0]
                      else:
                          token_i1 = None
                          token_i2 = None                           
                              
                      df_edit = self.lm_prob_compute(df_edit = df_edit, token_i1 = token_i1, token_i2 = token_i2, index = i, token_length = len(tokens))

                      df_edit["sum_prob"] = df_edit.apply(lambda row: (row["prior_prob"] * self.weight) + ((row["noisy_prob"]) * (1 - self.weight)) if row["edit_distance"] == "edit_1" else
                                          row["prior_prob"], axis = 1)
                      df_edit = df_edit.sort_values(by = "sum_prob", ascending = False)

                      total_prob = df_edit["sum_prob"].sum()
                      df_edit["normalise_prob"] = df_edit["sum_prob"].apply(lambda row: row / total_prob)

                      prob = df_edit.loc[df_edit["candidates"] == tokens[i][0], "normalise_prob"].iloc[0]
                      if prob > self.true_prob:
                          print("Token is correct!")
                          pass
                      else:
                          print("The word has real word error")
                          self.error.append(tokens[i][0]) 
                          for err in self.error:
                              idx = self.txt.search(err, self.idx, nocase=1, stopindex=tk.END)
                              lastidx = '%s+%dc' % (idx, len(err))
                              self.txt.tag_add("highlight", idx, lastidx)
                              #idx = lastidx
                              self.txt.tag_config("highlight", background="yellow", foreground="red")
                          if df_edit.shape[0] > 5:
                              candidates = df_edit["candidates"].to_numpy().flatten().tolist()[:5]
                              show_candidate = self.print_candidate_words(candidate_list = candidates)
                              break
                          else:
                              candidates = df_edit["candidates"].to_numpy().flatten().tolist()
                              show_candidate = self.print_candidate_words(candidate_list = candidates)
                              break


                  # Step 1.2, for non-word errors, to calculate edit distance
                  else: 
                      print("The word has non word error")
                      self.error.append(tokens[i][0])
                      for err in self.error:
                          idx = self.txt.search(err, self.idx, nocase=1, stopindex=tk.END)
                          lastidx = '%s+%dc' % (idx, len(err))
                          self.txt.tag_add("highlight", idx, lastidx)
                          #idx = lastidx
                          self.txt.tag_config("highlight", background="yellow", foreground="red")
                      candidates_1, candidates_2 = self.generation_edit_distance(token = tokens[i][0])
                      df_edit_1 = self.noisy_prob_compute(candidate_1 = candidates_1, token = tokens[i][0])

                      if df_edit_1.shape[0] >= 5:
                          df_edit = df_edit_1
                      else:
                          df_edit_2 = pd.DataFrame(data = list(zip(candidates_2, ["edit_2"] * len(candidates_2))), columns = ["candidates", "edit_distance"])
                          df_edit_2["jaro_score"] = df_edit_2["candidates"].apply(lambda row: jaro.jaro_winkler_metric(tokens[i][0], row))
                          df_edit_2 = df_edit_2.sort_values(by = "jaro_score", ascending = False)
                          df_edit_2.reset_index(drop = True, inplace = True)
                          df_edit_2 = df_edit_2.iloc[0:5 - df_edit_1.shape[0]]
                          df_edit = pd.concat([df_edit_1, df_edit_2])

                      if df_edit.shape[0] == 0:
                          candidates = []
                      else:
                          # other controls are done in lm_prob_compute
                          if (i != 0) & (self.n_gram == 2):
                              token_i1 = tokens[i-1][0]
                              token_i2 = None
                          elif (i == 1) & (self.n_gram == 3):
                              token_i1 = tokens[i-1][0]
                              token_i2 = None
                          elif (i != 0) & (self.n_gram == 3):
                              token_i1 = tokens[i-1][0]
                              token_i2 = tokens[i-2][0]
                          else:
                              token_i1 = None
                              token_i2 = None 
                              
                          df_edit = self.lm_prob_compute(df_edit = df_edit, token_i1 = token_i1, token_i2 = token_i2, index = i, token_length = len(tokens))

                          df_edit["sum_prob"] = df_edit.apply(lambda row: (row["prior_prob"] * self.weight) + ((row["noisy_prob"]) * (1 - self.weight)) if row["edit_distance"] == "edit_1" else
                                                      row["prior_prob"], axis = 1)

                          df_edit = df_edit.sort_values(by = "sum_prob", ascending = False)

                          if df_edit.shape[0] > 5:
                              candidates = df_edit["candidates"].to_numpy().flatten().tolist()[:5]
                              show_candidate = self.print_candidate_words(candidate_list = candidates)
                              break
                          else:
                              candidates = df_edit["candidates"].to_numpy().flatten().tolist()
                              show_candidate = self.print_candidate_words(candidate_list = candidates)
                              break


    def tuning(self, token_t, token_t_1, token_t_2, index):

          candidates_1, candidates_2 = self.generation_edit_distance(token = token_t)

          if len(candidates_1) == 0:
              df_edit = pd.DataFrame(data = list(zip(candidates_2, ["edit_2"] * len(candidates_2))), columns = ["candidates", "edit_distance"])
          elif len(candidates_1) >= 5:
              df_edit = self.noisy_prob_compute(candidate_1 = candidates_1, token = token_t)
          else:
              df_edit_2 = pd.DataFrame(data = list(zip(candidates_2, ["edit_2"] * len(candidates_2))), columns = ["candidates", "edit_distance"])
              df_edit_2["jaro_score"] = df_edit_2["candidates"].apply(lambda row: jaro.jaro_winkler_metric(token_t, row))
              df_edit_2 = df_edit_2.sort_values(by = "jaro_score", ascending = False)
              df_edit_2.reset_index(drop = True, inplace = True)
              df_edit_1 = self.noisy_prob_compute(candidate_1 = candidates_1, token = token_t)
              df_edit_2 = df_edit_2.iloc[0:5 - df_edit_1.shape[0]]
              df_edit = pd.concat([df_edit_1, df_edit_2])

          df_edit = self.lm_prob_compute(df_edit = df_edit, token_i1 = token_t_1, token_i2 = token_t_2, index = index)

          if df_edit.shape[0] == 0:
              candidates = []
          else:
              df_edit["sum_prob"] = df_edit.apply(lambda row: (row["prior_prob"] * self.weight) + ((row["noisy_prob"]) * (1 - self.weight)) if row["edit_distance"] == "edit_1" else
                                              row["prior_prob"], axis = 1)

              df_edit = df_edit.sort_values(by = "sum_prob", ascending = False)

              if df_edit.shape[0] > 5:
                  candidates = df_edit["candidates"].to_numpy().flatten().tolist()[:5]
              else:
                  candidates = df_edit["candidates"].to_numpy().flatten().tolist()

          return candidates        
          
          
if __name__ == "__main__":
  gui = spelling_correction(distance_type = 3, language_model_type = 3, ngram = 3, weight = 0.8, threshold = 1.e-02).spelling_correction()

