# Setup

In [None]:
import pandas as pd
import numpy as np
import csv
import string

from sklearn.model_selection import train_test_split

!pip install transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# Load our model

In [None]:
id2label = {0: "correct", 1: "incorrect"}
label2id = {"incorrect": 0, "correct": 1}

# Model Architecture
myTokenizer = AutoTokenizer.from_pretrained("mideind/IceBERT")
myModel = AutoModelForSequenceClassification.from_pretrained("refactored_model_v2_318k", num_labels=2, id2label=id2label, label2id=label2id)
classifier = pipeline("text-classification", model=myModel, tokenizer=myTokenizer)

# Load our Dataset

In [None]:
bin_data = pd.read_csv("./original_datasets/Storasnid_beygm.csv", low_memory=False, encoding='latin-1', header=None, sep=";")
# Drop unused columns
case_data = bin_data.drop([0,2,5,6,7], axis=1)
case_data

# Functions meant to be abstracted from the interface

In [None]:
# A function that returns an 3d array of possible cases, in the form of:
# [[original_word,[possible_case_1,possible_case_2,...,possible_case_n],[next_word_in_sentence,[possible_case_1]]...]
def find_alternate_cases(myText):
    # Make an array to contain all the cases we want to test
    words_to_test=[]

    #for each word in the string find it's case alternatives if they exist, and add them to the array
    word_array = myText.split()
    for word in word_array:
        alt_cases = []
        df_same_words = case_data.loc[case_data[3] == word]
        if not df_same_words.empty:
            word_matches = df_same_words[1].unique()
            for word_id in word_matches:
                df_alt_cases = case_data.loc[case_data[1] == word_id]
                df_alt_cases = df_alt_cases[3].unique()
                df_alt_cases = df_alt_cases[df_alt_cases != word]
                words_to_test.append([word,df_alt_cases])
    return words_to_test

def test_alternate_cases(test_sentence, words_to_test):
    
    prediction = classifier(test_sentence)
    print("I think this sentence is",prediction[0]['label'],"--> ",test_sentence)
    if prediction[0]['label'] == 'correct':
        return False
    # Create an array to hold our suggestions in
    suggestions = []
    word_array = test_sentence.split()

    # Create the sentence to be tested
    for word in words_to_test:
        index = word_array.index(word[0])
        #test_sentence.split()
        for case in word[1]:
            test_sentence = word_array.copy()
            test_sentence[index] = case
            # Test the sentence, and if it scores correct, log it
            test_sentence = " ".join(test_sentence)
            prediction = classifier(test_sentence)
            if prediction[0]['label'] == 'correct':
                suggestions.append([test_sentence, prediction[0]['score']])
    return suggestions
            
def print_suggestions(suggestions):
    suggestions.sort(key=lambda suggestions:suggestions[1])
    print("             Did you mean --> ", suggestions[-1][0])
    
    #for x in range(len(suggestions)-1):
        #print("Or possibly --> ", suggestions[x][0])
        #print("-_-_--_-_--_-_--_-_--_-_-")

# GEC Interface

In [None]:
# Open the csv file, and read each sentence
with open('./original_datasets/ChatGPT_comparison.csv') as csv_file:
    file_to_be_proofread = csv.reader(csv_file)
    for line in file_to_be_proofread:
        my_text = line[0]
        words_to_test = find_alternate_cases(my_text)
        suggestions = test_alternate_cases(my_text,words_to_test)
        if suggestions:
            print_suggestions(suggestions)

# Validation

In [None]:
correct_sentences = []
incorrect_sentences = []

correct_suggestions = 0
incorrect_suggestions = 0
no_suggestions = 0
returned_false = 0

mismatches = []

with open('./generated_datasets/correct_sentence_data.csv') as csv_file:
    my_file = csv.reader(csv_file)
    next(my_file, None)  # skip the header
    for line in my_file:
        correct_sentences.append([line[0]])
        
with open('./generated_datasets/incorrect_sentence_data.csv') as csv_file:
    my_file = csv.reader(csv_file)
    next(my_file, None)  # skip the header
    for line in my_file:
        incorrect_sentences.append([line[0]])
        
for i in range(len(correct_sentences)):
    if(i%10==0):
        print("We are at sentence: ",i)
    correct_sentence = correct_sentences[i][0]
    incorrect_sentence = incorrect_sentences[i][0]
    suggested_sentence = ""
    words_to_test = find_alternate_cases(incorrect_sentence)
    suggestions = test_alternate_cases(incorrect_sentence,words_to_test)
    if suggestions:
        suggestions.sort(key=lambda suggestions:suggestions[1])
        suggested_sentence = suggestions[-1][0]
        if(suggested_sentence==correct_sentence):
            correct_suggestions += 1
        else:
            incorrect_suggestions += 1
            mismatches.append([incorrect_sentence,suggested_sentence,correct_sentence])
    elif suggestions == False:
        returned_false += 1
    else:
        no_suggestions += 1

In [None]:
total = len(incorrect_sentences)

print("correct_suggestions :",correct_suggestions," : ",1/total*correct_suggestions)
print("incorrect_suggestions :",incorrect_suggestions," : ",1/total*incorrect_suggestions)
print("no_suggestions :",no_suggestions," : ",1/total*no_suggestions)
print("misclassified :",returned_false," : ",1/total*returned_false)

In [None]:
for mismatch in mismatches:
    print("----------------------------------")
    print(mismatch[0]) # Incorrect
    print("-")
    print(mismatch[1]) # Suggested
    print("-")
    print(mismatch[2]) # Corrected
    print("----------------------------------")