### Detection

In [2]:
from detection import *

words, label = labelTesseract()
train_data, test_data, train_label, test_label = div_train(words, label)
bigram_dict = compute_bigram()
featureMatrix_train = buildFeatures(train_data, bigram_dict)
featureMatrix_test = buildFeatures(test_data, bigram_dict)

# uncomment for testing
'''
head = featureMatrix_train.head()
print(head.to_string())
'''

# build classifier
svm_class = SVC(kernel='rbf', verbose=True, gamma='scale')
svm_class.fit(featureMatrix_train, train_label)

# prediction
prediction = svm_class.predict(featureMatrix_test)

output = pd.DataFrame({'data': test_data,
                       'label': prediction})

print(output[:20])

##### evaluation
#confustion Matrix
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(test_label, prediction))
print(classification_report(test_label, prediction))

[LibSVM]               data  label
0          proposal      1
1                1n      0
2           unclear      1
3              been      1
4          adequacy      1
5                 7      0
6               MCA      1
7               the      1
8               you      1
9   recommendatlons      0
10            thelr      1
11              the      1
12              Los      1
13            afflx      1
14    understandlng      0
15             that      1
16        posltlons      0
17              the      1
18         Drlnklng      0
19       1ndustrlal      0
[[12204  4508]
 [ 2622 24967]]
              precision    recall  f1-score   support

           0       0.82      0.73      0.77     16712
           1       0.85      0.90      0.88     27589

   micro avg       0.84      0.84      0.84     44301
   macro avg       0.84      0.82      0.82     44301
weighted avg       0.84      0.84      0.84     44301



In [72]:
typos = output[output.label == 0].reset_index(drop = True).data

In [76]:
# remove all number
cleaned_typos = []
for typo in typos:
    try:
        int(typo)
    except:
        cleaned_typos.append(typo)
cleaned_typos = pd.Series(cleaned_typos)

In [80]:
import string
# remove puncutation and numbers
cleaned_typos = cleaned_typos.str.replace(r"[{}]".format(string.punctuation + '‘'),'')
cleaned_typos = cleaned_typos.str.replace(r'\d','')
cleaned_typos = cleaned_typos[cleaned_typos != '']

In [92]:
cleaned_typos.to_csv('cleaned_typos.csv')

### Find Candidates

In [462]:
fname = 'CSW_corpus.txt'
with open(fname) as file:
    corpus = file.readlines()
corpus = [x.strip() for x in corpus]   
# delete first 2 rows
corpus = corpus[2:]
# de-capitalize
corpus = [x.lower() for x in corpus]

In [463]:
import numpy as np
import pandas as pd
from collections import Counter 
from nltk import edit_distance

def typo_classification(typo,correct):
    if (len(typo) > len(correct)):
        return 'insertion'
    elif (len(typo) < len(correct)):
        return 'deletion'
    else:
        typo_count = Counter(typo)
        correct_count = Counter(correct)
        if typo_count == correct_count:
            return 'reversal'
        else:
            return 'subsititution'

def find_candidates(typo,corpus):
    candidates = []
    candi_type = []
    for word in corpus:
        ed = edit_distance(typo,word)
        word_type = typo_classification(typo,word)
        if ((ed == 1) | ((ed == 2) & (word_type == 'reversal'))):
            candidates.append(word)
            candi_type.append(word_type)
    return candidates,candi_type

def find_position(typo,candidates):
    position = []
    for corr in candidates:
        typo_type = typo_classification(typo,corr)
        
        if (typo_type == 'deletion'):
            typo += '#'

            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    if corr[i] != corr[i-1]:
                        typo = typo[:-1]
                        position.append([typo,corr,"@",corr[i],i,typo_type])
                        break
                    else:
                        typo = typo[:-1]
                        position.append([typo,corr,"@",corr[i],i,typo_type])
                        position.append([typo,corr,"@",corr[i],i-1,typo_type])
                        break
                        
                i += 1
        elif (typo_type == 'insertion'):
            corr += '#'

            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    if typo[i] != typo[i-1]:
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"@",i,typo_type])
                        break
                    elif ((typo[i] == typo[i-1])& (typo[i] == typo[i-2])):
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"@",i,typo_type])
                        position.append([typo,corr,typo[i],"@",i-1,typo_type])
                        position.append([typo,corr,typo[i],"@",i-2,typo_type])
                        break
                    else:
                        corr = corr[:-1]
                        position.append([typo,corr,typo[i],"@",i,typo_type])
                        position.append([typo,corr,typo[i],"@",i-1,typo_type])
                        break
                i += 1
        elif (typo_type == 'subsititution'):
            i = 0
            while i < len(corr):
                if (corr[i] != typo[i]):
                    position.append([typo,corr,typo[i],corr[i],i,typo_type])
                    break
                i+=1
                
        elif (typo_type == 'reversal'):
            i = 0
            while i < len(corr)-1:
                if ((typo[i] == corr[i+1]) & (typo[i+1] == corr[i])):
                    typo_comb = typo[i] + typo[i+1]
                    position.append([typo,corr,typo_comb,typo_comb[::-1],i,typo_type])
                    break
                i +=1
    return position

In [464]:
# imput lowercase typo
typo = 'ambitios'

candidates,cand_type = find_candidates(typo,corpus)
correction = find_position(typo,candidates)

In [461]:
correction = pd.DataFrame(correction)
correction.columns = ['Typo','Correction','old','new','index','type']
correction

Unnamed: 0,Typo,Correction,old,new,index,type
0,ambitios,ambition,s,n,7,subsititution
1,ambitios,ambitions,@,n,7,deletion
2,ambitios,ambitious,@,u,7,deletion
