# CDM_SymSpell

In [1]:
# -*- coding: utf-8 -*-

"""

@ author: Taehyeong Kim
@ e-mail: taehyeong93@korea.ac.kr

"""

import utils.preprocessing
from symspellpy.symspellpy import SymSpell

import pandas as pd
from keras.preprocessing.text import text_to_word_sequence

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Data read
df = pd.read_csv("data/sample data.csv", encoding = "CP949")
# Column Name Transform, Missing Value Remove, Lowercase, Organism Check, remove hangul
df = utils.preprocessing.Preprocessing().organ_result(df)

def organ_extract(df):
    
    temp=[]
    
    for _ in range(len(df)):
        
        if ':' in df.iat[_][0]:
            temp.append(df.iat[_][0].split(":")[1].strip())
        if ';' in df.iat[_][0]:
            temp.append(df.iat[_][0].split(";")[1].strip())
            
    return temp

organ_list = organ_extract(df["organ_result"])
organ_list = [x for x in organ_list if x]

organ_check = sorted(list(set(organ_list)))
# pd.Series(organ_check).to_csv("organ_check.csv", index = False, header=False, encoding = "CP949")

print("Organism count:" + str(len(organ_list)))

stop_words=["ss","spp","ssp","mrsa","mssa","group"]
print("Stop words:" + str(stop_words))

--- Preprocessing ---
Organism count:27544
Stop words:['ss', 'spp', 'ssp', 'mrsa', 'mssa', 'group']


### 1. Extraction of data using regular expression

In [3]:
# Word
organ_word=[]
for _ in range(len(organ_list)):
    
    temp_words = text_to_word_sequence(organ_list[_]) #Tokenizer
    
    for _ in temp_words:
        if _ not in stop_words:
            if len(_)>3:
                organ_word.append(_)
                
# Sentence
organ_sentence=[]
for _ in range(len(organ_list)):
    
    temp_token=[]
    temp_words = text_to_word_sequence(organ_list[_]) #Tokenizer
    
    for _ in temp_words:
        if _ not in stop_words:
            if len(_)>3:
                temp_token.append(_)
                
    organ_sentence.append(temp_token)

for _ in range(len(organ_sentence)):
    organ_sentence[_] =  " ".join(organ_sentence[_])

In [4]:
organ_corpus = sorted(list(set(organ_word)))
len(organ_corpus)

320

### 2. Misspelling Detection

In [5]:
dict_corpus=list(pd.read_csv("data/misspelling_detection.txt", header=None)[0])
misspell=sorted(list(set(organ_word)))
for _ in dict_corpus:
    if _ in misspell: 
        misspell.remove(_)

misspell.remove("oneday")
misspell.remove("nonenterococci")
misspell.remove("nonenterococcus")
misspell.append("perosis")
misspell

['adecarboxylate',
 'chromogens',
 'ferentum',
 'flavbacterium',
 'koneensis',
 'ochrobacterium',
 'orytihabitans',
 'papatyphi',
 'parpinfluenzae',
 'pseudodiphthericum',
 'shingobacterium',
 'sstreptococcus',
 'stacherbrandfii',
 'stapylococcus',
 'urealyticm',
 'perosis']

### 3. SymSpell

In [6]:
sym_spell = SymSpell(max_dictionary_edit_distance=3)
sym_spell.create_dictionary("data/Dorlands Dictionary of Medical Acronyms & Abbreviations.txt", encoding="UTF-8")
sym_spell.load_dictionary("data/frequency_dictionary_en_82_765.txt", 0, 1)

for mis in misspell:
    suggestions = sym_spell.lookup_compound(mis, max_edit_distance=1)
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))

ad carboxylate, 1, 0
chromogen, 1, 23952
fere tum, 1, 0
fla bacterium, 1, 0
one ennis, 3, 902
och bacterium, 2, 1
orytihabitans, 0, 0
papa phi, 2, 33
par influenzae, 1, 0
pseudodiphthericum, 0, 0
thing bacterium, 2, 63
streptococcus, 1, 832696
stacherbrandfii, 0, 0
staphylococcus, 1, 797144
urea lytic, 2, 0
persis, 1, 69405


In [7]:
%%time
organ_rev=[]
for mis in organ_corpus:
    suggestions = sym_spell.lookup_compound(mis, max_edit_distance=3)
    for suggestion in suggestions:
        organ_rev.append(suggestion.term)
        
len(sorted(list(set(organ_rev))))

Wall time: 11 s


313