# CDM_SUSC

In [1]:
# -*- coding: utf-8 -*-

"""

@ author: Taehyeong Kim, Fusion Data Analytics and Artificial Intelligence Lab

"""

import utils.preprocessing
from gensim.models import FastText
from pyxdameraulevenshtein import damerau_levenshtein_distance

import collections
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import text_to_word_sequence

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Data read
df = pd.read_csv("data/sample data.csv", encoding = "CP949")
# Column Name Transform, Missing Value Remove, Lowercase, Organism Check, remove hangul
df = utils.preprocessing.Preprocessing().organ_result(df)

def organ_extract(df):
    
    temp=[]
    
    for _ in range(len(df)):
        
        if ':' in df.iat[_][0]:
            temp.append(df.iat[_][0].split(":")[1].strip())
        if ';' in df.iat[_][0]:
            temp.append(df.iat[_][0].split(";")[1].strip())
            
    return temp

organ_list = organ_extract(df["organ_result"])
organ_list = [x for x in organ_list if x]

# organ_check = sorted(list(set(organ_list)))
# pd.Series(organ_check).to_csv("Experiments/organ_check.csv", index = False, header=False, encoding = "CP949")

print("Organism count:" + str(len(organ_list)))

stop_words=["ss","spp","ssp","mrsa","mssa","group"]
print("Stop words:" + str(stop_words))

--- Preprocessing ---
Organism count:27544
Stop words:['ss', 'spp', 'ssp', 'mrsa', 'mssa', 'group']


### 1. Extraction of data using regular expression

In [3]:
# Word
organ_word=[]
for _ in range(len(organ_list)):
    
    temp_words = text_to_word_sequence(organ_list[_]) #Tokenizer
    
    for _ in temp_words:
        if _ not in stop_words:
            if len(_)>3:
                organ_word.append(_)
                
# Sentence
organ_sentence=[]
for _ in range(len(organ_list)):
    
    temp_token=[]
    temp_words = text_to_word_sequence(organ_list[_]) #Tokenizer
    
    for _ in temp_words:
        if _ not in stop_words:
            if len(_)>3:
                temp_token.append(_)
                
    organ_sentence.append(temp_token)

for _ in range(len(organ_sentence)):
    organ_sentence[_] =  " ".join(organ_sentence[_])

In [4]:
organ_corpus = sorted(list(set(organ_word)))
len(organ_corpus)

320

### 2. Misspelling Detection

In [5]:
dict_corpus=list(pd.read_csv("data/misspelling_detection.txt", header=None)[0])
misspell=sorted(list(set(organ_word)))
for _ in dict_corpus:
    if _ in misspell: 
        misspell.remove(_)

misspell.remove('oneday')
misspell.remove("nonenterococci")
misspell.remove("nonenterococcus")
misspell.append("perosis")

misspell

['adecarboxylate',
 'chromogens',
 'ferentum',
 'flavbacterium',
 'koneensis',
 'ochrobacterium',
 'orytihabitans',
 'papatyphi',
 'parpinfluenzae',
 'pseudodiphthericum',
 'shingobacterium',
 'sstreptococcus',
 'stacherbrandfii',
 'stapylococcus',
 'urealyticm',
 'perosis']

In [6]:
vocab_m={}
for _ in misspell:
    vocab_m[_]=organ_word.count(_)
len(vocab_m)

16

In [7]:
vocab_c=collections.Counter(organ_word)
for _ in misspell:
    del vocab_c[_]
len(vocab_c)

304

### 4. BioWordVec

In [8]:
# # Token

# organ_token=[]

# for _ in range(len(organ_list)):
    
#     temp_token=[]
#     temp_words = text_to_word_sequence(organ_list[_]) #Tokenizer
    
#     for _ in temp_words:
#         if _ not in stop_words:
#             if len(_)>3:
#                 temp_token.append(_)
                
#     organ_token.append(temp_token)
               
# organ_token

In [9]:
# %%time
# local_model = FastText(size=200, sg=1, min_count=1)
# local_model.build_vocab(sentences=organ_token)
# local_model.train(sentences=organ_token, total_examples=len(organ_token), epochs=200)
# print("epochs: " + str(local_model.epochs))

In [10]:
%%time
bio_model = FastText.load_fasttext_format('./pretrained/BioWordVec_PubMed_MIMICIII_d200.bin')
print("epochs: " + str(bio_model.epochs))

epochs: 5
Wall time: 24min 29s


In [11]:
# %%time
# bio_model.build_vocab(sentences=organ_token, update=True)
# bio_model.train(organ_token, total_examples=len(organ_token), epochs=100)

### 5. Unsupervised Dictionary

In [12]:
%%time

similar_n=30
fasttext_min_similarity = 0.80
edit_distance_threshold = 3

nonalphabetic = re.compile(r'[^a-zA-Z]')

def include_spell_mistake(word, similar_word, score):
    
    return (score > fasttext_min_similarity 
            and damerau_levenshtein_distance(word, similar_word) <= edit_distance_threshold
            and len(similar_word) > 3
            and word[0] == similar_word[0]
            and nonalphabetic.search(similar_word) is None)

restrict_vocab_size = np.arange(100000,1000000,100000)
rank=1

word_to_mistakes = collections.defaultdict(list)
for word, freq in vocab_m.items():
        
    if len(word) <= 3 or nonalphabetic.search(word) is not None:
        continue
    
    for i in range(len(restrict_vocab_size)):
        similar_pre = bio_model.wv.most_similar(word, topn=similar_n,
                                                restrict_vocab=restrict_vocab_size[i])
    
        for similar_p in similar_pre:
            if include_spell_mistake(word, similar_p[0], similar_p[1]) and len(word_to_mistakes[word])<rank:
                word_to_mistakes[word].append(similar_p[0])
            
len(word_to_mistakes)

Wall time: 1min 9s


13

In [13]:
word_to_mistakes

defaultdict(list,
            {'adecarboxylate': ['adecarboxylata'],
             'chromogens': ['chromogen'],
             'flavbacterium': ['flavobacterium'],
             'koneensis': ['koreensis'],
             'ochrobacterium': ['ochrobactrum'],
             'orytihabitans': ['oryzihabitans'],
             'papatyphi': ['paratyphi'],
             'parpinfluenzae': ['parainfluenzae'],
             'pseudodiphthericum': ['pseudodiphtheriticum'],
             'shingobacterium': ['sphingobacterium'],
             'sstreptococcus': ['streptococcus'],
             'stapylococcus': ['staphylococcus'],
             'urealyticm': ['urealyticum']})

### 6. Similarity Score

#### index

In [14]:
# BioWordVec

temp=[]
for word, mistakes in word_to_mistakes.items():
    for mistake in mistakes:
        
        p_score = bio_model.wv.similarity(word, mistake)
        
        if mistake != word:
            temp.append([word, mistake, p_score])

df=pd.DataFrame(temp, columns=["Misspell","Correct","score"])
df=df.sort_values(["Misspell","score"], ascending=True)
df

Unnamed: 0,Misspell,Correct,score
0,adecarboxylate,adecarboxylata,0.904413
1,chromogens,chromogen,0.936553
2,flavbacterium,flavobacterium,0.831663
3,koneensis,koreensis,0.873674
4,ochrobacterium,ochrobactrum,0.928374
5,orytihabitans,oryzihabitans,0.900719
6,papatyphi,paratyphi,0.890885
7,parpinfluenzae,parainfluenzae,0.860117
8,pseudodiphthericum,pseudodiphtheriticum,0.929961
9,shingobacterium,sphingobacterium,0.928626


In [15]:
word = 'choromogenes'
mistake = 'chromogens'
bio_model.wv.similarity(word, mistake)

0.7128164

In [16]:
word = 'fermentum'
mistake = 'ferentum'
bio_model.wv.similarity(word, mistake)

0.47425902

In [17]:
word = 'peroris'
mistake = 'perosis'
bio_model.wv.similarity(word, mistake)

0.42142314

In [18]:
word = 'stackebrandtii'
mistake = 'stacherbrandfii'
bio_model.wv.similarity(word, mistake)

0.59232664