In [1]:
# Import Dependencies
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
%matplotlib inline

In [2]:
# Read Raw Dataset
df = pd.read_excel('raw_data.xlsx')

In [3]:
df

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall
...,...,...,...
1861,,,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,,,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,,,UMLS:C0871754_frail


In [4]:
df.head

<bound method NDFrame.head of                                  Disease  Count of Disease Occurrence  \
0     UMLS:C0020538_hypertensive disease                       3363.0   
1                                    NaN                          NaN   
2                                    NaN                          NaN   
3                                    NaN                          NaN   
4                                    NaN                          NaN   
...                                  ...                          ...   
1861                                 NaN                          NaN   
1862                                 NaN                          NaN   
1863       UMLS:C0011127_decubitus ulcer                         42.0   
1864                                 NaN                          NaN   
1865                                 NaN                          NaN   

                                              Symptom  
0                            UMLS:C00

In [5]:
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [6]:
df.tail()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
1861,,,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,,,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,,,UMLS:C0871754_frail
1865,,,UMLS:C0015967_fever


In [7]:
# Fill all NaN with the values above
data = df.fillna(method='ffill')

In [8]:
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [9]:
df.to_csv('sanjay.csv')

In [10]:
df.to_csv('cleaned_data.csv')  # Save dataframe as data.csv in the current directory

In [11]:
df.to_csv(r'C:\Users\chara\Desktop\Mach\cleaned_data.csv')  # Save dataframe as cleandata.csv in the specified directory

In [12]:
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [13]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    return data_list

In [14]:
# Data Cleanup
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        # Placeholder block
        pass  # replace this with your actual code for processing symptoms



In [15]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list, {})

In [16]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{}

In [17]:
# See that the data is Processed Correctly
disease_symptom_dict
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count
# Save cleaned data as CSV
f = open('cleaned_data.csv', 'w')

with f:
    writer = csv.writer(f)
    for key, val in disease_symptom_dict.items():
        for i in range(len(val)):
            writer.writerow([key, val[i], disease_symptom_count[key]])

In [18]:
# Read Cleaned Data as DF
df = pd.read_csv('sanjay.csv')

# Assign new column names
df.columns = ['Index', 'Disease', 'Count of Disease Occurrence', 'Symptom']

# Display the first few rows
df.head()


Unnamed: 0,Index,Disease,Count of Disease Occurrence,Symptom
0,0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,1,,,UMLS:C0392680_shortness of breath
2,2,,,UMLS:C0012833_dizziness
3,3,,,UMLS:C0004093_asthenia
4,4,,,UMLS:C0085639_fall


In [19]:
# Remove any rows with empty values
df.replace(float('nan'), np.nan, inplace=True)
df.dropna(inplace=True)

In [20]:
from sklearn import preprocessing

In [21]:
n_unique = len(df['Symptom'].unique())
n_unique

79

In [22]:
df.dtypes

Index                            int64
Disease                         object
Count of Disease Occurrence    float64
Symptom                         object
dtype: object

In [23]:
# Encode the Labels
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['Symptom'])
print(integer_encoded)

[ 4 27 61  4  7 60 13 34  4 25 16 16 35 60 16 60 29 25 60 49 25 16 32 25
 16 30 16 67 60 29 60 33 67  2 60 62  4 61 25 12  2 16 72 21 60 25  7 13
 57 52 67 50 33 41 37  0 18 36 25 43 23 50  0 28  0 25 70 16 37 75 37  7
  2 60 26 68 15 55 58 16 76 50 64 48 78 42 51 56 60 10  9  3 60 47 38 40
 42 27 11 44 19 67  0 42 63 24 67 65 20 60  6 71 16  5 46 31 14 77 74  8
 56 69 39 73  0  1 22 33 53 66 17 54 59 45]


In [24]:
# One Hot Encode the Labels
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]




In [25]:
onehot_encoded[0]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [26]:
cols = np.asarray(df['Symptom'].unique())
cols

array(['UMLS:C0008031_pain chest', 'UMLS:C0032617_polyuria',
       'UMLS:C0424000_feeling suicidal', 'UMLS:C0010200_cough',
       'UMLS:C0392680_shortness of breath', 'UMLS:C0013362_dysarthria',
       'UMLS:C0043144_wheezing', 'UMLS:C0030193_pain',
       'UMLS:C0015967_fever', 'UMLS:C0085593_chill',
       'UMLS:C0036572_seizure', 'UMLS:C0233481_worry',
       'UMLS:C0041834_erythema', 'UMLS:C0038999_swelling',
       'UMLS:C0577559_mass of body structure', 'UMLS:C0042963_vomiting',
       'UMLS:C0003962_ascites', 'UMLS:C0438696_suicidal',
       'UMLS:C0013144_drowsiness^UMLS:C0234450_sleepy',
       'UMLS:C0856054_mental status changes',
       'UMLS:C0020461_hyperkalemia', 'UMLS:C0242453_prostatism',
       'UMLS:C0234518_speech slurred',
       'UMLS:C0233762_hallucinations auditory',
       'UMLS:C0221166_paraparesis', 'UMLS:C0085619_orthopnea',
       'UMLS:C0000737_pain abdominal', 'UMLS:C0018965_hematuria',
       'UMLS:C0085606_urgency of\xa0micturition',
       'UMLS:C023

In [27]:
# Create a new dataframe to save OHE labels
df_ohe = pd.DataFrame(columns = cols)
df_ohe.head()


Unnamed: 0,UMLS:C0008031_pain chest,UMLS:C0032617_polyuria,UMLS:C0424000_feeling suicidal,UMLS:C0010200_cough,UMLS:C0392680_shortness of breath,UMLS:C0013362_dysarthria,UMLS:C0043144_wheezing,UMLS:C0030193_pain,UMLS:C0015967_fever,UMLS:C0085593_chill,...,UMLS:C0086439_hypokinesia,UMLS:C0859032_moan,UMLS:C0002416_ambidexterity,UMLS:C0020639_hypoproteinemia,UMLS:C0238705_left atrial hypertrophy,UMLS:C0549483_abscess bacterial,UMLS:C0016204_flatulence,UMLS:C0240233_loose associations,UMLS:C0277794_extreme exhaustion,UMLS:C0232257_systolic murmur


In [28]:
for i in range(len(onehot_encoded)):
    df_ohe.loc[i] = onehot_encoded[i]

In [29]:
df_ohe.head()

Unnamed: 0,UMLS:C0008031_pain chest,UMLS:C0032617_polyuria,UMLS:C0424000_feeling suicidal,UMLS:C0010200_cough,UMLS:C0392680_shortness of breath,UMLS:C0013362_dysarthria,UMLS:C0043144_wheezing,UMLS:C0030193_pain,UMLS:C0015967_fever,UMLS:C0085593_chill,...,UMLS:C0086439_hypokinesia,UMLS:C0859032_moan,UMLS:C0002416_ambidexterity,UMLS:C0020639_hypoproteinemia,UMLS:C0238705_left atrial hypertrophy,UMLS:C0549483_abscess bacterial,UMLS:C0016204_flatulence,UMLS:C0240233_loose associations,UMLS:C0277794_extreme exhaustion,UMLS:C0232257_systolic murmur
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
len(df_ohe)

134

In [31]:
# Disease Dataframe
df_disease = df['Disease']
df_disease.head()

0                    UMLS:C0020538_hypertensive disease
12                               UMLS:C0011847_diabetes
26    UMLS:C0011570_depression mental^UMLS:C0011581_...
47    UMLS:C0010054_coronary arteriosclerosis^UMLS:C...
56                              UMLS:C0032285_pneumonia
Name: Disease, dtype: object

In [32]:
# Concatenate OHE Labels with the Disease Column
df_concat = pd.concat([df_disease,df_ohe], axis=1)
df_concat.head()

Unnamed: 0,Disease,UMLS:C0008031_pain chest,UMLS:C0032617_polyuria,UMLS:C0424000_feeling suicidal,UMLS:C0010200_cough,UMLS:C0392680_shortness of breath,UMLS:C0013362_dysarthria,UMLS:C0043144_wheezing,UMLS:C0030193_pain,UMLS:C0015967_fever,...,UMLS:C0086439_hypokinesia,UMLS:C0859032_moan,UMLS:C0002416_ambidexterity,UMLS:C0020639_hypoproteinemia,UMLS:C0238705_left atrial hypertrophy,UMLS:C0549483_abscess bacterial,UMLS:C0016204_flatulence,UMLS:C0240233_loose associations,UMLS:C0277794_extreme exhaustion,UMLS:C0232257_systolic murmur
0,UMLS:C0020538_hypertensive disease,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# Concatenate OHE Labels with the Disease Column
df_concat = pd.concat([df_disease,df_ohe], axis=1)
df_concat.head()
df_concat.drop_duplicates(keep='first',inplace=True)

In [34]:
df_concat.head()

Unnamed: 0,Disease,UMLS:C0008031_pain chest,UMLS:C0032617_polyuria,UMLS:C0424000_feeling suicidal,UMLS:C0010200_cough,UMLS:C0392680_shortness of breath,UMLS:C0013362_dysarthria,UMLS:C0043144_wheezing,UMLS:C0030193_pain,UMLS:C0015967_fever,...,UMLS:C0086439_hypokinesia,UMLS:C0859032_moan,UMLS:C0002416_ambidexterity,UMLS:C0020639_hypoproteinemia,UMLS:C0238705_left atrial hypertrophy,UMLS:C0549483_abscess bacterial,UMLS:C0016204_flatulence,UMLS:C0240233_loose associations,UMLS:C0277794_extreme exhaustion,UMLS:C0232257_systolic murmur
0,UMLS:C0020538_hypertensive disease,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
len(df_concat)

205

In [36]:
cols = df_concat.columns
cols

Index(['Disease', 'UMLS:C0008031_pain chest', 'UMLS:C0032617_polyuria',
       'UMLS:C0424000_feeling suicidal', 'UMLS:C0010200_cough',
       'UMLS:C0392680_shortness of breath', 'UMLS:C0013362_dysarthria',
       'UMLS:C0043144_wheezing', 'UMLS:C0030193_pain', 'UMLS:C0015967_fever',
       'UMLS:C0085593_chill', 'UMLS:C0036572_seizure', 'UMLS:C0233481_worry',
       'UMLS:C0041834_erythema', 'UMLS:C0038999_swelling',
       'UMLS:C0577559_mass of body structure', 'UMLS:C0042963_vomiting',
       'UMLS:C0003962_ascites', 'UMLS:C0438696_suicidal',
       'UMLS:C0013144_drowsiness^UMLS:C0234450_sleepy',
       'UMLS:C0856054_mental status changes', 'UMLS:C0020461_hyperkalemia',
       'UMLS:C0242453_prostatism', 'UMLS:C0234518_speech slurred',
       'UMLS:C0233762_hallucinations auditory', 'UMLS:C0221166_paraparesis',
       'UMLS:C0085619_orthopnea', 'UMLS:C0000737_pain abdominal',
       'UMLS:C0018965_hematuria', 'UMLS:C0085606_urgency of micturition',
       'UMLS:C0231807_dyspnea 

In [37]:
cols = cols[1:]

In [38]:
# Since, every disease has multiple symptoms, combine all symptoms per disease per row
df_concat = df_concat.groupby('Disease').sum()
df_concat = df_concat.reset_index()
df_concat[:5]

Unnamed: 0,Disease,UMLS:C0008031_pain chest,UMLS:C0032617_polyuria,UMLS:C0424000_feeling suicidal,UMLS:C0010200_cough,UMLS:C0392680_shortness of breath,UMLS:C0013362_dysarthria,UMLS:C0043144_wheezing,UMLS:C0030193_pain,UMLS:C0015967_fever,...,UMLS:C0086439_hypokinesia,UMLS:C0859032_moan,UMLS:C0002416_ambidexterity,UMLS:C0020639_hypoproteinemia,UMLS:C0238705_left atrial hypertrophy,UMLS:C0549483_abscess bacterial,UMLS:C0016204_flatulence,UMLS:C0240233_loose associations,UMLS:C0277794_extreme exhaustion,UMLS:C0232257_systolic murmur
0,UMLS:C0001175_acquired immuno-deficiency syndr...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,UMLS:C0001418_adenocarcinoma,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,UMLS:C0001511_adhesion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,UMLS:C0001973_chronic alcoholic intoxication,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,UMLS:C0002395_Alzheimer's disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
len(df_concat)

134

In [40]:
df_concat.to_csv("training_dataset.csv", index=False)

In [41]:
# One Hot Encoded Features
X = df_concat[cols]

# Labels
y = df_concat['Disease']

## Model Training

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [43]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [44]:
len(X_train), len(y_train)

(107, 107)

In [45]:
len(X_test), len(y_test)

(27, 27)

In [46]:
dt = DecisionTreeClassifier()
clf_dt=dt.fit(X, y)

In [47]:
clf_dt.score(X, y)

0.08955223880597014

In [48]:
export_graphviz(dt, 
                out_file='./tree.dot', 
                feature_names=cols)

In [49]:
!pip install graphviz

