In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv("diabetic_data.csv",header='infer')

# Readmitted Classification

### Preprocessing : getting rid of terminal ill patient records / null values

In [4]:
df = df.drop(df[df["discharge_disposition_id"] == 11 ].index)
df = df.drop(df[df["discharge_disposition_id"] == 13 ].index)
df = df.drop(df[df["discharge_disposition_id"] == 14 ].index)
df = df.drop(df[df["discharge_disposition_id"] == 19 ].index)
df = df.drop(df[df["discharge_disposition_id"] == 20 ].index)
df = df.drop(df[df["discharge_disposition_id"] == 21 ].index)


#11,Expired
#13,Hospice / home
#14,Hospice / medical facility
#19,"Expired at home. Medicaid only, hospice."
#20,"Expired in a medical facility. Medicaid only, hospice."
#21,"Expired, place unknown. Medicaid only, hospice."



In [5]:
df.drop(['race','encounter_id','patient_nbr'],axis=1,inplace=True)

# diabetesMed recoding (Yes = 1, No = 0)

In [6]:
df.diabetesMed.value_counts()

Yes    76719
No     22624
Name: diabetesMed, dtype: int64

In [7]:
df

Unnamed: 0,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,...,No,No,No,No,No,No,No,No,No,NO
1,Female,[10-20),?,1,1,7,3,?,?,59,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,Female,[20-30),?,1,1,7,2,?,?,11,...,No,No,No,No,No,No,No,No,Yes,NO
3,Male,[30-40),?,1,1,7,2,?,?,44,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Male,[40-50),?,1,1,7,1,?,?,51,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,Male,[70-80),?,1,3,7,3,MC,?,51,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,Female,[80-90),?,1,4,5,5,MC,?,33,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,Male,[70-80),?,1,1,7,1,MC,?,53,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,Female,[80-90),?,2,3,7,10,MC,Surgery-General,45,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [8]:
df.loc[df.diabetesMed=='Yes','diabetesMed'] = 1
df.loc[df.diabetesMed=='No','diabetesMed'] = 0


In [9]:
df.diabetesMed.value_counts()

1    76719
0    22624
Name: diabetesMed, dtype: int64

# Scrapping ICD9 codes for diag1, diag2 and diag3

In [10]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

In [11]:
url="https://en.wikipedia.org/wiki/List_of_ICD-9_codes_001%E2%80%93139:_infectious_and_parasitic_diseases"

In [12]:
page=requests.get(url)

html_soup=BeautifulSoup(page.content,'html.parser')

In [13]:
table = html_soup.find_all('table',{"class":"wikitable"})

In [14]:
df_list = pd.read_html(str(table),header=None)[0]

In [15]:
df_list.drop(labels="Chapter",inplace=True,axis=1)


# Using value groups instead of specific codes

In [16]:
pd.set_option('max_colwidth',500)
df_list

Unnamed: 0,Block,Title
0,001–139,Infectious and Parasitic Diseases
1,140–239,Neoplasms
2,240–279,"Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders"
3,280–289,Diseases of the Blood and Blood-forming Organs
4,290–319,Mental Disorders
5,320–389,Diseases of the Nervous System and Sense Organs
6,390–459,Diseases of the Circulatory System
7,460–519,Diseases of the Respiratory System
8,520–579,Diseases of the Digestive System
9,580–629,Diseases of the Genitourinary System


In [17]:
with open ('value_counts_diag1.txt',"w") as f:
    for i in df.diag_1.value_counts():
        f.write(str(i)+'\n')




In [18]:
def add_prefix (prefix,lst):
    newlst=[]
    for i in lst:
        i=prefix+str(i)
        newlst.append(i)
    return newlst



In [19]:
df["diag_2"]=df["diag_2"].apply(lambda x: x.split(".",1)[0])
df["diag_1"]=df["diag_1"].apply(lambda x: x.split(".",1)[0])
df["diag_3"]=df["diag_3"].apply(lambda x: x.split(".",1)[0])


In [20]:

Infectious_Parasitic_Diseases=[str(x) for x in range(1,140)]
Neoplasms=[str(x) for x in range(140,240)]
Endo_Nutri_Metabo_Immun=[str(x) for x in range(240,280)]
Blood_sick=[str(x) for x in range(280,290)]
Mental=[str(x) for x in range(290,320)]
Nervous_System=[str(x) for x in range(320,390)]
Circulatory_System=[str(x) for x in range(390,460)]
Respiratory_System=[str(x) for x in range(460,520)]
Digestive_System=[str(x) for x in range(520,580)]
Genitourinary_System=[str(x) for x in range(580,630)]
Pregnancy=[str(x) for x in range(630,680)]
Skin=[str(x) for x in range(680,710)]
Musculoskeletal_System=[str(x) for x in range(710,740)]
Congenital=[str(x) for x in range(740,760)]
Perinatal_Period=[str(x) for x in range(760,780)]
Ill_defined=[str(x) for x in range(780,800)]
Injury_Poisoning=[str(x) for x in range(800,1000)]
External_Inj_Pois=add_prefix('E',[str(x).zfill(2) for x in range(800,1000)])
External_work=add_prefix('V',[str(x).zfill(2) for x in range(1,92)])
#Morpho_Neo





In [21]:
#df.loc[df.diag_2.isin(Infectious_Parasitic_Diseases),["diag_2","diag_1","diag_3"]]=df_list["Title"][0]
df.loc[df.diag_2.isin(Infectious_Parasitic_Diseases),"diag_2"]=df_list["Title"][0]
df.loc[df.diag_2.isin(Neoplasms),"diag_2"]=df_list["Title"][1]
df.loc[df.diag_2.isin(Endo_Nutri_Metabo_Immun),"diag_2"]=df_list["Title"][2]
df.loc[df.diag_2.isin(Blood_sick),"diag_2"]=df_list["Title"][3]
df.loc[df.diag_2.isin(Mental),"diag_2"]=df_list["Title"][4]
df.loc[df.diag_2.isin(Nervous_System),"diag_2"]=df_list["Title"][5]

df.loc[df.diag_2.isin(Circulatory_System),"diag_2"]=df_list["Title"][6]
df.loc[df.diag_2.isin(Respiratory_System),"diag_2"]=df_list["Title"][7]
df.loc[df.diag_2.isin(Digestive_System),"diag_2"]=df_list["Title"][8]
df.loc[df.diag_2.isin(Genitourinary_System),"diag_2"]=df_list["Title"][9]
df.loc[df.diag_2.isin(Pregnancy),"diag_2"]=df_list["Title"][10]
df.loc[df.diag_2.isin(Skin),"diag_2"]=df_list["Title"][11]
df.loc[df.diag_2.isin(Musculoskeletal_System),"diag_2"]=df_list["Title"][12]
df.loc[df.diag_2.isin(Congenital),"diag_2"]=df_list["Title"][13]
df.loc[df.diag_2.isin(Perinatal_Period),"diag_2"]=df_list["Title"][14]
df.loc[df.diag_2.isin(Ill_defined),"diag_2"]=df_list["Title"][15]
df.loc[df.diag_2.isin(Injury_Poisoning),"diag_2"]=df_list["Title"][16]
df.loc[df.diag_2.isin(External_Inj_Pois),"diag_2"]=df_list["Title"][17]
df.loc[df.diag_2.isin(External_work),"diag_2"]=df_list["Title"][18]

In [22]:
df.diag_2.unique()

array(['?',
       'Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders',
       'Neoplasms', 'Diseases of the Circulatory System',
       'Diseases of the Respiratory System',
       'Diseases of the Blood and Blood-forming Organs',
       'Injury and Poisoning',
       'Diseases of the Musculoskeletal System and Connective Tissue',
       'Diseases of the Genitourinary System',
       'Infectious and Parasitic Diseases',
       'Diseases of the Digestive System',
       'Diseases of the Skin and Subcutaneous Tissue',
       'Supplementary Classification of Factors influencing Health Status and Contact with Health Services',
       'Diseases of the Nervous System and Sense Organs',
       'Symptoms, Signs and Ill-defined Conditions', 'Mental Disorders',
       'Complications of Pregnancy, Childbirth, and the Puerperium',
       'Congenital Anomalies',
       'Supplementary Classification of External Causes of Injury and Poisoning'],
      dtype=object)

In [23]:
df.loc[df.diag_1.isin(Infectious_Parasitic_Diseases),"diag_1"]=df_list["Title"][0]
df.loc[df.diag_1.isin(Neoplasms),"diag_1"]=df_list["Title"][1]
df.loc[df.diag_1.isin(Endo_Nutri_Metabo_Immun),"diag_1"]=df_list["Title"][2]
df.loc[df.diag_1.isin(Blood_sick),"diag_1"]=df_list["Title"][3]
df.loc[df.diag_1.isin(Mental),"diag_1"]=df_list["Title"][4]
df.loc[df.diag_1.isin(Nervous_System),"diag_1"]=df_list["Title"][5]

df.loc[df.diag_1.isin(Circulatory_System),"diag_1"]=df_list["Title"][6]
df.loc[df.diag_1.isin(Respiratory_System),"diag_1"]=df_list["Title"][7]
df.loc[df.diag_1.isin(Digestive_System),"diag_1"]=df_list["Title"][8]
df.loc[df.diag_1.isin(Genitourinary_System),"diag_1"]=df_list["Title"][9]
df.loc[df.diag_1.isin(Pregnancy),"diag_1"]=df_list["Title"][10]
df.loc[df.diag_1.isin(Skin),"diag_1"]=df_list["Title"][11]
df.loc[df.diag_1.isin(Musculoskeletal_System),"diag_1"]=df_list["Title"][12]
df.loc[df.diag_1.isin(Congenital),"diag_1"]=df_list["Title"][13]
df.loc[df.diag_1.isin(Perinatal_Period),"diag_1"]=df_list["Title"][14]
df.loc[df.diag_1.isin(Ill_defined),"diag_1"]=df_list["Title"][15]
df.loc[df.diag_1.isin(Injury_Poisoning),"diag_1"]=df_list["Title"][16]
df.loc[df.diag_1.isin(External_Inj_Pois),"diag_1"]=df_list["Title"][17]
df.loc[df.diag_1.isin(External_work),"diag_1"]=df_list["Title"][18]

In [24]:
df.diag_1.unique()

array(['Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders',
       'Complications of Pregnancy, Childbirth, and the Puerperium',
       'Infectious and Parasitic Diseases', 'Neoplasms',
       'Diseases of the Circulatory System',
       'Diseases of the Respiratory System', 'Injury and Poisoning',
       'Diseases of the Skin and Subcutaneous Tissue',
       'Diseases of the Musculoskeletal System and Connective Tissue',
       'Diseases of the Digestive System',
       'Supplementary Classification of Factors influencing Health Status and Contact with Health Services',
       'Symptoms, Signs and Ill-defined Conditions',
       'Diseases of the Genitourinary System', 'Mental Disorders',
       'Diseases of the Nervous System and Sense Organs',
       'Diseases of the Blood and Blood-forming Organs', '?',
       'Congenital Anomalies',
       'Supplementary Classification of External Causes of Injury and Poisoning'],
      dtype=object)

In [25]:
df.loc[df.diag_3.isin(Infectious_Parasitic_Diseases),"diag_3"]=df_list["Title"][0]
df.loc[df.diag_3.isin(Neoplasms),"diag_3"]=df_list["Title"][1]
df.loc[df.diag_3.isin(Endo_Nutri_Metabo_Immun),"diag_3"]=df_list["Title"][2]
df.loc[df.diag_3.isin(Blood_sick),"diag_3"]=df_list["Title"][3]
df.loc[df.diag_3.isin(Mental),"diag_3"]=df_list["Title"][4]
df.loc[df.diag_3.isin(Nervous_System),"diag_3"]=df_list["Title"][5]

df.loc[df.diag_3.isin(Circulatory_System),"diag_3"]=df_list["Title"][6]
df.loc[df.diag_3.isin(Respiratory_System),"diag_3"]=df_list["Title"][7]
df.loc[df.diag_3.isin(Digestive_System),"diag_3"]=df_list["Title"][8]
df.loc[df.diag_3.isin(Genitourinary_System),"diag_3"]=df_list["Title"][9]
df.loc[df.diag_3.isin(Pregnancy),"diag_3"]=df_list["Title"][10]
df.loc[df.diag_3.isin(Skin),"diag_3"]=df_list["Title"][11]
df.loc[df.diag_3.isin(Musculoskeletal_System),"diag_3"]=df_list["Title"][12]
df.loc[df.diag_3.isin(Congenital),"diag_3"]=df_list["Title"][13]
df.loc[df.diag_3.isin(Perinatal_Period),"diag_3"]=df_list["Title"][14]
df.loc[df.diag_3.isin(Ill_defined),"diag_3"]=df_list["Title"][15]
df.loc[df.diag_3.isin(Injury_Poisoning),"diag_3"]=df_list["Title"][16]
df.loc[df.diag_3.isin(External_Inj_Pois),"diag_3"]=df_list["Title"][17]
df.loc[df.diag_3.isin(External_work),"diag_3"]=df_list["Title"][18]

In [26]:
df.diag_3.unique()

array(['?',
       'Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders',
       'Supplementary Classification of Factors influencing Health Status and Contact with Health Services',
       'Diseases of the Circulatory System',
       'Infectious and Parasitic Diseases',
       'Diseases of the Respiratory System', 'Injury and Poisoning',
       'Neoplasms', 'Diseases of the Genitourinary System',
       'Diseases of the Musculoskeletal System and Connective Tissue',
       'Symptoms, Signs and Ill-defined Conditions',
       'Diseases of the Digestive System',
       'Diseases of the Skin and Subcutaneous Tissue', 'Mental Disorders',
       'Congenital Anomalies',
       'Supplementary Classification of External Causes of Injury and Poisoning',
       'Diseases of the Nervous System and Sense Organs',
       'Complications of Pregnancy, Childbirth, and the Puerperium',
       'Diseases of the Blood and Blood-forming Organs'], dtype=object)

# Recoding nominal features

In [30]:
#df=df.drop(['diag_1','diag_2','diag_3'],axis=1)

dummied_full=pd.get_dummies(df)
dummied_full

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_0,diabetesMed_1,readmitted_<30,readmitted_>30,readmitted_NO
0,6,25,1,1,41,0,1,0,0,0,...,0,1,0,0,1,1,0,0,0,1
1,1,1,7,3,59,0,18,0,0,0,...,0,1,0,1,0,0,1,0,1,0
2,1,1,7,2,11,5,13,2,0,1,...,0,1,0,0,1,0,1,0,0,1
3,1,1,7,2,44,1,16,0,0,0,...,0,1,0,1,0,0,1,0,0,1
4,1,1,7,1,51,0,8,0,0,0,...,0,1,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,3,7,3,51,0,16,0,0,0,...,0,1,0,1,0,0,1,0,1,0
101762,1,4,5,5,33,3,18,0,0,1,...,0,1,0,0,1,0,1,0,0,1
101763,1,1,7,1,53,0,9,1,0,0,...,0,1,0,1,0,0,1,0,0,1
101764,2,3,7,10,45,2,21,0,0,1,...,0,1,0,1,0,0,1,0,0,1


# Writing all new features

In [34]:
with open ('new_features.txt',"w") as f:
    for i in dummied_full.columns:
        f.write(str(i)+'\n')


# Trying out models

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [36]:
target=df["readmitted"]

In [37]:
vars=dummied_full.drop(['readmitted_<30','readmitted_>30','readmitted_NO'],axis=1)

In [38]:
train,test_val,target_train,target_test_val=train_test_split(vars,target,test_size=0.33,random_state=42)

test,validation,target_test,target_validation=train_test_split(test_val,target_test_val,test_size=0.1,random_state=42)

In [39]:
#clf=svm.SVC()
#clf.fit(train,target_train)

In [40]:
#t=clf.predict(test)

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [42]:
clf = RandomForestClassifier(max_depth=100,max_features="sqrt",bootstrap=True,oob_score=False)

In [43]:
clf.fit(train,target_train)

RandomForestClassifier(max_depth=100, max_features='sqrt')

In [44]:
predictions=clf.predict(test)

In [45]:
from sklearn import metrics

In [46]:
print(metrics.accuracy_score(target_test,predictions))

0.5713268937468226


In [47]:
print(metrics.confusion_matrix(target_test,predictions))

[[   28  1364  2032]
 [   34  3907  6619]
 [   11  2588 12922]]


# No Bootstrap

In [48]:
clf2=RandomForestClassifier(max_depth=200,max_features="sqrt",oob_score=False)

In [49]:
clf2.fit(train,target_train)

RandomForestClassifier(max_depth=200, max_features='sqrt')

In [50]:
pred2=clf2.predict(test)

In [51]:
print(metrics.accuracy_score(target_test,pred2))

0.5720386375190646


In [52]:
print(metrics.confusion_matrix(target_test,pred2))

[[   33  1355  2036]
 [   28  3920  6612]
 [   11  2585 12925]]


# PCA

# Try cor matrix