In [38]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

import numpy as np

from scipy.stats import uniform, randint

from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [39]:
df=pd.read_csv("diabetic_data.csv",header='infer')

df.drop_duplicates(subset=['patient_nbr'], inplace=True) # only keeping the first visit for each patients
df.drop("payer_code",inplace=True,axis=1) # getting of payer_code because of high missing%
df.drop("weight",inplace=True,axis=1) # same for weight

df.loc[df["readmitted"]=="<30","readmitted"]="readmitted" # only worried about patients readmitted within 30 days
df.loc[df["readmitted"]==">30","readmitted"]="Otherwise"
df.loc[df["readmitted"]=="NO","readmitted"]="Otherwise"

df = df.drop(df[df["discharge_disposition_id"] == 11 ].index) #Getting rid of terminaly ill patient records
df = df.drop(df[df["discharge_disposition_id"] == 13 ].index)
df = df.drop(df[df["discharge_disposition_id"] == 14 ].index)
df = df.drop(df[df["discharge_disposition_id"] == 19 ].index)
df = df.drop(df[df["discharge_disposition_id"] == 20 ].index)
df = df.drop(df[df["discharge_disposition_id"] == 21 ].index)

df.drop(['race','encounter_id','patient_nbr'],axis=1,inplace=True) # removing id type features and race feature

df.loc[df.diabetesMed=='Yes','diabetesMed'] = 1 # one-hot encoding features diabetesMed
df.loc[df.diabetesMed=='No','diabetesMed'] = 0

# Grouping ICD9 codes by categories/intervals

In [40]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

In [41]:
url="https://en.wikipedia.org/wiki/List_of_ICD-9_codes_001%E2%80%93139:_infectious_and_parasitic_diseases"
page=requests.get(url)

html_soup=BeautifulSoup(page.content,'html.parser')
table = html_soup.find_all('table',{"class":"wikitable"})
df_list = pd.read_html(str(table),header=None)[0]
df_list.drop(labels="Chapter",inplace=True,axis=1)
pd.set_option('max_colwidth',500)
df_list

Unnamed: 0,Block,Title
0,001–139,Infectious and Parasitic Diseases
1,140–239,Neoplasms
2,240–279,"Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders"
3,280–289,Diseases of the Blood and Blood-forming Organs
4,290–319,Mental Disorders
5,320–389,Diseases of the Nervous System and Sense Organs
6,390–459,Diseases of the Circulatory System
7,460–519,Diseases of the Respiratory System
8,520–579,Diseases of the Digestive System
9,580–629,Diseases of the Genitourinary System


In [42]:
def add_prefix (prefix,lst):
    newlst=[]
    for i in lst:
        i=prefix+str(i)
        newlst.append(i)
    return newlst

df["diag_2"]=df["diag_2"].apply(lambda x: x.split(".",1)[0])
df["diag_1"]=df["diag_1"].apply(lambda x: x.split(".",1)[0])
df["diag_3"]=df["diag_3"].apply(lambda x: x.split(".",1)[0])


Infectious_Parasitic_Diseases=[str(x) for x in range(1,140)]
Neoplasms=[str(x) for x in range(140,240)]
Endo_Nutri_Metabo_Immun=[str(x) for x in range(240,280)]
Blood_sick=[str(x) for x in range(280,290)]
Mental=[str(x) for x in range(290,320)]
Nervous_System=[str(x) for x in range(320,390)]
Circulatory_System=[str(x) for x in range(390,460)]
Respiratory_System=[str(x) for x in range(460,520)]
Digestive_System=[str(x) for x in range(520,580)]
Genitourinary_System=[str(x) for x in range(580,630)]
Pregnancy=[str(x) for x in range(630,680)]
Skin=[str(x) for x in range(680,710)]
Musculoskeletal_System=[str(x) for x in range(710,740)]
Congenital=[str(x) for x in range(740,760)]
Perinatal_Period=[str(x) for x in range(760,780)]
Ill_defined=[str(x) for x in range(780,800)]
Injury_Poisoning=[str(x) for x in range(800,1000)]
External_Inj_Pois=add_prefix('E',[str(x).zfill(2) for x in range(800,1000)])
External_work=add_prefix('V',[str(x).zfill(2) for x in range(1,92)])



#df.loc[df.diag_2.isin(Infectious_Parasitic_Diseases),["diag_2","diag_1","diag_3"]]=df_list["Title"][0]
df.loc[df.diag_2.isin(Infectious_Parasitic_Diseases),"diag_2"]=df_list["Title"][0]
df.loc[df.diag_2.isin(Neoplasms),"diag_2"]=df_list["Title"][1]
df.loc[df.diag_2.isin(Endo_Nutri_Metabo_Immun),"diag_2"]=df_list["Title"][2]
df.loc[df.diag_2.isin(Blood_sick),"diag_2"]=df_list["Title"][3]
df.loc[df.diag_2.isin(Mental),"diag_2"]=df_list["Title"][4]
df.loc[df.diag_2.isin(Nervous_System),"diag_2"]=df_list["Title"][5]

df.loc[df.diag_2.isin(Circulatory_System),"diag_2"]=df_list["Title"][6]
df.loc[df.diag_2.isin(Respiratory_System),"diag_2"]=df_list["Title"][7]
df.loc[df.diag_2.isin(Digestive_System),"diag_2"]=df_list["Title"][8]
df.loc[df.diag_2.isin(Genitourinary_System),"diag_2"]=df_list["Title"][9]
df.loc[df.diag_2.isin(Pregnancy),"diag_2"]=df_list["Title"][10]
df.loc[df.diag_2.isin(Skin),"diag_2"]=df_list["Title"][11]
df.loc[df.diag_2.isin(Musculoskeletal_System),"diag_2"]=df_list["Title"][12]
df.loc[df.diag_2.isin(Congenital),"diag_2"]=df_list["Title"][13]
df.loc[df.diag_2.isin(Perinatal_Period),"diag_2"]=df_list["Title"][14]
df.loc[df.diag_2.isin(Ill_defined),"diag_2"]=df_list["Title"][15]
df.loc[df.diag_2.isin(Injury_Poisoning),"diag_2"]=df_list["Title"][16]
df.loc[df.diag_2.isin(External_Inj_Pois),"diag_2"]=df_list["Title"][17]
df.loc[df.diag_2.isin(External_work),"diag_2"]=df_list["Title"][18]


df.loc[df.diag_1.isin(Infectious_Parasitic_Diseases),"diag_1"]=df_list["Title"][0]
df.loc[df.diag_1.isin(Neoplasms),"diag_1"]=df_list["Title"][1]
df.loc[df.diag_1.isin(Endo_Nutri_Metabo_Immun),"diag_1"]=df_list["Title"][2]
df.loc[df.diag_1.isin(Blood_sick),"diag_1"]=df_list["Title"][3]
df.loc[df.diag_1.isin(Mental),"diag_1"]=df_list["Title"][4]
df.loc[df.diag_1.isin(Nervous_System),"diag_1"]=df_list["Title"][5]

df.loc[df.diag_1.isin(Circulatory_System),"diag_1"]=df_list["Title"][6]
df.loc[df.diag_1.isin(Respiratory_System),"diag_1"]=df_list["Title"][7]
df.loc[df.diag_1.isin(Digestive_System),"diag_1"]=df_list["Title"][8]
df.loc[df.diag_1.isin(Genitourinary_System),"diag_1"]=df_list["Title"][9]
df.loc[df.diag_1.isin(Pregnancy),"diag_1"]=df_list["Title"][10]
df.loc[df.diag_1.isin(Skin),"diag_1"]=df_list["Title"][11]
df.loc[df.diag_1.isin(Musculoskeletal_System),"diag_1"]=df_list["Title"][12]
df.loc[df.diag_1.isin(Congenital),"diag_1"]=df_list["Title"][13]
df.loc[df.diag_1.isin(Perinatal_Period),"diag_1"]=df_list["Title"][14]
df.loc[df.diag_1.isin(Ill_defined),"diag_1"]=df_list["Title"][15]
df.loc[df.diag_1.isin(Injury_Poisoning),"diag_1"]=df_list["Title"][16]
df.loc[df.diag_1.isin(External_Inj_Pois),"diag_1"]=df_list["Title"][17]
df.loc[df.diag_1.isin(External_work),"diag_1"]=df_list["Title"][18]


df.loc[df.diag_3.isin(Infectious_Parasitic_Diseases),"diag_3"]=df_list["Title"][0]
df.loc[df.diag_3.isin(Neoplasms),"diag_3"]=df_list["Title"][1]
df.loc[df.diag_3.isin(Endo_Nutri_Metabo_Immun),"diag_3"]=df_list["Title"][2]
df.loc[df.diag_3.isin(Blood_sick),"diag_3"]=df_list["Title"][3]
df.loc[df.diag_3.isin(Mental),"diag_3"]=df_list["Title"][4]
df.loc[df.diag_3.isin(Nervous_System),"diag_3"]=df_list["Title"][5]

df.loc[df.diag_3.isin(Circulatory_System),"diag_3"]=df_list["Title"][6]
df.loc[df.diag_3.isin(Respiratory_System),"diag_3"]=df_list["Title"][7]
df.loc[df.diag_3.isin(Digestive_System),"diag_3"]=df_list["Title"][8]
df.loc[df.diag_3.isin(Genitourinary_System),"diag_3"]=df_list["Title"][9]
df.loc[df.diag_3.isin(Pregnancy),"diag_3"]=df_list["Title"][10]
df.loc[df.diag_3.isin(Skin),"diag_3"]=df_list["Title"][11]
df.loc[df.diag_3.isin(Musculoskeletal_System),"diag_3"]=df_list["Title"][12]
df.loc[df.diag_3.isin(Congenital),"diag_3"]=df_list["Title"][13]
df.loc[df.diag_3.isin(Perinatal_Period),"diag_3"]=df_list["Title"][14]
df.loc[df.diag_3.isin(Ill_defined),"diag_3"]=df_list["Title"][15]
df.loc[df.diag_3.isin(Injury_Poisoning),"diag_3"]=df_list["Title"][16]
df.loc[df.diag_3.isin(External_Inj_Pois),"diag_3"]=df_list["Title"][17]
df.loc[df.diag_3.isin(External_work),"diag_3"]=df_list["Title"][18]

# One-hot encoding features

In [43]:
dummied_full=pd.get_dummies(df)
dummied_full=pd.get_dummies(dummied_full,columns=["admission_type_id","discharge_disposition_id","admission_source_id"])
dummied_full.drop(["examide_No","citoglipton_No"],axis=1,inplace=True)




dummied_full.rename(columns={'weight_>200': 'weight_Over200', 
                    'max_glu_serum_>200': 'max_glu_serum_Over200',
                    "max_glu_serum_>300":"max_glu_serum_Over300",
                   "A1Cresult_>7":"A1Cresult_Over7",
                   "A1Cresult_>8":"A1Cresult_Over8",
                   "readmitted_<30":"readmitted_Less30",
                   "readmitted_>30":"readmitted_Over30",
                   "age_[0-10)":"age_(0-10)",
                    "age_[10-20)":"age_(10-20)",
                    "age_[20-30)":"age_(20-30)",
                    "age_[30-40)":"age_(30-40)",
                    "age_[40-50)":"age_(40-50)",
                    "age_[50-60)":"age_(50-60)",
                    "age_[60-70)":"age_(60-70)",
                    "age_[70-80)":"age_(70-80)",
                    "age_[80-90)":"age_(80-90)",
                    "age_[90-100)":"age_(90-100)",
                    "weight_[0-25)":"weight_(0-25)",
                    "weight_[100-125)":"weight_(100-125)",
                    "weight_[125-150)":"weight_(125-150)",
                    "weight_[150-175)":"weight_(150-175)",
                    "weight_[175-200)":"weight_(175-200)",
                    "weight_[25-50)":"weight_(25-50)",
                    "weight_[50-75)":"weight_(50-75)",
                    "weight_[75-100)":"weight_(75-100)"}, inplace=True)


In [44]:
dummied_full.drop(["readmitted_Otherwise","readmitted_readmitted"],axis=1,inplace=True)
dummied_full["Target"]=df.readmitted

dummied_full.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,gender_Female,gender_Male,...,admission_source_id_9,admission_source_id_10,admission_source_id_11,admission_source_id_13,admission_source_id_14,admission_source_id_17,admission_source_id_20,admission_source_id_22,admission_source_id_25,Target
0,1,41,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,Otherwise
1,3,59,0,18,0,0,0,9,1,0,...,0,0,0,0,0,0,0,0,0,Otherwise
2,2,11,5,13,2,0,1,6,1,0,...,0,0,0,0,0,0,0,0,0,Otherwise
3,2,44,1,16,0,0,0,7,0,1,...,0,0,0,0,0,0,0,0,0,Otherwise
4,1,51,0,8,0,0,0,5,0,1,...,0,0,0,0,0,0,0,0,0,Otherwise


In [45]:
dummied_full.to_csv("Preprocessed06_12.csv",index=False)