# Preprocessing of diabetic_data.csv


In [2]:
import IPython
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import StandardScaler

In [3]:
ORIGINAL_DF = pd.read_csv("diabetic_data.csv",index_col=0, na_values='?', low_memory=False)
df = ORIGINAL_DF.copy(deep=True)

### Examining Dataset

In [7]:
str(df)

'              patient_nbr             race  gender       age weight  \\\nencounter_id                                                          \n2278392           8222157        Caucasian  Female    [0-10)    NaN   \n149190           55629189        Caucasian  Female   [10-20)    NaN   \n64410            86047875  AfricanAmerican  Female   [20-30)    NaN   \n500364           82442376        Caucasian    Male   [30-40)    NaN   \n16680            42519267        Caucasian    Male   [40-50)    NaN   \n35754            82637451        Caucasian    Male   [50-60)    NaN   \n55842            84259809        Caucasian    Male   [60-70)    NaN   \n63768           114882984        Caucasian    Male   [70-80)    NaN   \n12522            48330783        Caucasian  Female   [80-90)    NaN   \n15738            63555939        Caucasian  Female  [90-100)    NaN   \n28236            89869032  AfricanAmerican  Female   [40-50)    NaN   \n36900            77391171  AfricanAmerican    Male   [60-70)  

In [4]:
print('Dataset Readmitted distro: {}'.format(Counter(df["readmitted"])))
missingvalues = pd.DataFrame(df.isnull().sum(axis=0), columns = ['number_of_missing_values'])
missingvalues['feature'] = missingvalues.index
missingvalues = missingvalues[['feature','number_of_missing_values']].reset_index (drop = True)
missingvalues[missingvalues.number_of_missing_values > 0]

Dataset Readmitted distro: Counter({'NO': 54864, '>30': 35545, '<30': 11357})


Unnamed: 0,feature,number_of_missing_values
1,race,2273
4,weight,98569
9,payer_code,40256
10,medical_specialty,49949
17,diag_1,21
18,diag_2,358
19,diag_3,1423


In [5]:
# DECISION 1: We will classify between earlier readmissions and the rest
df['readmitted'] = df['readmitted'].apply(lambda x: 0 if x == "<30" else 1)

In [6]:
# DECISION 2: We will drop variables with a massive number of NAs
# DECISION 2.1: We will conserve a weight variable which indicates whether weight was measured or not, as it might be interesting for study
df['weight'] = df['weight'].apply(lambda x: 0 if pd.isna(x) else 1)
df = df.drop(['payer_code', 'medical_specialty'], axis = 1)

In [7]:
categ = []
numer = []
for col in df.columns:
    if df[col].dtype == object and col != "patient_nbr":
        categ.append(col)
    else:
        numer.append(col)
print("Categorical: ", categ)
print("Numerical: ", numer)

Categorical:  ['race', 'gender', 'age', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']
Numerical:  ['patient_nbr', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'readmitted']


In [8]:
categNoDiag = list(filter(lambda x: x != 'diag_1' and x != 'diag_2' and x != 'diag_3', categ))
for name in categNoDiag:
    print("Dataset "+ name + " distro: \n{}\n".format(Counter(df[name])))

Dataset race distro: 
Counter({'Caucasian': 76099, 'AfricanAmerican': 19210, nan: 2273, 'Hispanic': 2037, 'Other': 1506, 'Asian': 641})

Dataset gender distro: 
Counter({'Female': 54708, 'Male': 47055, 'Unknown/Invalid': 3})

Dataset age distro: 
Counter({'[70-80)': 26068, '[60-70)': 22483, '[50-60)': 17256, '[80-90)': 17197, '[40-50)': 9685, '[30-40)': 3775, '[90-100)': 2793, '[20-30)': 1657, '[10-20)': 691, '[0-10)': 161})

Dataset max_glu_serum distro: 
Counter({'None': 96420, 'Norm': 2597, '>200': 1485, '>300': 1264})

Dataset A1Cresult distro: 
Counter({'None': 84748, '>8': 8216, 'Norm': 4990, '>7': 3812})

Dataset metformin distro: 
Counter({'No': 81778, 'Steady': 18346, 'Up': 1067, 'Down': 575})

Dataset repaglinide distro: 
Counter({'No': 100227, 'Steady': 1384, 'Up': 110, 'Down': 45})

Dataset nateglinide distro: 
Counter({'No': 101063, 'Steady': 668, 'Up': 24, 'Down': 11})

Dataset chlorpropamide distro: 
Counter({'No': 101680, 'Steady': 79, 'Up': 6, 'Down': 1})

Dataset glim

In [9]:
#Decision 3: Drop citoglipton and examide. (constants)
df = df.drop(['citoglipton', 'examide'], axis = 1)

In [10]:
#Decision 4: Input NA in unknown-gender records
df["gender"]=df["gender"].replace("Unknown/Invalid", np.NaN)

In [11]:
#Decision 5: reduction of admission_type_id, discharge_disposition_id, admission_source_id 
#As seen in study, some of these values SHOULD be grouped as they pretty much mean the same. (Urgent care, emergency and similar)
#Decision 5.1: discharge_disposition_id = 11 is death during hospitalisation. Not even interesting for tests
print("Previously:")
for name in ['admission_source_id', 'discharge_disposition_id', 'admission_type_id']:
    print("Dataset "+ name + " distro: \n{}".format(Counter(df[name])))
valid = list(set(df[df['discharge_disposition_id'] != 11].index))
df = df[df['discharge_disposition_id'] != 11]
    
df['admission_type_id'] = df['admission_type_id'].replace(2,1)
df['admission_type_id'] = df['admission_type_id'].replace(7,1)
df['admission_type_id'] = df['admission_type_id'].replace(6,5)
df['admission_type_id'] = df['admission_type_id'].replace(8,5)

df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(6,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(8,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(9,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(13,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(3,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(4,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(5,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(14,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(22,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(23,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(24,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(12,10)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(15,10)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(16,10)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(17,10)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(25,18)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(26,18)

df['admission_source_id'] = df['admission_source_id'].replace(2,1)
df['admission_source_id'] = df['admission_source_id'].replace(3,1)
df['admission_source_id'] = df['admission_source_id'].replace(5,4)
df['admission_source_id'] = df['admission_source_id'].replace(6,4)
df['admission_source_id'] = df['admission_source_id'].replace(10,4)
df['admission_source_id'] = df['admission_source_id'].replace(22,4)
df['admission_source_id'] = df['admission_source_id'].replace(25,4)
df['admission_source_id'] = df['admission_source_id'].replace(15,9)
df['admission_source_id'] = df['admission_source_id'].replace(17,9)
df['admission_source_id'] = df['admission_source_id'].replace(20,9)
df['admission_source_id'] = df['admission_source_id'].replace(21,9)
df['admission_source_id'] = df['admission_source_id'].replace(13,11)
df['admission_source_id'] = df['admission_source_id'].replace(14,11)
print("\n Now:")
for name in ['admission_source_id', 'discharge_disposition_id', 'admission_type_id']:
    print("Dataset "+ name + " distro: \n{}".format(Counter(df[name])))

Previously:
Dataset admission_source_id distro: 
Counter({7: 57494, 1: 29565, 17: 6781, 4: 3187, 6: 2264, 2: 1104, 5: 855, 3: 187, 20: 161, 9: 125, 8: 16, 22: 12, 10: 8, 14: 2, 11: 2, 25: 2, 13: 1})
Dataset discharge_disposition_id distro: 
Counter({1: 60234, 3: 13954, 6: 12902, 18: 3691, 2: 2128, 22: 1993, 11: 1642, 5: 1184, 25: 989, 4: 815, 7: 623, 23: 412, 13: 399, 14: 372, 28: 139, 8: 108, 15: 63, 24: 48, 9: 21, 17: 14, 16: 11, 19: 8, 10: 6, 27: 5, 12: 3, 20: 2})
Dataset admission_type_id distro: 
Counter({1: 53990, 3: 18869, 2: 18480, 6: 5291, 5: 4785, 8: 320, 7: 21, 4: 10})

 Now:
Dataset admission_source_id distro: 
Counter({7: 56370, 1: 30592, 9: 6930, 4: 6212, 8: 15, 11: 5})
Dataset discharge_disposition_id distro: 
Counter({1: 73664, 2: 20906, 18: 4680, 7: 623, 28: 139, 10: 97, 19: 8, 27: 5, 20: 2})
Dataset admission_type_id distro: 
Counter({1: 71136, 3: 18740, 5: 10238, 4: 10})


In [12]:
# Decision 6: One-hot-encode A1Cresult and max_glu_serum into Normal and Abnormal. No-test = neither.
print("Previously:")
for name in ["A1Cresult", "max_glu_serum"]:
    print("Dataset "+ name + " distro: \n{}".format(Counter(df[name])))
    
df["A1C_Abnormal"] = df["A1Cresult"].apply(lambda x: 0 if x in ["Norm", "None"] else 1)
df["A1C_Normal"] = df["A1Cresult"].apply(lambda x: 1 if x == "Norm" else 0)

df["MAXGLU_Abnormal"] = df["max_glu_serum"].apply(lambda x: 0 if x in ["Norm", "None"] else 1)
df["MAXGLU_Normal"] = df["max_glu_serum"].apply(lambda x: 1 if x == "Norm" else 0)

print("\nNow:")
for name in ["A1C_Abnormal", "A1C_Normal", "MAXGLU_Abnormal", "MAXGLU_Normal"]:
    print("Dataset "+ name + " distro: \n{}".format(Counter(df[name])))

Previously:
Dataset A1Cresult distro: 
Counter({'None': 83247, '>8': 8151, 'Norm': 4942, '>7': 3784})
Dataset max_glu_serum distro: 
Counter({'None': 94899, 'Norm': 2574, '>200': 1440, '>300': 1211})

Now:
Dataset A1C_Abnormal distro: 
Counter({0: 88189, 1: 11935})
Dataset A1C_Normal distro: 
Counter({0: 95182, 1: 4942})
Dataset MAXGLU_Abnormal distro: 
Counter({0: 97473, 1: 2651})
Dataset MAXGLU_Normal distro: 
Counter({0: 97550, 1: 2574})


In [13]:
# + Dropping of original:
df = df.drop(["max_glu_serum","A1Cresult"], axis=1)

In [14]:
# Decision 7: Age to numerical, ORDERED. (Same as inputting mean of interval)
df['age'] = df['age'].replace('[0-10)', 0)
df['age'] = df['age'].replace('[10-20)', 1)
df['age'] = df['age'].replace('[20-30)', 2)
df['age'] = df['age'].replace('[30-40)', 3)
df['age'] = df['age'].replace('[40-50)', 4)
df['age'] = df['age'].replace('[50-60)', 5)
df['age'] = df['age'].replace('[60-70)', 6)
df['age'] = df['age'].replace('[70-80)', 7)
df['age'] = df['age'].replace('[80-90)', 8)
df['age'] = df['age'].replace('[90-100)', 9)

In [15]:
df.head().T

encounter_id,2278392,149190,64410,500364,16680
patient_nbr,8222157,55629189,86047875,82442376,42519267
race,Caucasian,Caucasian,AfricanAmerican,Caucasian,Caucasian
gender,Female,Female,Female,Male,Male
age,0,1,2,3,4
weight,0,0,0,0,0
admission_type_id,5,1,1,1,1
discharge_disposition_id,18,1,1,1,1
admission_source_id,1,7,7,7,7
time_in_hospital,1,3,2,2,1
num_lab_procedures,41,59,11,44,51


In [16]:
# Decision 8: One-hot encoding of gender:
df = df[df['gender'].notna()] # 3 cases, irrelevant to drop
df["Female"] = df['gender'].apply(lambda x: 1 if x == "Female" else 0)

In [17]:
#drop gender
df = df.drop(["gender"], axis=1)

In [18]:
df.head().T

encounter_id,2278392,149190,64410,500364,16680
patient_nbr,8222157,55629189,86047875,82442376,42519267
race,Caucasian,Caucasian,AfricanAmerican,Caucasian,Caucasian
age,0,1,2,3,4
weight,0,0,0,0,0
admission_type_id,5,1,1,1,1
discharge_disposition_id,18,1,1,1,1
admission_source_id,1,7,7,7,7
time_in_hospital,1,3,2,2,1
num_lab_procedures,41,59,11,44,51
num_procedures,0,0,5,1,0


In [19]:
# Decision 9: Meds will be re-mapped to 0 if he does not take it, 1 otherwise (independently of dosage change)   
meds = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']
for med in meds:
    df[med] = df[med].apply(lambda x: 0 if x =="No" else 1)

In [20]:
# Decision 10: We transform diagnosis_1 into a higher hierarchy (less classes) and one-hot encode it. Diag 2 and 3 worsen the results, so we drop them
df = df[df['diag_1'].notna()] # 21 cases, irrelevant to drop
df['temp_diag1'] = df['diag_1']
df.loc[df['diag_1'].str.contains('V', na=True), ['temp_diag1']] = -9 # there are no NAs, just a quick hack. Using neg numbers
df.loc[df['diag_1'].str.contains('E', na=True), ['temp_diag1']] = -9 # to avoid collision with rest when transforming

df['temp_diag1'] = df['temp_diag1'].astype(float)

df["temp_diag1"] = df["temp_diag1"].apply(lambda x: -1 if (x >= 390 and x < 460) or (np.floor(x) == 785) else x)
df["temp_diag1"] = df["temp_diag1"].apply(lambda x: -2 if (x >= 460 and x < 520) or (np.floor(x) == 786) else x)
df["temp_diag1"] = df["temp_diag1"].apply(lambda x: -3 if (x >= 520 and x < 580) or (np.floor(x) == 787) else x)
df["temp_diag1"] = df["temp_diag1"].apply(lambda x: -4 if (np.floor(x) == 250) else x)
df["temp_diag1"] = df["temp_diag1"].apply(lambda x: -5 if (x >= 800 and x < 1000) else x)
df["temp_diag1"] = df["temp_diag1"].apply(lambda x: -6 if (x >= 710 and x < 740) else x)
df["temp_diag1"] = df["temp_diag1"].apply(lambda x: -7 if (x >= 580 and x < 630) or (np.floor(x) == 788) else x)
df["temp_diag1"] = df["temp_diag1"].apply(lambda x: -8 if (x >= 140 and x < 240) else x)
df["temp_diag1"] = df["temp_diag1"].apply(lambda x: -9 if (x >= 0) else x) #default

dummiesDiag1 = pd.get_dummies(-df['temp_diag1'], drop_first = False)
for d in list(dummiesDiag1):
    df["Diag_"+str(int(d))] = dummiesDiag1[d]
df = df.drop(["temp_diag1"], axis = 1)

In [21]:
# Drop diagnosis:
df = df.drop(["diag_1","diag_2", "diag_3"], axis=1)

In [22]:
df.head().T

encounter_id,2278392,149190,64410,500364,16680
patient_nbr,8222157,55629189,86047875,82442376,42519267
race,Caucasian,Caucasian,AfricanAmerican,Caucasian,Caucasian
age,0,1,2,3,4
weight,0,0,0,0,0
admission_type_id,5,1,1,1,1
discharge_disposition_id,18,1,1,1,1
admission_source_id,1,7,7,7,7
time_in_hospital,1,3,2,2,1
num_lab_procedures,41,59,11,44,51
num_procedures,0,0,5,1,0


In [23]:
# Decision 11: Encode rest of categoricals (no ids yet)
df["change"] = df["change"].apply(lambda x: 0 if x== "No" else 1)
df["diabetesMed"] = df["diabetesMed"].apply(lambda x: 0 if x== "No" else 1)

In [24]:
# Decision 12: One hot encode Ids. admission_type_id, discharge_disposition_id, admission_source_id
dummiesADM_S = pd.get_dummies(df['admission_source_id'], drop_first = False)
for d in list(dummiesADM_S):
    df["ADM_S_"+str(d)] = dummiesADM_S[d]
dummiesADM_T = pd.get_dummies(df['admission_type_id'], drop_first = False)
for d in list(dummiesADM_T):
    df["ADM_T_"+str(d)] = dummiesADM_T[d]
dummiesDIS_D = pd.get_dummies(df['discharge_disposition_id'], drop_first = False)
for d in list(dummiesDIS_D):
    df["DIS_D_"+str(d)] = dummiesDIS_D[d]

In [25]:
df = df.drop(["admission_type_id","discharge_disposition_id", "admission_source_id"], axis=1)

In [26]:
df.head().T

encounter_id,2278392,149190,64410,500364,16680
patient_nbr,8222157,55629189,86047875,82442376,42519267
race,Caucasian,Caucasian,AfricanAmerican,Caucasian,Caucasian
age,0,1,2,3,4
weight,0,0,0,0,0
time_in_hospital,1,3,2,2,1
num_lab_procedures,41,59,11,44,51
num_procedures,0,0,5,1,0
num_medications,1,18,13,16,8
number_outpatient,0,0,2,0,0
number_emergency,0,0,0,0,0


In [27]:
print(df.shape)
missingvalues = pd.DataFrame(df.isnull().sum(axis=0), columns = ['number_of_missing_values'])
missingvalues['feature'] = missingvalues.index
missingvalues = missingvalues[['feature','number_of_missing_values']].reset_index (drop = True)
missingvalues[missingvalues.number_of_missing_values > 0]

(100100, 69)


Unnamed: 0,feature,number_of_missing_values
1,race,2235


In [28]:
# DECISION 13: Drop all NAs from race
df = df[df['race'].notna()] 
dummiesRace = pd.get_dummies(df['race'], drop_first = False)

for race in list(dummiesRace):
    df["race_"+race] = dummiesRace[race]

In [29]:
df =df.drop("race", axis = 1)

In [30]:
#Drop patient_nbr, unuseful
df = df.drop("patient_nbr", axis=1)
df = df.drop("metformin-rosiglitazone", axis=1)

In [31]:
# One last look at data:
#df.head().T
pd.set_option('display.max_columns', 500)
df.describe(include="all")

Unnamed: 0,age,weight,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,A1C_Abnormal,A1C_Normal,MAXGLU_Abnormal,MAXGLU_Normal,Female,Diag_1,Diag_2,Diag_3,Diag_4,Diag_5,Diag_6,Diag_7,Diag_8,Diag_9,ADM_S_1,ADM_S_4,ADM_S_7,ADM_S_8,ADM_S_9,ADM_S_11,ADM_T_1,ADM_T_3,ADM_T_4,ADM_T_5,DIS_D_1,DIS_D_2,DIS_D_7,DIS_D_10,DIS_D_18,DIS_D_19,DIS_D_20,DIS_D_27,DIS_D_28,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other
count,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0,97865.0
mean,6.081868,0.030859,4.391713,42.919726,1.331978,15.986624,0.373474,0.201839,0.64028,7.426342,0.198529,0.015501,0.00702,0.000858,0.050999,1e-05,0.125888,0.105227,0.000204,0.072825,0.06324,0.003076,0.000388,3.1e-05,0.000399,0.531988,0.006867,0.000133,1e-05,1e-05,0.463393,0.770592,0.885924,0.1185,0.049619,0.026792,0.026036,0.538926,0.298493,0.140663,0.094038,0.086895,0.068789,0.04918,0.050713,0.032923,0.178307,0.304665,0.058591,0.567302,0.000153,0.069238,5.1e-05,0.708783,0.187789,0.000102,0.103326,0.735493,0.208266,0.006233,0.000991,0.047453,8.2e-05,2e-05,5.1e-05,0.00141,0.192909,0.006458,0.764798,0.020661,0.015174
std,1.595037,0.172936,2.97642,19.64047,1.698432,8.08455,1.273538,0.945136,1.270368,1.930904,0.398894,0.123535,0.083491,0.029285,0.219996,0.003197,0.331724,0.306847,0.014294,0.25985,0.243396,0.055374,0.019701,0.005537,0.019959,0.498978,0.08258,0.011525,0.003197,0.003197,0.498661,0.420454,0.317905,0.323201,0.217158,0.161476,0.159243,0.498485,0.457599,0.347676,0.291883,0.281683,0.253096,0.216245,0.219412,0.178436,0.382773,0.460268,0.234859,0.495452,0.012379,0.25386,0.007148,0.454326,0.390546,0.010108,0.304386,0.441073,0.40607,0.078704,0.031467,0.212607,0.009041,0.004521,0.007148,0.037525,0.394584,0.080101,0.424127,0.142248,0.122245
min,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,0.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,6.0,0.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,7.0,0.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,9.0,1.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Train_test splitting

In [32]:
# We should only deal with race missings AFTER the split. We will also make a very simple classifier for inputting the race.
inputSet = df.drop("readmitted", axis=1)
outputSet = df["readmitted"]
X_train, X_test, y_train, y_test = train_test_split(inputSet, outputSet, random_state=42,test_size=0.3)

### Standarization and logarithmic transformations

In [33]:
numericInterestingVariables =['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
for var in numericInterestingVariables:
    if (abs(X_train[var].skew()) >2) & (abs(X_train[var].kurtosis()) >2):
        print(var, " needs log")

number_outpatient  needs log
number_emergency  needs log
number_inpatient  needs log


In [34]:
#DECISION 14: Applying logarithmics to these variables
X_train["number_outpatient"] =  np.log1p(X_train["number_outpatient"])
X_train["number_emergency"] =  np.log1p(X_train["number_emergency"])
X_train["number_inpatient"] =  np.log1p(X_train["number_inpatient"])

X_test["number_outpatient"] =  np.log1p(X_test["number_outpatient"])
X_test["number_emergency"] =  np.log1p(X_test["number_emergency"])
X_test["number_inpatient"] =  np.log1p(X_test["number_inpatient"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [35]:
#df.describe().T
maxes = df.describe().T["max"]
Numeric = maxes[maxes!=1.0].index.tolist()
df[Numeric].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,97865.0,6.081868,1.595037,0.0,5.0,6.0,7.0,9.0
time_in_hospital,97865.0,4.391713,2.97642,1.0,2.0,4.0,6.0,14.0
num_lab_procedures,97865.0,42.919726,19.64047,1.0,31.0,44.0,57.0,132.0
num_procedures,97865.0,1.331978,1.698432,0.0,0.0,1.0,2.0,6.0
num_medications,97865.0,15.986624,8.08455,1.0,10.0,15.0,20.0,81.0
number_outpatient,97865.0,0.373474,1.273538,0.0,0.0,0.0,0.0,42.0
number_emergency,97865.0,0.201839,0.945136,0.0,0.0,0.0,0.0,76.0
number_inpatient,97865.0,0.64028,1.270368,0.0,0.0,0.0,1.0,21.0
number_diagnoses,97865.0,7.426342,1.930904,1.0,6.0,8.0,9.0,16.0


In [36]:
#Decision 15: Standarization of numerical variables (not binary)
scaler = StandardScaler().fit(X_train[Numeric])
X_train[Numeric] = scaler.transform(X_train[Numeric])
X_test[Numeric]  = scaler.transform(X_test[Numeric])

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  aft

In [37]:
X_train.head()
X_train[Numeric].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,68505.0,-2.775728e-16,1.000007,-3.82556,-0.681085,-0.05219,0.576705,1.834495
time_in_hospital,68505.0,-3.095124e-16,1.000007,-1.140566,-0.804003,-0.130877,0.54225,3.234756
num_lab_procedures,68505.0,-1.011325e-15,1.000007,-2.139982,-0.61069,0.052003,0.714696,4.537924
num_procedures,68505.0,8.670371e-16,1.000007,-0.785799,-0.785799,-0.198719,0.388362,2.736683
num_medications,68505.0,1.252645e-16,1.000007,-1.853779,-0.742479,-0.125091,0.492298,8.024437
number_outpatient,68505.0,5.535881e-16,1.000007,-0.402535,-0.402535,-0.402535,-0.402535,8.306451
number_emergency,68505.0,2.480981e-16,1.000007,-0.326184,-0.326184,-0.326184,-0.326184,13.345836
number_inpatient,68505.0,2.488433e-15,1.000007,-0.64189,-0.64189,-0.64189,0.711977,5.395589
number_diagnoses,68505.0,1.216213e-16,1.000007,-3.328737,-0.739071,0.296796,0.814729,4.440262


In [38]:
FinalTrainDataset = X_train
FinalTrainDataset["readmitted"] = y_train
FinalTestDataset = X_test
FinalTestDataset["readmitted"] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [39]:
FinalTrainDataset.to_csv('./Train.csv')
FinalTestDataset.to_csv('./Test.csv')