In [1]:
%reload_ext autoreload
%autoreload 2

from sarpu.data_processing import *
from sarpu.paths_and_names import *

import numpy as np
import pandas as pd
import requests

import sklearn.model_selection

# Input Data

In [2]:
# Names and locations
data_folder= "../../Data/"
data_name = "diabetes"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip"

In [3]:
# Creation information
nb_splits = 5
test_size = 0.2

# Prepare folders

In [4]:
# Prepare folders
data_folder_original = original_data_path(data_folder,data_name)
!mkdir -p $data_folder_original
data_folder_processed = processed_data_path(data_folder,data_name)
!mkdir -p $data_folder_processed
data_folder_partitions = partitions_data_path(data_folder,data_name)
!mkdir -p $data_folder_partitions

# Download

In [5]:
#download and unzip data

archive = os.path.join(data_folder_original,url.split("/")[-1])
unprocessed_data_path = ".".join(archive.split(".")[:-1])+"/diabetic_data.csv"

if not os.path.exists(unprocessed_data_path):
    r = requests.get(url, allow_redirects=True)
    open(archive, 'wb').write(r.content)
    !unzip $archive -d $data_folder_original

Archive:  ../../Data/diabetes/original/dataset_diabetes.zip
  inflating: ../../Data/diabetes/original/dataset_diabetes/diabetic_data.csv  
  inflating: ../../Data/diabetes/original/dataset_diabetes/IDs_mapping.csv  


In [6]:
#read data to pandas dataframe

header = []

multival=[]

df = pd.read_csv(unprocessed_data_path).dropna()
# remove ids
df = df.drop("encounter_id",axis=1)
df = df.drop("patient_nbr",axis=1)



# remove attributes with too many missing values, like in http://www.hindawi.com/journals/bmri/2014/781670/
df = df.drop("weight",axis=1)
df = df.drop("payer_code",axis=1)

# add value "missing" for medical specialty, like in http://www.hindawi.com/journals/bmri/2014/781670/

df.loc[df['medical_specialty'] == "?","medical_specialty"]="Missing"


# only keep 10 most common diagnoses, discharge dispositions, admission sources, medical specialty, replace others by "other"
for column in ["diag_1","diag_2","diag_3","discharge_disposition_id","admission_source_id","medical_specialty"]:
    df=keep_k_most_common(df,column,10)


#remove columns that have the same value in at least 95% of the time
n=df.shape[0]
for column in df.columns.values:
    if df[column].value_counts().values[0]/n > 0.95:
        df=df.drop(column,axis=1)
        
#replace age by number

for i in range(11):
    max_age=10*i
    min_age=max_age-10
    df.loc[df["age"]=="["+str(min_age)+"-"+str(max_age)+")", "age"]=i


header = df.columns.values

print(header)
df = df.replace('?', np.NaN).replace('Unknown/Invalid',np.NaN).dropna() #remove missing values

non_multival=["age","time_in_hospital",]+list(filter(lambda c: c.startswith("num"), header))

multival = list(filter(lambda c: c not in non_multival and c!="readmitted", header))

print(multival)

df


['race' 'gender' 'age' 'admission_type_id' 'discharge_disposition_id'
 'admission_source_id' 'time_in_hospital' 'medical_specialty'
 'num_lab_procedures' 'num_procedures' 'num_medications'
 'number_outpatient' 'number_emergency' 'number_inpatient' 'diag_1'
 'diag_2' 'diag_3' 'number_diagnoses' 'max_glu_serum' 'A1Cresult'
 'metformin' 'glimepiride' 'glipizide' 'glyburide' 'pioglitazone'
 'rosiglitazone' 'insulin' 'change' 'diabetesMed' 'readmitted']
['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'change', 'diabetesMed']


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,...,metformin,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,Caucasian,Female,1,6,25,1,1,other,41,0,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,2,1,1,7,3,Missing,59,0,...,No,No,No,No,No,No,Up,Ch,Yes,>30
2,AfricanAmerican,Female,3,1,1,7,2,Missing,11,5,...,No,No,Steady,No,No,No,No,No,Yes,NO
3,Caucasian,Male,4,1,1,7,2,Missing,44,1,...,No,No,No,No,No,No,Up,Ch,Yes,NO
4,Caucasian,Male,5,1,1,7,1,Missing,51,0,...,No,No,Steady,No,No,No,Steady,Ch,Yes,NO
5,Caucasian,Male,6,2,1,2,3,Missing,31,6,...,No,No,No,No,No,No,Steady,No,Yes,>30
6,Caucasian,Male,7,3,1,2,4,Missing,70,1,...,Steady,Steady,No,No,No,No,Steady,Ch,Yes,NO
7,Caucasian,Male,8,1,1,7,5,Missing,73,0,...,No,No,No,Steady,No,No,No,No,Yes,>30
8,Caucasian,Female,9,2,1,4,13,Missing,68,2,...,No,No,Steady,No,No,No,Steady,Ch,Yes,NO
9,Caucasian,Female,10,3,3,4,12,InternalMedicine,33,3,...,No,No,No,No,No,Steady,Steady,Ch,Yes,NO


In [7]:
#class distribution

df["readmitted"].value_counts()


NO     53316
>30    35007
<30    11169
Name: readmitted, dtype: int64

In [8]:
# Make <30 positive class

df["class"]=0

df.loc[df['readmitted'] == "<30","class"]=1
df = df.drop("readmitted",axis=1)
df.groupby('class').count()

Unnamed: 0_level_0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,...,A1Cresult,metformin,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,88323,88323,88323,88323,88323,88323,88323,88323,88323,88323,...,88323,88323,88323,88323,88323,88323,88323,88323,88323,88323
1,11169,11169,11169,11169,11169,11169,11169,11169,11169,11169,...,11169,11169,11169,11169,11169,11169,11169,11169,11169,11169


In [9]:
#binarize multivalued features

for column in multival:
    values = list(set(df[column]))
    if len(values)>2:
        df = binarize(df, column)
    elif len(values)==2:
        df.loc[df[column]==values[0],column]=-1
        df.loc[df[column]==values[1],column]=1
    else: # drop useless features
        print(column, values)
        df=df.drop(column, axis=1)

# df

In [10]:
#normalize
for column in df.columns.values:
    df[column]=pd.to_numeric(df[column])

normalized_df=(df.astype(float)-df.min())/(df.max()-df.min())*2-1
normalized_df["class"] = df["class"]
df = normalized_df
# df

In [11]:
#move class to back

cols = list(df.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('class')) #Remove class from list
df = df[cols+['class']]

df

Unnamed: 0,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,pioglitazone-No,rosiglitazone-Steady,rosiglitazone-Up,rosiglitazone-Down,rosiglitazone-No,insulin-Steady,insulin-Up,insulin-Down,insulin-No,class
0,-1.0,-1.000000,-1.000000,-0.389313,-1.000000,-1.000,-1.000000,-1.000000,-1.000000,-1.000000,...,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,0
1,-1.0,-0.777778,-0.692308,-0.114504,-1.000000,-0.575,-1.000000,-1.000000,-1.000000,0.066667,...,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,0
2,-1.0,-0.555556,-0.846154,-0.847328,0.666667,-0.700,-0.904762,-1.000000,-0.904762,-0.333333,...,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,0
3,1.0,-0.333333,-0.846154,-0.343511,-0.666667,-0.625,-1.000000,-1.000000,-1.000000,-0.200000,...,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,0
4,1.0,-0.111111,-1.000000,-0.236641,-1.000000,-0.825,-1.000000,-1.000000,-1.000000,-0.466667,...,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,0
5,1.0,0.111111,-0.692308,-0.541985,1.000000,-0.625,-1.000000,-1.000000,-1.000000,0.066667,...,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,0
6,1.0,0.333333,-0.538462,0.053435,-0.666667,-0.500,-1.000000,-1.000000,-1.000000,-0.200000,...,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,0
7,1.0,0.555556,-0.384615,0.099237,-1.000000,-0.725,-1.000000,-1.000000,-1.000000,-0.066667,...,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,0
8,-1.0,0.777778,0.846154,0.022901,-0.333333,-0.325,-1.000000,-1.000000,-1.000000,-0.066667,...,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,0
9,-1.0,1.000000,0.692308,-0.511450,0.000000,-0.575,-1.000000,-1.000000,-1.000000,-0.066667,...,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,0


In [12]:
# make numpy array

import numpy as np

xy = df.values

x = xy[:,:-1]
y = xy[:,-1].astype(int)

x_pos = x[y==1]
x_neg = x[y==0]

In [13]:
#Save data and true classes
np.savetxt(data_path(data_folder, data_name), x)
np.savetxt(classlabels_path(data_folder, data_name), y,fmt='%d')

# Different dataset partitions (train/test and class prior)

In [14]:
sss = sklearn.model_selection.StratifiedShuffleSplit(n_splits=nb_splits, test_size=test_size, random_state=0)
splits = list(sss.split(x,y))

In [15]:
#save partitions. 0 means not in data, 1 means in train partition, 2 means in test partition

for i, (train,test) in enumerate(splits):
    partition = np.zeros_like(y,dtype=int)
    partition[train]=1
    
    partition[test]=2    
    np.savetxt(partition_path(data_folder,data_name, i), partition, fmt='%d')
   