# Data prep

Este notebook é dedicado ao data prep e exportação de um csv com os dados trabalhados ('diabetic_data_df.csv') e as colunas utilizadas ('col2use.csv')

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
# load the csv file
df = pd.read_csv('diabetic_data.csv')

## Algumas análises

In [3]:
print('Number of samples:',len(df))

Number of samples: 101766


In [None]:
df.info()

In [None]:
df.head()

In [None]:
# count the number of rows for each type
df.groupby('readmitted').size()

In [None]:
# discharge_disposition_id, tells us where the patient went after the hospitalization.
df.groupby('discharge_disposition_id').size()

In [None]:
#visualização de 10 em 10 colunas
df[list(df.columns)[:10]].head()

In [None]:
#visualização de 10 em 10 colunas
df[list(df.columns)[10:20]].head()

In [None]:
#visualização de 10 em 10 colunas
df[list(df.columns)[20:30]].head()

In [None]:
#visualização de 10 em 10 colunas
df[list(df.columns)[30:40]].head()

In [None]:
#visualização de 10 em 10 colunas
df[list(df.columns)[40:]].head()

In [None]:
# Mostra os IDs unicos da base se forem (em quantidade) menores que 30, caso contrário diz quantos são
# for each column
for c in list(df.columns):
    
    # get a list of unique values
    n = df[c].unique()
    
    # if number of unique values is less than 30, print the values. Otherwise print the number of unique values
    if len(n)<30:
        print(c)
        print(n)
    else:
        print(c + ': ' +str(len(n)) + ' unique values')

## Trabalhando a base

In [4]:
# De acordo com IDs_mapping.csv os IDs 11,13,14,19,20,21 são relacionados a morte ou hospício. Remover esses IDs do modelo.
df = df.loc[~df.discharge_disposition_id.isin([11,13,14,19,20,21])]

In [5]:
# Criando a coluna de output para classificação binária
df['OUTPUT_LABEL'] = (df.readmitted == '<30').astype('int')

In [6]:
# replace ? with nan
df = df.replace('?',np.nan)

In [7]:
# nas colunas 'race', 'player_code' e 'medical_specialty' estao faltando dados, preencher com 'UNK'
df['race'] = df['race'].fillna('UNK')
df['payer_code'] = df['payer_code'].fillna('UNK')
df['medical_specialty'] = df['medical_specialty'].fillna('UNK')

In [8]:
# Sobre a coluna 'medical_specialty', pega os top 10 mais recorrentes e cria uma nova coluna contendo somente estes top 10
# os outros dados que não estão dentro dos top 10 são classificados como 'Other'
top_10 = ['UNK','InternalMedicine','Emergency/Trauma',\
          'Family/GeneralPractice', 'Cardiology','Surgery-General' ,\
          'Nephrology','Orthopedics',\
          'Orthopedics-Reconstructive','Radiologist']

# make a new column with duplicated data
df['med_spec'] = df['medical_specialty'].copy()

# replace all specialties not in top 10 with 'Other' category
df.loc[~df.med_spec.isin(top_10),'med_spec'] = 'Other'

To convert our categorical features to numbers, we will use a technique called one-hot encoding. In one-hot encoding, you create a new column for each unique value in that column. Then the value of the column is 1 if the sample has that unique value or 0 otherwise. For example, for the column race, we would create new columns ('race_Caucasian','race_AfricanAmerican', etc). If the patient's race is Caucasian, the patient gets a 1 under 'race_Caucasian' and 0 under the rest of the race columns. To create these one-hot encoding columns, we can use the get_dummies function.

Now the problem is that if we create a column for each unique value, we have correlated columns. In other words, the value in one column can be figured out by looking at the rest of the columns. For example, if the sample is not AfricanAmerican, Asian, Causasian, Hispance or Other, it must be UNK. To deal with this, we can use the drop_first option, which will drop the first categorical value for each column.

The get_dummies function does not work on numerical data. To trick get_dummies, we can convert the numerical data into strings and then it will work properly.

In [9]:
cols_cat = ['race', 'gender', 
       'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed','payer_code']

cols_cat_num = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']

df[cols_cat_num] = df[cols_cat_num].astype('str')

df_cat = pd.get_dummies(df[cols_cat + cols_cat_num + ['med_spec']],drop_first = True)

df = pd.concat([df,df_cat], axis = 1)

# salva as colunas de dados categóricos em 'cols_all_cat'
cols_all_cat = list(df_cat.columns)
print (df_cat.columns.tolist())

['race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Other', 'race_UNK', 'gender_Male', 'gender_Unknown/Invalid', 'max_glu_serum_>300', 'max_glu_serum_None', 'max_glu_serum_Norm', 'A1Cresult_>8', 'A1Cresult_None', 'A1Cresult_Norm', 'metformin_No', 'metformin_Steady', 'metformin_Up', 'repaglinide_No', 'repaglinide_Steady', 'repaglinide_Up', 'nateglinide_No', 'nateglinide_Steady', 'nateglinide_Up', 'chlorpropamide_No', 'chlorpropamide_Steady', 'chlorpropamide_Up', 'glimepiride_No', 'glimepiride_Steady', 'glimepiride_Up', 'acetohexamide_Steady', 'glipizide_No', 'glipizide_Steady', 'glipizide_Up', 'glyburide_No', 'glyburide_Steady', 'glyburide_Up', 'tolbutamide_Steady', 'pioglitazone_No', 'pioglitazone_Steady', 'pioglitazone_Up', 'rosiglitazone_No', 'rosiglitazone_Steady', 'rosiglitazone_Up', 'acarbose_No', 'acarbose_Steady', 'acarbose_Up', 'miglitol_No', 'miglitol_Steady', 'miglitol_Up', 'troglitazone_Steady', 'tolazamide_Steady', 'tolazamide_Up', 'insulin_No', 'insulin_Steady', 'insul

In [10]:
# transformação das colunas de idade e peso de categóricas para numéricas
age_id = {'[0-10)':0, 
          '[10-20)':10, 
          '[20-30)':20, 
          '[30-40)':30, 
          '[40-50)':40, 
          '[50-60)':50,
          '[60-70)':60, 
          '[70-80)':70, 
          '[80-90)':80, 
          '[90-100)':90}
df['age_group'] = df.age.replace(age_id)

df['has_weight'] = df.weight.notnull().astype('int')

# salva as colunas 'age_group' e 'has_weight'
cols_extra = ['age_group','has_weight']

In [11]:
cols_num = ['time_in_hospital','num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient','number_diagnoses']

In [12]:
# cria o df_data que é o dataframe final com os dados utilizados
col2use = cols_num + cols_all_cat + cols_extra
col2use_df = pd.DataFrame(list(col2use), columns=['col2use'])
col2use_df.to_csv('col2use.csv')

df_data = df[col2use + ['OUTPUT_LABEL']]

In [13]:
# exporta csv do dataframe final
df_data.to_csv(path_or_buf="diabetic_data_df.csv", index=False)

In [None]:
df_data.head()