## Projeto Analysis of Diabetes Risk Factor

### Modelagem de Dados.

#### Importando as bibliotecas necessárias.

In [1]:
import pandas as pd
import glob

### Carregando todos os dados brutos.

#### Criando uma estrutura de Dataframe vazia.

In [14]:
df_diabetes_health_indcators = pd.DataFrame(data=None)

#### Carregando dados locais para o Dataframe.

In [15]:
# for sheet in glob.glob("../datalake/landing/working-hours/*.xlsx"):
#     df_ = pd.read_excel(sheet)
#     df_sistema_ponto = pd.concat([df_sistema_ponto,df_])

# Windows
for sheet in glob.glob("..\\datalake\\landing\\*.csv"):
    df_temporary = pd.read_csv(sheet)
    df_diabetes_health_indcators = pd.concat([df_diabetes_health_indcators,df_temporary])

In [16]:
df_diabetes_health_indcators.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0,
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0,
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0,
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0,
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0,


### Pré-processamento e limpeza

#### Organizando coluna alvo.
Dentre os arquivos que foram carregados para o dataset, em alguns a coluna alvo é **Diabetes_012**, que possui os valores:
- 0 = sem diabetes 
- 1 = pré-diabetes 
- 2 = diabetes

Já em outros arquivos, a coluna alvo é **Diabetes_binary**, que possui os valores:
- 0 = sem diabetes 
- 1 = pré-diabetes ou diabetes

Por questões de definição do problema, vamos considerar a opção binária. Logo, a coluna **Diabetes_012** será removida e terá seus valores mapeados para **Diabetes_binary**, sendo:
- 0 = 0
- 1 ou 2 = 1

In [17]:
def map_target_column(Diabetes_012, Diabetes_binary):
    if Diabetes_012 >= 0:
        if Diabetes_012 == 0:
            return 0
        elif Diabetes_012 in [1, 2]:
            return 1
    return Diabetes_binary

In [18]:
df_diabetes_health_indcators['Diabetes_binary'] = df_diabetes_health_indcators\
    .apply(lambda x: map_target_column(x['Diabetes_012'], x['Diabetes_binary']), axis=1)

In [20]:
df_diabetes_health_indcators.drop('Diabetes_012', axis=1, inplace=True)

#### Renomeando atributos

In [26]:
df_diabetes_health_indcators = df_diabetes_health_indcators.rename(
    columns={'Diabetes_binary': 'diabetes', 
             'HighBP': 'high_blood_preassure',
             'HighChol': 'high_cholesterol',
             'CholCheck': 'cholesterol_check',
             'BMI': 'body_mass_index',
             'Smoker': 'smoker',
             'Stroke' : 'stroke',
             'HeartDiseaseorAttack': 'heart_diseaseor_attack',
             'PhysActivity': 'physical_activity_in past_30_days',
             'Fruits': 'at_least_one_fruit_a_day',
             'Veggies': 'at_least_one_veggies_a_day',
             'HvyAlcoholConsump': 'high_consumption_of_alcohol',
             'AnyHealthcare': 'any_healthcare', 
             'NoDocbcCost': 'no_doctor_because_cost', 
             'GenHlth': 'general_health_scale', 
             'MentHlth': 'days_of_poor_mental_health', 
             'PhysHlth': 'physical_illness_injury_days', 
             'DiffWalk': 'serious_difficulty_walking',
             'Sex': 'sex', 
             'Age': 'age', 
             'Education': 'education', 
             'Income': 'income'
            }
)

#### Organizando o dataset.

In [27]:
df_diabetes_health_indcators = df_diabetes_health_indcators[
    ['high_blood_preassure', 'high_cholesterol', 'cholesterol_check', 'body_mass_index', 'stroke', 'heart_diseaseor_attack',
     'smoker', 'physical_activity_in past_30_days', 'at_least_one_fruit_a_day', 'at_least_one_veggies_a_day',
     'high_consumption_of_alcohol', 'any_healthcare', 'no_doctor_because_cost', 'general_health_scale', 
     'days_of_poor_mental_health', 'physical_illness_injury_days', 'serious_difficulty_walking',
     'sex', 'age', 'education', 'income', 'diabetes']
]

In [28]:
df_diabetes_health_indcators.head()

Unnamed: 0,high_blood_preassure,high_cholesterol,cholesterol_check,body_mass_index,stroke,heart_diseaseor_attack,smoker,physical_activity_in past_30_days,at_least_one_fruit_a_day,at_least_one_veggies_a_day,...,no_doctor_because_cost,general_health_scale,days_of_poor_mental_health,physical_illness_injury_days,serious_difficulty_walking,sex,age,education,income,diabetes
0,1.0,1.0,1.0,40.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0,0.0
1,0.0,0.0,0.0,25.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0,0.0
2,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0,0.0
3,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0,0.0
4,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0,0.0


#### Verificando os registros missing.

In [29]:
df_diabetes_health_indcators.isnull().sum()

high_blood_preassure                 0
high_cholesterol                     0
cholesterol_check                    0
body_mass_index                      0
stroke                               0
heart_diseaseor_attack               0
smoker                               0
physical_activity_in past_30_days    0
at_least_one_fruit_a_day             0
at_least_one_veggies_a_day           0
high_consumption_of_alcohol          0
any_healthcare                       0
no_doctor_because_cost               0
general_health_scale                 0
days_of_poor_mental_health           0
physical_illness_injury_days         0
serious_difficulty_walking           0
sex                                  0
age                                  0
education                            0
income                               0
diabetes                             0
dtype: int64

#### Alterando os tipos de dados.

In [30]:
df_diabetes_health_indcators.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 578052 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   high_blood_preassure               578052 non-null  float64
 1   high_cholesterol                   578052 non-null  float64
 2   cholesterol_check                  578052 non-null  float64
 3   body_mass_index                    578052 non-null  float64
 4   stroke                             578052 non-null  float64
 5   heart_diseaseor_attack             578052 non-null  float64
 6   smoker                             578052 non-null  float64
 7   physical_activity_in past_30_days  578052 non-null  float64
 8   at_least_one_fruit_a_day           578052 non-null  float64
 9   at_least_one_veggies_a_day         578052 non-null  float64
 10  high_consumption_of_alcohol        578052 non-null  float64
 11  any_healthcare                     5780

In [34]:
columns = list(df_diabetes_health_indcators.columns)
columns.remove('body_mass_index')

In [35]:
for column in columns:
    df_diabetes_health_indcators[column] = df_diabetes_health_indcators[column].astype(int)

In [36]:
df_diabetes_health_indcators.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 578052 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   high_blood_preassure               578052 non-null  int32  
 1   high_cholesterol                   578052 non-null  int32  
 2   cholesterol_check                  578052 non-null  int32  
 3   body_mass_index                    578052 non-null  float64
 4   stroke                             578052 non-null  int32  
 5   heart_diseaseor_attack             578052 non-null  int32  
 6   smoker                             578052 non-null  int32  
 7   physical_activity_in past_30_days  578052 non-null  int32  
 8   at_least_one_fruit_a_day           578052 non-null  int32  
 9   at_least_one_veggies_a_day         578052 non-null  int32  
 10  high_consumption_of_alcohol        578052 non-null  int32  
 11  any_healthcare                     5780