In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [9]:
df=pd.read_csv('../alzheimers_disease_data.csv')

In [10]:
print(df.isnull().sum())

PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfu

In [11]:
print(df.duplicated().sum())

0


In [12]:
print(df.dtypes)

PatientID                      int64
Age                            int64
Gender                         int64
Ethnicity                      int64
EducationLevel                 int64
BMI                          float64
Smoking                        int64
AlcoholConsumption           float64
PhysicalActivity             float64
DietQuality                  float64
SleepQuality                 float64
FamilyHistoryAlzheimers        int64
CardiovascularDisease          int64
Diabetes                       int64
Depression                     int64
HeadInjury                     int64
Hypertension                   int64
SystolicBP                     int64
DiastolicBP                    int64
CholesterolTotal             float64
CholesterolLDL               float64
CholesterolHDL               float64
CholesterolTriglycerides     float64
MMSE                         float64
FunctionalAssessment         float64
MemoryComplaints               int64
BehavioralProblems             int64
A

In [13]:
print('Após verificar os valores e tipos de dados nas colunas não foram identificadas duplicatas ou inconsistências')

Após verificar os valores e tipos de dados nas colunas não foram identificadas duplicatas ou inconsistências


In [5]:
df = df.drop(["DoctorInCharge"], axis=1)

# Remover categoricas inuteis para a analise #

#### Criar novas features relevantes para o problema
Pensando no problema que queremos solucionar, optamos por seguir um caminho para classificar a presença de alzheimer somente com fatores diretos de risco de Alzheimer, estes fatores tem impacto esabelecido com a doença seja pela contribuição ao declínio cognitivo, à progressão da doença ou pela associação em estudos epidemiológicos.

Logo selecionamos colunas que julamos serem fatores  que podem ter relação indireta com o Alzheimer, mas sua influência depende de interações com outros fatores, e analisamos a suas correlaçõs com o diagnostico da doença.

* Fatores Indiretos: 'PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'Smoking', 'PhysicalActivity', 'DietQuality', 'AlcoholConsumption', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolTriglycerides', 'HeadInjury', 'PersonalityChanges', 'BehavioralProblems'


In [10]:
# Colunas para analisar a correlação
colunas_especificas = ['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 
                       'Smoking', 'PhysicalActivity', 'DietQuality', 'AlcoholConsumption', 
                       'CholesterolTotal', 'CholesterolLDL', 'CholesterolTriglycerides', 
                       'HeadInjury', 'PersonalityChanges', 'BehavioralProblems']

# Calculando a correlação de cada coluna com 'Diagnosis'
correlacao_com_diagnosis = df[colunas_especificas + ['Diagnosis']].corr()['Diagnosis']

correlacao_com_diagnosis = correlacao_com_diagnosis[colunas_especificas]

print(correlacao_com_diagnosis)

PatientID                   0.041019
Age                        -0.005488
Gender                     -0.020975
Ethnicity                  -0.014782
EducationLevel             -0.043966
Smoking                    -0.004865
PhysicalActivity            0.005945
DietQuality                 0.008506
AlcoholConsumption         -0.007618
CholesterolTotal            0.006394
CholesterolLDL             -0.031976
CholesterolTriglycerides    0.022672
HeadInjury                 -0.021411
PersonalityChanges         -0.020627
BehavioralProblems          0.224350
Name: Diagnosis, dtype: float64


Observamos que as correlações das colunas foram, em sua maioria, muito baixas. Apenas a coluna de problemas comportamentais apresentou uma correlação ligeiramente maior, mas sua utilidade é limitada devido à falta de detalhes sobre os tipos de problemas registrados no dataset.
Assim, optamos por remover essas colunas, pois elas não contribuem significativamente para o processo de classificação.

In [11]:
df = df.drop(columns=colunas_especificas)
df

Unnamed: 0,BMI,SleepQuality,FamilyHistoryAlzheimers,CardiovascularDisease,Diabetes,Depression,Hypertension,SystolicBP,DiastolicBP,CholesterolHDL,MMSE,FunctionalAssessment,MemoryComplaints,ADL,Confusion,Disorientation,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,22.927749,9.025679,0,0,1,1,0,142,72,33.682563,21.463532,6.518877,0,1.725883,0,0,1,0,0
1,26.827681,7.151293,0,0,0,0,0,115,64,79.028477,20.613267,7.118696,0,2.592424,0,0,0,1,0
2,17.795882,9.673574,1,0,0,0,0,99,116,69.772292,7.356249,5.895077,0,7.119548,0,1,1,0,0
3,33.800817,8.392554,0,0,0,0,0,118,115,68.457491,13.991127,8.965106,0,6.481226,0,0,0,0,0
4,20.716974,5.597238,0,0,0,0,0,94,117,56.874305,13.517609,6.045039,0,0.014691,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,39.121757,7.535540,0,0,0,0,0,122,101,60.943092,1.201190,0.238667,0,4.492838,1,0,0,0,1
2145,17.857903,8.555256,0,0,0,0,0,152,106,93.649735,6.458060,8.687480,0,9.204952,0,0,0,0,1
2146,15.476479,5.769464,0,0,0,0,0,115,118,99.678209,17.011003,1.972137,0,5.036334,0,0,0,0,1
2147,15.299911,8.322874,0,1,0,0,0,103,96,81.281111,4.030491,5.173891,0,3.785399,0,0,0,1,1


In [16]:
scaler = MinMaxScaler()

df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

print(df_normalized)

           BMI  SleepQuality  FamilyHistoryAlzheimers  CardiovascularDisease  \
0     0.316960      0.837564                      0.0                    0.0   
1     0.473058      0.525021                      0.0                    0.0   
2     0.111553      0.945597                      1.0                    0.0   
3     0.752163      0.731994                      0.0                    0.0   
4     0.228472      0.265892                      0.0                    0.0   
...        ...           ...                      ...                    ...   
2144  0.965137      0.589092                      0.0                    0.0   
2145  0.114035      0.759124                      0.0                    0.0   
2146  0.018717      0.294609                      0.0                    0.0   
2147  0.011650      0.720376                      0.0                    1.0   
2148  0.731706      0.979802                      0.0                    0.0   

      Diabetes  Depression  Hypertensio

In [18]:
df_normalized.head()

Unnamed: 0,BMI,SleepQuality,FamilyHistoryAlzheimers,CardiovascularDisease,Diabetes,Depression,Hypertension,SystolicBP,DiastolicBP,CholesterolHDL,MMSE,FunctionalAssessment,MemoryComplaints,ADL,Confusion,Disorientation,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,0.31696,0.837564,0.0,0.0,1.0,1.0,0.0,0.58427,0.20339,0.171039,0.715606,0.652102,0.0,0.172486,0.0,0.0,1.0,0.0,0.0
1,0.473058,0.525021,0.0,0.0,0.0,0.0,0.0,0.280899,0.067797,0.738026,0.687251,0.712108,0.0,0.259154,0.0,0.0,0.0,1.0,0.0
2,0.111553,0.945597,1.0,0.0,0.0,0.0,0.0,0.101124,0.949153,0.62229,0.245145,0.589697,0.0,0.711936,0.0,1.0,1.0,0.0,0.0
3,0.752163,0.731994,0.0,0.0,0.0,0.0,0.0,0.314607,0.932203,0.605851,0.46641,0.896823,0.0,0.648094,0.0,0.0,0.0,0.0,0.0
4,0.228472,0.265892,0.0,0.0,0.0,0.0,0.0,0.044944,0.966102,0.461019,0.450619,0.604699,0.0,0.001341,0.0,0.0,1.0,0.0,0.0
