In [4]:
import pandas as pd


In [5]:
df=pd.read_csv('../alzheimers_disease_data.csv')


In [6]:
df = df.drop(["DoctorInCharge"], axis=1)

# Remover categoricas inuteis para a analise #

In [7]:
df.head().round()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,4751,73,0,0,2,23.0,0,13.0,6.0,1.0,...,7.0,0,0,2.0,0,0,0,1,0,0
1,4752,89,0,0,0,27.0,0,5.0,8.0,1.0,...,7.0,0,0,3.0,0,0,0,0,1,0
2,4753,73,0,3,1,18.0,0,20.0,8.0,2.0,...,6.0,0,0,7.0,0,1,0,1,0,0
3,4754,74,1,0,1,34.0,1,12.0,8.0,7.0,...,9.0,0,1,6.0,0,0,0,0,0,0
4,4755,89,0,0,0,21.0,0,18.0,6.0,1.0,...,6.0,0,0,0.0,0,0,1,1,0,0


# Arredondamos os valores pois eles ja estavam padronizados.

#### Criar novas features relevantes para o problema
Pensando no problema que queremos solucionar, optamos por seguir um caminho para classificar a presença de alzheimer somente com fatores diretos de risco de Alzheimer, estes fatores tem impacto esabelecido com a doença seja pela contribuição ao declínio cognitivo, à progressão da doença ou pela associação em estudos epidemiológicos.

Logo selecionamos colunas que julamos serem fatores  que podem ter relação indireta com o Alzheimer, mas sua influência depende de interações com outros fatores, e analisamos a suas correlaçõs com o diagnostico da doença.

* Fatores Indiretos: 'PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'Smoking', 'PhysicalActivity', 'DietQuality', 'AlcoholConsumption', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolTriglycerides', 'HeadInjury', 'PersonalityChanges', 'BehavioralProblems'


In [10]:
# Colunas para analisar a correlação
colunas_especificas = ['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 
                       'Smoking', 'PhysicalActivity', 'DietQuality', 'AlcoholConsumption', 
                       'CholesterolTotal', 'CholesterolLDL', 'CholesterolTriglycerides', 
                       'HeadInjury', 'PersonalityChanges', 'BehavioralProblems']

# Calculando a correlação de cada coluna com 'Diagnosis'
correlacao_com_diagnosis = df[colunas_especificas + ['Diagnosis']].corr()['Diagnosis']

correlacao_com_diagnosis = correlacao_com_diagnosis[colunas_especificas]

print(correlacao_com_diagnosis)

PatientID                   0.041019
Age                        -0.005488
Gender                     -0.020975
Ethnicity                  -0.014782
EducationLevel             -0.043966
Smoking                    -0.004865
PhysicalActivity            0.005945
DietQuality                 0.008506
AlcoholConsumption         -0.007618
CholesterolTotal            0.006394
CholesterolLDL             -0.031976
CholesterolTriglycerides    0.022672
HeadInjury                 -0.021411
PersonalityChanges         -0.020627
BehavioralProblems          0.224350
Name: Diagnosis, dtype: float64


Observamos que as correlações das colunas foram, em sua maioria, muito baixas. Apenas a coluna de problemas comportamentais apresentou uma correlação ligeiramente maior, mas sua utilidade é limitada devido à falta de detalhes sobre os tipos de problemas registrados no dataset.
Assim, optamos por remover essas colunas, pois elas não contribuem significativamente para o processo de classificação.

In [11]:
df = df.drop(columns=colunas_especificas)
df

Unnamed: 0,BMI,SleepQuality,FamilyHistoryAlzheimers,CardiovascularDisease,Diabetes,Depression,Hypertension,SystolicBP,DiastolicBP,CholesterolHDL,MMSE,FunctionalAssessment,MemoryComplaints,ADL,Confusion,Disorientation,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,22.927749,9.025679,0,0,1,1,0,142,72,33.682563,21.463532,6.518877,0,1.725883,0,0,1,0,0,XXXConfid
1,26.827681,7.151293,0,0,0,0,0,115,64,79.028477,20.613267,7.118696,0,2.592424,0,0,0,1,0,XXXConfid
2,17.795882,9.673574,1,0,0,0,0,99,116,69.772292,7.356249,5.895077,0,7.119548,0,1,1,0,0,XXXConfid
3,33.800817,8.392554,0,0,0,0,0,118,115,68.457491,13.991127,8.965106,0,6.481226,0,0,0,0,0,XXXConfid
4,20.716974,5.597238,0,0,0,0,0,94,117,56.874305,13.517609,6.045039,0,0.014691,0,0,1,0,0,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,39.121757,7.535540,0,0,0,0,0,122,101,60.943092,1.201190,0.238667,0,4.492838,1,0,0,0,1,XXXConfid
2145,17.857903,8.555256,0,0,0,0,0,152,106,93.649735,6.458060,8.687480,0,9.204952,0,0,0,0,1,XXXConfid
2146,15.476479,5.769464,0,0,0,0,0,115,118,99.678209,17.011003,1.972137,0,5.036334,0,0,0,0,1,XXXConfid
2147,15.299911,8.322874,0,1,0,0,0,103,96,81.281111,4.030491,5.173891,0,3.785399,0,0,0,1,1,XXXConfid
