In [37]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

# Seleção de atributos Random Forest

In [44]:
def calcular_importancia_atributos(caminho_csv):
    # Carregar os dados
    df = pd.read_csv(caminho_csv)
    
    # Definir atributos e alvo
    X = df.drop(columns=['patientunitstayid', 'diagnostico'])  # Atributos
    y = df['diagnostico']  # Alvo
    
    # Dividir os dados em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Treinar um modelo de Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    # Obter as importâncias dos atributos
    importances = rf.feature_importances_
    
    # Criar um DataFrame com as importâncias dos atributos
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    })
    
    # Normalizar as importâncias para que somem 100%
    feature_importance_df['Importance'] = feature_importance_df['Importance'] / feature_importance_df['Importance'].sum() * 100
    
    # Ordenar o DataFrame pelas importâncias
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    
    return feature_importance_df

Media

In [46]:
resultado_media = calcular_importancia_atributos('df_media.csv')
print(resultado_media)

                       Feature  Importance
3                          WBC    5.275248
24                     glucose    4.882127
20                   platelets    4.734107
1                          age    4.257838
13                         MAP    3.861704
18                  hemoglobin    3.813737
9                    heartRate    3.589608
23                    chloride    3.583465
11                    systolic    3.552557
28                     calcium    3.548588
12                   diastolic    3.542493
14           bloodUreaNitrogen    3.527684
27                   potassium    3.512560
7                    pao2/fio2    3.419482
4                       lymphs    3.344197
25                      sodium    3.329015
8                  bicarbonate    3.215329
16                   aspartate    3.212132
22                     albumin    3.195020
5                         resp    3.112261
30                   magnesium    3.042708
29                   phosphate    2.945668
6          

Valor Normal

In [49]:
resultado_normal = calcular_importancia_atributos('df_normal.csv')
print(resultado_normal)

                       Feature  Importance
3                          WBC    5.263414
24                     glucose    4.903594
20                   platelets    4.893969
1                          age    4.520996
13                         MAP    4.046478
18                  hemoglobin    3.897737
12                   diastolic    3.891620
14           bloodUreaNitrogen    3.838085
9                    heartRate    3.789451
23                    chloride    3.690391
27                   potassium    3.657954
28                     calcium    3.557157
25                      sodium    3.428083
11                    systolic    3.241221
5                         resp    3.220572
22                     albumin    3.178299
16                   aspartate    3.173694
30                   magnesium    3.168415
4                       lymphs    3.163739
8                  bicarbonate    3.142442
7                    pao2/fio2    3.079086
29                   phosphate    3.066413
15         

KNN

In [None]:
resultado_knn = calcular_importancia_atributos('df_knn.csv')
print(resultado_knn)

Mediana

In [50]:
resultado_mediana = calcular_importancia_atributos('df_mediana.csv')
print(resultado_mediana)

                       Feature  Importance
3                          WBC    5.358714
24                     glucose    5.026595
20                   platelets    4.633613
1                          age    4.510492
13                         MAP    3.975287
18                  hemoglobin    3.856686
12                   diastolic    3.710093
14           bloodUreaNitrogen    3.696965
9                    heartRate    3.561785
23                    chloride    3.560826
28                     calcium    3.548872
27                   potassium    3.409911
11                    systolic    3.405850
25                      sodium    3.375326
7                    pao2/fio2    3.372208
8                  bicarbonate    3.363864
4                       lymphs    3.302560
22                     albumin    3.134210
5                         resp    3.080153
29                   phosphate    3.015761
30                   magnesium    2.997232
16                   aspartate    2.991264
6          

Moda

In [51]:
resultado_moda = calcular_importancia_atributos('df_moda.csv')
print(resultado_moda)

                       Feature  Importance
3                          WBC    5.144613
24                     glucose    4.962525
20                   platelets    4.822587
1                          age    4.424613
13                         MAP    3.905742
12                   diastolic    3.808302
18                  hemoglobin    3.749839
9                    heartRate    3.712127
14           bloodUreaNitrogen    3.700973
28                     calcium    3.615421
23                    chloride    3.590564
11                    systolic    3.569384
25                      sodium    3.532090
27                   potassium    3.455954
4                       lymphs    3.426686
7                    pao2/fio2    3.418186
8                  bicarbonate    3.212435
30                   magnesium    3.031362
22                     albumin    3.031052
16                   aspartate    3.025758
2                      lactate    2.903939
6                    spo2/fio2    2.894039
5          

Regressao

In [47]:
resultado_regressao = calcular_importancia_atributos('df_regressao.csv')
print(resultado_regressao)

                       Feature  Importance
5                         resp    5.073191
7                    pao2/fio2    4.212976
22                     albumin    4.040771
13                         MAP    4.033488
6                    spo2/fio2    4.022785
3                          WBC    3.970918
9                    heartRate    3.956352
20                   platelets    3.915517
21                         PTT    3.893423
11                    systolic    3.838784
10                    troponin    3.614136
24                     glucose    3.471149
12                   diastolic    3.458119
4                       lymphs    3.426243
2                      lactate    3.417517
16                   aspartate    3.239417
29                   phosphate    3.200868
1                          age    3.139938
15                     alanine    3.134407
23                    chloride    3.060657
30                   magnesium    3.026018
18                  hemoglobin    2.973013
28         

Seleção de atributos Chi Square

In [42]:
X = df.drop(columns=['patientunitstayid', 'diagnostico'])  # Atributos
y = df['diagnostico']  # Alvo

# Aplicar o chi2 para calcular os scores de chi-square
chi_scores = chi2(X, y)

# Criar um DataFrame com os resultados
chi2_df = pd.DataFrame({
    'Feature': X.columns,
    'Chi2 Score': chi_scores[0],
    'p-value': chi_scores[1]
})

# Ordenar por chi-square score para obter o ranking dos atributos
chi2_df = chi2_df.sort_values(by='Chi2 Score', ascending=False)

# Exibir o ranking
print(chi2_df)

                       Feature    Chi2 Score        p-value
9                    heartRate  7.419040e+02  2.310822e-163
14           bloodUreaNitrogen  7.348758e+02  7.798212e-162
4                       lymphs  7.320303e+02  3.241363e-161
13                         MAP  6.304400e+02  4.009892e-139
11                    systolic  4.647851e+02  4.371047e-103
5                         resp  2.417823e+02   1.607287e-54
16                   aspartate  2.362084e+02   2.639210e-53
24                     glucose  2.335921e+02   9.817249e-53
20                   platelets  1.453978e+02   1.757863e-33
12                   diastolic  1.270146e+02   1.844087e-29
21                         PTT  8.277258e+01   9.205825e-20
7                    pao2/fio2  5.671726e+01   5.032001e-14
22                     albumin  4.737237e+01   5.870441e-12
1                          age  3.746776e+01   9.293649e-10
15                     alanine  3.367098e+01   6.526703e-09
8                  bicarbonate  2.905469