In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib

from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score


In [2]:
folds = joblib.load('folds.pkl')


[(array([     0,      1,      2, ..., 629997, 629998, 629999],
        shape=(504000,)),
  array([     9,     16,     19, ..., 629976, 629987, 629993],
        shape=(126000,))),
 (array([     0,      1,      4, ..., 629997, 629998, 629999],
        shape=(504000,)),
  array([     2,      3,      6, ..., 629977, 629982, 629989],
        shape=(126000,))),
 (array([     0,      2,      3, ..., 629995, 629996, 629999],
        shape=(504000,)),
  array([     1,      4,     12, ..., 629994, 629997, 629998],
        shape=(126000,))),
 (array([     1,      2,      3, ..., 629997, 629998, 629999],
        shape=(504000,)),
  array([     0,      5,     10, ..., 629988, 629990, 629995],
        shape=(126000,))),
 (array([     0,      1,      2, ..., 629995, 629997, 629998],
        shape=(504000,)),
  array([     7,      8,     11, ..., 629992, 629996, 629999],
        shape=(126000,)))]

In [17]:
df = pd.read_csv('df_att.csv')
target = 'Heart Disease'

X = df.drop(target, axis=1)
y = df[target]
y = y.map({"Presence": 1, "Absence": 0})

num_cols = list(X.select_dtypes(include='number').columns)
cat_cols = list(X.select_dtypes(exclude='number').columns)

y

0         1
1         0
2         0
3         0
4         1
         ..
629995    0
629996    0
629997    1
629998    1
629999    0
Name: Heart Disease, Length: 630000, dtype: int64

In [18]:
params = {
    'hidden_layer_sizes': (128,),
    'alpha': 5.894589851301599e-05,
    'learning_rate_init': 0.0007594934857959463,
    'activation': 'relu'}


In [19]:
preprocessor = ColumnTransformer(
    transformers =[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

In [20]:
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', MLPClassifier(
        hidden_layer_sizes=params['hidden_layer_sizes'],
        activation=params['activation'],
        alpha=params['alpha'],
        learning_rate_init=params['learning_rate_init'],
        solver='adam',
        max_iter=300,
        random_state=42,
        early_stopping=True
    ))
])

In [23]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

oof_preds2=np.zeros(len(X))




In [24]:
for fold, (train_idx, val_idx) in enumerate(folds):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    final_pipeline.fit(X_train, y_train)
    oof_preds_mlpc[val_idx] = final_pipeline.predict_proba(X_val)[:, 1]

    print(f'Fold {fold} concluído')



Fold 0 concluído
Fold 1 concluído
Fold 2 concluído
Fold 3 concluído
Fold 4 concluído


In [25]:
oof_preds2

array([0.99704828, 0.01107352, 0.01694481, ..., 0.95260064, 0.46022769,
       0.00292729], shape=(630000,))

In [26]:
oof_auc = roc_auc_score(y, oof_preds2)

In [28]:
X.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,58,M,Dangerous,152,239,N,N,158,Y,3.6,Flat,2,7
1,52,M,Light,125,325,N,P,171,N,0.0,Upsloping,0,3
2,56,F,Common,160,188,N,P,151,N,0.0,Upsloping,0,3
3,44,F,Hard,134,229,N,P,150,N,1.0,Flat,0,3
4,58,M,Dangerous,140,234,N,P,125,Y,3.8,Flat,3,3


In [29]:
np.save('mlp_oof.npy', oof_preds)
