In [45]:
import matplotlib.pyplot as plt
import pandas as pd
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, accuracy_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier 

In [2]:
RANDOM_STATE = 42
warnings.filterwarnings('ignore')
train = pd.read_csv('heart_adapt_train.csv')
test = pd.read_csv('heart_adapt_test.csv')
train.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,74.0,M,NAP,138.0,,0,Normal,116,N,0.2,Up,0
1,58.0,M,NAP,132.0,224.0,0,LVH,173,N,3.2,Up,1
2,44.0,M,ATA,150.0,288.0,0,Normal,150,Y,3.0,Flat,1
3,50.0,M,ASY,144.0,349.0,0,LVH,120,Y,1.0,Up,1
4,,M,ASY,145.0,248.0,0,Normal,96,Y,2.0,Flat,1


In [4]:
target_train = train['HeartDisease']
features_train = train.drop('HeartDisease', axis=1)
target_test = test['HeartDisease']
features_test = test.drop('HeartDisease',axis=1)

Проверка баланса

In [5]:
round((target_train.value_counts(normalize=True)*100),2)

HeartDisease
1    64.69
0    35.31
Name: proportion, dtype: float64

явный дисбаланс. нужно будет применять гиперпараметр class_weight='balanced' 

In [6]:
pd.get_dummies(features_train, drop_first=True)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,74.0,138.0,,0,116,0.2,True,False,True,False,True,False,False,False,True
1,58.0,132.0,224.0,0,173,3.2,True,False,True,False,False,False,False,False,True
2,44.0,150.0,288.0,0,150,3.0,True,True,False,False,True,False,True,True,False
3,50.0,144.0,349.0,0,120,1.0,True,False,False,False,False,False,True,False,True
4,,145.0,248.0,0,96,2.0,True,False,False,False,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
584,58.0,115.0,,1,138,0.5,True,False,False,False,True,False,False,False,True
585,56.0,132.0,184.0,0,105,2.1,True,False,False,False,False,False,True,True,False
586,52.0,134.0,201.0,0,158,0.8,True,True,False,False,True,False,False,False,True
587,49.0,131.0,142.0,0,127,1.5,True,False,True,False,True,False,True,True,False


In [7]:
features_train_nonclass = pd.get_dummies(features_train, drop_first=True)
features_train_nonclass.columns = features_train_nonclass.columns.str.lower()

def convert_bool_to_num(df):
    bool_column = df.select_dtypes(include=bool).columns
    for column in bool_column:
        df[column]=df[column].astype(int)
    return df

In [10]:
features_train_nonclass = convert_bool_to_num(features_train_nonclass)
features_train_nonclass.info()
features_train_nonclass.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                533 non-null    float64
 1   restingbp          588 non-null    float64
 2   cholesterol        462 non-null    float64
 3   fastingbs          589 non-null    int64  
 4   maxhr              589 non-null    int64  
 5   oldpeak            589 non-null    float64
 6   sex_m              589 non-null    int32  
 7   chestpaintype_ata  589 non-null    int32  
 8   chestpaintype_nap  589 non-null    int32  
 9   chestpaintype_ta   589 non-null    int32  
 10  restingecg_normal  589 non-null    int32  
 11  restingecg_st      589 non-null    int32  
 12  exerciseangina_y   589 non-null    int32  
 13  st_slope_flat      589 non-null    int32  
 14  st_slope_up        589 non-null    int32  
dtypes: float64(4), int32(9), int64(2)
memory usage: 48.4 KB


Unnamed: 0,age,restingbp,cholesterol,fastingbs,maxhr,oldpeak,sex_m,chestpaintype_ata,chestpaintype_nap,chestpaintype_ta,restingecg_normal,restingecg_st,exerciseangina_y,st_slope_flat,st_slope_up
0,74.0,138.0,,0,116,0.2,1,0,1,0,1,0,0,0,1
1,58.0,132.0,224.0,0,173,3.2,1,0,1,0,0,0,0,0,1
2,44.0,150.0,288.0,0,150,3.0,1,1,0,0,1,0,1,1,0
3,50.0,144.0,349.0,0,120,1.0,1,0,0,0,0,0,1,0,1
4,,145.0,248.0,0,96,2.0,1,0,0,0,1,0,1,1,0


In [15]:
class DataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
       X = pd.get_dummies(X, drop_first=True)
       X.columns = X.columns.str.lower()
       X = self.convert_bool_to_numeric(X)
       X.fillna(X.mean(), inplace=True)
       return X
    
    def convert_bool_to_numeric(self, df):
        bool_columns = df.select_dtypes(include=bool).columns
        for column in bool_columns:
            df[column] = df[column].astype(int)
        return df
                    

In [16]:
DP = DataPreprocessor()
DP.fit(features_train)
features_train_DP=DP.transform(features_train)
features_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             533 non-null    float64
 1   Sex             589 non-null    object 
 2   ChestPainType   589 non-null    object 
 3   RestingBP       588 non-null    float64
 4   Cholesterol     462 non-null    float64
 5   FastingBS       589 non-null    int64  
 6   RestingECG      589 non-null    object 
 7   MaxHR           589 non-null    int64  
 8   ExerciseAngina  589 non-null    object 
 9   Oldpeak         589 non-null    float64
 10  ST_Slope        589 non-null    object 
dtypes: float64(4), int64(2), object(5)
memory usage: 50.7+ KB


In [17]:
features_train_nonclass.fillna(features_train_nonclass.mean(), inplace=True)
features_train_nonclass.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                589 non-null    float64
 1   restingbp          589 non-null    float64
 2   cholesterol        589 non-null    float64
 3   fastingbs          589 non-null    int64  
 4   maxhr              589 non-null    int64  
 5   oldpeak            589 non-null    float64
 6   sex_m              589 non-null    int32  
 7   chestpaintype_ata  589 non-null    int32  
 8   chestpaintype_nap  589 non-null    int32  
 9   chestpaintype_ta   589 non-null    int32  
 10  restingecg_normal  589 non-null    int32  
 11  restingecg_st      589 non-null    int32  
 12  exerciseangina_y   589 non-null    int32  
 13  st_slope_flat      589 non-null    int32  
 14  st_slope_up        589 non-null    int32  
dtypes: float64(4), int32(9), int64(2)
memory usage: 48.4 KB


масштабирование признаков

In [22]:
scaler = StandardScaler()
scaler.fit(features_train_DP)
features_train_DP_scaler = pd.DataFrame(scaler.transform(features_train_DP), columns=scaler.feature_names_in_)
features_train_DP_scaler

Unnamed: 0,age,restingbp,cholesterol,fastingbs,maxhr,oldpeak,sex_m,chestpaintype_ata,chestpaintype_nap,chestpaintype_ta,restingecg_normal,restingecg_st,exerciseangina_y,st_slope_flat,st_slope_up
0,2.186049,0.246610,1.096468e-15,-0.589768,-0.758105,-0.689009,0.484563,-0.441278,2.008529,-0.223407,0.840979,-0.500531,-0.901281,-1.109532,1.276505
1,0.419980,-0.072203,-4.172660e-01,-0.589768,1.529085,2.113660,0.484563,-0.441278,2.008529,-0.223407,-1.189090,-0.500531,-0.901281,-1.109532,1.276505
2,-1.125331,0.884236,8.172468e-01,-0.589768,0.606184,1.926815,0.484563,2.266146,-0.497877,-0.223407,0.840979,-0.500531,1.109532,0.901281,-0.783389
3,-0.463055,0.565423,1.993892e+00,-0.589768,-0.597600,0.058369,0.484563,-0.441278,-0.497877,-0.223407,-1.189090,-0.500531,1.109532,-1.109532,1.276505
4,0.000000,0.618558,4.567631e-02,-0.589768,-1.560627,0.992592,0.484563,-0.441278,-0.497877,-0.223407,0.840979,-0.500531,1.109532,0.901281,-0.783389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
584,0.419980,-0.975506,1.096468e-15,1.695582,0.124670,-0.408742,0.484563,-0.441278,-0.497877,-0.223407,0.840979,-0.500531,-0.901281,-1.109532,1.276505
585,0.199221,-0.072203,-1.188836e+00,-0.589768,-1.199492,1.086014,0.484563,-0.441278,-0.497877,-0.223407,-1.189090,-0.500531,1.109532,0.901281,-0.783389
586,-0.242296,0.034068,-8.609190e-01,-0.589768,0.927193,-0.128475,0.484563,2.266146,-0.497877,-0.223407,0.840979,-0.500531,-0.901281,-1.109532,1.276505
587,-0.573434,-0.125338,-1.998986e+00,-0.589768,-0.316717,0.525481,0.484563,-0.441278,2.008529,-0.223407,0.840979,-0.500531,1.109532,0.901281,-0.783389


In [29]:
model = DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight='balanced')
model.fit(features_train_DP_scaler, target_train)


In [31]:
print(f'Качество модели на тренировочных данных {roc_auc_score(target_train, model.predict_proba(features_train_DP_scaler)[:, 1])}')

Качество модели на тренировочных данных 1.0


Создание конвеера и настройка трансформера  на обучающей выборке

In [32]:
pipe = Pipeline([
    ('ohe_types', DataPreprocessor()),
    ('scaler', StandardScaler()),
    ('classify', DecisionTreeClassifier(class_weight='balanced', random_state=RANDOM_STATE))
])
pipe.fit(X=features_train, y = target_train)

In [35]:
print(f'Качество модели на тренировочных данных {roc_auc_score(target_test, pipe.predict_proba(features_test)[:, 1])}')

Качество модели на тренировочных данных 0.8044994375703037


Модель переобучается

In [36]:
params = [
    {
        'classify': [LogisticRegression(class_weight='balanced', random_state=RANDOM_STATE)],
        'classify__penalty': ['l1', 'l2'],
        'classify__C': (range(1, 11))
    },
    {
        'classify': [DecisionTreeClassifier(class_weight='balanced', random_state=RANDOM_STATE)],
        'classify__max_depth': [2, 5, 10, 20],
        'classify__min_samples_leaf': [1, 2]
    },
    {
        'classify': [RandomForestClassifier(class_weight='balanced', random_state=RANDOM_STATE)],
        'classify__min_samples_leaf': [1, 2],
        'classify__max_depth': [2, 5, 10, 20],
        'classify__n_estimators': (range(10, 51, 10))
    }
]

In [37]:
grid_search = GridSearchCV(pipe, param_grid=params, cv=5, scoring='roc_auc')
grid_search.fit(X=features_train, y=target_train)

слава яйцам оно заработало

In [39]:
print(f'Качество лучшей модели на тренировочных данных {roc_auc_score(target_test, grid_search.predict_proba(features_test)[:, 1])}')

Качество лучшей модели на тренировочных данных 0.9140607424071991


In [43]:
best_model = grid_search.best_estimator_

In [44]:
target_testing =  best_model.predict(features_test)

In [46]:
roc_auc = roc_auc_score(target_test, target_testing)
accuracy = accuracy_score(target_test, target_testing)
recal = recall_score(target_test, target_testing)
f1 = f1_score(target_test, target_testing)

In [47]:
print(f'Лучшие параметры модели: {grid_search.best_params_}')
print(f'Лучший ROC AUC: {grid_search.best_score_}')
print(f'ROC AUC на тестовой выборке: {roc_auc}')
print(f'Точность на тестовой выборке: {accuracy}')
print(f'Полнота на тестовой выборке: {recal}')
print(f'F1 на тестовой выборке: {f1}')

Лучшие параметры модели: {'classify': RandomForestClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=2, n_estimators=50, random_state=42), 'classify__max_depth': 10, 'classify__min_samples_leaf': 2, 'classify__n_estimators': 50}
Лучший ROC AUC: 0.9292024406915294
ROC AUC на тестовой выборке: 0.863835770528684
Точность на тестовой выборке: 0.8781725888324873
Полнота на тестовой выборке: 0.9133858267716536
F1 на тестовой выборке: 0.9062500000000001
