In [87]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.model_selection import ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import  BaggingClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import RidgeClassifier

In [2]:
data = pd.read_csv('heart.csv')

In [4]:
data.head(3)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0


In [31]:
data.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

In [33]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,918.0,53.510893,9.432617,28.0,47.0,54.0,60.0,77.0
RestingBP,918.0,132.396514,18.514154,0.0,120.0,130.0,140.0,200.0
Cholesterol,918.0,198.799564,109.384145,0.0,173.25,223.0,267.0,603.0
FastingBS,918.0,0.233115,0.423046,0.0,0.0,0.0,0.0,1.0
MaxHR,918.0,136.809368,25.460334,60.0,120.0,138.0,156.0,202.0
Oldpeak,918.0,0.887364,1.06657,-2.6,0.0,0.6,1.5,6.2
HeartDisease,918.0,0.553377,0.497414,0.0,0.0,1.0,1.0,1.0


In [34]:
data.describe(include=object).T

Unnamed: 0,count,unique,top,freq
Sex,918,2,M,725
ChestPainType,918,4,ASY,496
RestingECG,918,3,Normal,552
ExerciseAngina,918,2,N,547
ST_Slope,918,3,Flat,460


In [35]:
print(data["HeartDisease"].value_counts())

HeartDisease
1    508
0    410
Name: count, dtype: int64


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns='HeartDisease'),
                                                    data['HeartDisease'],
                                                    test_size=0.3,
                                                    random_state=42)
X_train.shape, X_test.shape

((642, 11), (276, 11))

In [8]:
categorials = X_train.select_dtypes('object').columns
categorials

Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [18]:
integer = X_train.select_dtypes(['int', 'float']).columns.tolist()

In [19]:
integer

['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

In [20]:
dummy_train = pd.get_dummies(X_train[categorials], columns=categorials)
dummy_test = pd.get_dummies(X_test[categorials], columns=categorials)

dummy_cols = list(dummy_train)

dummy_train = dummy_train[dummy_cols]
dummy_test = dummy_test[dummy_cols]

X_train = pd.concat([X_train[integer], dummy_train], axis=1)

X_test = pd.concat([X_test[integer], dummy_test], axis=1)

In [27]:
dt = DecisionTreeClassifier(random_state=10)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

1.0

In [41]:
classification_report(y_train, dt.predict(X_train))

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00       298\n           1       1.00      1.00      1.00       344\n\n    accuracy                           1.00       642\n   macro avg       1.00      1.00      1.00       642\nweighted avg       1.00      1.00      1.00       642\n'

Ну тут за хлебом ходить не нужно, понятно, что модель переобучилась

In [28]:
dt.score(X_test, y_test)

0.7355072463768116

In [54]:
target_names = ['class 0', 'class 1']
classification_report(y_test, dt.predict(X_test), target_names = target_names)

'              precision    recall  f1-score   support\n\n     class 0       0.64      0.81      0.71       112\n     class 1       0.84      0.68      0.75       164\n\n    accuracy                           0.74       276\n   macro avg       0.74      0.75      0.73       276\nweighted avg       0.76      0.74      0.74       276\n'

In [62]:
len(y_test[y_test==1])

164

в болезнях сердца лучше перебдеть, чем недобдеть, поэтому нас больше тут интерисует recall (у class 1), он 64%, так что пойдем в лес

In [63]:
rfc = RandomForestClassifier(random_state=10)
rfc.fit(X_train, y_train)

In [64]:
rfc.score(X_train, y_train)

1.0

In [65]:
rfc.score(X_test, y_test)

0.894927536231884

In [66]:
classification_report(y_test, rfc.predict(X_test), target_names = target_names)

'              precision    recall  f1-score   support\n\n     class 0       0.85      0.89      0.87       112\n     class 1       0.92      0.90      0.91       164\n\n    accuracy                           0.89       276\n   macro avg       0.89      0.89      0.89       276\nweighted avg       0.90      0.89      0.90       276\n'

У класса 1 хороший recall 90% на random forest, вполне себе пригодная модель

In [73]:
imp = pd.Series(rfc.feature_importances_)
imp.index = pd.Series(X_train.columns)
imp = imp.sort_values(ascending=False)
imp[:3]
# Какой-то верхний наклон самый важный типа (без понятия шо это)

ST_Slope_Up      0.152730
Oldpeak          0.102279
ST_Slope_Flat    0.097955
dtype: float64

In [76]:
bagging = BaggingClassifier(DecisionTreeClassifier(),
                            max_samples=0.6,
                            max_features=0.5,
                            random_state=10)

In [77]:
bagging.fit(X_train, y_train)

In [78]:
bagging.score(X_train, y_train)

0.9439252336448598

In [79]:
bagging.score(X_test, y_test)

0.8514492753623188

In [80]:
classification_report(y_test, bagging.predict(X_test), target_names = target_names)

'              precision    recall  f1-score   support\n\n     class 0       0.79      0.87      0.83       112\n     class 1       0.90      0.84      0.87       164\n\n    accuracy                           0.85       276\n   macro avg       0.85      0.85      0.85       276\nweighted avg       0.86      0.85      0.85       276\n'

recall class 1 84% пока что лучший результат из полученных

In [88]:
stclf = StackingClassifier(
    [
        ('DecisionTreeClassifier', DecisionTreeClassifier()),
        ('RandomForestClassifier', RandomForestClassifier()),
        ('LinearSVC', LinearSVC())
    ], RidgeClassifier())

In [89]:
stclf.fit(X_train, y_train)

In [90]:
print(f'Score on train  {stclf.score(X_train, y_train)}')
print(f'Score on test  {stclf.score(X_test, y_test)}')

Score on train  0.9890965732087228
Score on test  0.8804347826086957


In [91]:
classification_report(y_test, stclf.predict(X_test), target_names = target_names)

'              precision    recall  f1-score   support\n\n     class 0       0.84      0.88      0.86       112\n     class 1       0.91      0.88      0.90       164\n\n    accuracy                           0.88       276\n   macro avg       0.87      0.88      0.88       276\nweighted avg       0.88      0.88      0.88       276\n'

recall class 1 88% и это лучший из полученных результатов

In [106]:
# мне очень понравился Стекинг и я решила замутить еще 1 модель
from sklearn.svm import NuSVC
from sklearn.svm import SVC
stclf1 = StackingClassifier(
    [
        ('DecisionTreeClassifier', DecisionTreeClassifier()),
        ('RandomForestClassifier', RandomForestClassifier()),
        ('LinearSVC', LinearSVC()),
        ('NuSVC', NuSVC()),
        ('SVC', SVC())
    ], RidgeClassifier())

In [107]:
stclf1.fit(X_train, y_train)

In [108]:
print(f'Score on train  {stclf1.score(X_train, y_train)}')
print(f'Score on test  {stclf1.score(X_test, y_test)}')

Score on train  0.9704049844236761
Score on test  0.9094202898550725


In [109]:
classification_report(y_test, stclf1.predict(X_test), target_names = target_names)

'              precision    recall  f1-score   support\n\n     class 0       0.88      0.90      0.89       112\n     class 1       0.93      0.91      0.92       164\n\n    accuracy                           0.91       276\n   macro avg       0.90      0.91      0.91       276\nweighted avg       0.91      0.91      0.91       276\n'

recall class 1 91%. Коллеги это успех!!!!