1. Загрузка библиотек и предобработка данных

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel

In [8]:
df = pd.read_csv('heart_disease.csv')

In [49]:
df

Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral in mg/dl,fasting blood sugar > 120 mg/dl,resting electrocardiographic results,maximum heart rate achieved,exercise induced angina,oldpeak,slope of peak,number of major vessels,thal,Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,1
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,0
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,1
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,0
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,0
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,0
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,0
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,0


In [51]:
df.columns = df.columns.str.strip()


In [52]:
df['sex'] = df['sex'].astype("category")
df['chest pain type'] = df['chest pain type'].astype("category")
df['fasting blood sugar > 120 mg/dl'] = df['fasting blood sugar > 120 mg/dl'].astype("category")
df['resting electrocardiographic results'] = df['resting electrocardiographic results'].astype('category')
df['exercise induced angina'] = df['exercise induced angina'].astype('category')
df['slope of peak'] = df['slope of peak'].astype('category')
df['thal'] = df['thal'].astype('category')
df['Disease'] = df['Disease'].astype('category')

In [53]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   age                                   270 non-null    int64   
 1   sex                                   270 non-null    category
 2   chest pain type                       270 non-null    category
 3   resting blood pressure                270 non-null    int64   
 4   serum cholestoral in mg/dl            270 non-null    int64   
 5   fasting blood sugar > 120 mg/dl       270 non-null    category
 6   resting electrocardiographic results  270 non-null    category
 7   maximum heart rate achieved           270 non-null    int64   
 8   exercise induced angina               270 non-null    category
 9   oldpeak                               270 non-null    float64 
 10  slope of peak                         270 non-null    category
 11  number

In [54]:
Y = df['Disease']
X = df.drop('Disease', axis=1)


2. Масштабирование признаков


In [55]:
X_processed = X.copy()
category_columns: list[str] = X_processed.select_dtypes(include=['category']).columns # собираем колонки помеченные как category
X_processed = pd.get_dummies(X_processed, columns=category_columns,drop_first=True)

In [56]:
X_processed

Unnamed: 0,age,resting blood pressure,serum cholestoral in mg/dl,maximum heart rate achieved,oldpeak,number of major vessels,sex_1,chest pain type_2,chest pain type_3,chest pain type_4,fasting blood sugar > 120 mg/dl_1,resting electrocardiographic results_1,resting electrocardiographic results_2,exercise induced angina_1,slope of peak_2,slope of peak_3,thal_6,thal_7
0,70,130,322,109,2.4,3,True,False,False,True,False,False,True,False,True,False,False,False
1,67,115,564,160,1.6,0,False,False,True,False,False,False,True,False,True,False,False,True
2,57,124,261,141,0.3,0,True,True,False,False,False,False,False,False,False,False,False,True
3,64,128,263,105,0.2,1,True,False,False,True,False,False,False,True,True,False,False,True
4,74,120,269,121,0.2,1,False,True,False,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,172,199,162,0.5,0,True,False,True,False,True,False,False,False,False,False,False,True
266,44,120,263,173,0.0,0,True,True,False,False,False,False,False,False,False,False,False,True
267,56,140,294,153,1.3,0,False,True,False,False,False,False,True,False,True,False,False,False
268,57,140,192,148,0.4,0,True,False,False,True,False,False,False,False,True,False,True,False


In [60]:
numeric_features = X_processed.select_dtypes(include=['int64', 'float64']).columns.tolist()
scaler = MinMaxScaler()
X_processed[numeric_features] = scaler.fit_transform(X_processed[numeric_features])

In [64]:
X_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 18 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   age                                     270 non-null    float64
 1   resting blood pressure                  270 non-null    float64
 2   serum cholestoral in mg/dl              270 non-null    float64
 3   maximum heart rate achieved             270 non-null    float64
 4   oldpeak                                 270 non-null    float64
 5   number of major vessels                 270 non-null    float64
 6   sex_1                                   270 non-null    bool   
 7   chest pain type_2                       270 non-null    bool   
 8   chest pain type_3                       270 non-null    bool   
 9   chest pain type_4                       270 non-null    bool   
 10  fasting blood sugar > 120 mg/dl_1       270 non-null    bool  

In [21]:
X_processed

Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral in mg/dl,fasting blood sugar > 120 mg/dl,resting electrocardiographic results,maximum heart rate achieved,exercise induced angina,oldpeak,slope of peak,number of major vessels,thal
0,70,1,4,130,322,0,2,109,0,0.387097,2,3,3
1,67,0,3,115,564,0,2,160,0,0.258065,2,0,7
2,57,1,2,124,261,0,0,141,0,0.048387,1,0,7
3,64,1,4,128,263,0,0,105,1,0.032258,2,1,7
4,74,0,2,120,269,0,2,121,1,0.032258,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.080645,1,0,7
266,44,1,2,120,263,0,0,173,0,0.000000,1,0,7
267,56,0,2,140,294,0,2,153,0,0.209677,2,0,3
268,57,1,4,140,192,0,0,148,0,0.064516,2,0,6
