1. Загрузка библиотек и предобработка данных

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel

In [5]:
df = pd.read_csv('heart_disease.csv')

In [6]:
df

Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral in mg/dl,fasting blood sugar > 120 mg/dl,resting electrocardiographic results,maximum heart rate achieved,exercise induced angina,oldpeak,slope of peak,number of major vessels,thal,Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,1
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,0
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,1
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,0
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,0
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,0
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,0
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,0


In [7]:
df.columns = df.columns.str.strip()

In [8]:
df['sex'] = df['sex'].astype("category")
df['chest pain type'] = df['chest pain type'].astype("category")
df['fasting blood sugar > 120 mg/dl'] = df['fasting blood sugar > 120 mg/dl'].astype("category")
df['resting electrocardiographic results'] = df['resting electrocardiographic results'].astype('category')
df['exercise induced angina'] = df['exercise induced angina'].astype('category')
df['slope of peak'] = df['slope of peak'].astype('category')
df['thal'] = df['thal'].astype('category')
df['Disease'] = df['Disease'].astype('category')

In [9]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   age                                   270 non-null    int64   
 1   sex                                   270 non-null    category
 2   chest pain type                       270 non-null    category
 3   resting blood pressure                270 non-null    int64   
 4   serum cholestoral in mg/dl            270 non-null    int64   
 5   fasting blood sugar > 120 mg/dl       270 non-null    category
 6   resting electrocardiographic results  270 non-null    category
 7   maximum heart rate achieved           270 non-null    int64   
 8   exercise induced angina               270 non-null    category
 9   oldpeak                               270 non-null    float64 
 10  slope of peak                         270 non-null    category
 11  number

In [10]:
Y = df['Disease']
X = df.drop('Disease', axis=1)


2. Масштабирование признаков


In [11]:
X_processed = X.copy()
category_columns: list[str] = X_processed.select_dtypes(include=['category']).columns # собираем колонки помеченные как category
X_processed = pd.get_dummies(X_processed, columns=category_columns,drop_first=True)

In [12]:
X_processed

Unnamed: 0,age,resting blood pressure,serum cholestoral in mg/dl,maximum heart rate achieved,oldpeak,number of major vessels,sex_1,chest pain type_2,chest pain type_3,chest pain type_4,fasting blood sugar > 120 mg/dl_1,resting electrocardiographic results_1,resting electrocardiographic results_2,exercise induced angina_1,slope of peak_2,slope of peak_3,thal_6,thal_7
0,70,130,322,109,2.4,3,True,False,False,True,False,False,True,False,True,False,False,False
1,67,115,564,160,1.6,0,False,False,True,False,False,False,True,False,True,False,False,True
2,57,124,261,141,0.3,0,True,True,False,False,False,False,False,False,False,False,False,True
3,64,128,263,105,0.2,1,True,False,False,True,False,False,False,True,True,False,False,True
4,74,120,269,121,0.2,1,False,True,False,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,172,199,162,0.5,0,True,False,True,False,True,False,False,False,False,False,False,True
266,44,120,263,173,0.0,0,True,True,False,False,False,False,False,False,False,False,False,True
267,56,140,294,153,1.3,0,False,True,False,False,False,False,True,False,True,False,False,False
268,57,140,192,148,0.4,0,True,False,False,True,False,False,False,False,True,False,True,False


In [13]:
numeric_features = X_processed.select_dtypes(include=['int64', 'float64']).columns.tolist()
scaler = MinMaxScaler()
X_processed[numeric_features] = scaler.fit_transform(X_processed[numeric_features])

In [14]:
X_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 18 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   age                                     270 non-null    float64
 1   resting blood pressure                  270 non-null    float64
 2   serum cholestoral in mg/dl              270 non-null    float64
 3   maximum heart rate achieved             270 non-null    float64
 4   oldpeak                                 270 non-null    float64
 5   number of major vessels                 270 non-null    float64
 6   sex_1                                   270 non-null    bool   
 7   chest pain type_2                       270 non-null    bool   
 8   chest pain type_3                       270 non-null    bool   
 9   chest pain type_4                       270 non-null    bool   
 10  fasting blood sugar > 120 mg/dl_1       270 non-null    bool  

In [15]:
X_processed

Unnamed: 0,age,resting blood pressure,serum cholestoral in mg/dl,maximum heart rate achieved,oldpeak,number of major vessels,sex_1,chest pain type_2,chest pain type_3,chest pain type_4,fasting blood sugar > 120 mg/dl_1,resting electrocardiographic results_1,resting electrocardiographic results_2,exercise induced angina_1,slope of peak_2,slope of peak_3,thal_6,thal_7
0,0.854167,0.339623,0.447489,0.290076,0.387097,1.000000,True,False,False,True,False,False,True,False,True,False,False,False
1,0.791667,0.198113,1.000000,0.679389,0.258065,0.000000,False,False,True,False,False,False,True,False,True,False,False,True
2,0.583333,0.283019,0.308219,0.534351,0.048387,0.000000,True,True,False,False,False,False,False,False,False,False,False,True
3,0.729167,0.320755,0.312785,0.259542,0.032258,0.333333,True,False,False,True,False,False,False,True,True,False,False,True
4,0.937500,0.245283,0.326484,0.381679,0.032258,0.333333,False,True,False,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,0.479167,0.735849,0.166667,0.694656,0.080645,0.000000,True,False,True,False,True,False,False,False,False,False,False,True
266,0.312500,0.245283,0.312785,0.778626,0.000000,0.000000,True,True,False,False,False,False,False,False,False,False,False,True
267,0.562500,0.433962,0.383562,0.625954,0.209677,0.000000,False,True,False,False,False,False,True,False,True,False,False,False
268,0.583333,0.433962,0.150685,0.587786,0.064516,0.000000,True,False,False,True,False,False,False,False,True,False,True,False


3. Очистка данных и удаление выбросов

In [16]:
from sklearn.model_selection import cross_val_score

def train_and_evaluate(X, Y):
    # Разделение данных на обучающую и тестовую выборки
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=42, stratify=Y)

    # Инициализация модели
    model = LogisticRegression(max_iter=1000)

    # Обучение модели
    model.fit(X_train, Y_train)

    # Предсказания на обучающей выборке
    Y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(Y_train, Y_train_pred)

    # Предсказания на тестовой выборке
    Y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_test_pred)

    # Вывод результатов
    print(f"Точность на обучающей выборке: {train_accuracy:.4f}")
    print(f"Точность на тестовой выборке: {test_accuracy:.4f}")

    # Классификационный отчет
    print("\nКлассификационный отчет на тестовой выборке:")
    print(classification_report(Y_test, Y_test_pred))

    return model

In [17]:
train_and_evaluate(X_processed, Y)

Точность на обучающей выборке: 0.8657
Точность на тестовой выборке: 0.8519

Классификационный отчет на тестовой выборке:
              precision    recall  f1-score   support

           0       0.92      0.80      0.86        30
           1       0.79      0.92      0.85        24

    accuracy                           0.85        54
   macro avg       0.85      0.86      0.85        54
weighted avg       0.86      0.85      0.85        54



Метод 1: удаление на основе медианы

In [23]:
def remove_outliers_based_on_iqr(X, y, iqr_multiplier=1.5):
    # Убедимся, что работаем только с числовыми данными
    X_numeric = X.select_dtypes(include=['number'])

    # Рассчитываем первый и третий квартили для каждой числовой переменной
    Q1 = X_numeric.quantile(0.25)
    Q3 = X_numeric.quantile(0.75)

    # Вычисляем межквартильный размах (IQR)
    IQR = Q3 - Q1

    # Оставляем только те значения, которые находятся в диапазоне [Q1 - 1.5*IQR, Q3 + 1.5*IQR]
    X_filtered = X_numeric[~((X_numeric < (Q1 - iqr_multiplier * IQR)) | (X_numeric > (Q3 + iqr_multiplier * IQR))).any(axis=1)]

    # Синхронизируем y с отфильтрованными X
    y_filtered = y.loc[X_filtered.index]

    return X_filtered, y_filtered


In [24]:
X_iqr, y_iqr = remove_outliers_based_on_iqr(X_processed, Y, iqr_multiplier=3)

In [25]:
X_processed.describe()

Unnamed: 0,age,resting blood pressure,serum cholestoral in mg/dl,maximum heart rate achieved,oldpeak,number of major vessels
count,270.0,270.0,270.0,270.0,270.0,270.0
mean,0.529861,0.352306,0.282327,0.600594,0.169355,0.223457
std,0.189772,0.168506,0.118005,0.176838,0.184711,0.314632
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.395833,0.245283,0.19863,0.473282,0.0,0.0
50%,0.541667,0.339623,0.271689,0.629771,0.129032,0.0
75%,0.666667,0.433962,0.351598,0.725191,0.258065,0.333333
max,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
X_iqr.describe()

Unnamed: 0,age,resting blood pressure,serum cholestoral in mg/dl,maximum heart rate achieved,oldpeak,number of major vessels
count,268.0,268.0,268.0,268.0,268.0,268.0
mean,0.528762,0.350465,0.279323,0.600775,0.167248,0.222637
std,0.189793,0.164144,0.10983,0.177262,0.183008,0.314343
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.390625,0.245283,0.198059,0.471374,0.0,0.0
50%,0.541667,0.339623,0.270548,0.629771,0.129032,0.0
75%,0.666667,0.433962,0.344749,0.727099,0.258065,0.333333
max,1.0,0.924528,0.664384,1.0,1.0,1.0


In [27]:
train_and_evaluate(X_iqr, y_iqr)

Точность на обучающей выборке: 0.7710
Точность на тестовой выборке: 0.8333

Классификационный отчет на тестовой выборке:
              precision    recall  f1-score   support

           0       0.86      0.83      0.85        30
           1       0.80      0.83      0.82        24

    accuracy                           0.83        54
   macro avg       0.83      0.83      0.83        54
weighted avg       0.83      0.83      0.83        54



Анализ

Метод 2: удаление на основе среднего

In [28]:
def remove_outliers_based_on_mean(X, y, mean_multiplier=2):
    # Убедимся, что работаем только с числовыми данными
    X_numeric = X.select_dtypes(include=['number'])

    # Рассчитываем среднее значение и стандартное отклонение для каждой переменной
    mean_values = X_numeric.mean()
    std_values = X_numeric.std()

    # Устанавливаем верхний и нижний пороги на основе среднего значения и множителя стандартного отклонения
    lower_threshold = mean_values - mean_multiplier * std_values
    upper_threshold = mean_values + mean_multiplier * std_values

    # Фильтруем наблюдения, которые находятся в пределах этих порогов по всем признакам
    X_filtered = X_numeric[~((X_numeric < lower_threshold) | (X_numeric > upper_threshold)).any(axis=1)]

    # Синхронизируем y с отфильтрованными X
    y_filtered = y.loc[X_filtered.index]

    return X_filtered, y_filtered

In [29]:
X_mean, y_mean = remove_outliers_based_on_mean(X_processed.copy(), Y.copy())

In [30]:
X_processed.describe()

Unnamed: 0,age,resting blood pressure,serum cholestoral in mg/dl,maximum heart rate achieved,oldpeak,number of major vessels
count,270.0,270.0,270.0,270.0,270.0,270.0
mean,0.529861,0.352306,0.282327,0.600594,0.169355,0.223457
std,0.189772,0.168506,0.118005,0.176838,0.184711,0.314632
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.395833,0.245283,0.19863,0.473282,0.0,0.0
50%,0.541667,0.339623,0.271689,0.629771,0.129032,0.0
75%,0.666667,0.433962,0.351598,0.725191,0.258065,0.333333
max,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
X_mean.describe()

Unnamed: 0,age,resting blood pressure,serum cholestoral in mg/dl,maximum heart rate achieved,oldpeak,number of major vessels
count,206.0,206.0,206.0,206.0,206.0,206.0
mean,0.517193,0.328036,0.272133,0.621841,0.140855,0.16343
std,0.174254,0.139578,0.094421,0.154076,0.151473,0.239105
min,0.166667,0.056604,0.052511,0.259542,0.0,0.0
25%,0.375,0.245283,0.196918,0.526718,0.0,0.0
50%,0.520833,0.339623,0.267123,0.648855,0.096774,0.0
75%,0.645833,0.433962,0.33105,0.740458,0.241935,0.333333
max,0.875,0.622642,0.518265,0.938931,0.516129,0.666667


In [32]:
train_and_evaluate(X_mean, y_mean)

Точность на обучающей выборке: 0.7439
Точность на тестовой выборке: 0.7619

Классификационный отчет на тестовой выборке:
              precision    recall  f1-score   support

           0       0.75      0.92      0.83        26
           1       0.80      0.50      0.62        16

    accuracy                           0.76        42
   macro avg       0.78      0.71      0.72        42
weighted avg       0.77      0.76      0.75        42

