In [36]:
# Импорт библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Загрузка данных
file_path = 'brain_stroke.csv'
data = pd.read_csv(file_path)

In [37]:
# Исследование данных
print(data.info())
print(data.describe())
print(data['stroke'].value_counts())
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB
None
               age  hypertension  heart_disease  avg_glucose_level  \
count  4981.000000   4981.000000    4981.000000        4981.000000   
mean     43.419859      0.096165       0.055210         105.

Unnamed: 0,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,0
smoking_status,0


In [38]:
# Кодирование категориальных переменных
data_encoded = pd.get_dummies(data, drop_first=True)

# Разделение данных на признаки и целевой признак
X = data_encoded.drop('stroke', axis=1)
y = data_encoded['stroke']

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [39]:
print(data_encoded.columns)

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke', 'gender_Male', 'ever_married_Yes', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes'],
      dtype='object')


In [40]:
print(y_train.value_counts())

stroke
0    3316
1     170
Name: count, dtype: int64


In [41]:
# Балансировка данных: undersampling и SMOTE
under = RandomUnderSampler(sampling_strategy=1.0, random_state=42)
smote = SMOTE(sampling_strategy=0.8, random_state=42)

#pipeline = Pipeline(steps=[('under', under), ('smote', smote)])
#X_train_balanced, y_train_balanced = pipeline.fit_resample(X_train, y_train)

In [42]:
pipeline = Pipeline(steps=[('under', under)])
X_train_balanced, y_train_balanced = pipeline.fit_resample(X_train, y_train)

In [43]:

class_distribution = pd.Series(y_train_balanced).value_counts()
print("После балансировки:")
print(class_distribution)

После балансировки:
stroke
0    170
1    170
Name: count, dtype: int64


In [44]:
# Масштабирование признаков
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)
X_test = scaler.transform(X_test)

In [45]:
# Функция для обучения моделей и оценки их качества
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    return y_pred

In [46]:
# 1. Логистическая регрессия
print("\n___ Логистическая регрессия ___")
logistic_model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
evaluate_model(logistic_model, X_train_balanced, y_train_balanced, X_test, y_test)


___ Логистическая регрессия ___
Accuracy: 0.7518394648829432
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.75      0.85      1417
           1       0.15      0.79      0.25        78

    accuracy                           0.75      1495
   macro avg       0.57      0.77      0.55      1495
weighted avg       0.94      0.75      0.82      1495

Confusion Matrix:
 [[1062  355]
 [  16   62]]


array([1, 0, 0, ..., 0, 0, 1])

In [47]:
from sklearn.tree import DecisionTreeClassifier
# 2. Дерево решений
print("\n___ Дерево решений ___")
decision_tree_model = DecisionTreeClassifier(random_state=42)
evaluate_model(decision_tree_model, X_train_balanced, y_train_balanced, X_test, y_test)


___ Дерево решений ___
Accuracy: 0.6742474916387959
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.68      0.80      1417
           1       0.09      0.60      0.16        78

    accuracy                           0.67      1495
   macro avg       0.53      0.64      0.48      1495
weighted avg       0.92      0.67      0.76      1495

Confusion Matrix:
 [[961 456]
 [ 31  47]]


array([1, 0, 0, ..., 0, 0, 0])

In [48]:
# 3. KNN
print("\n___ KNN ___")
knn_model = KNeighborsClassifier(n_neighbors=5)
evaluate_model(knn_model, X_train_balanced, y_train_balanced, X_test, y_test)


___ KNN ___
Accuracy: 0.676923076923077
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.67      0.80      1417
           1       0.11      0.74      0.19        78

    accuracy                           0.68      1495
   macro avg       0.55      0.71      0.50      1495
weighted avg       0.93      0.68      0.77      1495

Confusion Matrix:
 [[954 463]
 [ 20  58]]


array([0, 0, 0, ..., 0, 0, 1])

In [49]:

param_grid = {'n_neighbors': range(1, 20), 'metric': ['euclidean', 'manhattan', 'minkowski']}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid)
grid_search.fit(X_train_balanced, y_train_balanced)


In [50]:
best_knn = grid_search.best_estimator_
print("Лучшие параметры:", grid_search.best_params_)


Лучшие параметры: {'metric': 'euclidean', 'n_neighbors': 19}


In [51]:
# Оценка модели с лучшими параметрами
print("\n___ Лучшие KNN ___")
evaluate_model(best_knn, X_train_balanced, y_train_balanced, X_test, y_test)


___ Лучшие KNN ___
Accuracy: 0.6762541806020067
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.67      0.80      1417
           1       0.11      0.77      0.20        78

    accuracy                           0.68      1495
   macro avg       0.55      0.72      0.50      1495
weighted avg       0.94      0.68      0.77      1495

Confusion Matrix:
 [[951 466]
 [ 18  60]]


array([1, 0, 0, ..., 0, 0, 1])

In [52]:
# Настройка гиперпараметров для дерева решений

param_grid_tree = {
    'max_depth': range(3, 15),
    'min_samples_split': range(2, 10),
    'min_samples_leaf': range(1, 5),
    'criterion': ['gini', 'entropy']
}

grid_search_tree = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_tree)
grid_search_tree.fit(X_train_balanced, y_train_balanced)

best_tree = grid_search_tree.best_estimator_
print("Лучшие параметры:", grid_search_tree.best_params_)

# Оценка модели с лучшими параметрами
print("\n___ Лучшее дерево ___")
evaluate_model(best_tree, X_train_balanced, y_train_balanced, X_test, y_test)

Лучшие параметры: {'criterion': 'gini', 'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 2}

___ Лучшее дерево ___
Accuracy: 0.6301003344481605
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.63      0.76      1417
           1       0.09      0.67      0.16        78

    accuracy                           0.63      1495
   macro avg       0.53      0.65      0.46      1495
weighted avg       0.93      0.63      0.73      1495

Confusion Matrix:
 [[890 527]
 [ 26  52]]


  _data = np.array(data, dtype=dtype, copy=copy,


array([1, 0, 0, ..., 0, 0, 0])