In [105]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#### Поосмотрим на данные

In [106]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Id,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years,risk_flag
0,1,1303835,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [107]:
df.shape

(252000, 13)

In [108]:
df.nunique()

Id                     252000
income                  41917
age                        59
experience                 21
married                     2
house_ownership             3
car_ownership               2
profession                 51
city                      317
state                      29
current_job_years          15
current_house_years         5
risk_flag                   2
dtype: int64

In [109]:
df.isna().sum()

Id                     0
income                 0
age                    0
experience             0
married                0
house_ownership        0
car_ownership          0
profession             0
city                   0
state                  0
current_job_years      0
current_house_years    0
risk_flag              0
dtype: int64

In [110]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

#### Присутствует дизбаланс классов

In [111]:
df['risk_flag'].value_counts()

risk_flag
0    221004
1     30996
Name: count, dtype: int64

In [112]:
X = df.drop('risk_flag', axis = 1)
Y = df['risk_flag']

In [113]:
# Закодируем колонки с категориальными признаками и используем модель
catcols = X.select_dtypes(include = ['object']).columns
OHEcols = catcols[X[catcols].nunique() <= 10]
MTEcols = catcols[X[catcols].nunique() > 10]

# Standard Scaler не нужен, так как данные одного порядка
preprocessing = ColumnTransformer([('ohe', OneHotEncoder(), OHEcols),
                                  ('mte', TargetEncoder(), MTEcols)])

#### Попробуем случайный лес

In [142]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [115]:
preprocessing = ColumnTransformer([('ohe', OneHotEncoder(), OHEcols)])
model = RandomForestClassifier()

param_grid = {
    "random_forest__max_depth": [4, 5, 10],
    "random_forest__min_samples_split": [2],
    "random_forest__min_samples_leaf": [1]
}

pipe = Pipeline([('preprocessing', preprocessing),
                ('random_forest', model)])

search = GridSearchCV(pipe,
                     param_grid,
                     cv =  4)

search.fit(X_train, Y_train)
print(search.best_params_)

{'random_forest__max_depth': 4, 'random_forest__min_samples_leaf': 1, 'random_forest__min_samples_split': 2}


In [116]:
search.best_params_.items()

dict_items([('random_forest__max_depth', 4), ('random_forest__min_samples_leaf', 1), ('random_forest__min_samples_split', 2)])

In [117]:
bp = search.best_params_
ubp = {key.replace('random_forest__', ''): value for key, value in search.best_params_.items()}
model2 = RandomForestClassifier(**ubp)

pipe = Pipeline([('preprocessing', preprocessing),
                ('Scaler', StandardScaler()),
                ('random_forest', model2)]) 

In [118]:
pipe.fit(X_train, Y_train)

In [119]:
from sklearn.metrics import accuracy_score

accuracy_score(pipe.predict(X_test), Y_test)

0.8750793650793651

In [120]:
accuracy_score(pipe.predict(X_train), Y_train)

0.8776402116402117

In [121]:
Y_test.value_counts()

risk_flag
0    55130
1     7870
Name: count, dtype: int64

In [122]:
from sklearn.metrics import recall_score

recall = recall_score(Y_test, pipe.predict(X_test))

In [123]:
recall

0.0

#### Случайный лес не смог найти зависимости

#### Попробуем градиентный бустинг, применив веса для балансировки классов

In [124]:
from sklearn.ensemble import GradientBoostingClassifier

modelGB = GradientBoostingClassifier(learning_rate= 0.3,
                                  max_depth = 5,
                                  n_estimators = 7)

In [125]:
pipeGB = Pipeline([('preprocessing', preprocessing),
                ('gb', modelGB)])

In [126]:
sample_weight = [10 if y == 1 else 1 for y in Y_train]

In [127]:
pipeGB.fit(X_train, Y_train, gb__sample_weight = sample_weight)

In [128]:
res = pipeGB.predict(X_test)

In [129]:
recall_score(Y_test, res)

0.9899618805590852

In [130]:
from sklearn.metrics import precision_score

precision_score(Y_test, res)

0.12591718654038853

In [131]:
accuracy_score(pipeGB.predict(X_test), Y_test)

0.1402857142857143

#### Он не справился с точным выявлением

In [132]:
from sklearn.neighbors import KNeighborsClassifier

mknn = KNeighborsClassifier()

In [133]:
pknn =Pipeline([('preprocessing', preprocessing),
                ('knn', mknn)])

In [134]:
pknn.fit(X_test, Y_test)
res = pknn.predict(X_test)

In [135]:
precision_score(Y_test, res)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.0

In [136]:
recall_score(Y_test, res)

0.0

In [137]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

plr =Pipeline([('preprocessing', preprocessing),
                ('lr', lr)])

plr.fit(X_test, Y_test)

In [138]:
res = pknn.predict(X_test)

In [139]:
precision_score(Y_test, res)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.0

In [140]:
recall_score(Y_test, res)

0.0

In [141]:
Y_train.value_counts()

risk_flag
0    165874
1     23126
Name: count, dtype: int64

#### KNN и логистическая регрессия тоже не справились

#### Можно сделать вывод, что зависимости данных сложные и модели не справляются с эффективыным выявлением людей с риском для сердца

#### Вероятно, для определния риска для сердца нужно использовать другие исходные данные, например, сон, давление, вес и тд.