In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Загружаем данные

In [2]:
dataset = pd.read_csv("heart.csv")

In [3]:
dataset.shape

(303, 14)

In [4]:
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


#### Делаем разметку

In [5]:
X = dataset.drop('target', axis=1)
y = dataset['target']

#### Делим выборку

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

### Создаем модель и обучаем

In [7]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier()

In [8]:
y_pred = classifier.predict(X_test)

## Выводим метрику

In [9]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[27  7]
 [ 7 20]]
              precision    recall  f1-score   support

           0       0.79      0.79      0.79        34
           1       0.74      0.74      0.74        27

    accuracy                           0.77        61
   macro avg       0.77      0.77      0.77        61
weighted avg       0.77      0.77      0.77        61



### Выполняем подбор гиперпараметров

In [10]:
from sklearn.model_selection import GridSearchCV
GSCV = GridSearchCV( classifier, {'max_depth': range(3,10), 'min_samples_leaf': range(1,10), 'min_samples_split': range(2, 10)}, cv = 3, scoring = 'neg_mean_absolute_error')
GSCV.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(3, 10),
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10)},
             scoring='neg_mean_absolute_error')

#### Определяем лучшие параметры

In [11]:
GSCV.best_estimator_

DecisionTreeClassifier(max_depth=3, min_samples_leaf=3)

In [12]:
classifier_1 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=6)
classifier_1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=6)

In [19]:
y_pred_1 = classifier_1.predict(X_test)

In [14]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
print(classification_report(y_test, y_pred_1))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83        34
           1       0.80      0.74      0.77        27

    accuracy                           0.80        61
   macro avg       0.80      0.80      0.80        61
weighted avg       0.80      0.80      0.80        61



## Процедура бэггинга

In [16]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [22]:
bdt = BaggingRegressor(DecisionTreeClassifier()).fit(X_train, y_train)
bdt_predict = bdt.predict(X_test)

In [23]:
X = dataset.iloc[:, 0:4].values
y = dataset.iloc[:, 4].values

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [25]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Ансамбль решающих деревьев 

In [26]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')
model.fit(X_train, y_train)

RandomForestClassifier(max_features='sqrt')

In [27]:
display(model.score(X_train, y_train))

0.9628099173553719