# Практическая работа № 7

In [1]:
!pip3 install catboost



#### Импортирование необходимых библиотек

In [2]:
import numpy as np
import warnings

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier

import catboost as cb


warnings.filterwarnings('ignore')

In [3]:
mnist = fetch_openml('mnist_784', version=1, cache=True)

# Разделение на признаки и метки
X = mnist.data
y = mnist.target

# Преобразование меток в числовой формат
y = y.astype(int)

# Разделение на обучающую и тестовую выборки

sample_size = 1000  # Желаемый размер выборки (ограничиваем для ускорения расчетов)
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)


x_datasets = np.array_split(X_train, 10)
y_datasets = np.array_split(y_train, 10)



### Bagging

In [4]:
#обучим на каждой подвыборке дерево решений
models = []
for i in range(10):
    model_tree = tree.DecisionTreeRegressor(random_state=0)
    model_tree.fit(x_datasets[i], y_datasets[i])
    models.append(model_tree)

In [5]:
# находим прогноз каждого дерева
y_pred = []
for i in range(len(models)):
    y_pred.append(models[i].predict(X_test))

mean_pred = np.array(y_pred).mean(axis=0)

In [6]:
model_tree = tree.DecisionTreeRegressor(random_state=0)
one_model = model_tree.fit(X_train, y_train)

one_pred = one_model.predict(X_test)

print("R2 для случайного леса", r2_score(mean_pred, y_test))

print("R2 для одного дерева решений", r2_score(one_pred, y_test))

R2 для случайного леса -0.36370170351456665
R2 для одного дерева решений 0.2879861546007879


### Boosting

In [7]:
A_train, A_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.8)

print(A_train.shape)
print(A_test.shape)


(200, 784)
(800, 784)


In [8]:
random_forest = RandomForestClassifier(max_depth=15, min_samples_split=10).fit(A_train, y_train)
y_preds_d = random_forest.predict(A_train)

print("F1 мера для тренировочных данных:", f1_score(y_preds_d, y_train,  average='macro'))
y_pred = random_forest.predict(A_test)
print("F1 мера для тестовых данных:", f1_score(y_pred, y_test, average='macro'))

F1 мера для тренировочных данных: 0.9953082378614294
F1 мера для тестовых данных: 0.8118004825908706


In [9]:
random_forest = RandomForestClassifier()
params_grid = {
    'max_depth': [12, 18],
    'min_samples_leaf': [3, 10],
    'min_samples_split': [6, 12]
}

gcv = GridSearchCV(estimator=random_forest, param_grid=params_grid, scoring='f1_macro', cv = 4)

In [10]:
gcv.fit(A_train, y_train)

In [11]:
best_model = gcv.best_estimator_

y_preds_d = best_model.predict(A_train)
print('F1 мера для тренировочных данных', f1_score(y_preds_d, y_train, average= "macro"))

y_pred = best_model.predict(A_test)
print('F1 мера для тестовых данных', f1_score(y_pred, y_test, average= "macro"))

F1 мера для тренировочных данных 1.0
F1 мера для тестовых данных 0.7994652497846662


In [12]:
model_catboost_clf = cb.CatBoostClassifier(iterations=3000, task_type="GPU", devices='0')
model_catboost_clf.fit(A_train, y_train)

Learning rate set to 0.018718
0:	learn: 2.2851569	total: 25.3ms	remaining: 1m 15s
1:	learn: 2.2685727	total: 47.1ms	remaining: 1m 10s
2:	learn: 2.2471634	total: 67.9ms	remaining: 1m 7s
3:	learn: 2.2277838	total: 88.7ms	remaining: 1m 6s
4:	learn: 2.2111133	total: 109ms	remaining: 1m 5s
5:	learn: 2.1938376	total: 129ms	remaining: 1m 4s
6:	learn: 2.1769537	total: 148ms	remaining: 1m 3s
7:	learn: 2.1597021	total: 168ms	remaining: 1m 2s
8:	learn: 2.1438513	total: 188ms	remaining: 1m 2s
9:	learn: 2.1271234	total: 207ms	remaining: 1m 1s
10:	learn: 2.1095419	total: 225ms	remaining: 1m 1s
11:	learn: 2.0940536	total: 246ms	remaining: 1m 1s
12:	learn: 2.0768465	total: 264ms	remaining: 1m
13:	learn: 2.0589233	total: 283ms	remaining: 1m
14:	learn: 2.0413443	total: 302ms	remaining: 1m
15:	learn: 2.0272807	total: 320ms	remaining: 59.8s
16:	learn: 2.0099693	total: 337ms	remaining: 59.2s
17:	learn: 1.9939317	total: 358ms	remaining: 59.3s
18:	learn: 1.9777315	total: 375ms	remaining: 58.9s
19:	learn: 1.9

<catboost.core.CatBoostClassifier at 0x21703fbc750>

In [13]:
y_preds_t = model_catboost_clf.predict(A_train, task_type='CPU')
print('F1 мера для тренировочных данных', f1_score(y_preds_t, y_train, average='macro'))


y_preds = model_catboost_clf.predict(A_test, task_type='CPU')
print('F1 мера для тестовых данных', f1_score(y_preds, y_test, average='macro'))

F1 мера для тренировочных данных 1.0
F1 мера для тестовых данных 0.8127022438335161
