In [4]:
!pip3 install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import ensemble
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import fetch_covtype
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import catboost as cb


warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('CrabAgePrediction.csv')

In [7]:
df.isna().sum()

Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
Age               0
dtype: int64

In [8]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,F,1.4375,1.175,0.4125,24.635715,12.332033,5.584852,6.747181,9
1,M,0.8875,0.65,0.2125,5.40058,2.29631,1.374951,1.559222,6
2,I,1.0375,0.775,0.25,7.952035,3.231843,1.601747,2.764076,6
3,F,1.175,0.8875,0.25,13.480187,4.748541,2.282135,5.244657,10
4,I,0.8875,0.6625,0.2125,6.903103,3.458639,1.488349,1.70097,6


In [49]:
encoder = preprocessing.LabelEncoder()
df['Sex'] = encoder.fit_transform(df['Sex'])

In [50]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,1.4375,1.175,0.4125,24.635715,12.332033,5.584852,6.747181,9
1,2,0.8875,0.65,0.2125,5.40058,2.29631,1.374951,1.559222,6
2,1,1.0375,0.775,0.25,7.952035,3.231843,1.601747,2.764076,6
3,0,1.175,0.8875,0.25,13.480187,4.748541,2.282135,5.244657,10
4,1,0.8875,0.6625,0.2125,6.903103,3.458639,1.488349,1.70097,6


In [51]:
X = df.drop(columns=['Age'])
y = df['Age']

In [52]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=271)

In [53]:
x_datasets = np.array_split(x_train, 10)

In [54]:
y_datasets = np.array_split(y_train, 10)

In [55]:
#обучим на каждой подвыборке дерево решений
models = []
for i in range(10):
    model_tree = tree.DecisionTreeRegressor(random_state=0)
    model_tree.fit(x_datasets[i], y_datasets[i])
    models.append(model_tree)

In [56]:
#находим прогноз каждого дерева
y_pred = []
for i in range(len(models)):
    y_pred.append(models[i].predict(x_test))

In [57]:
mean_pred = np.array(y_pred).mean(axis=0)

In [58]:
print(*mean_pred[:10], sep='\t')
print(*y_test[:10], sep='\t')

9.9	9.7	13.8	9.8	11.9	17.6	10.1	10.6	9.3	9.8
10	7	13	10	9	17	10	13	10	10


In [59]:
model_tree = tree.DecisionTreeRegressor(random_state=0)
one_model = model_tree.fit(x_train, y_train)

In [60]:
one_pred = one_model.predict(x_test)

In [61]:
r2_score(mean_pred, y_test)

0.08858525275507723

In [66]:
r2_score(one_pred, y_test)

-0.8656522475385418

Boosting

In [67]:
A_train, A_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [68]:
print(A_train.shape)
print(A_test.shape)

(3114, 8)
(779, 8)


In [74]:
random_forest = RandomForestRegressor(max_depth=15,min_samples_split=10).fit(A_train, y_train)
y_preds_d = random_forest.predict(A_train)
print('R2 мера для тренировочных данных',r2_score(y_preds_d,y_train))
y_pred = random_forest.predict(A_test)
print('R2 мера для тестовых данных',r2_score(y_pred,y_test))

R2 мера для тренировочных данных 0.7547492848920812
R2 мера для тестовых данных 0.31698244903423567


In [75]:
random_forest = RandomForestClassifier()
params_grid = {
    'max_depth': [3, 20],
    'min_samples_leaf': [3, 10],
    'min_samples_split': [6, 12]
}

gcv = GridSearchCV(estimator=random_forest, param_grid=params_grid, scoring='r2', cv = 4)

In [81]:
gcv.fit(A_train, y_train)

GridSearchCV(cv=4, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 20], 'min_samples_leaf': [3, 10],
                         'min_samples_split': [6, 12]},
             scoring='r2')

In [82]:
best_model = gcv.best_estimator_

In [83]:
y_preds_d = best_model.predict(A_train)
print('R2 мера для тренировочных данных', r2_score(y_preds_d, y_train))

R2 мера для тренировочных данных 0.830669759223861


In [84]:
y_pred = best_model.predict(A_test)

In [85]:
print('R2 мера для тестовых данных', r2_score(y_pred, y_test))

R2 мера для тестовых данных -0.31987545272154017


In [86]:
model_catboost_clf = cb.CatBoostRegressor(iterations=3000, task_type='GPU', devices='0')
model_catboost_clf.fit(A_train, y_train)

Learning rate set to 0.024327
0:	learn: 3.1987918	total: 15ms	remaining: 45.1s
1:	learn: 3.1673572	total: 28.9ms	remaining: 43.4s
2:	learn: 3.1380018	total: 55.3ms	remaining: 55.3s
3:	learn: 3.1094100	total: 70.2ms	remaining: 52.6s
4:	learn: 3.0795806	total: 83.9ms	remaining: 50.3s
5:	learn: 3.0531701	total: 97.8ms	remaining: 48.8s
6:	learn: 3.0248103	total: 115ms	remaining: 49.2s
7:	learn: 2.9976124	total: 129ms	remaining: 48.1s
8:	learn: 2.9722609	total: 142ms	remaining: 47.3s
9:	learn: 2.9491512	total: 156ms	remaining: 46.7s
10:	learn: 2.9272121	total: 169ms	remaining: 45.9s
11:	learn: 2.9054842	total: 181ms	remaining: 45.2s
12:	learn: 2.8848558	total: 204ms	remaining: 46.8s
13:	learn: 2.8631916	total: 218ms	remaining: 46.4s
14:	learn: 2.8450115	total: 228ms	remaining: 45.4s
15:	learn: 2.8270446	total: 241ms	remaining: 44.9s
16:	learn: 2.8081873	total: 255ms	remaining: 44.7s
17:	learn: 2.7894955	total: 267ms	remaining: 44.3s
18:	learn: 2.7701419	total: 280ms	remaining: 43.9s
19:	lea

<catboost.core.CatBoostRegressor at 0x7f3a20defd10>

In [87]:
y_preds_t = model_catboost_clf.predict(A_train, task_type='CPU')
print('R2 мера для тренировочных данных', r2_score(y_preds_t, y_train))

R2 мера для тренировочных данных 0.5317965663583599


In [88]:
y_preds_t = model_catboost_clf.predict(A_test, task_type='CPU')
print('R2 мера для тестовых данных', r2_score(y_preds_t, y_test))

R2 мера для тестовых данных 0.2614668733173453
