In [14]:
# Execute this block to test that CatBoost works
import numpy
from catboost import CatBoostRegressor

dataset = numpy.array([[1,4,5,6],[4,5,6,7],[30,40,50,60],[20,15,85,60]])
train_labels = [1.2,3.4,9.5,24.5]
model = CatBoostRegressor(learning_rate=1, depth=6, loss_function='RMSE')
fit_model = model.fit(dataset, train_labels)

print(fit_model.get_params())

0:	learn: 6.8953900	total: 397us	remaining: 397ms
1:	learn: 4.8590818	total: 955us	remaining: 477ms
2:	learn: 3.6271477	total: 1.04ms	remaining: 345ms
3:	learn: 2.7203608	total: 1.45ms	remaining: 362ms
4:	learn: 2.0402706	total: 1.64ms	remaining: 327ms
5:	learn: 1.5302029	total: 1.77ms	remaining: 293ms
6:	learn: 1.1476522	total: 1.93ms	remaining: 273ms
7:	learn: 0.8607391	total: 2.09ms	remaining: 259ms
8:	learn: 0.6455544	total: 2.18ms	remaining: 240ms
9:	learn: 0.4841658	total: 2.3ms	remaining: 228ms
10:	learn: 0.3631243	total: 2.4ms	remaining: 216ms
11:	learn: 0.2723432	total: 2.52ms	remaining: 208ms
12:	learn: 0.2042574	total: 2.65ms	remaining: 201ms
13:	learn: 0.1531931	total: 2.78ms	remaining: 196ms
14:	learn: 0.1148948	total: 2.88ms	remaining: 189ms
15:	learn: 0.0861711	total: 3ms	remaining: 184ms
16:	learn: 0.0646283	total: 3.1ms	remaining: 179ms
17:	learn: 0.0484712	total: 3.18ms	remaining: 173ms
18:	learn: 0.0363534	total: 3.3ms	remaining: 170ms
19:	learn: 0.0272651	total: 3.4

## Load Data

In [21]:
import pandas as pd
from sklearn.datasets import load_iris

data = load_iris()
t_names = data['target_names']
targets = data['target']
named_targets = [t_names[t] for t in targets]
df = pd.DataFrame(data=data.data, columns=data.feature_names)
# df['species'] = named_targets
# df['species code'] = targets

In [23]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, named_targets, test_size=0.33, random_state=20)
print(type(X_train), type(y_train))

<class 'pandas.core.frame.DataFrame'> <class 'list'>


## Fit Training Model

In [31]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=500,
                           depth=4,
                           learning_rate=0.1,
                           loss_function='MultiClass')

model.fit(X_train, y_train)

0:	learn: 0.9887484	total: 1.52ms	remaining: 759ms
1:	learn: 0.8825537	total: 2.53ms	remaining: 631ms
2:	learn: 0.8000880	total: 2.86ms	remaining: 474ms
3:	learn: 0.7364433	total: 3.19ms	remaining: 395ms
4:	learn: 0.6639156	total: 3.51ms	remaining: 347ms
5:	learn: 0.6098835	total: 3.82ms	remaining: 314ms
6:	learn: 0.5703381	total: 4.13ms	remaining: 291ms
7:	learn: 0.5236019	total: 4.47ms	remaining: 275ms
8:	learn: 0.4903583	total: 4.78ms	remaining: 261ms
9:	learn: 0.4584067	total: 5.08ms	remaining: 249ms
10:	learn: 0.4231930	total: 5.34ms	remaining: 237ms
11:	learn: 0.3933996	total: 5.54ms	remaining: 225ms
12:	learn: 0.3656138	total: 5.84ms	remaining: 219ms
13:	learn: 0.3396485	total: 6.1ms	remaining: 212ms
14:	learn: 0.3149863	total: 6.4ms	remaining: 207ms
15:	learn: 0.2957912	total: 6.66ms	remaining: 202ms
16:	learn: 0.2788481	total: 6.98ms	remaining: 198ms
17:	learn: 0.2616562	total: 7.23ms	remaining: 194ms
18:	learn: 0.2472772	total: 7.49ms	remaining: 190ms
19:	learn: 0.2374106	tot

<catboost.core.CatBoostClassifier at 0x7f735570fb80>

## Predict on Held-out test set

In [32]:
predicted_classes = [x[0] for x in model.predict(X_test)]
predicted_probability = [max(x) for x in model.predict_proba(X_test)]

# print("class = ", predicted_classes)
# print("proba = ", predicted_probability)

df= pd.DataFrame({'truth': y_test, 'pred_class': predicted_classes, 'pred_prob': predicted_probability})

model.score(X_test, y_test)

0.9

## Train final model

In [34]:
import pandas as pd
from sklearn.datasets import load_iris

data_f = load_iris()
df_f = pd.DataFrame(data=data_f.data, columns=data_f.feature_names)
t_names_f = data['target_names']
targets_f = data['target']
named_targets_f = [t_names[t] for t in targets]

from catboost import CatBoostClassifier
model_f = CatBoostClassifier(iterations=200,
                           depth=3,
                           learning_rate=1,
                           loss_function='MultiClass')


model_f.fit(df_f, named_targets_f)

model_f.save_model("final_model")

0:	learn: 0.2247331	total: 495us	remaining: 98.6ms
1:	learn: 0.1487134	total: 1.31ms	remaining: 130ms
2:	learn: 0.1180019	total: 1.63ms	remaining: 107ms
3:	learn: 0.0970691	total: 2.17ms	remaining: 106ms
4:	learn: 0.0829679	total: 2.47ms	remaining: 96.2ms
5:	learn: 0.0774980	total: 2.81ms	remaining: 90.9ms
6:	learn: 0.0690239	total: 3.09ms	remaining: 85.3ms
7:	learn: 0.0632670	total: 3.36ms	remaining: 80.6ms
8:	learn: 0.0585371	total: 3.65ms	remaining: 77.4ms
9:	learn: 0.0568231	total: 3.91ms	remaining: 74.4ms
10:	learn: 0.0523305	total: 4.19ms	remaining: 72.1ms
11:	learn: 0.0494892	total: 4.43ms	remaining: 69.4ms
12:	learn: 0.0477487	total: 4.65ms	remaining: 66.9ms
13:	learn: 0.0446590	total: 4.89ms	remaining: 65ms
14:	learn: 0.0405126	total: 5.13ms	remaining: 63.2ms
15:	learn: 0.0381232	total: 5.37ms	remaining: 61.8ms
16:	learn: 0.0369903	total: 5.64ms	remaining: 60.8ms
17:	learn: 0.0348588	total: 5.9ms	remaining: 59.7ms
18:	learn: 0.0312570	total: 6.14ms	remaining: 58.5ms
19:	learn: