In [1]:
# Execute this block to test that CatBoost works
import numpy
from catboost import CatBoostRegressor

dataset = numpy.array([[1,4,5,6],[4,5,6,7],[30,40,50,60],[20,15,85,60]])
train_labels = [1.2,3.4,9.5,24.5]
model = CatBoostRegressor(learning_rate=1, depth=6, loss_function='RMSE')
fit_model = model.fit(dataset, train_labels)

print(fit_model.get_params())

0:	learn: 6.8953900	total: 53.1ms	remaining: 53s
1:	learn: 4.8590818	total: 53.8ms	remaining: 26.8s
2:	learn: 3.6271477	total: 54.3ms	remaining: 18s
3:	learn: 2.7203608	total: 54.7ms	remaining: 13.6s
4:	learn: 2.0402706	total: 55.1ms	remaining: 11s
5:	learn: 1.5302029	total: 55.3ms	remaining: 9.17s
6:	learn: 1.1476522	total: 55.5ms	remaining: 7.87s
7:	learn: 0.8607391	total: 55.8ms	remaining: 6.92s
8:	learn: 0.6455544	total: 56.1ms	remaining: 6.17s
9:	learn: 0.4841658	total: 56.2ms	remaining: 5.56s
10:	learn: 0.3631243	total: 56.3ms	remaining: 5.06s
11:	learn: 0.2723432	total: 56.6ms	remaining: 4.66s
12:	learn: 0.2042574	total: 57ms	remaining: 4.32s
13:	learn: 0.1531931	total: 57.1ms	remaining: 4.02s
14:	learn: 0.1148948	total: 57.3ms	remaining: 3.76s
15:	learn: 0.0861711	total: 57.4ms	remaining: 3.53s
16:	learn: 0.0646283	total: 57.6ms	remaining: 3.33s
17:	learn: 0.0484712	total: 57.7ms	remaining: 3.15s
18:	learn: 0.0363534	total: 57.8ms	remaining: 2.98s
19:	learn: 0.0272651	total: 58

## Load Data

In [2]:
import pandas as pd
from sklearn.datasets import load_iris

data = load_iris()
t_names = data['target_names']
targets = data['target']
named_targets = [t_names[t] for t in targets]
df = pd.DataFrame(data=data.data, columns=data.feature_names)
# df['species'] = named_targets
# df['species code'] = targets

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, named_targets, test_size=0.33, random_state=20)
print(type(X_train), type(y_train))

<class 'pandas.core.frame.DataFrame'> <class 'list'>


## Fit Training Model

In [4]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=500,
                           depth=4,
                           learning_rate=0.1,
                           loss_function='MultiClass')

model.fit(X_train, y_train)

0:	learn: 0.9887484	total: 2.58ms	remaining: 1.29s
1:	learn: 0.8825537	total: 3.85ms	remaining: 958ms
2:	learn: 0.8000880	total: 4.29ms	remaining: 711ms
3:	learn: 0.7364433	total: 4.71ms	remaining: 584ms
4:	learn: 0.6639156	total: 4.96ms	remaining: 491ms
5:	learn: 0.6098835	total: 5.34ms	remaining: 440ms
6:	learn: 0.5703381	total: 5.57ms	remaining: 393ms
7:	learn: 0.5236019	total: 5.89ms	remaining: 363ms
8:	learn: 0.4903583	total: 6.19ms	remaining: 338ms
9:	learn: 0.4584067	total: 6.5ms	remaining: 318ms
10:	learn: 0.4231930	total: 6.74ms	remaining: 300ms
11:	learn: 0.3933996	total: 6.98ms	remaining: 284ms
12:	learn: 0.3656138	total: 7.27ms	remaining: 272ms
13:	learn: 0.3396485	total: 7.46ms	remaining: 259ms
14:	learn: 0.3149863	total: 7.71ms	remaining: 249ms
15:	learn: 0.2957912	total: 7.94ms	remaining: 240ms
16:	learn: 0.2788481	total: 8.2ms	remaining: 233ms
17:	learn: 0.2616562	total: 8.43ms	remaining: 226ms
18:	learn: 0.2472772	total: 8.67ms	remaining: 220ms
19:	learn: 0.2374106	tot

<catboost.core.CatBoostClassifier at 0x7f1292e6fe20>

## Predict on Held-out test set

In [5]:
predicted_classes = [x[0] for x in model.predict(X_test)]
predicted_probability = [max(x) for x in model.predict_proba(X_test)]

# print("class = ", predicted_classes)
# print("proba = ", predicted_probability)

df= pd.DataFrame({'truth': y_test, 'pred_class': predicted_classes, 'pred_prob': predicted_probability})

model.score(X_test, y_test)

0.9

## Train final model

In [7]:
import pandas as pd
from sklearn.datasets import load_iris

data_f = load_iris()
df_f = pd.DataFrame(data=data_f.data, columns=data_f.feature_names)
t_names_f = data['target_names']
targets_f = data['target']
named_targets_f = [t_names[t] for t in targets]

from catboost import CatBoostClassifier
model_f = CatBoostClassifier(iterations=200,
                           depth=3,
                           learning_rate=1,
                           loss_function='MultiClass')


model_f.fit(df_f, named_targets_f)

version = "1.0"
path = "../backend/app/"
name = "final_model"
model_f.save_model(f"{path}{name}_{version}")

0:	learn: 0.2247331	total: 464us	remaining: 92.4ms
1:	learn: 0.1487134	total: 1.3ms	remaining: 129ms
2:	learn: 0.1180019	total: 1.92ms	remaining: 126ms
3:	learn: 0.0970691	total: 2.56ms	remaining: 125ms
4:	learn: 0.0829679	total: 2.97ms	remaining: 116ms
5:	learn: 0.0774980	total: 3.27ms	remaining: 106ms
6:	learn: 0.0690239	total: 3.58ms	remaining: 98.7ms
7:	learn: 0.0632670	total: 3.85ms	remaining: 92.4ms
8:	learn: 0.0585371	total: 4.1ms	remaining: 86.9ms
9:	learn: 0.0568231	total: 4.37ms	remaining: 83ms
10:	learn: 0.0523305	total: 4.61ms	remaining: 79.1ms
11:	learn: 0.0494892	total: 4.87ms	remaining: 76.4ms
12:	learn: 0.0477487	total: 5.12ms	remaining: 73.7ms
13:	learn: 0.0446590	total: 5.61ms	remaining: 74.5ms
14:	learn: 0.0405126	total: 5.89ms	remaining: 72.6ms
15:	learn: 0.0381232	total: 6.17ms	remaining: 71ms
16:	learn: 0.0369903	total: 6.43ms	remaining: 69.3ms
17:	learn: 0.0348588	total: 6.69ms	remaining: 67.7ms
18:	learn: 0.0312570	total: 6.93ms	remaining: 66ms
19:	learn: 0.0299