In [148]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [149]:
evaluation_folder = '/Users/glebmarin/projects/intellij-evaluation/2022-05-10_19-28-03/'

features_all = pd.read_csv(
    f'{evaluation_folder}/completion-ml-performance/features.csv', sep=',', header=1
).to_numpy()
labels_all = pd.read_csv(
    f'{evaluation_folder}/completion-ml-performance/labels.csv', sep=',', header=1
).to_numpy()

features_train, features_test, labels_train, labels_test = train_test_split(features_all, labels_all, test_size=0.1)

N, n_categories = labels_train.shape
extended_labels = np.reshape(labels_train.T, (N * n_categories, 1))

def with_single_one(pos: int):
    return np.concatenate([
        np.zeros(pos),
        np.ones(1),
        np.zeros(n_categories - pos - 1)
    ])

features_is_category = np.concatenate([
    np.repeat(np.reshape(with_single_one(i), (1, n_categories)), N, axis=0)
    for i in range(n_categories)
])

extended_features = np.append(np.concatenate([features_train] * n_categories), features_is_category, axis=1)
train_X, valid_X, train_y, valid_y = train_test_split(extended_features, extended_labels, test_size=0.33)

print(f'Dataset loaded! train={len(train_X)}, validation={len(valid_X)}, test={len(labels_train)}')

Dataset loaded! train=4878, validation=2403, test=2427


In [150]:
train_pool = Pool(train_X, train_y)
valid_pool = Pool(valid_X, valid_y)

In [151]:
model = CatBoostRegressor(iterations=300)

In [152]:
model.fit(
    train_pool,
    eval_set=valid_pool,
    plot=True, 
    use_best_model=True
)
model.save_model('test')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.136115
0:	learn: 0.4396533	test: 0.4367420	best: 0.4367420 (0)	total: 1.64ms	remaining: 490ms
1:	learn: 0.4146407	test: 0.4117538	best: 0.4117538 (1)	total: 2.65ms	remaining: 394ms
2:	learn: 0.3964141	test: 0.3930564	best: 0.3930564 (2)	total: 7.86ms	remaining: 778ms
3:	learn: 0.3806675	test: 0.3773169	best: 0.3773169 (3)	total: 9.13ms	remaining: 675ms
4:	learn: 0.3696963	test: 0.3661521	best: 0.3661521 (4)	total: 10.1ms	remaining: 597ms
5:	learn: 0.3576105	test: 0.3537722	best: 0.3537722 (5)	total: 11.1ms	remaining: 543ms
6:	learn: 0.3483431	test: 0.3446016	best: 0.3446016 (6)	total: 12.1ms	remaining: 506ms
7:	learn: 0.3439894	test: 0.3403449	best: 0.3403449 (7)	total: 14.2ms	remaining: 517ms
8:	learn: 0.3380416	test: 0.3341606	best: 0.3341606 (8)	total: 15.1ms	remaining: 488ms
9:	learn: 0.3329146	test: 0.3294015	best: 0.3294015 (9)	total: 16ms	remaining: 465ms
10:	learn: 0.3283986	test: 0.3250134	best: 0.3250134 (10)	total: 17.2ms	remaining: 451ms
11:	learn: 0.

In [153]:
def predict_class(context_features: np.ndarray) -> int:
    def extended_features(for_category: int) -> np.ndarray:
        return np.concatenate([
            context_features,
            with_single_one(for_category)
        ])
    probs = model.predict([extended_features(i) for i in range(n_categories)])
    return list(probs).index(max(probs))


n_correct_predictions = 0
for context_features, label in zip(features_test, labels_test):
    actual_class = list(label).index(max(label))
    n_correct_predictions += predict_class(context_features) == actual_class
print(f'accuracy={100 * n_correct_predictions / len(labels_test):.2f}%, {n_correct_predictions=}, total={len(labels_test)}')

accuracy=85.93%, n_correct_predictions=232, total=270
