In [136]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [137]:
evaluation_folder = '/Users/glebmarin/projects/intellij-evaluation/2022-05-10_19-28-03/'

features_all = pd.read_csv(
    f'{evaluation_folder}/completion-ml-performance/features.csv', sep=',', header=1
).to_numpy()
labels_all = pd.read_csv(
    f'{evaluation_folder}/completion-ml-performance/labels.csv', sep=',', header=1
).to_numpy()

features_train, features_test, labels_train, labels_test = train_test_split(features_all, labels_all, test_size=0.1)

N, n_categories = labels_train.shape
extended_labels = np.reshape(labels_train.T, (N * n_categories, 1))

def with_single_one(pos: int):
    return np.concatenate([
        np.zeros(pos),
        np.ones(1),
        np.zeros(n_categories - pos - 1)
    ])

features_is_category = np.concatenate([
    np.repeat(np.reshape(with_single_one(i), (1, n_categories)), N, axis=0)
    for i in range(n_categories)
])

extended_features = np.append(np.concatenate([features_train] * n_categories), features_is_category, axis=1)
train_X, valid_X, train_y, valid_y = train_test_split(extended_features, extended_labels, test_size=0.33)

print(f'Dataset loaded! train={len(train_X)}, validation={len(valid_X)}, test={len(labels_train)}')

Dataset loaded! train=4878, validation=2403, test=2427


In [138]:
train_pool = Pool(train_X, train_y)
valid_pool = Pool(valid_X, valid_y)

In [139]:
model = CatBoostRegressor(iterations=100)

In [140]:
model.fit(
    train_pool,
    eval_set=valid_pool,
    plot=True, 
    use_best_model=True
)
model.save_model('test')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.266042
0:	learn: 0.4093868	test: 0.4119103	best: 0.4119103 (0)	total: 1.03ms	remaining: 102ms
1:	learn: 0.3718747	test: 0.3709325	best: 0.3709325 (1)	total: 2.02ms	remaining: 99.1ms
2:	learn: 0.3528302	test: 0.3491394	best: 0.3491394 (2)	total: 3.73ms	remaining: 121ms
3:	learn: 0.3374494	test: 0.3325644	best: 0.3325644 (3)	total: 4.89ms	remaining: 117ms
4:	learn: 0.3310441	test: 0.3259803	best: 0.3259803 (4)	total: 6.24ms	remaining: 119ms
5:	learn: 0.3231430	test: 0.3171014	best: 0.3171014 (5)	total: 7.47ms	remaining: 117ms
6:	learn: 0.3182030	test: 0.3114204	best: 0.3114204 (6)	total: 8.54ms	remaining: 113ms
7:	learn: 0.3134513	test: 0.3070500	best: 0.3070500 (7)	total: 9.76ms	remaining: 112ms
8:	learn: 0.3118405	test: 0.3054040	best: 0.3054040 (8)	total: 10.8ms	remaining: 109ms
9:	learn: 0.3079851	test: 0.3017377	best: 0.3017377 (9)	total: 11.9ms	remaining: 107ms
10:	learn: 0.3072672	test: 0.3012407	best: 0.3012407 (10)	total: 13.1ms	remaining: 106ms
11:	learn:

In [141]:
def predict_class(context_features: np.ndarray) -> int:
    def extended_features(for_category: int) -> np.ndarray:
        return np.concatenate([
            context_features,
            with_single_one(for_category)
        ])
    probs = model.predict([extended_features(i) for i in range(n_categories)])
    return list(probs).index(max(probs))


n_correct_predictions = 0
for context_features, label in zip(features_test, labels_test):
    actual_class = list(label).index(max(label))
    n_correct_predictions += predict_class(context_features) == actual_class
print(f'accuracy={100 * n_correct_predictions / len(labels_test):.2f}%, {n_correct_predictions=}, total={len(labels_test)}')

accuracy=84.07%, n_correct_predictions=227, total=270
