In [128]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [129]:
evaluation_folder = '/Users/glebmarin/projects/intellij-evaluation/2022-05-10_19-28-03/'

features_all = pd.read_csv(
    f'{evaluation_folder}/completion-ml-performance/features.csv', sep=',', header=1
).to_numpy()
labels_all = pd.read_csv(
    f'{evaluation_folder}/completion-ml-performance/labels.csv', sep=',', header=1
).to_numpy()

features_train, features_test, labels_train, labels_test = train_test_split(features_all, labels_all, test_size=0.1)

N, n_categories = labels_train.shape
extended_labels = np.reshape(labels_train.T, (N * n_categories, 1))

def with_single_one(pos: int):
    return np.concatenate([
        np.zeros(pos),
        np.ones(1),
        np.zeros(n_categories - pos - 1)
    ])

features_is_category = np.concatenate([
    np.repeat(np.reshape(with_single_one(i), (1, n_categories)), N, axis=0)
    for i in range(n_categories)
])

extended_features = np.append(np.concatenate([features_train] * n_categories), features_is_category, axis=1)
train_X, valid_X, train_y, valid_y = train_test_split(extended_features, extended_labels, test_size=0.33)

print(f'Dataset loaded! train={len(train_X)}, validation={len(valid_X)}, test={len(labels_train)}')

Dataset loaded! train=4878, validation=2403, test=2427


In [130]:
train_pool = Pool(train_X, train_y)
valid_pool = Pool(valid_X, valid_y)

In [131]:
model = CatBoostRegressor(iterations=10000)

In [132]:
model.fit(
    train_pool,
    eval_set=valid_pool,
    plot=True, 
    use_best_model=True
)
model.save_model('test')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.016031
0:	learn: 0.4659370	test: 0.4707934	best: 0.4707934 (0)	total: 1.15ms	remaining: 11.5s
1:	learn: 0.4620544	test: 0.4667749	best: 0.4667749 (1)	total: 2.48ms	remaining: 12.4s
2:	learn: 0.4589228	test: 0.4635875	best: 0.4635875 (2)	total: 3.65ms	remaining: 12.2s
3:	learn: 0.4557022	test: 0.4602232	best: 0.4602232 (3)	total: 4.97ms	remaining: 12.4s
4:	learn: 0.4522834	test: 0.4566680	best: 0.4566680 (4)	total: 5.9ms	remaining: 11.8s
5:	learn: 0.4489163	test: 0.4531485	best: 0.4531485 (5)	total: 6.98ms	remaining: 11.6s
6:	learn: 0.4459743	test: 0.4500986	best: 0.4500986 (6)	total: 7.95ms	remaining: 11.3s
7:	learn: 0.4427566	test: 0.4467315	best: 0.4467315 (7)	total: 9.22ms	remaining: 11.5s
8:	learn: 0.4397321	test: 0.4435713	best: 0.4435713 (8)	total: 10.2ms	remaining: 11.3s
9:	learn: 0.4363998	test: 0.4401070	best: 0.4401070 (9)	total: 11.3ms	remaining: 11.3s
10:	learn: 0.4334784	test: 0.4370467	best: 0.4370467 (10)	total: 12.8ms	remaining: 11.6s
11:	learn: 0

In [135]:
def predict_class(context_features: np.ndarray) -> int:
    def extended_features(for_category: int) -> np.ndarray:
        return np.concatenate([
            context_features,
            with_single_one(for_category)
        ])
    probs = model.predict([extended_features(i) for i in range(n_categories)])
    return list(probs).index(max(probs))


n_correct_predictions = 0
for context_features, label in zip(features_test, labels_test):
    actual_class = list(label).index(max(label))
    n_correct_predictions += predict_class(context_features) == actual_class
print(f'accuracy={100 * n_correct_predictions / len(labels_test):.2f}%, {n_correct_predictions=}, total={len(labels_test)}')

accuracy=82.96%, n_correct_predictions=224, total=270
