In [160]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from pathlib import Path

import catboost_to_java

import importlib
importlib.reload(catboost_to_java)

<module 'catboost_to_java' from '/Users/glebmarin/projects/ml-completion-performance/catboost_to_java.py'>

In [161]:
evaluation_folder = '/Users/glebmarin/projects/intellij-evaluation/for-model/'

features_filename = f'{evaluation_folder}/completion-ml-performance/features.csv'
labels_filename = f'{evaluation_folder}/completion-ml-performance/labels.csv'

features_all = pd.read_csv(features_filename, sep=',', header=1).to_numpy()
labels_all = pd.read_csv(labels_filename, sep=',', header=1).to_numpy()

features_train, features_test, labels_train, labels_test = train_test_split(features_all, labels_all, test_size=0.1)

N, n_categories = labels_train.shape
extended_labels = np.reshape(labels_train.T, (N * n_categories, 1))

def with_single_one(pos: int):
    return np.concatenate([
        np.zeros(pos),
        np.ones(1),
        np.zeros(n_categories - pos - 1)
    ])

features_is_category = np.concatenate([
    np.repeat(np.reshape(with_single_one(i), (1, n_categories)), N, axis=0)
    for i in range(n_categories)
])

extended_features = np.append(np.concatenate([features_train] * n_categories), features_is_category, axis=1)
train_X, valid_X, train_y, valid_y = train_test_split(extended_features, extended_labels, test_size=0.33)


def read_header(path: str) -> list[str]:
    return list(map(
        lambda f: f.replace('"', ''), 
        Path(path).read_text().splitlines()[0].split(',')
    ))

extended_features_names = read_header(features_filename) + list(map(lambda l: f'is_{l}', read_header(labels_filename)))

print(f'Dataset loaded! train={len(train_X)}, validation={len(valid_X)}, test={len(labels_train)}')
print(f'Train dataset shape={features_all.shape}')
print(f'{extended_features_names=}')

Dataset loaded! train=4878, validation=2403, test=2427
Train dataset shape=(2697, 109)
extended_features_names=['AnnotationPositionMatcher', 'ExceptionPositionMatcher', 'ExtendsDeclarationPositionMatcher', 'ImplementsDeclarationPositionMatcher', 'TryWithResourcesPositionMatcher', 'TypeParameterPositionMatcher', 'case_sensitivity_ALL', 'case_sensitivity_FIRST_LETTER', 'case_sensitivity_NONE', 'indent_level', 'is_keyword_ABSTRACT', 'is_keyword_ANOTHER', 'is_keyword_BOOLEAN', 'is_keyword_BREAK', 'is_keyword_CASE', 'is_keyword_CATCH', 'is_keyword_CHAR', 'is_keyword_CLASS', 'is_keyword_CONST', 'is_keyword_CONTINUE', 'is_keyword_DOUBLE', 'is_keyword_ELSE', 'is_keyword_EXTENDS', 'is_keyword_FALSE', 'is_keyword_FINAL', 'is_keyword_FINALLY', 'is_keyword_FLOAT', 'is_keyword_FOR', 'is_keyword_IF', 'is_keyword_IMPLEMENTS', 'is_keyword_IMPORT', 'is_keyword_INSTANCEOF', 'is_keyword_INT', 'is_keyword_INTERFACE', 'is_keyword_LONG', 'is_keyword_NEW', 'is_keyword_NULL', 'is_keyword_PRIVATE', 'is_keyword

In [162]:
train_pool = Pool(train_X, train_y)
valid_pool = Pool(valid_X, valid_y)
train_pool.set_feature_names(extended_features_names)

<catboost.core.Pool at 0x282e6c1c0>

In [163]:
model = CatBoostRegressor(iterations=300)

In [164]:
model.fit(
    train_pool,
    eval_set=valid_pool,
    plot=True, 
    use_best_model=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.136115
0:	learn: 0.4371972	test: 0.4412983	best: 0.4412983 (0)	total: 2.24ms	remaining: 671ms
1:	learn: 0.4120707	test: 0.4162384	best: 0.4162384 (1)	total: 3.32ms	remaining: 495ms
2:	learn: 0.3943034	test: 0.3983417	best: 0.3983417 (2)	total: 4.38ms	remaining: 433ms
3:	learn: 0.3788219	test: 0.3830866	best: 0.3830866 (3)	total: 5.68ms	remaining: 420ms
4:	learn: 0.3649395	test: 0.3692187	best: 0.3692187 (4)	total: 6.99ms	remaining: 412ms
5:	learn: 0.3540338	test: 0.3582033	best: 0.3582033 (5)	total: 8.38ms	remaining: 411ms
6:	learn: 0.3458191	test: 0.3498250	best: 0.3498250 (6)	total: 9.53ms	remaining: 399ms
7:	learn: 0.3391169	test: 0.3429928	best: 0.3429928 (7)	total: 11ms	remaining: 402ms
8:	learn: 0.3303113	test: 0.3343451	best: 0.3343451 (8)	total: 12.2ms	remaining: 395ms
9:	learn: 0.3252860	test: 0.3295891	best: 0.3295891 (9)	total: 13ms	remaining: 378ms
10:	learn: 0.3214759	test: 0.3257270	best: 0.3257270 (10)	total: 14ms	remaining: 367ms
11:	learn: 0.3193

<catboost.core.CatBoostRegressor at 0x16b6e5430>

In [165]:
converter = catboost_to_java.CatboostToJavaConverter()

MODEL_JAVA = Path('model-java')
MODEL_PYTHON = Path('model-python')
FEATURES_DIR = MODEL_JAVA / 'all_features'
F_ALL        = FEATURES_DIR / 'all_features.json'
F_BINARY     = FEATURES_DIR / 'binary.json'
F_CATEGORIAL = FEATURES_DIR / 'categorical.json'
F_ORDER      = FEATURES_DIR / 'features_order.txt'
F_FLOAT      = FEATURES_DIR / 'float.json'


MODEL_JAVA.mkdir(exist_ok=True)
FEATURES_DIR.mkdir(exist_ok=True)
(MODEL_JAVA / 'model.bin').touch()
model.save_model(MODEL_JAVA / 'model', pool=train_pool, format='json')
converter.convert_to_bin_model(
    catboost_to_java.CatboostToJavaConverter.create_py_model_instance(model, MODEL_PYTHON),
    MODEL_JAVA
)

with F_ALL.open('w') as f_all:
    def quoted(s): return f'"{s}"'
    def sq_braces(s): return f'[\n{s}\n]'
    f_all.write(sq_braces(',\n'.join(map(quoted, extended_features_names))))


def dump_empty_json_to(p: Path):
    with p.open('w') as f:
        f.write('{}')

dump_empty_json_to(F_BINARY)
dump_empty_json_to(F_CATEGORIAL)

with F_ORDER.open('w') as f_order:
    f_order.write('\n'.join(extended_features_names))

with F_FLOAT.open('w') as f_float:
    def feature_record(f_name):
        return f"""
    "{f_name}": {{
        "default": 0.0,
        "use_undefined": false
    }}"""

    def in_curly(s): return f'{{{s}}}'

    f_float.write(in_curly(',\n'.join(map(feature_record, extended_features_names))))

converter.zip_local_model(MODEL_JAVA, 'all')

model-java/all_features


In [166]:
def predict_class(context_features: np.ndarray) -> int:
    def extended_features(for_category: int) -> np.ndarray:
        return np.concatenate([
            context_features,
            with_single_one(for_category)
        ])
    probs = model.predict([extended_features(i) for i in range(n_categories)])
    return list(probs).index(max(probs))


n_correct_predictions = 0
predicted = []
actual = []
for context_features, label in zip(features_test, labels_test):
    actual_class = list(label).index(max(label))
    pred = predict_class(context_features)
    n_correct_predictions += pred == actual_class
    predicted.append(pred)
    actual.append(actual_class)
recall = recall_score(actual, predicted, average='weighted')
precision = precision_score(actual, predicted, average='weighted')

def percent(p: float): return f'{100 * p:.2f} %'

print(f'''
accuracy  = {percent(n_correct_predictions / len(labels_test))}
recall    = {percent(recall)}
precision = {percent(precision)}

{n_correct_predictions=}, total={len(labels_test)}
''')

print(f'''
predictions:

zeros: {model.predict(np.zeros(N))}
ones:  {model.predict(np.ones(N))}
''')



accuracy  = 85.19 %
recall    = 85.19 %
precision = 84.97 %

n_correct_predictions=230, total=270


predictions:

zeros: 0.30499683978074726
ones:  0.4139912333952897

