In [11]:
from sklearn.model_selection import train_test_split
from metric.calculate import calculate, calculate_and_print
import pickle
from sklearn.linear_model import LogisticRegression
from models.linear import SVM, LinearSVM, LogisticReg
from models.tree import CatBoost, RandomForest, GradientBoosting
from models.baseline import Baseline
from models.binary.tree import CatBoostBinary, RandomForestBinary, GradientBoostingBinary
import matplotlib.pyplot as plt
from collections import Counter
import warnings
from utils.usage_loader import initial_feature_names
from utils.encoder import encode_column
from sklearn.utils import shuffle
import numpy as np
import random
from utils.usage_loader import UsagesLoader

In [2]:
with open('data/usage_loader.pickle', 'rb') as f:
    usage_loader = pickle.load(f)

In [35]:
spring_loader = UsagesLoader(['/Users/danilbk/Desktop/0.0.0'])

In [36]:
jadx_loader = UsagesLoader(['/Users/danilbk/Programming/Java/test/jadx/jadx/project-processing-results/processing/java/annotations/processing/0.0.0'])

In [32]:
s = []
for usage in usage_loader.load_all():
    if usage.annotation_name.startswith('org.junit.jupiter'):
        s.append(usage.annotation_name)

19439

In [5]:
X, y, names = usage_loader.load(size=10000, state=42, train_fraction=0.6)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)

In [7]:
calculate_and_print(X_train, X_test, y_train, y_test, Baseline())
# calculate_and_print(X_train, X_test, y_train, y_test, CatBoostBinary())
calculate_and_print(X_train, X_test, y_train, y_test, CatBoost())
# calculate_and_print(X_train, X_test, y_train, y_test, RandomForestBinary())
calculate_and_print(X_train, X_test, y_train, y_test, RandomForest())


Baseline
Count: 2000
Top 1: 0.4645
Top 2: 0.7265
Top 3: 0.832
Top 4: 0.8895
Top 5: 0.903
Top1 1: 0.7355
Mean: 4.0535
CatBoost
Count: 2000
Top 1: 0.8215
Top 2: 0.927
Top 3: 0.9485
Top 4: 0.956
Top 5: 0.9605
Top1 1: 0.85
Mean: 2.481
Random forest
Count: 2000
Top 1: 0.8255
Top 2: 0.922
Top 3: 0.9445
Top 4: 0.9545
Top 5: 0.963
Top1 1: 0.852
Mean: 2.651


In [68]:
from catboost import CatBoostClassifier, Pool, cv

In [58]:
pool = Pool(X_train, label=y_train)

In [99]:
params = {"iterations": 1000,
          "depth": 6,
          "loss_function": "MultiClass",
          "verbose": False}

In [100]:
cv_data = cv(pool, params=params, plot=True)
best_iter = np.argmin(cv_data['test-MultiClass-mean'])
best_iter

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



999

In [101]:
cv_data

Unnamed: 0,iterations,test-MultiClass-mean,test-MultiClass-std,train-MultiClass-mean,train-MultiClass-std
0,0,3.537994,0.033881,3.534442,0.050813
1,1,3.209796,0.016207,3.206667,0.049306
2,2,2.994074,0.008086,2.990311,0.053129
3,3,2.821829,0.010394,2.818621,0.053923
4,4,2.681456,0.014723,2.677982,0.049678
...,...,...,...,...,...
995,995,0.798515,0.220257,0.493800,0.069924
996,996,0.798455,0.220367,0.493611,0.069901
997,997,0.798418,0.220389,0.493346,0.069914
998,998,0.798410,0.220380,0.493214,0.069998


In [134]:
model = CatBoostClassifier(early_stopping_rounds=20, iterations=1000, loss_function='MultiClass')

In [149]:
train_pool = Pool(X_train, y_train)
validate_pool = Pool(X_validate, y_validate)

In [147]:
set(y_validate) - set(y_train)

set()

In [150]:
model.fit(train_pool, plot=True, eval_set=validate_pool)
model.score(X_test, y_test)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.111916
0:	learn: 1.7702613	test: 1.7203284	best: 1.7203284 (0)	total: 246ms	remaining: 4m 6s
1:	learn: 1.4452155	test: 1.4197730	best: 1.4197730 (1)	total: 406ms	remaining: 3m 22s
2:	learn: 1.3229678	test: 1.3051459	best: 1.3051459 (2)	total: 571ms	remaining: 3m 9s
3:	learn: 1.2256539	test: 1.2092752	best: 1.2092752 (3)	total: 725ms	remaining: 3m
4:	learn: 1.1704371	test: 1.1563472	best: 1.1563472 (4)	total: 927ms	remaining: 3m 4s
5:	learn: 1.1236253	test: 1.1124478	best: 1.1124478 (5)	total: 1.08s	remaining: 2m 59s
6:	learn: 1.0915321	test: 1.0804660	best: 1.0804660 (6)	total: 1.25s	remaining: 2m 57s
7:	learn: 1.0595161	test: 1.0492349	best: 1.0492349 (7)	total: 1.43s	remaining: 2m 57s
8:	learn: 1.0370630	test: 1.0274253	best: 1.0274253 (8)	total: 1.61s	remaining: 2m 57s
9:	learn: 1.0135506	test: 1.0064138	best: 1.0064138 (9)	total: 1.77s	remaining: 2m 55s
10:	learn: 0.9960663	test: 0.9908941	best: 0.9908941 (10)	total: 2.02s	remaining: 3m 1s
11:	learn: 0.974851

0.8225

1000

In [5]:
rnd = random.Random()
top1 = 0
top5 = 0
mean = 0
iterations = 10
for i in range(iterations):
    print(i)
    X, y, feature_names = usage_loader.load(size=3000, state=random.randint(1, 10000), train_fraction=0.8)
    print(len(X[0]))
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)
    model = CatBoost()
    model.fit(X, y)
    metric = calculate(X_train, X_test, y_train, y_test, model)
    top1 += metric.top_i(1)
    top5 += metric.top_i(5)
    mean += metric.mean
print(f'Top 1: {top1 / iterations}')
print(f'Top 5: {top5 / iterations}')
print(f'Mean: {mean / iterations}')

0
843


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

1
868
2
882
3
851
4
874
5
844
6
865
7
865
8
858
9
862
Top 1: 0.8011666666666667
Top 5: 0.9523333333333334
Mean: 2.6445


In [6]:
X, y, feature_names = usage_loader.load(size=3000, state=random.randint(1, 10000), train_fraction=0.8)
feature_names

['targetName_action',
 'targetName_actions',
 'targetName_add',
 'targetName_all',
 'targetName_annotation',
 'targetName_build',
 'targetName_by',
 'targetName_call',
 'targetName_change',
 'targetName_changed',
 'targetName_check',
 'targetName_child',
 'targetName_class',
 'targetName_code',
 'targetName_com',
 'targetName_comment',
 'targetName_component',
 'targetName_constructor',
 'targetName_content',
 'targetName_context',
 'targetName_create',
 'targetName_data',
 'targetName_default',
 'targetName_description',
 'targetName_do',
 'targetName_editor',
 'targetName_element',
 'targetName_error',
 'targetName_event',
 'targetName_expression',
 'targetName_family',
 'targetName_field',
 'targetName_file',
 'targetName_find',
 'targetName_for',
 'targetName_from',
 'targetName_get',
 'targetName_group',
 'targetName_icon',
 'targetName_id',
 'targetName_in',
 'targetName_info',
 'targetName_inspection',
 'targetName_intellij',
 'targetName_is',
 'targetName_java',
 'targetName_ke

In [None]:
X, y, feature_names = usage_loader.load(size=100000, state=123, train_fraction=0.8)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)
calculate_and_print(X_train, X_test, y_train, y_test, Baseline())
calculate_and_print(X_train, X_test, y_train, y_test, CatBoost(task_type='GPU', verbose=True))