In [2]:
from models.tree import CatBoost
from models.baseline import Baseline
from utils.usage_loader import initial_feature_names
from utils.encoder import encode_column
from sklearn.utils import shuffle
import numpy as np
import random
from utils.usage_loader import UsagesLoader
from metric.calculate import calculate_and_print
from pathlib import PurePath
from metric.metric import Metric

In [3]:
usage_loader = UsagesLoader(['community/0.0.0'])

In [4]:
usages = list(filter(lambda x: 'java.lang.Override' not in x.annotation_name, usage_loader.load_all()))
train_usages = usages
test_usages = []
while len(train_usages) > 100000:
    random_usage = random.choice(train_usages)
    module = '/'.join(PurePath(random_usage.file_path).parts[:3])
    print(module)
    test_usages += list(filter(lambda x: x.file_path.startswith(module), train_usages))
    train_usages = list(filter(lambda x: not x.file_path.startswith(module), train_usages))
    print(len(train_usages))
print(len(train_usages))
print(len(test_usages))
test_usages = shuffle(test_usages, random_state=123)[:200000]
usages = train_usages + test_usages
raw_X = np.array([np.array(usage.features_list, dtype=object) for usage in usages])
X = None
all_new_names = []
for col in range(raw_X.shape[1]):
    new_columns, new_names = encode_column(raw_X[:, col], len(train_usages),
                                           initial_feature_names[col], 100)
    if new_columns is None:
        continue
    all_new_names += new_names
    if X is None:
        X = new_columns
    else:
        X = np.concatenate((X, new_columns), axis=1)
y = np.array([usage.annotation_name for usage in usages])

actual_train_size = len(X) - len(test_usages)
X_train = X[:actual_train_size]
y_train = y[:actual_train_size]
X_test = X[actual_train_size:]
y_test = y[actual_train_size:]

plugins/tasks/tasks-core
419516
plugins/lombok/src
417375
android/android-adb/src
417221
platform/lang-impl/src
388037
platform/vcs-log/impl
384362
platform/external-system-impl/src
382082
java/java-psi-api/src
378522
platform/platform-impl/src
355737
java/idea-ui/src
352581
plugins/maven/src
347612
java/java-impl/src
335656
android/gradle-dsl/src
332085
plugins/kotlin/refIndex
330266
plugins/kotlin/idea
308031
jps/jps-builders/src
306574
android/android/src
282822
plugins/stream-debugger/src
282261
platform/core-impl/src
276565
java/java-impl-inspections/src
273368
platform/vcs-impl/src
264289
platform/editor-ui-api/src
262565
platform/xdebugger-impl/src
259932
java/java-indexing-api/src
259650
plugins/javaFX/src
258739
java/java-psi-impl/src
252399
java/java-analysis-impl/src
245583
platform/util/src
235923
android/profilers-android/src
235474
plugins/svn4idea/src
231135
platform/diff-impl/src
226010
platform/util/ui
223488
plugins/kotlin/analysis-api-providers-ide-impl
223426
platfo

In [5]:
len(X_train)

99272

In [6]:
model = CatBoost(task_type='GPU', early_stopping_rounds=20, verbose=True, iterations=500, learning_rate=0.05)
baseline = Baseline()

calculate_and_print(X_train, X_test, y_train, y_test, baseline)
calculate_and_print(X_train, X_test, y_train, y_test, model)

Baseline
Count: 200000
Top 1: 0.61997
Top 2: 0.764235
Top 3: 0.78384
Top 4: 0.79652
Top 5: 0.80636
Top1 1: 0.63647
Mean: 5.839775
CatBoost
0:	learn: 2.1171583	test: 1.6048590	best: 1.6048590 (0)	total: 1.01s	remaining: 8m 24s
1:	learn: 1.7725536	test: 1.4503720	best: 1.4503720 (1)	total: 1.91s	remaining: 7m 55s
2:	learn: 1.6324830	test: 1.3288597	best: 1.3288597 (2)	total: 2.84s	remaining: 7m 49s
3:	learn: 1.5352651	test: 1.2118342	best: 1.2118342 (3)	total: 3.74s	remaining: 7m 43s
4:	learn: 1.4787783	test: 1.1862038	best: 1.1862038 (4)	total: 4.6s	remaining: 7m 35s
5:	learn: 1.4318220	test: 1.1512910	best: 1.1512910 (5)	total: 5.52s	remaining: 7m 34s
6:	learn: 1.3943146	test: 1.1345995	best: 1.1345995 (6)	total: 6.38s	remaining: 7m 29s
7:	learn: 1.3617711	test: 1.1162525	best: 1.1162525 (7)	total: 7.24s	remaining: 7m 25s
8:	learn: 1.3359176	test: 1.1062280	best: 1.1062280 (8)	total: 8.2s	remaining: 7m 27s
9:	learn: 1.3122179	test: 1.0873904	best: 1.0873904 (9)	total: 9.09s	remaining: 

In [9]:
top1 = []
top5 = []
mean = []

for _ in range(100):
    test_indexes = np.random.choice(len(X_test), 20000)
    test_X = X_test[test_indexes]
    test_y = y_test[test_indexes]
    predicted = model.predict(test_X)
    metric = Metric(predicted, test_y)
    print(metric.top_i(1))
    top1.append(metric.top_i(1))
    top5.append(metric.top_i(5))
    mean.append(metric.mean)
    

0.76935
0.7693
0.77245
0.77275
0.77005
0.7706
0.7725
0.7738
0.778
0.77655
0.77125
0.7741
0.7774
0.77285
0.77715
0.7718
0.7707
0.77025
0.7736
0.7731
0.7765
0.7753
0.7724
0.77505
0.77205
0.7753
0.7696
0.77165
0.77345
0.7725
0.7732
0.7736
0.77525
0.77265
0.775
0.7727
0.77015
0.76945
0.77075
0.77405
0.768
0.76985
0.7728
0.7689
0.76825
0.7718
0.77285
0.77415
0.76935
0.7685
0.7707
0.77155
0.77455
0.7717
0.77085
0.774
0.77025
0.7699
0.7714
0.77405
0.7786
0.7728
0.7737
0.77035
0.76625
0.76615
0.7729
0.77065
0.77355
0.7718
0.77645
0.77125
0.76925
0.7743
0.77225
0.7748
0.77385
0.7733
0.77575
0.7689
0.7711
0.7664
0.7757
0.7726
0.77585
0.77495
0.7689
0.77165
0.76635
0.77235
0.77115
0.77425
0.77475
0.7685
0.7706
0.7703
0.7693
0.76875
0.7725
0.77055


In [12]:
print(np.min(top1))
print(np.max(top1))
print(np.mean(top1))
print()

print(np.min(top5))
print(np.max(top5))
print(np.mean(top5))
print()

print(np.min(mean))
print(np.max(mean))
print(np.mean(mean))
print()

0.76615
0.7786
0.772149

0.9496
0.95675
0.9532459999999999

3.7667
4.55485
4.077229



In [13]:
baseline = Baseline()
calculate_and_print(X_train, X_test, y_train, y_test, baseline)

Baseline
Count: 200000
Top 1: 0.63683
Top 2: 0.78531
Top 3: 0.856845
Top 4: 0.86928
Top 5: 0.889555
Top1 1: 0.653615
Mean: 6.20635


In [7]:
top1_baseline = []
top5_baseline = []
mean_baseline = []
top1 = []
top5 = []
mean = []

for _ in range(100):
    test_indexes = np.random.choice(len(X_test), 20000)
    test_X = X_test[test_indexes]
    test_y = y_test[test_indexes]
    predicted = model.predict(test_X)
    metric = Metric(predicted, test_y)
    print(metric.mean)
    top1.append(metric.top_i(1))
    top5.append(metric.top_i(5))
    mean.append(metric.mean)
    predicted = baseline.predict(test_X)
    metric = Metric(predicted, test_y)
    print(metric.mean)
    top1_baseline.append(metric.top_i(1))
    top5_baseline.append(metric.top_i(5))
    mean_baseline.append(metric.mean)
    

3.3829
5.55945
3.7648
5.9068
3.5985
5.8749
3.6255
5.83575
3.7837
6.06735
3.73115
5.85795
3.88775
5.89795
3.6371
5.6898
3.7215
5.9152
3.483
5.7621
3.7399
5.80455
3.62365
5.8675
3.71185
5.93185
3.83585
5.88895
3.7591
5.9235
3.65545
5.86305
3.86985
5.9291
3.65935
5.78605
3.526
5.70625
3.79325
5.88505
3.6475
5.80625
3.5905
5.795
3.7049
5.7995
3.7015
6.08215
3.7081
5.90395
3.6609
5.77195
3.56685
5.79885
3.6507
5.8057
3.74585
5.85585
3.9991
6.10025
3.7478
5.98295
3.61575
5.7564
3.61625
5.8378
3.59665
5.85155
3.9469
5.98805
3.62795
5.8766
3.53655
5.8032
3.48815
5.6283
3.5851
5.73245
3.8029
5.96765
3.5964
5.72085
3.7038
5.75705
3.46445
5.7726
3.7272
5.78785
3.7117
5.73465
3.74065
5.8923
3.9311
6.15395
3.71235
5.94205
3.78575
6.0679
3.3908
5.55325
3.5126
5.6097
3.66875
5.7649
3.85195
5.99445
3.6169
5.8383
3.5379
5.7118
3.53605
5.80515
3.525
5.70125
3.5969
5.7241
3.69625
5.93235
3.7129
5.8992
3.64055
6.023
3.67895
5.9852
3.56585
5.71505
3.8244
5.9664
3.7661
5.89205
3.78385
5.8507
3.4511
5.67865


In [8]:
print(np.min(top1))
print(np.max(top1))
print(np.mean(top1))
print()

print(np.min(top1_baseline))
print(np.max(top1_baseline))
print(np.mean(top1_baseline))
print()

print(np.min(top5))
print(np.max(top5))
print(np.mean(top5))
print()

print(np.min(top5_baseline))
print(np.max(top5_baseline))
print(np.mean(top5_baseline))
print()

print(np.min(mean))
print(np.max(mean))
print(np.mean(mean))
print()

print(np.min(mean_baseline))
print(np.max(mean_baseline))
print(np.mean(mean_baseline))
print()

0.7679
0.78045
0.7740835000000003

0.61145
0.6271
0.619912

0.95225
0.95915
0.9551505

0.80015
0.81205
0.806124

3.3829
3.9991
3.665250999999999

5.55325
6.15395
5.840501000000001



In [13]:
def interval(l, percent):
    x = int(round(len(l) / 100 * percent))
    return sorted(l)[x:-x]

top1_interval = interval(top1, 5)
top1_baseline_interval = interval(top1_baseline, 5)
top5_interval = interval(top5, 5)
top5_baseline_interval = interval(top5_baseline, 5)
mean_interval = interval(mean, 5)
mean_baseline_interval = interval(mean_baseline, 5)

In [14]:
print(np.min(top1_interval))
print(np.max(top1_interval))
print(np.mean(top1_interval))
print()

print(np.min(top1_baseline_interval))
print(np.max(top1_baseline_interval))
print(np.mean(top1_baseline_interval))
print()

print(np.min(top5_interval))
print(np.max(top5_interval))
print(np.mean(top5_interval))
print()

print(np.min(top5_baseline_interval))
print(np.max(top5_baseline_interval))
print(np.mean(top5_baseline_interval))
print()

print(np.min(mean_interval))
print(np.max(mean_interval))
print(np.mean(mean_interval))
print()

print(np.min(mean_baseline_interval))
print(np.max(mean_baseline_interval))
print(np.mean(mean_baseline_interval))
print()

0.76975
0.77895
0.7740783333333334

0.61445
0.6256
0.619965

0.95325
0.9578
0.9550966666666667

0.80165
0.81045
0.8061355555555555

3.4638
3.85195
3.6638205555555556

5.6261
6.0679
5.840757222222222

