In [2]:
from models.tree import CatBoost, RandomForest
from models.linear import LinearSVM, LogisticReg
from models.baseline import Baseline
from utils.usage_loader import initial_feature_names
from utils.encoder import encode_column
from sklearn.utils import shuffle
import numpy as np
from utils.usage_loader import UsagesLoader
from metric.calculate import calculate_and_print

In [5]:
jadx_loader = UsagesLoader(['./jadx/0.0.0'])
spring_loader = UsagesLoader(['./community/0.0.0'])

In [6]:
train_usages = list(filter(lambda x: 'java.lang.Override' not in x.annotation_name, spring_loader.load_all()))
test_usages = list(filter(lambda x: 'java.lang.Override' not in x.annotation_name, jadx_loader.load_all()))
train_classes = set(x.annotation_name for x in train_usages)
test_classes = set(x.annotation_name for x in test_usages)
classes = train_classes & test_classes
train_usages = list(filter(lambda x: x.annotation_name in classes, train_usages))
test_usages = list(filter(lambda x: x.annotation_name in classes, test_usages))
train_size = 100000
train_usages = shuffle(train_usages, random_state=123)[:train_size]
usages = train_usages + test_usages
raw_X = np.array([np.array(usage.features_list, dtype=object) for usage in usages])
X = None
all_new_names = []
for col in range(raw_X.shape[1]):
    new_columns, new_names = encode_column(raw_X[:, col], len(train_usages),
                                           initial_feature_names[col], 100)
    if new_columns is None:
        continue
    all_new_names += new_names
    if X is None:
        X = new_columns
    else:
        X = np.concatenate((X, new_columns), axis=1)
y = np.array([usage.annotation_name for usage in usages])

actual_train_size = len(X) - len(test_usages)
X_train = X[:actual_train_size]
y_train = y[:actual_train_size]
X_test = X[actual_train_size:]
y_test = y[actual_train_size:]

In [11]:
len(X_train[0])

922

In [14]:
len(X_train)

100000

In [13]:
len(X_test)

1470

In [9]:
model = CatBoost(task_type='GPU', early_stopping_rounds=20, verbose=True, iterations=500, learning_rate=0.05, depth=6)

calculate_and_print(X_train, X_test, y_train, y_test, Baseline())
# calculate_and_print(X_train, X_test, y_train, y_test, LinearSVM())
calculate_and_print(X_train, X_test, y_train, y_test, model)

Baseline
Count: 1470
Top 1: 0.07891156462585033
Top 2: 0.39591836734693875
Top 3: 0.4816326530612245
Top 4: 0.4857142857142857
Top 5: 0.49183673469387756
Top1 1: 0.16938775510204082
Mean: 4.7870748299319725
CatBoost
0:	learn: 2.3189525	test: 2.3181654	best: 2.3181654 (0)	total: 45.1ms	remaining: 22.5s
1:	learn: 2.0523154	test: 2.0508624	best: 2.0508624 (1)	total: 103ms	remaining: 25.7s
2:	learn: 1.8651133	test: 1.8632118	best: 1.8632118 (2)	total: 148ms	remaining: 24.5s
3:	learn: 1.7191796	test: 1.7168757	best: 1.7168757 (3)	total: 194ms	remaining: 24s
4:	learn: 1.6017635	test: 1.5992888	best: 1.5992888 (4)	total: 246ms	remaining: 24.3s
5:	learn: 1.5037090	test: 1.5009322	best: 1.5009322 (5)	total: 295ms	remaining: 24.3s
6:	learn: 1.4198698	test: 1.4169355	best: 1.4169355 (6)	total: 356ms	remaining: 25.1s
7:	learn: 1.3475767	test: 1.3444473	best: 1.3444473 (7)	total: 401ms	remaining: 24.6s
8:	learn: 1.2838502	test: 1.2805057	best: 1.2805057 (8)	total: 447ms	remaining: 24.4s
9:	learn: 1

In [9]:
top1 = []
top5 = []
mean = []

for _ in range(100):
    test_indexes = np.random.choice(len(X_test), 20000)
    test_X = X_test[test_indexes]
    test_y = y_test[test_indexes]
    predicted = model.predict(test_X)
    metric = Metric(predicted, test_y)
    print(metric.top_i(1))
    top1.append(metric.top_i(1))
    top5.append(metric.top_i(5))
    mean.append(metric.mean)
    

0.76935
0.7693
0.77245
0.77275
0.77005
0.7706
0.7725
0.7738
0.778
0.77655
0.77125
0.7741
0.7774
0.77285
0.77715
0.7718
0.7707
0.77025
0.7736
0.7731
0.7765
0.7753
0.7724
0.77505
0.77205
0.7753
0.7696
0.77165
0.77345
0.7725
0.7732
0.7736
0.77525
0.77265
0.775
0.7727
0.77015
0.76945
0.77075
0.77405
0.768
0.76985
0.7728
0.7689
0.76825
0.7718
0.77285
0.77415
0.76935
0.7685
0.7707
0.77155
0.77455
0.7717
0.77085
0.774
0.77025
0.7699
0.7714
0.77405
0.7786
0.7728
0.7737
0.77035
0.76625
0.76615
0.7729
0.77065
0.77355
0.7718
0.77645
0.77125
0.76925
0.7743
0.77225
0.7748
0.77385
0.7733
0.77575
0.7689
0.7711
0.7664
0.7757
0.7726
0.77585
0.77495
0.7689
0.77165
0.76635
0.77235
0.77115
0.77425
0.77475
0.7685
0.7706
0.7703
0.7693
0.76875
0.7725
0.77055


In [12]:
print(np.min(top1))
print(np.max(top1))
print(np.mean(top1))
print()

print(np.min(top5))
print(np.max(top5))
print(np.mean(top5))
print()

print(np.min(mean))
print(np.max(mean))
print(np.mean(mean))
print()

0.76615
0.7786
0.772149

0.9496
0.95675
0.9532459999999999

3.7667
4.55485
4.077229



In [13]:
baseline = Baseline()
calculate_and_print(X_train, X_test, y_train, y_test, baseline)

Baseline
Count: 200000
Top 1: 0.63683
Top 2: 0.78531
Top 3: 0.856845
Top 4: 0.86928
Top 5: 0.889555
Top1 1: 0.653615
Mean: 6.20635


In [7]:
top1_baseline = []
top5_baseline = []
mean_baseline = []
top1 = []
top5 = []
mean = []

for _ in range(100):
    test_indexes = np.random.choice(len(X_test), 20000)
    test_X = X_test[test_indexes]
    test_y = y_test[test_indexes]
    predicted = model.predict(test_X)
    metric = Metric(predicted, test_y)
    print(metric.mean)
    top1.append(metric.top_i(1))
    top5.append(metric.top_i(5))
    mean.append(metric.mean)
    predicted = baseline.predict(test_X)
    metric = Metric(predicted, test_y)
    print(metric.mean)
    top1_baseline.append(metric.top_i(1))
    top5_baseline.append(metric.top_i(5))
    mean_baseline.append(metric.mean)
    

3.3829
5.55945
3.7648
5.9068
3.5985
5.8749
3.6255
5.83575
3.7837
6.06735
3.73115
5.85795
3.88775
5.89795
3.6371
5.6898
3.7215
5.9152
3.483
5.7621
3.7399
5.80455
3.62365
5.8675
3.71185
5.93185
3.83585
5.88895
3.7591
5.9235
3.65545
5.86305
3.86985
5.9291
3.65935
5.78605
3.526
5.70625
3.79325
5.88505
3.6475
5.80625
3.5905
5.795
3.7049
5.7995
3.7015
6.08215
3.7081
5.90395
3.6609
5.77195
3.56685
5.79885
3.6507
5.8057
3.74585
5.85585
3.9991
6.10025
3.7478
5.98295
3.61575
5.7564
3.61625
5.8378
3.59665
5.85155
3.9469
5.98805
3.62795
5.8766
3.53655
5.8032
3.48815
5.6283
3.5851
5.73245
3.8029
5.96765
3.5964
5.72085
3.7038
5.75705
3.46445
5.7726
3.7272
5.78785
3.7117
5.73465
3.74065
5.8923
3.9311
6.15395
3.71235
5.94205
3.78575
6.0679
3.3908
5.55325
3.5126
5.6097
3.66875
5.7649
3.85195
5.99445
3.6169
5.8383
3.5379
5.7118
3.53605
5.80515
3.525
5.70125
3.5969
5.7241
3.69625
5.93235
3.7129
5.8992
3.64055
6.023
3.67895
5.9852
3.56585
5.71505
3.8244
5.9664
3.7661
5.89205
3.78385
5.8507
3.4511
5.67865


In [8]:
print(np.min(top1))
print(np.max(top1))
print(np.mean(top1))
print()

print(np.min(top1_baseline))
print(np.max(top1_baseline))
print(np.mean(top1_baseline))
print()

print(np.min(top5))
print(np.max(top5))
print(np.mean(top5))
print()

print(np.min(top5_baseline))
print(np.max(top5_baseline))
print(np.mean(top5_baseline))
print()

print(np.min(mean))
print(np.max(mean))
print(np.mean(mean))
print()

print(np.min(mean_baseline))
print(np.max(mean_baseline))
print(np.mean(mean_baseline))
print()

0.7679
0.78045
0.7740835000000003

0.61145
0.6271
0.619912

0.95225
0.95915
0.9551505

0.80015
0.81205
0.806124

3.3829
3.9991
3.665250999999999

5.55325
6.15395
5.840501000000001



In [13]:
def interval(l, percent):
    x = int(round(len(l) / 100 * 5))
    return sorted(l)[x:-x]

top1_interval = interval(top1, 5)
top1_baseline_interval = interval(top1_baseline, 5)
top5_interval = interval(top5, 5)
top5_baseline_interval = interval(top5_baseline, 5)
mean_interval = interval(mean, 5)
mean_baseline_interval = interval(mean_baseline, 5)

In [14]:
print(np.min(top1_interval))
print(np.max(top1_interval))
print(np.mean(top1_interval))
print()

print(np.min(top1_baseline_interval))
print(np.max(top1_baseline_interval))
print(np.mean(top1_baseline_interval))
print()

print(np.min(top5_interval))
print(np.max(top5_interval))
print(np.mean(top5_interval))
print()

print(np.min(top5_baseline_interval))
print(np.max(top5_baseline_interval))
print(np.mean(top5_baseline_interval))
print()

print(np.min(mean_interval))
print(np.max(mean_interval))
print(np.mean(mean_interval))
print()

print(np.min(mean_baseline_interval))
print(np.max(mean_baseline_interval))
print(np.mean(mean_baseline_interval))
print()

0.76975
0.77895
0.7740783333333334

0.61445
0.6256
0.619965

0.95325
0.9578
0.9550966666666667

0.80165
0.81045
0.8061355555555555

3.4638
3.85195
3.6638205555555556

5.6261
6.0679
5.840757222222222

