In [109]:
import os
import json
from sklearn.model_selection import train_test_split
from collections import Counter

In [110]:
class AnnotationUsage:
    def __init__(self, usage_json):
        self.annotation_name = usage_json["name"]
        # TODO other features extraction

    def __str__(self):
        return f'{self.annotation_name}'

class UsagesLoader:
    def __init__(self, processing_result_path):
        self.processing_result_path = processing_result_path

    def load(self):
        usages_by_target = {}
        for root, dirs, files in os.walk(self.processing_result_path):
            for file in files:
                if not file.endswith("json"):
                    continue
                with open(os.path.join(root, file), "r") as read_file:
                    data = json.load(read_file)
                    target_type = data["keyInfo"]["name"]
                    new_usages = [AnnotationUsage(usage_json) for usage_json in data["usages"]]
                    usages_by_target[target_type] = usages_by_target.get(target_type, []) + new_usages
        return usages_by_target

In [111]:
class Baseline:
    def __init__(self):
        self.ordered_by_quantity = []

    def fit(self, X, y):
        self.ordered_by_quantity = [x[0] for x in Counter(y).most_common()]

    def predict(self, X):
        return [self.ordered_by_quantity for _ in X]


In [112]:
class Metric:
    def __init__(self, predicted_y, expected_y):
        self.orders = [(predicted_y[i] + [expected_y[i]]).index(expected_y[i]) + 1 for i in range(len(expected_y))]

    def top_i(self, i):
        return sum(map(lambda x : x <= i, self.orders)) / len(self.orders)

In [113]:
usages_loader = UsagesLoader('/Users/danilbk/Programming/Kotlin/test/intellij-community/project-processing-results/processing/java/annotations/processing/0.0.0')
usages_by_target_type = usages_loader.load()
def load_for_target(target_type, ignored_annotation=()):
    method_usages = list(filter(lambda x: x.annotation_name not in ignored_annotation, usages_by_target_type.get(target_type, [])))
    X = method_usages
    y = [usage.annotation_name for usage in  method_usages]
    return X, y

In [119]:
def calculate(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

    model.fit(X_train, y_train)
    predicted = model.predict(X_test)

    metric = Metric(predicted, y_test)
    for i in range(1, 6):
        print(f'Top {i}: {metric.top_i(i)}')

In [115]:
target_types = [
    'AnnotationType',
    'Constructor',
    'Field',
    'LocalVariable',
    'Method',
    'Module',
    'Package',
    'Parameter',
    'RecordComponent',
    'Type',
    'TypeParameter',
    'TypeUse'
]

In [120]:
for target_type in target_types:
    X, y = load_for_target(target_type)
    if len(X) == 0:
        continue
    print(target_type)
    calculate(X, y, Baseline())
    print()

AnnotationType
Top 1: 0.3670886075949367
Top 2: 0.759493670886076
Top 3: 0.8481012658227848
Top 4: 0.9240506329113924
Top 5: 0.9620253164556962

Constructor
Top 1: 0.35766423357664234
Top 2: 0.6277372262773723
Top 3: 0.781021897810219
Top 4: 0.8248175182481752
Top 5: 0.8832116788321168

Field
Top 1: 0.5415821501014199
Top 2: 0.7164300202839756
Top 3: 0.8679513184584179
Top 4: 0.9028397565922921
Top 5: 0.9148073022312373

LocalVariable
Top 1: 0.37472283813747226
Top 2: 0.5764966740576497
Top 3: 0.7760532150776053
Top 4: 0.8425720620842572
Top 5: 0.917960088691796

Method
Top 1: 0.5189996643168848
Top 2: 0.7306982208794898
Top 3: 0.8318563276267203
Top 4: 0.9288016112789527
Top 5: 0.9530547163477677

Package
Top 1: 0.4375
Top 2: 0.875
Top 3: 0.9375
Top 4: 0.9375
Top 5: 0.9375

Parameter
Top 1: 0.8380986249407302
Top 2: 0.9664058795637743
Top 3: 0.9826220957799905
Top 4: 0.9887624466571835
Top 5: 0.9928876244665719

Type
Top 1: 0.34265734265734266
Top 2: 0.6584185045723507
Top 3: 0.772458