In [1]:
import os
import json
from sklearn.model_selection import train_test_split
from collections import Counter
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re

In [2]:
class AnnotationUsage:
    def __init__(self, usage_json):
        self.annotation_name = usage_json['name']
        features_json = usage_json['features']
        self.features_list = [
            features_json.get('targetName', '')
        ]

    def __str__(self):
        return f'{self.annotation_name}'


# "features": {
#     "targetName": "getExtensions",
#     "className": "ExtensionsSupport",
#     "otherMethodsNames": [
#         "ExtensionsSupport"
#     ],
#     "otherMethodsAnnotations": [],
#     "returnType": "PsiType:Collection\u003cT\u003e",
#     "methodModifiers": [
#         "public"
#     ],
#     "otherAnnotations": []
# },
# "filePath": "jps/jps-builders-6/src/org/jetbrains/jps/ExtensionsSupport.java",
# "textOffset": 584

class UsagesLoader:
    def __init__(self, processing_result_path):
        self.processing_result_path = processing_result_path

    def load(self):
        usages_by_target = defaultdict(list)
        for root, dirs, files in os.walk(self.processing_result_path):
            for file in files:
                if not file.endswith('json'):
                    continue
                with open(os.path.join(root, file), 'r') as read_file:
                    data = json.load(read_file)
                    target_type = data['keyInfo']['name']
                    new_usages = [AnnotationUsage(usage_json) for usage_json in data["usages"]]
                    usages_by_target[target_type] = usages_by_target[target_type] + new_usages
        return usages_by_target

In [3]:
class Baseline:
    def __init__(self):
        self.ordered_by_quantity = np.array([])

    def fit(self, X, y):
        self.ordered_by_quantity = np.array([x[0] for x in Counter(y).most_common()])

    def predict(self, X):
        return np.array([self.ordered_by_quantity for _ in X])


In [4]:
class Metric:
    def __init__(self, predicted_y, expected_y):
        self.orders = [(list(predicted_y[i]) + [expected_y[i]]).index(expected_y[i]) + 1 for i in range(len(expected_y))]

    def top_i(self, i):
        return sum(map(lambda x: x <= i, self.orders)) / len(self.orders)

In [5]:
def encode_names(column):
    """
    Converts column of camelCase names to n (number of unique words in names) columns
    with 1 (if name contains word) and 0 (otherwise)
    """

    def split_camel_case(x):
        words = [word.lower() for word in re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)', x)]
        return ' '.join(words)

    vectorizer = CountVectorizer(preprocessor=split_camel_case)
    new_columns = vectorizer.fit_transform(column).toarray()
    selections = np.count_nonzero(new_columns, axis=0) > 100
    if not any(selections):
        return new_columns
    else:
        return new_columns[:, selections]


encode_names(np.array(['getSomethingGood', 'returnSomething']))

array([[1, 1, 0, 1],
       [0, 0, 1, 1]])

In [6]:
usages_loader = UsagesLoader(
    '/Users/danilbk/Programming/Kotlin/test/intellij-community/project-processing-results/processing/java/annotations/processing/0.0.0')
usages_by_target_type = usages_loader.load()

In [7]:
def load_for_target(target_type, ignored_annotation=()):
    method_usages = list(
        filter(lambda x: x.annotation_name not in ignored_annotation, usages_by_target_type[target_type]))
    raw_X = np.array([np.array(usage.features_list) for usage in method_usages])
    X = None
    if len(raw_X) == 0:
        X = []
    else:
        for col in range(raw_X.shape[1]):
            new_columns = encode_names(raw_X[:, col])
            if X is None:
                X = new_columns
            else:
                X = np.concatenate(X, new_columns, axis=1)
    y = [usage.annotation_name for usage in method_usages]
    return X, y

In [8]:
def calculate(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.05, shuffle=True, random_state=42)

    model.fit(X_train, y_train)
    predicted = model.predict(X_test)

    metric = Metric(predicted, y_test)
    print(f'Count: {len(y_test)}')
    for i in range(1, 6):
        print(f'Top {i}: {metric.top_i(i)}')

In [9]:
target_types = [
    'AnnotationType',
    'Constructor',
    'Field',
    'LocalVariable',
    'Method',
    'Module',
    'Package',
    'Parameter',
    'RecordComponent',
    'Type',
    'TypeParameter',
    'TypeUse'
]

In [10]:
from sklearn.svm import SVC


class SVM:
    def __init__(self):
        self.model = SVC(probability=True, kernel='linear')

    def fit(self, X, y):
        self.model.fit(X, y)
        print(X.shape)

    def predict(self, X):
        def transform(x):
            indexes = np.argsort(x)
            return self.model.classes_[indexes]
        return np.array([transform(x) for x in self.model.predict_proba(X)])


In [11]:
for target_type in ['Method']:
    X, y = load_for_target(target_type)
    if len(X) == 0:
        continue
    print(target_type)
    calculate(X, y, Baseline())
    calculate(X, y, SVM())
    print()

Method
Count: 283004
Top 1: 0.5166923435711156
Top 2: 0.7280639142909641
Top 3: 0.830323953018332
Top 4: 0.9284709756752555
Top 5: 0.9523646308886093
(14894, 938)
Count: 283004
Top 1: 6.713686025639214e-05
Top 2: 0.00012013964466933329
Top 3: 0.00023674577037780385
Top 4: 0.00024381280829952932
Top 5: 0.0002579468841429803

