In [1]:
import numpy as np

import matplotlib as mpl
mpl.rcParams['savefig.dpi'] = 100
mpl.rcParams['figure.dpi'] = 100

import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import string
import mplcursors
import collections
import sklearn.cluster
import os
from tqdm import tqdm_notebook
#%matplotlib inline
%matplotlib notebook
#%matplotlib notebook

In [2]:
# NEW IMPLEMENTATION

import math

def get_intersections(hist1, hist2):
    len1 = len(hist1)
    len2 = len(hist2)
    intersections = []
    i = 0
    j = 0

    while i < len1 and j < len2:
        if hist1[i][0] < hist2[j][0]:
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            intersections.append((hist1[i][0], hist1[i][1], hist2[j][1]))
            i += 1
            j += 1
        elif hist1[i][0] > hist2[j][0]:
            j += 1
            
    return intersections

def get_union_inds(hist1, hist2):
    len1 = len(hist1)
    len2 = len(hist2)
    union = []
    i = 0
    j = 0

    while i < len1 or j < len2:
        if i >= len1:
            union.append(hist2[j][0])
            j += 1
        elif j >= len2:
            union.append(hist1[i][0])
            i += 1
        elif hist1[i][0] < hist2[j][0]:
            union.append(hist1[i][0])
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            union.append(hist1[i][0])
            i += 1
            j += 1
        elif hist1[i][0] > hist2[j][0]:
            union.append(hist2[j][0])
            j += 1
            
    return union


def jaccard_metric(hist1, hist2, hist_len=None):
    intersections = get_intersections(hist1, hist2)

    metric = 0

    for ind, v1, v2 in intersections:
        mx = max(v1, v2)
        mn = min(v1, v2)
        metric += mn / mx

    len_intersections = len(intersections)

    if (len_intersections != 0):
        metric = metric / len(intersections)
    else:
        metric = 0

    return 1 - metric

def canberra_metric(hist1, hist2, hist_len=None):
    metric = 0
    i = 0
    j = 0
    n = len(hist1)
    m = len(hist2)
    
    union_len = 0
    
    while i < n or j < m:
        if i >= n:
            metric += abs(hist2[j][1]) / abs(hist2[j][1]) # 1
            j += 1
        elif j >= m:
            metric += abs(hist1[i][1]) / abs(hist1[i][1]) # 1
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            metric += abs(hist1[i][1] - hist2[j][1]) / (abs(hist1[i][1]) + abs(hist2[j][1]))
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            metric += abs(hist1[i][1]) / abs(hist1[i][1]) # 1
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            metric += abs(hist2[j][1]) / abs(hist2[j][1]) # 1
            j += 1
        
        #union_len += 1
            
    union_len = len(get_union_inds(hist1, hist2))
    if union_len == 0:
        return 1
    
    return metric / union_len


def canberra_metric_optimized(hist1, hist2, hist_len=None):
    metric = 0
    i = 0
    j = 0
    n = len(hist1)
    m = len(hist2)
    
    union_len = 0
    
    while i < n or j < m:
        if i >= n:
            metric += 1.0
            j += 1
        elif j >= m:
            metric += 1.0
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            metric += abs(hist1[i][1] - hist2[j][1]) / (abs(hist1[i][1]) + abs(hist2[j][1]))
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            metric += 1.0
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            metric += 1.0
            j += 1
        
        union_len += 1
            
    #union_len = len(get_union_inds(hist1, hist2))
    if union_len == 0:
        return 1
    
    return metric / union_len



'''
def pearsons_correlation(hist1, hist2, hist_len):
    #hist_len = hist_len_3gram
    union_len = len(get_union_inds(hist1, hist2))
    
    top = 0
    left = 0
    right = 0
    
    n = len(hist1)
    m = len(hist2)
    
    i = 0
    j = 0
    
    while i < n or j < m:
        if i >= n:
            top += (- 1/hist_len) * (hist2[j][1] - 1/hist_len)
            left += (- 1/hist_len) ** 2
            right += (hist2[j][1] - 1/hist_len) ** 2
            j += 1
        elif j >= m:
            top += (hist1[i][1] - 1/hist_len) * (- 1/hist_len)
            left += (hist1[i][1] - 1/hist_len)  ** 2
            right += (- 1/hist_len) ** 2
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            top += (hist1[i][1] - 1/hist_len) * (hist2[j][1] - 1/hist_len)
            left += (hist1[i][1] - 1/hist_len)  ** 2
            right += (hist2[j][1] - 1/hist_len) ** 2
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            top += (hist1[i][1] - 1/hist_len) * (- 1/hist_len)
            left += (hist1[i][1] - 1/hist_len)  ** 2
            right += (- 1/hist_len) ** 2
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            top += (- 1/hist_len) * (hist2[j][1] - 1/hist_len)
            left += (- 1/hist_len) ** 2
            right += (hist2[j][1] - 1/hist_len) ** 2
            j += 1

    bottom = math.sqrt(left * right)
    return 1 - top / bottom
'''


def cos_distance(hist1, hist2, hist_len=None):
    intersections = get_intersections(hist1, hist2)

    top = 0

    for ind, v1, v2 in intersections:
        top += v1 * v2
        
    bottom1 = (sum([pair[1] ** 2 for pair in hist1]))
    bottom2 = (sum([pair[1] ** 2 for pair in hist2]))
    
    #print(top)
    #print(bottom1)
    #print(bottom2)
    
    return 1 - abs(top / np.sqrt(bottom1 * bottom2))
    

def pearsons_correlation_mean(hist1, hist2, hist_len):
    union_len = len(get_union_inds(hist1, hist2))
    
    top = 0
    left = 0
    right = 0
    
    n = len(hist1)
    m = len(hist2)
    
    mean1 = sum([pair[1] for pair in hist1]) / hist_len
    mean2 = sum([pair[1] for pair in hist2]) / hist_len
    
    i = 0
    j = 0
    
    while i < n or j < m:
        if i >= n:
            top += (- mean1) * (hist2[j][1] - mean2)
            left += (- mean1) ** 2
            right += (hist2[j][1] - mean2) ** 2
            j += 1
        elif j >= m:
            top += (hist1[i][1] - mean1) * (- mean2)
            left += (hist1[i][1] - mean1)  ** 2
            right += (- mean2) ** 2
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            top += (hist1[i][1] - mean1) * (hist2[j][1] - mean2)
            left += (hist1[i][1] - mean1)  ** 2
            right += (hist2[j][1] - mean2) ** 2
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            top += (hist1[i][1] - mean1) * (- mean2)
            left += (hist1[i][1] - mean1)  ** 2
            right += (- mean2) ** 2
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            top += (- mean1) * (hist2[j][1] - mean2)
            left += (- mean1) ** 2
            right += (hist2[j][1] - mean2) ** 2
            j += 1

    bottom = math.sqrt(left * right)
    return 1 - top / bottom
    

def get_dists(hists, dist_metric, hist_len):
    #hist_len = len(hists[0])
    
    #n = len(hists)
    dists = []
    
    for hist1 in tqdm_notebook(hists):
        cur_dists = []

        for hist2 in hists:
            distance = dist_metric(hist1, hist2, hist_len)
            
            cur_dists.append(distance)

        dists.append(cur_dists)
        
    return dists


def get_dists_optimized(hists, dist_metric, hist_len):
    #hist_len = len(hists[0])
    
    n = len(hists)
    #dists = [[0.0 for i in range(n)] for j in range(n)]
    dists = np.zeros((n, n))
    
    for i in tqdm_notebook(range(n)):
        for j in range(n):
            if i <= j:
                break
                
            distance = dist_metric(hists[i], hists[j], hist_len)
            dists[i][j] = distance
            dists[j][i] = distance
        
    return dists

In [6]:
RESULTS_ALL_LABELED_TUFANO = "/Volumes/Seagate/Alina/result_for_all_labeled_tufano"

In [7]:
all_dirs = [os.fsdecode(el) for el in os.listdir(RESULTS_ALL_LABELED_TUFANO) 
               if os.path.isdir(RESULTS_ALL_LABELED_TUFANO + "/" + os.fsdecode(el))]
all_dirs = sorted(all_dirs, key=lambda s: int(s.split(" ")[0])) 
len(all_dirs)

632

In [8]:
TUFANO_DATASET_PATH = "/Users/aliscafo/Documents/ALINA/WORK/SPbAU/thesis/CodeDiffEditScripts/TufanoDataset"

unique_label_to_changes = collections.defaultdict(list)
from_change_to_unique_label = collections.defaultdict(list)

num_multiple = 0

for double_num in tqdm_notebook(all_dirs):
    change_id = double_num.split(" ")[0]
    label_file = open(TUFANO_DATASET_PATH + "/" + change_id + "/" + "label.txt")
    labels = label_file.read().split(",")
    label = ""
        
    if len(labels) > 1:
        num_multiple += 1
        final_label_file = open(TUFANO_DATASET_PATH + "/" + change_id + "/" + "final_label.txt")
        final_label = final_label_file.read()
        label = final_label
        final_label_file.close()
        #if not os.path.exists(TUFANO_DATASET_PATH + "/" + change_id + "/" + "final_label.txt"):
        #    print(double_num)
    else:
        label = labels[0]
    
    #print(double_num, label)
        
    if label != "" and label != 'unclear':
        unique_label_to_changes[label].append(change_id)
        from_change_to_unique_label[change_id].append(label)
    
    label_file.close()
    
print(num_multiple)

HBox(children=(IntProgress(value=0, max=632), HTML(value='')))


42


In [9]:
print(len(from_change_to_unique_label))
print("NUM CLASSES:", len(unique_label_to_changes))

627
NUM CLASSES: 58


In [10]:
sorted(list(unique_label_to_changes))

['Abstract an existing method using the abstract keyword',
 'Add Condition',
 'Add braces to if statement',
 'Add invoked method',
 'Add null check',
 'Add parameter & remove variable from method body',
 'Add parameter in the method/constructor invocation',
 'Add statement',
 'Add try block',
 'Add/Remove operand from condition',
 'Add/Remove parameter',
 'Add/Remove synchronized keyword from method',
 'Add/Remove synchronized keyword from variable',
 'Add/Remove this qualifier',
 'Broad method visibility',
 'Change comparison operator (e.g. >) in condition',
 'Change exception type in catch clause',
 'Change method invocation as result of a move method',
 'Change method return value',
 'Change operands order in if condition',
 'Change parameter type',
 'Change parameter value of invoked method',
 'Change parameters order in method invocation',
 'Change qualified name in response to a move class refactoring',
 'Change return type',
 'Change type of a variable',
 'Class becomes static. 

In [11]:
ids_per_unique_label = []
unique_lines_poses = []
cur_sum = 0
all_unique_labels = []

for label in unique_label_to_changes:
    all_unique_labels.append(label)
    for change in unique_label_to_changes[label]:
        ids_per_unique_label.append(change)
        cur_sum += 1
    unique_lines_poses.append(cur_sum)

In [12]:
for i in range(len(ids_per_unique_label)):
    print("On index", i, ids_per_unique_label[i])

On index 0 1
On index 1 392
On index 2 395
On index 3 409
On index 4 426
On index 5 441
On index 6 468
On index 7 2
On index 8 45
On index 9 466
On index 10 3
On index 11 9
On index 12 16
On index 13 35
On index 14 41
On index 15 53
On index 16 59
On index 17 61
On index 18 65
On index 19 69
On index 20 80
On index 21 92
On index 22 93
On index 23 97
On index 24 110
On index 25 111
On index 26 118
On index 27 121
On index 28 123
On index 29 125
On index 30 132
On index 31 133
On index 32 135
On index 33 137
On index 34 138
On index 35 149
On index 36 167
On index 37 193
On index 38 197
On index 39 200
On index 40 203
On index 41 210
On index 42 211
On index 43 214
On index 44 224
On index 45 227
On index 46 231
On index 47 234
On index 48 235
On index 49 239
On index 50 241
On index 51 249
On index 52 250
On index 53 251
On index 54 260
On index 55 261
On index 56 273
On index 57 274
On index 58 275
On index 59 289
On index 60 292
On index 61 295
On index 62 296
On index 63 305
On inde

In [13]:
for i in range(len(unique_lines_poses)):
    if i == 0:
        print("From 0 to", unique_lines_poses[i], all_unique_labels[i])
    else:
        print("From", unique_lines_poses[i - 1], "to", unique_lines_poses[i], all_unique_labels[i])

From 0 to 7 Add braces to if statement
From 7 to 10 Forbid overriding: add final to method
From 10 to 105 Remove statement
From 105 to 164 Remove parameter from the method invocation
From 164 to 208 Change parameter type
From 208 to 214 Abstract an existing method using the abstract keyword
From 214 to 215 Move synchronized keyword from method signature to code block or vice versa
From 215 to 222 Merge variable definition & initialization
From 222 to 244 Class is not static anymore. Add object instance to invoke its methods
From 244 to 265 Broad method visibility
From 265 to 286 Remove invoked method
From 286 to 292 Change method return value
From 292 to 317 Add/Remove parameter
From 317 to 324 Remove condition
From 324 to 330 Move existing statements out of try block
From 330 to 366 Add parameter in the method/constructor invocation
From 366 to 370 Remove redundant initialization
From 370 to 375 Rename variable
From 375 to 377 Add/Remove synchronized keyword from variable
From 377 to 

In [102]:
def print_clustering_results_tufano_unique(clustering, dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    without_outliers,
                                    to_print=True):
    clusters = collections.defaultdict(list)
    clusters_to_ids = collections.defaultdict(list)
    outliers = []
    num_changes = len(clustering.labels_)
    
    for i in range(len(clustering.labels_)):
        label = clustering.labels_[i]
        change = ids_per_unique_label[i]
        if label == -1:
            outliers.append(change)
        else:
            clusters[label].append(change)
            clusters_to_ids[label].append(i)
    
    # For Agglomerative
    clusters_list = list(clusters_to_ids.keys())
    for label in clusters_list:
        if len(clusters_to_ids[label]) == 1:
            outliers.append(clusters[label][0])
            clustering.labels_[clusters_to_ids[label][0]] = -1
            clusters_to_ids.pop(label, None)
            clusters.pop(label, None)
            
    num_inliers = num_changes - len(outliers)   
            
    cohesion = 0
    separation = 0
    g1 = 0
    MAX_DIST = np.array(dists).max()
    #print("MAX_DIST", MAX_DIST)
    for i in clusters_to_ids.keys():
        coef = 0
        #print("!!!", len(clusters_to_ids[i]))
        for xi in range(len(clusters_to_ids[i])):
            for yi in range(len(clusters_to_ids[i])):
                if xi < yi:
                    x = clusters_to_ids[i][xi]
                    y = clusters_to_ids[i][yi]
                    #print("dists[x][y]", dists[x][y])
                    coef += MAX_DIST - dists[x][y]
                    
        cohesion += (1 / len(clusters_to_ids[i])) * coef
        coef_for_sep = len(clusters_to_ids[i]) / np.sqrt(coef)
        coef = 1 / coef
        
        summ = 0
        for j in clusters_to_ids.keys():
            if i == j:
                continue
                
            for xi in range(len(clusters_to_ids[i])):
                for yi in range(len(clusters_to_ids[j])):
                    x = clusters_to_ids[i][xi]
                    y = clusters_to_ids[j][yi]
                    summ += MAX_DIST - dists[x][y]
        
        g1 += coef * summ
        separation += coef_for_sep * summ
        
        
    #final_init_labels = dict()
        
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    for i in range(len(clustering.labels_)):
        change_i = ids_per_unique_label[i]
        init_label_i = from_change_to_unique_label[change_i][0]
        after_label_i = clustering.labels_[i]
        
        for j in range(len(clustering.labels_)):
            change_j = ids_per_unique_label[j]
            init_label_j = from_change_to_unique_label[change_j][0]
            after_label_j = clustering.labels_[j]
            
            if i >= j:
                continue
                
            if (without_outliers and (after_label_i == -1 or after_label_j == -1)):
                continue
            
            if init_label_i == init_label_j:
                if after_label_i == after_label_j:
                    tp += 1
                else:
                    fp += 1
            else:
                if after_label_i == after_label_j:
                    tn += 1
                else:
                    fn += 1
                    
                    
    
    confusion_mtx = []
    for i in clusters_to_ids.keys():
        mtx_row = collections.defaultdict(int)
        
        for change_i in clusters_to_ids[i]:
            change = ids_per_unique_label[change_i]
            init_label = from_change_to_unique_label[change][0]
            mtx_row[init_label] += 1
        
        confusion_mtx.append(mtx_row)
        
    ENTROPY = 0
    PURITY = 0
    entropies = []
    purities = []
    Fs = collections.defaultdict(list)
    for i in range(len(clusters_to_ids.keys())):
        entropy = 0
        purity = 0
        
        clustering_label = list(clusters_to_ids.keys())[i]
        
        for init_lbl in unique_label_to_changes:
            pij = confusion_mtx[i][init_lbl] / len(clusters_to_ids[clustering_label])
            precision = pij
            recall = confusion_mtx[i][init_lbl] / len(unique_label_to_changes[init_lbl])
            
            if pij != 0:
                entropy += - pij * np.log2(pij)
            purity = max(purity, pij)
            
            fij = 0
            if precision != 0 or recall != 0:
                fij = (2 * precision * recall) / (precision + recall)
            Fs[init_lbl].append(fij)
        
        ENTROPY += entropy * len(clusters_to_ids[clustering_label]) / num_inliers
        entropies.append(round(entropy, 3))
        
        PURITY += purity * len(clusters_to_ids[clustering_label]) / num_inliers
        purities.append(round(purity, 3))
        
    Fmeasure = 0
    for init_lbl in unique_label_to_changes:
        Fmeasure += max(Fs[init_lbl]) * len(unique_label_to_changes[init_lbl]) / num_changes
            
    
    rand = round(100 * (tp + fn) / (tp + tn + fp + fn), 3)
    jaccard_index = round(100 * tp / (tp + tn + fp), 3)
    outl_percent = round(100 * len(outliers) / num_changes, 3)
    
    
    if (to_print):
        '''
        print("CLUSTERS:\n")

        for i in clusters.keys():
            for change in clusters[i]:
                print(change, from_change_to_unique_label[change])
            print("\n")
        '''
        
        print("Number of clusters:", len(clusters))
        print("Number of outliers:", str(len(outliers)) + "/" + str(num_changes), "(" + str(outl_percent) + "%)")
        
        print()
        print("Cohesion =", str(cohesion))
        print("Separation =", str(separation))
        print("G1 =", str(g1))
        
        print()
        print("Entropy =", str(round(ENTROPY, 6)))
        print("Purity =", str(round(PURITY, 6)))
        #pairs = [(entropies[i], purities[i]) for i in range(len(entropies))]
        #print("(entropy, purity):", pairs)
        print("F-measure =", round(Fmeasure, 6))
        
        print()
        print("Rand =", str(rand) + "%")
        print("Jaccard Index =", str(jaccard_index) + "%")       
    
    return rand, Fmeasure
    
    

In [118]:
def find_eps_with_brute_force_universal(dists, label_to_changes, from_change_to_label, 
                                        ids_per_label, agglomerative=False, linkage='single', with_step=None):
    if with_step is None:
        all_dists = np.unique(np.array(dists).flatten())
    else:
        all_dists = np.arange(0.0, max(1.1, np.array(dists).flatten().max()), with_step)
        
    if linkage == "complete" and with_step is not None:
        all_dists = np.unique(np.array(dists).flatten())
        all_dists_step = np.arange(0.0, max(1.1, np.array(dists).flatten().max()), with_step)
        print(len(all_dists), len(all_dists_step))
        
        if len(all_dists) > len(all_dists_step):
            all_dists = all_dists_step

    max_rand = 0
    best_eps = -1
    max_fm = 0

    for eps in tqdm_notebook(all_dists):
        if eps == 0.0:
            continue
            
        if agglomerative is True:
            tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=linkage, 
                                                            compute_full_tree=True,
                                                            distance_threshold=eps).fit(dists)
        else:
            tufano_clustering = sklearn.cluster.DBSCAN(eps=eps, 
                                                   min_samples=2, 
                                                   metric='precomputed').fit(dists)
        
        rand, fm = print_clustering_results_tufano_unique(tufano_clustering, 
                                               dists,
                                               label_to_changes, 
                                               from_change_to_label, 
                                               ids_per_label,
                                               to_print=False, 
                                               without_outliers=True)

        if max_fm < fm:
            max_fm = fm
            best_eps = eps

    print("Best eps:", best_eps)
    print("Best F-measure:", max_fm)
    
    return best_eps

# With UPD, concat

In [20]:
RESULTS_ALL_LABELED_TUFANO = "/Volumes/Seagate/Alina/result_for_labeled_tufano_with_UPD"

def get_hists(hists_path, gram_path):
    hists = []

    for double_num in tqdm_notebook(ids_per_unique_label):
        hist_path1 = hists_path + "/" + double_num + " " + double_num + "/" + gram_path + "/sampleChange1" + "_hist.txt"
        
        if os.path.exists(hist_path1):
            hist_file1 = open(hist_path1, "r")
            lines = hist_file1.read().split("\n")
            hist_data1 = [(int(line.split(" ")[0]), int(line.split(" ")[1])) for line in lines if line != '']
            hists.append(hist_data1)
            hist_file1.close()
        else:
            hists.append([])
            
    return hists

def get_hists_len(hists_path, gram_path):
    hists_len = None
    
    es_path = hists_path + "/" + "edit_scripts_" + gram_path + "s_mapped.txt"
    es_file = open(es_path, "r")
    es_lines = es_file.read().split("\n")
    hists_len = int(es_lines[-2].split(" ")[0][:-1]) + 1
    es_file.close()

    print(hists_len)
    
    return hists_len

In [21]:
hists_tufano_1gram = get_hists(RESULTS_ALL_LABELED_TUFANO, "1gram")
hists_tufano_2gram = get_hists(RESULTS_ALL_LABELED_TUFANO, "2gram")
hists_tufano_3gram = get_hists(RESULTS_ALL_LABELED_TUFANO, "3gram")
hists_tufano_4gram = get_hists(RESULTS_ALL_LABELED_TUFANO, "4gram")
hists_tufano_5gram = get_hists(RESULTS_ALL_LABELED_TUFANO, "5gram")

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [22]:
hist_len_1gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "1gram")
hist_len_2gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "2gram")
hist_len_3gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "3gram")
hist_len_4gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "4gram")
hist_len_5gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "5gram")

356
638
751
743
675


In [24]:
concat_hists_upd_to_2gram_len = hist_len_1gram_tufano + hist_len_2gram_tufano

concat_hists_upd_to_3gram_len = hist_len_1gram_tufano + hist_len_2gram_tufano + hist_len_3gram_tufano

concat_hists_upd_to_4gram_len = hist_len_1gram_tufano + hist_len_2gram_tufano + hist_len_3gram_tufano
concat_hists_upd_to_4gram_len += hist_len_4gram_tufano

concat_hists_upd_to_5gram_len = hist_len_1gram_tufano + hist_len_2gram_tufano + hist_len_3gram_tufano
concat_hists_upd_to_5gram_len += hist_len_4gram_tufano + hist_len_5gram_tufano

In [33]:
cur_dists = get_dists(hists_tufano_1gram, jaccard_metric, hist_len_1gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=118), HTML(value='')))


Best eps: 0.25
Best F-measure: 0.4637536283943697
0.25
CLUSTERS:

1 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
123 ['Remove statement']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object instance to invoke its methods']


392 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426

(92.553, 0.4637536283943697)

In [34]:
cur_dists = get_dists(hists_tufano_1gram, canberra_metric, hist_len_1gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=734), HTML(value='')))


Best eps: 0.7789115646258503
Best F-measure: 0.5543294968987176
0.7789115646258503
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
68 ['Change return type']
194 ['Narrow method visibility']
464 ['Class beco

(94.501, 0.5543294968987176)

In [40]:
cur_dists = get_dists(hists_tufano_1gram, canberra_metric, hist_len_1gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.78
Best F-measure: 0.5543294968987176
0.78
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
68 ['Change return type']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object in

(94.501, 0.5543294968987176)

In [41]:
cur_dists = get_dists(hists_tufano_1gram, cos_distance, hist_len_1gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.3
Best F-measure: 0.5445556600120369
0.3
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']
190 ['Add parameter in the method/constructor invocation']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


69

(94.792, 0.5445556600120369)

In [42]:
cur_dists = get_dists(hists_tufano_1gram, pearsons_correlation_mean, hist_len_1gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.275
Best F-measure: 0.5445346950727553
0.275
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


69 ['Remove statement']
121 ['Remove statement']
197 ['Re

(94.894, 0.5445346950727553)

In [43]:
concat_hists_upd_2gram = []

num_changes = len(hists_tufano_1gram)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_tufano_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_tufano_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano, amount))
            
    concat_hists_upd_2gram.append(concat_hist)

    

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [44]:
cur_dists = get_dists(concat_hists_upd_2gram, jaccard_metric, concat_hists_upd_to_2gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.21
Best F-measure: 0.4790654766889852
0.21
CLUSTERS:

1 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
123 ['Remove statement']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
196 ['Add parameter in the method/constructor invocation']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object instance to invoke its methods']


392 ['Add braces to if statement']
395 ['Add brac

382 ['Add parameter in the method/constructor invocation']


96 ['Remove parameter from the method invocation']
160 ['Remove parameter from the method invocation']
391 ['Add/Remove parameter']
187 ['Add invoked method']


145 ['Remove parameter from the method invocation']
542 ['Add parameter in the method/constructor invocation']


168 ['Remove parameter from the method invocation']
380 ['Change parameters order in method invocation']


209 ['Remove parameter from the method invocation']
282 ['Remove parameter from the method invocation']
86 ['Add parameter in the method/constructor invocation']


367 ['Remove parameter from the method invocation']
493 ['Remove parameter from the method invocation']
345 ['Add parameter in the method/constructor invocation']


457 ['Remove parameter from the method invocation']
384 ['Remove condition']


459 ['Remove parameter from the method invocation']
469 ['Remove parameter from the method invocation']
506 ['Remove parameter from the method invocat

(92.839, 0.4790654766889852)

In [45]:
cur_dists = get_dists(concat_hists_upd_2gram, canberra_metric, concat_hists_upd_to_2gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.8300000000000001
Best F-measure: 0.5505461755606059
0.8300000000000001
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object 

(94.609, 0.5505461755606059)

In [47]:
cur_dists = get_dists(concat_hists_upd_2gram, cos_distance, concat_hists_upd_to_2gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.73
Best F-measure: 0.5341715314797305
0.73
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


392 ['Add braces to if statement']
237 ['Class is not static anymore. Add object instance to invoke its methods']
525 ['Change parameter value of invoked method']
71 ['Replace invoked method']
198 ['Replace invoked method']
213 ['Replace invoked method']
215 ['Replace invoked method']
222 ['Replace invoked method']
228 ['Replace invoked method']
232 ['Replace invoked method']
243 ['Replace invoked method']
248 ['Replace invoked method']
262 ['Replace invoked method']
270 ['Replace invoked method']
277 ['Replace invoked method']
291 ['Replace invoked method']
299 ['Replace invoked method']
302 ['Replace invoked me

142 ['Replace fully qualified name with import or vice versa']
483 ['Replace fully qualified name with import or vice versa']
411 ['Replace invoked method']
451 ['Replace invoked method']
475 ['Replace invoked method']
240 ['Change method invocation as result of a move method']
385 ['Change method invocation as result of a move method']
600 ['Change type of a variable']


400 ['Change parameter value of invoked method']
582 ['Change parameter value of invoked method']
497 ['Replace invoked method']
281 ['Rename method']
236 ['Add/Remove operand from condition']


492 ['Change return type']
519 ['Change return type']


327 ['Replace invoked method']
205 ['Rename method']
244 ['Rename method']
255 ['Rename method']
271 ['Rename method']
279 ['Rename method']
285 ['Rename method']
304 ['Rename method']
316 ['Rename method']
343 ['Rename method']
362 ['Rename method']
377 ['Rename method']
379 ['Rename method']
491 ['Rename method']
500 ['Rename method']
526 ['Rename method']
539 ['Rename 

(93.445, 0.5341715314797305)

In [48]:
cur_dists = get_dists(concat_hists_upd_2gram, pearsons_correlation_mean, concat_hists_upd_to_2gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.65
Best F-measure: 0.5420436267728801
0.65
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


441 ['Add braces to if statement']
440 ['Move existing statements out of try block']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
448 ['Remove statement']
461 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing meth

(94.081, 0.5420436267728801)

In [49]:
concat_hists_upd_3gram = []

num_changes = len(hists_tufano_1gram)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_tufano_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_tufano_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano, amount))
        
    for gram_ind, amount in hists_tufano_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano, amount))
            
    concat_hists_upd_3gram.append(concat_hist)

    

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [50]:
cur_dists = get_dists(concat_hists_upd_3gram, jaccard_metric, concat_hists_upd_to_3gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.19
Best F-measure: 0.48202851709976174
0.19
CLUSTERS:

1 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
123 ['Remove statement']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
196 ['Add parameter in the method/constructor invocation']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object instance to invoke its methods']


392 ['Add braces to if statement']
395 ['Add bra


52 ['Add parameter in the method/constructor invocation']
87 ['Add parameter in the method/constructor invocation']
105 ['Add parameter in the method/constructor invocation']
311 ['Add parameter in the method/constructor invocation']


104 ['Add parameter in the method/constructor invocation']
127 ['Add parameter in the method/constructor invocation']
157 ['Add parameter in the method/constructor invocation']
204 ['Add parameter in the method/constructor invocation']
381 ['Add parameter in the method/constructor invocation']
439 ['Add parameter in the method/constructor invocation']


190 ['Add parameter in the method/constructor invocation']
582 ['Change parameter value of invoked method']
447 ['Change comparison operator (e.g. >) in condition']


201 ['Add parameter in the method/constructor invocation']
269 ['Add parameter in the method/constructor invocation']
546 ['Add parameter in the method/constructor invocation']
549 ['Add parameter in the method/constructor invocation']


42

(92.868, 0.48202851709976174)

In [51]:
cur_dists = get_dists(concat_hists_upd_3gram, canberra_metric, concat_hists_upd_to_3gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.895
Best F-measure: 0.5489584343254714
0.895
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object instance to invoke its met

299 ['Replace invoked method']
302 ['Replace invoked method']
307 ['Replace invoked method']
309 ['Replace invoked method']
313 ['Replace invoked method']
324 ['Replace invoked method']
331 ['Replace invoked method']
332 ['Replace invoked method']
338 ['Replace invoked method']
342 ['Replace invoked method']
423 ['Replace invoked method']
455 ['Replace invoked method']
477 ['Replace invoked method']
479 ['Replace invoked method']
496 ['Replace invoked method']
501 ['Replace invoked method']
510 ['Replace invoked method']
520 ['Replace invoked method']
537 ['Replace invoked method']
544 ['Replace invoked method']
551 ['Replace invoked method']
553 ['Replace invoked method']
566 ['Replace invoked method']
567 ['Replace invoked method']
586 ['Replace invoked method']
587 ['Replace invoked method']
592 ['Replace invoked method']
604 ['Replace invoked method']
607 ['Replace invoked method']
608 ['Replace invoked method']
616 ['Replace invoked method']
88 ['Add invoked method']
408 ['Change 

(94.482, 0.5489584343254714)

In [52]:
cur_dists = get_dists(concat_hists_upd_3gram, cos_distance, concat_hists_upd_to_3gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.685
Best F-measure: 0.5369775260635428
0.685
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


441 ['Add braces to if statement']
440 ['Move existing statements out of try block']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
448 ['Remove statement']
461 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing me

Number of clusters: 59
Number of outliers: 68/627 (10.845%)

Cohesion = 181.44632591015244
Separation = 8945.060265000118
G1 = 589.2656693686183

Entropy = 1.181256
Purity = 0.710197
(entropy, purity): [(0.0, 1.0), (1.0, 0.5), (0.0, 1.0), (1.268, 0.619), (1.585, 0.333), (0.722, 0.8), (1.559, 0.667), (1.842, 0.429), (1.564, 0.729), (0.918, 0.667), (0.0, 1.0), (1.477, 0.692), (1.459, 0.5), (2.369, 0.364), (1.547, 0.615), (1.929, 0.579), (1.922, 0.4), (1.0, 0.5), (2.55, 0.364), (0.918, 0.667), (0.0, 1.0), (0.0, 1.0), (0.918, 0.667), (2.815, 0.231), (1.0, 0.5), (0.503, 0.889), (0.0, 1.0), (1.0, 0.5), (0.0, 1.0), (1.0, 0.5), (0.25, 0.958), (0.0, 1.0), (2.693, 0.371), (0.0, 1.0), (1.0, 0.5), (0.744, 0.88), (1.281, 0.583), (0.414, 0.917), (1.186, 0.75), (0.826, 0.741), (0.592, 0.857), (0.0, 1.0), (0.0, 1.0), (0.811, 0.75), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.918, 0.667), (0.65, 0.833), (2.231, 0.273), (0.918, 0.667), (0.0, 1.0), (0.0, 1.0), (0.235, 0.962), (1.0, 0.5

(94.0, 0.5369775260635428)

In [53]:
cur_dists = get_dists(concat_hists_upd_3gram, pearsons_correlation_mean, concat_hists_upd_to_3gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.6900000000000001
Best F-measure: 0.5404586938585499
0.6900000000000001
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


441 ['Add braces to if statement']
440 ['Move existing statements out of try block']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
448 ['Remove statement']
461 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48

327 ['Replace invoked method']
205 ['Rename method']
244 ['Rename method']
255 ['Rename method']
271 ['Rename method']
279 ['Rename method']
285 ['Rename method']
304 ['Rename method']
316 ['Rename method']
343 ['Rename method']
362 ['Rename method']
377 ['Rename method']
379 ['Rename method']
491 ['Rename method']
500 ['Rename method']
526 ['Rename method']
539 ['Rename method']
554 ['Rename method']
556 ['Rename method']
599 ['Rename method']
601 ['Rename method']
606 ['Rename method']
609 ['Rename method']
611 ['Rename method']
617 ['Rename method']
632 ['Rename method']


497 ['Replace invoked method']
236 ['Add/Remove operand from condition']


169 ['Replace statement']
129 ['Add statement']


83 ['Add null check']
151 ['Add null check']
152 ['Add null check']
363 ['Add null check']


242 ['Change parameters order in method invocation']
516 ['Change parameters order in method invocation']
595 ['Change parameters order in method invocation']


Number of clusters: 60
Number of outli

(94.088, 0.5404586938585499)

In [54]:
concat_hists_upd_4gram = []

num_changes = len(hists_tufano_1gram)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_tufano_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_tufano_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano, amount))
        
    for gram_ind, amount in hists_tufano_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano, amount))
        
    for gram_ind, amount in hists_tufano_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano +
                            hist_len_3gram_tufano, amount))    
            
    concat_hists_upd_4gram.append(concat_hist)

    

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [55]:
cur_dists = get_dists(concat_hists_upd_4gram, jaccard_metric, concat_hists_upd_to_4gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.19
Best F-measure: 0.48202851709976174
0.19
CLUSTERS:

1 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
123 ['Remove statement']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
196 ['Add parameter in the method/constructor invocation']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object instance to invoke its methods']


392 ['Add braces to if statement']
395 ['Add bra

106 ['Remove parameter from the method invocation']
252 ['Class is not static anymore. Add object instance to invoke its methods']
319 ['Class is not static anymore. Add object instance to invoke its methods']
368 ['Class is not static anymore. Add object instance to invoke its methods']
602 ['Class is not static anymore. Add object instance to invoke its methods']
172 ['Add/Remove parameter']
266 ['Add/Remove parameter']
396 ['Remove condition']
631 ['Replace invoked method']
161 ['Change method invocation as result of a move method']
306 ['Change method invocation as result of a move method']
226 ['Replace statement']


275 ['Remove statement']
490 ['Remove statement']
467 ['Add try block']


372 ['Remove statement']
148 ['Use ? in generics as return type']


394 ['Remove statement']
481 ['Remove statement']
21 ['Remove condition']
414 ['Simplify if condition']


529 ['Remove statement']
158 ['Remove parameter from the method invocation']
186 ['Remove parameter from the method invoca

(92.868, 0.48202851709976174)

In [56]:
cur_dists = get_dists(concat_hists_upd_4gram, canberra_metric, concat_hists_upd_to_4gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.915
Best F-measure: 0.5460392726898203
0.915
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object instance to invoke its met

606 ['Rename method']
609 ['Rename method']
611 ['Rename method']
617 ['Rename method']
632 ['Rename method']


507 ['Replace invoked method']
588 ['Replace invoked method']


83 ['Add null check']
139 ['Add null check']
151 ['Add null check']
152 ['Add null check']
184 ['Add null check']
363 ['Add null check']


109 ['Add invoked method']
183 ['Add invoked method']


187 ['Add invoked method']
417 ['Add statement']


108 ['Remove try/catch']
129 ['Add statement']
476 ['Merge 2 catch blocks capturing both exceptions in 1 catch expression']


242 ['Change parameters order in method invocation']
516 ['Change parameters order in method invocation']
595 ['Change parameters order in method invocation']


450 ['Add try block']
467 ['Add try block']
470 ['Add try block']


Number of clusters: 63
Number of outliers: 47/627 (7.496%)

Cohesion = 139.56995014713726
Separation = 3500.743988572346
G1 = 519.5791824550316

Entropy = 1.18322
Purity = 0.687931
(entropy, purity): [(2.465, 0.316), (1.0, 

(94.418, 0.5460392726898203)

In [57]:
cur_dists = get_dists(concat_hists_upd_4gram, cos_distance, concat_hists_upd_to_4gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.65
Best F-measure: 0.5336536564266062
0.65
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


441 ['Add braces to if statement']
440 ['Move existing statements out of try block']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing method using the abstract keyword']
90 ['Abstract an e

(94.362, 0.5336536564266062)

In [58]:
cur_dists = get_dists(concat_hists_upd_4gram, pearsons_correlation_mean, concat_hists_upd_to_4gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.72
Best F-measure: 0.5352591379267239
0.72
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


441 ['Add braces to if statement']
440 ['Move existing statements out of try block']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing method using the abstract keyword']
90 ['Abstract an e

(93.922, 0.5352591379267239)

In [59]:
concat_hists_upd_5gram = []

num_changes = len(hists_tufano_1gram)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_tufano_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_tufano_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano, amount))
        
    for gram_ind, amount in hists_tufano_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano, amount))
        
    for gram_ind, amount in hists_tufano_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano +
                            hist_len_3gram_tufano, amount))    
        
    for gram_ind, amount in hists_tufano_5gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano +
                            hist_len_3gram_tufano + hist_len_4gram_tufano, amount))  
            
    concat_hists_upd_5gram.append(concat_hist)

    

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [60]:
cur_dists = get_dists(concat_hists_upd_5gram, jaccard_metric, concat_hists_upd_to_5gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.19
Best F-measure: 0.48202851709976174
0.19
CLUSTERS:

1 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
123 ['Remove statement']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
196 ['Add parameter in the method/constructor invocation']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object instance to invoke its methods']


392 ['Add braces to if statement']
395 ['Add bra

391 ['Add/Remove parameter']
187 ['Add invoked method']


145 ['Remove parameter from the method invocation']
542 ['Add parameter in the method/constructor invocation']


168 ['Remove parameter from the method invocation']
380 ['Change parameters order in method invocation']


209 ['Remove parameter from the method invocation']
282 ['Remove parameter from the method invocation']
86 ['Add parameter in the method/constructor invocation']


367 ['Remove parameter from the method invocation']
493 ['Remove parameter from the method invocation']
345 ['Add parameter in the method/constructor invocation']


457 ['Remove parameter from the method invocation']
384 ['Remove condition']


459 ['Remove parameter from the method invocation']
469 ['Remove parameter from the method invocation']
506 ['Remove parameter from the method invocation']
574 ['Remove parameter from the method invocation']


575 ['Remove parameter from the method invocation']
166 ['Add parameter in the method/constructor invoca

(92.868, 0.48202851709976174)

In [61]:
cur_dists = get_dists(concat_hists_upd_5gram, canberra_metric, concat_hists_upd_to_5gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.935
Best F-measure: 0.5424272750796627
0.935
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
68 ['Change return type']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object 

(94.249, 0.5424272750796627)

In [62]:
cur_dists = get_dists(concat_hists_upd_5gram, cos_distance, concat_hists_upd_to_5gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.71
Best F-measure: 0.5376332764790721
0.71
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


441 ['Add braces to if statement']
440 ['Move existing statements out of try block']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing method using the abstract keyword']
90 ['Abstract an e

(94.063, 0.5376332764790721)

In [63]:
cur_dists = get_dists(concat_hists_upd_5gram, pearsons_correlation_mean, concat_hists_upd_to_5gram_len)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.715
Best F-measure: 0.5376332764790721
0.715
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


441 ['Add braces to if statement']
440 ['Move existing statements out of try block']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing method using the abstract keyword']
90 ['Abstract an

(94.059, 0.5376332764790721)

In [64]:
TUFANO_ACTIONS_PATH = "/Users/aliscafo/Documents/ALINA/WORK/SPbAU/thesis/CodeDiffEditScripts/TufanoActionsWithUPD"

edit_scripts = []

for double_num in tqdm_notebook(ids_per_unique_label):
    actions_file = open(TUFANO_ACTIONS_PATH + "/" + double_num + "/" + double_num + "/" + "sampleChange1")
    edit_script = actions_file.read().split("\n")
    edit_script = [elem for elem in edit_script if elem != '']
    
    edit_scripts.append(edit_script)
    actions_file.close()


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [65]:
def lcs(X, Y): 
    # find the length of the strings 
    m = len(X) 
    n = len(Y) 
  
    # declaring the array for storing the dp values 
    L = [[None]*(n + 1) for i in range(m + 1)] 
  
    """Following steps build L[m + 1][n + 1] in bottom up fashion 
    Note: L[i][j] contains length of LCS of X[0..i-1] 
    and Y[0..j-1]"""
    for i in range(m + 1): 
        for j in range(n + 1): 
            if i == 0 or j == 0 : 
                L[i][j] = 0
            elif X[i-1] == Y[j-1]: 
                L[i][j] = L[i-1][j-1]+1
            else: 
                L[i][j] = max(L[i-1][j], L[i][j-1]) 
  
    # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1] 
    return L[m][n] 

In [66]:
dists_for_lcs = []

for es1 in tqdm_notebook(edit_scripts):
    cur_dists = []
    for es2 in edit_scripts:
        cur_lcs = lcs(es1, es2)
        dist = 1 - cur_lcs / max(len(es1), len(es2))
        cur_dists.append(dist)
    dists_for_lcs.append(cur_dists)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [67]:
cur_dists = dists_for_lcs
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.725
Best F-measure: 0.5623453883198025
0.725
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']
2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']
541 ['Class is not static anymore. Add object instance to invoke its methods']
119 ['Broad method visibility']
286 ['Broad method visibility']
57 ['Replace fully qualified name with import or vice versa']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
389 ['Replace fully qualified name with import or vice versa']
420 ['Replace fully qualified name with import or vice versa']
194 ['Narrow method visibility']
464 ['Class becomes static. Delete object instance to invoke its met

385 ['Change method invocation as result of a move method']
600 ['Change type of a variable']


233 ['Change parameter value of invoked method']
387 ['Change parameter value of invoked method']
397 ['Remove type casting in method body']


63 ['Replace fully qualified name with import or vice versa']
171 ['Replace fully qualified name with import or vice versa']
432 ['Replace fully qualified name with import or vice versa']
435 ['Replace fully qualified name with import or vice versa']
452 ['Replace fully qualified name with import or vice versa']
453 ['Replace fully qualified name with import or vice versa']
219 ['Change method invocation as result of a move method']
456 ['Class becomes static. Delete object instance to invoke its methods']


150 ['Replace invoked method']
155 ['Replace invoked method']
159 ['Replace invoked method']
163 ['Replace invoked method']
174 ['Replace invoked method']
175 ['Replace invoked method']
180 ['Replace invoked method']


327 ['Replace invoked method

(94.614, 0.5623453883198025)

In [104]:
#RESULTS_ALL_LABELED_TUFANO = "/Volumes/Seagate/Alina/result_for_labeled_tufano_with_context_treat-true"
RESULTS_ALL_LABELED_TUFANO = "/Volumes/Seagate/Alina/result_for_all_labeled_tufano"

def get_hists(hists_path, gram_path):
    hists = []

    for double_num in tqdm_notebook(ids_per_unique_label):
        hist_path1 = hists_path + "/" + double_num + " " + double_num + "/" + gram_path + "/sampleChange1" + "_hist.txt"
        
        if os.path.exists(hist_path1):
            hist_file1 = open(hist_path1, "r")
            lines = hist_file1.read().split("\n")
            hist_data1 = [(int(line.split(" ")[0]), int(line.split(" ")[1])) for line in lines if line != '']
            hists.append(hist_data1)
            hist_file1.close()
        else:
            hists.append([])
            
    return hists

def get_hists_len(hists_path, gram_path):
    hists_len = None
    
    es_path = hists_path + "/" + "edit_scripts_" + gram_path + "s_mapped.txt"
    es_file = open(es_path, "r")
    es_lines = es_file.read().split("\n")
    hists_len = int(es_lines[-2].split(" ")[0][:-1]) + 1
    es_file.close()

    print(hists_len)
    
    return hists_len

In [105]:
hists_tufano_1gram = get_hists(RESULTS_ALL_LABELED_TUFANO, "1gram")
hists_tufano_2gram = get_hists(RESULTS_ALL_LABELED_TUFANO, "2gram")
hists_tufano_3gram = get_hists(RESULTS_ALL_LABELED_TUFANO, "3gram")
hists_tufano_4gram = get_hists(RESULTS_ALL_LABELED_TUFANO, "4gram")
hists_tufano_5gram = get_hists(RESULTS_ALL_LABELED_TUFANO, "5gram")

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [93]:
# with context
hist_len_1gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "1gram")
hist_len_2gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "2gram")
hist_len_3gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "3gram")
hist_len_4gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "4gram")
hist_len_5gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "5gram")

356
692
829
848
817


In [106]:
# without context
hist_len_1gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "1gram")
hist_len_2gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "2gram")
hist_len_3gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "3gram")
hist_len_4gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "4gram")
hist_len_5gram_tufano = get_hists_len(RESULTS_ALL_LABELED_TUFANO, "5gram")

244
523
666
741
748


In [107]:
hists_tufano = []

hists_tufano.append([]) # zero array
hists_tufano.append(hists_tufano_1gram)
hists_tufano.append(hists_tufano_2gram)
hists_tufano.append(hists_tufano_3gram)
hists_tufano.append(hists_tufano_4gram)
hists_tufano.append(hists_tufano_5gram)


In [108]:
hists_tufano_len = []

hists_tufano_len.append([]) # zero array
hists_tufano_len.append(hist_len_1gram_tufano)
hists_tufano_len.append(hist_len_2gram_tufano)
hists_tufano_len.append(hist_len_3gram_tufano)
hists_tufano_len.append(hist_len_4gram_tufano)
hists_tufano_len.append(hist_len_5gram_tufano)



In [97]:
distances = [jaccard_metric, canberra_metric, cos_distance, pearsons_correlation_mean]
distances_names = ["jaccard_metric", "canberra_metric", "cos_distance", "pearsons_correlation_mean"]

In [73]:
cur_dists = get_dists(hists_tufano_2gram, jaccard_metric, hist_len_2gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.505
Best F-measure: 0.5388937631516525
0.505
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']
123 ['Remove statement']
541 ['Class is not static anymore. Add object instance to invoke its methods']
73 ['Replace fully qualified name with import or vice versa']
182 ['Replace fully qualified name with import or vice versa']
195 ['Replace fully qualified name with import or vice versa']
464 ['Class becomes static. Delete object instance to invoke its methods']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
16 ['Remove statement']
35 ['Remove statement']
41 ['Remove statement']
59 ['Remove statement']
65 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remov

(94.643, 0.5388937631516525)

In [74]:
cur_dists = get_dists(hists_tufano_2gram, canberra_metric, hist_len_2gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.86
Best F-measure: 0.5367174222484791
0.86
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
448 ['Remove statement']
461 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing method using the abstract keyword']
90 ['Abstract an existing method using the abstract key

(94.637, 0.5367174222484791)

In [75]:
cur_dists = get_dists(hists_tufano_2gram, cos_distance, hist_len_2gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.76
Best F-measure: 0.5263813462447333
0.76
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
448 ['Remove statement']
461 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing method using the abstract keyword']
90 ['Abstract an existing method using the abstract key

(94.493, 0.5263813462447333)

In [76]:
cur_dists = get_dists(hists_tufano_2gram, pearsons_correlation_mean, hist_len_2gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.755
Best F-measure: 0.5234532073156191
0.755
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
448 ['Remove statement']
461 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing method using the abstract keyword']
90 ['Abstract an existing method using the abstract k

(94.441, 0.5234532073156191)

In [77]:
cur_dists = get_dists(hists_tufano_3gram, jaccard_metric, hist_len_3gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.605
Best F-measure: 0.488492277344056
0.605
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
16 ['Remove statement']
35 ['Remove statement']
41 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
193 ['Remove statement']
448 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing method using the abstract keyword']
90 ['A

(94.123, 0.488492277344056)

In [78]:
cur_dists = get_dists(hists_tufano_3gram, canberra_metric, hist_len_3gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.87
Best F-measure: 0.499092204722086
0.87
CLUSTERS:

1 ['Add braces to if statement']
395 ['Add braces to if statement']
409 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing method using the abstract keyword']
90 ['Abstract an existing method using the abstract keyword']
95 ['Abstract an existing method using the a

(94.451, 0.499092204722086)

In [79]:
cur_dists = get_dists(hists_tufano_3gram, cos_distance, hist_len_3gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.365
Best F-measure: 0.4968324492498922
0.365
CLUSTERS:

395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


69 ['Remove statement']
337 ['Remove statement']


111 ['Remove statement']
132 ['Remove s

(94.943, 0.4968324492498922)

In [80]:
cur_dists = get_dists(hists_tufano_3gram, pearsons_correlation_mean, hist_len_3gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.365
Best F-measure: 0.4968324492498922
0.365
CLUSTERS:

395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


69 ['Remove statement']
337 ['Remove statement']


111 ['Remove statement']
132 ['Remove s

(94.943, 0.4968324492498922)

In [81]:
cur_dists = get_dists(hists_tufano_4gram, jaccard_metric, hist_len_4gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.005
Best F-measure: 0.4715230330818618
0.005
CLUSTERS:

395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']
6 ['Abstract an existing method using the abstract keyword']
30 ['Abstract an existing method using the abstract keyword']
38 ['Abstract an existing method using the abstract keyword']
48 ['Abstract an existing method using the abstract keyword']
90 ['Abstract an existing method using the abstract keyword']
95 ['Abstract an existing method using the abstract keyword']


16 ['Remove statement']
41 ['Remove statement

(94.324, 0.4715230330818618)

In [82]:
cur_dists = get_dists(hists_tufano_4gram, canberra_metric, hist_len_4gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.675
Best F-measure: 0.477062003794903
0.675
CLUSTERS:

395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


69 ['Remove statement']
337 ['Remove statement']


111 ['Remove statement']
132 ['Remove st

(94.813, 0.477062003794903)

In [83]:
cur_dists = get_dists(hists_tufano_4gram, cos_distance, hist_len_4gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.44
Best F-measure: 0.47769996232759837
0.44
CLUSTERS:

395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


69 ['Remove statement']
337 ['Remove statement']


111 ['Remove statement']
132 ['Remove st



51 ['Change parameter value of invoked method']
75 ['Change parameter value of invoked method']
413 ['Change parameter value of invoked method']
142 ['Replace fully qualified name with import or vice versa']
483 ['Replace fully qualified name with import or vice versa']
411 ['Replace invoked method']
451 ['Replace invoked method']
475 ['Replace invoked method']
240 ['Change method invocation as result of a move method']
385 ['Change method invocation as result of a move method']
600 ['Change type of a variable']


202 ['Change parameter value of invoked method']
321 ['Change parameter value of invoked method']
552 ['Change parameter value of invoked method']


233 ['Change parameter value of invoked method']
387 ['Change parameter value of invoked method']


525 ['Change parameter value of invoked method']
71 ['Replace invoked method']
198 ['Replace invoked method']
213 ['Replace invoked method']
215 ['Replace invoked method']
222 ['Replace invoked method']
228 ['Replace invoked meth

(94.777, 0.47769996232759837)

In [84]:
cur_dists = get_dists(hists_tufano_4gram, pearsons_correlation_mean, hist_len_4gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.44
Best F-measure: 0.47769996232759837
0.44
CLUSTERS:

395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


69 ['Remove statement']
337 ['Remove statement']


111 ['Remove statement']
132 ['Remove st

626 ['Change type of a variable']


361 ['Add parameter & remove variable from method body']
543 ['Add parameter & remove variable from method body']


410 ['Replace if statement with assert statement']
449 ['Replace if statement with assert statement']


450 ['Add try block']
470 ['Add try block']


Number of clusters: 91
Number of outliers: 164/627 (26.156%)

Cohesion = 188.6185565246384
Separation = 17228.42480010001
G1 = 2716.3381317420653

Entropy = 0.525658
Purity = 0.842333
(entropy, purity): [(0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (1.0, 0.5), (0.722, 0.8), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.817, 0.833), (1.0, 0.5), (0.887, 0.781), (0.922, 0.8), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (1.0, 0.5), (0.0, 1.0), (0.0, 1.0), (1.0, 0.5), (1.0, 0.5), (0.0, 1.0), (0.918, 0.667), (0.0, 1.0), (0.544, 0.875), (0.918, 0.667), (1.0, 0.5), (1.5, 0.5), (1.0, 0.5), (0.503, 0.889), (0.0, 1.0), (1.041, 0.75), (0.918, 0.667), (0.0, 1.0), (0.0, 1.0), (1.0, 0.5), (1.0, 0.5), (0.0, 1.0), (0.0, 1.0), (1.

(94.777, 0.47769996232759837)

In [85]:
cur_dists = get_dists(hists_tufano_5gram, jaccard_metric, hist_len_5gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.67
Best F-measure: 0.46418703837818315
0.67
CLUSTERS:

395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']
190 ['Add parameter in the method/constructor invocation']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


111 ['Remove statement']
132 ['R

(94.758, 0.46418703837818315)

In [86]:
cur_dists = get_dists(hists_tufano_5gram, canberra_metric, hist_len_5gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.99
Best F-measure: 0.46418703837818315
0.99
CLUSTERS:

395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']
190 ['Add parameter in the method/constructor invocation']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


111 ['Remove statement']
132 ['R

606 ['Rename method']
609 ['Rename method']
611 ['Rename method']
617 ['Rename method']
632 ['Rename method']


507 ['Replace invoked method']
588 ['Replace invoked method']


169 ['Replace statement']
108 ['Remove try/catch']
129 ['Add statement']


83 ['Add null check']
151 ['Add null check']
152 ['Add null check']


122 ['Rename parameter']
128 ['Rename parameter']
143 ['Rename parameter']
165 ['Rename parameter']
336 ['Rename parameter']


207 ['Replace generic specification with diamond operator']
287 ['Replace generic specification with diamond operator']
364 ['Replace generic specification with diamond operator']
508 ['Replace generic specification with diamond operator']
528 ['Replace generic specification with diamond operator']
564 ['Replace generic specification with diamond operator']


221 ['Replace generic specification with diamond operator']
341 ['Replace generic specification with diamond operator']


205 ['Rename method']
556 ['Rename method']


596 ['Rename method']


(94.758, 0.46418703837818315)

In [87]:
cur_dists = get_dists(hists_tufano_5gram, cos_distance, hist_len_5gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.975
Best F-measure: 0.46418703837818315
0.975
CLUSTERS:

395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']
190 ['Add parameter in the method/constructor invocation']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


111 ['Remove statement']
132 [

(94.758, 0.46418703837818315)

In [88]:
cur_dists = get_dists(hists_tufano_5gram, pearsons_correlation_mean, hist_len_5gram_tufano)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.9550000000000001
Best F-measure: 0.46401235925613565
0.9550000000000001
CLUSTERS:

395 ['Add braces to if statement']
426 ['Add braces to if statement']
468 ['Add braces to if statement']


2 ['Forbid overriding: add final to method']
45 ['Forbid overriding: add final to method']
466 ['Forbid overriding: add final to method']


3 ['Remove statement']
9 ['Remove statement']
35 ['Remove statement']
59 ['Remove statement']
92 ['Remove statement']
93 ['Remove statement']
97 ['Remove statement']
110 ['Remove statement']
133 ['Remove statement']
137 ['Remove statement']
138 ['Remove statement']


16 ['Remove statement']
56 ['Remove condition']
190 ['Add parameter in the method/constructor invocation']


53 ['Remove statement']
24 ['Move existing statements out of try block']
136 ['Move existing statements out of try block']
445 ['Move existing statements out of try block']
474 ['Move existing statements out of try block']


65 ['Remove statement']
135 ['Remove statement']


111 

(94.737, 0.46401235925613565)

In [90]:
TUFANO_ACTIONS_PATH = "/Users/aliscafo/Documents/ALINA/WORK/SPbAU/thesis/CodeDiffEditScripts/TufanoActions"

edit_scripts = []

for double_num in tqdm_notebook(ids_per_unique_label):
    actions_file = open(TUFANO_ACTIONS_PATH + "/" + double_num + "/" + double_num + "/" + "sampleChange1")
    edit_script = actions_file.read().split("\n")
    edit_script = [elem for elem in edit_script if elem != '']
    
    edit_scripts.append(edit_script)
    
    if len(edit_script) == 5:
        print(double_num)
        print("\n".join(edit_script))
        print("\n")
    
    actions_file.close()


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))

392
INS 8@@ 25@@ at 1
INS 21@@ 8@@ at 0
MOV 32@@ 21@@ at 0
UPD 42@@
DEL 21@@


409
INS 78@@ 31@@ at 0
INS 40@@ 78@@ at 0
INS 8@@ 25@@ at 1
INS 41@@ 8@@ at 0
DEL 41@@


121
DEL 40@@
DEL 42@@
DEL 52@@
DEL 32@@
DEL 21@@


123
INS 78@@ 31@@ at 0
INS 40@@ 78@@ at 0
DEL 42@@
DEL 48@@
DEL 21@@


125
DEL 40@@
DEL 43@@
DEL 42@@
DEL 44@@
DEL 42@@


200
DEL 42@@
DEL 33@@
DEL 32@@
DEL 21@@
DEL 8@@


211
DEL 42@@
DEL 42@@
DEL 42@@
DEL 32@@
DEL 21@@


261
DEL 42@@
DEL 42@@
DEL 42@@
DEL 32@@
DEL 21@@


289
DEL 42@@
DEL 42@@
DEL 42@@
DEL 32@@
DEL 21@@


295
DEL 40@@
DEL 42@@
DEL 42@@
DEL 32@@
DEL 21@@


335
DEL 42@@
DEL 42@@
DEL 42@@
DEL 32@@
DEL 21@@


337
DEL 40@@
DEL 42@@
DEL 45@@
DEL 32@@
DEL 21@@


348
DEL 42@@
DEL 42@@
DEL 42@@
DEL 32@@
DEL 21@@


430
DEL 40@@
DEL 42@@
DEL 42@@
DEL 32@@
DEL 21@@


518
DEL 40@@
DEL 42@@
DEL 34@@
DEL 32@@
DEL 21@@


590
DEL 42@@
DEL 42@@
DEL 42@@
DEL 32@@
DEL 21@@


489
UPD 43@@
UPD 40@@
MOV 42@@ 32@@ at 2
DEL 42@@
DEL 32@@


7
INS 83@@ 31@@ at 2
MOV 8@@ 31@@ at 6

# Fixed n, without context

In [None]:
for i in range(1, 6):
    hists = hists_tufano[i]
    hists_len = hists_tufano_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'average'
        best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)
        
        

In [122]:
for i in range(1, 6):
    hists = hists_tufano[i]
    hists_len = hists_tufano_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'complete'
        best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.003)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


249 367


HBox(children=(IntProgress(value=0, max=249), HTML(value='')))


Best eps: 0.6722222222222223
Best F-measure: 0.4014052951767639
0.6722222222222223
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 40
Number of outliers: 15/627 (2.392%)

Cohesion = 260.854323206227
Separation = 56501.08034032663
G1 = 1091.9514078881075

Entropy = 2.264713
Purity = 0.473856
F-measure = 0.401405

Rand = 89.868%
Jaccard Index = 16.399%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


1164 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.627
Best F-measure: 0.4963336464770641
0.627
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 73
Number of outliers: 63/627 (10.048%)

Cohesion = 201.91674258467847
Separation = 19754.856917293855
G1 = 2146.922971392345

Entropy = 1.104558
Purity = 0.693262
F-measure = 0.496334

Rand = 93.392%
Jaccard Index = 23.68%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


1809 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.438
Best F-measure: 0.5007743360640162
0.438
_______________________________
1 cos_distance
_______________________________
Number of clusters: 63
Number of outliers: 46/627 (7.337%)

Cohesion = 233.2737421999571
Separation = 42488.90791696328
G1 = 2695.6927535343734

Entropy = 1.277213
Purity = 0.657487
F-measure = 0.500774

Rand = 93.276%
Jaccard Index = 23.975%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


5802 369


HBox(children=(IntProgress(value=0, max=369), HTML(value='')))


Best eps: 0.342
Best F-measure: 0.48553571034697335
0.342
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 72
Number of outliers: 61/627 (9.729%)

Cohesion = 256.08928752269946
Separation = 80075.05803123493
G1 = 5827.961616278907

Entropy = 1.126467
Purity = 0.690813
F-measure = 0.485536

Rand = 93.209%
Jaccard Index = 22.13%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


58 367


HBox(children=(IntProgress(value=0, max=58), HTML(value='')))


Best eps: 0.4
Best F-measure: 0.43008190779303934
0.4
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 68
Number of outliers: 47/627 (7.496%)

Cohesion = 254.95878900995424
Separation = 25124.382881612233
G1 = 927.3540037609831

Entropy = 1.448741
Purity = 0.601724
F-measure = 0.430082

Rand = 92.587%
Jaccard Index = 17.454%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


466 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.936
Best F-measure: 0.4892148523776107
0.936
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 70
Number of outliers: 43/627 (6.858%)

Cohesion = 176.52456948921287
Separation = 6570.655771763861
G1 = 545.7214841900781

Entropy = 1.267943
Purity = 0.650685
F-measure = 0.489215

Rand = 93.442%
Jaccard Index = 22.121%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


771 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.669
Best F-measure: 0.471458424294801
0.669
_______________________________
2 cos_distance
_______________________________
Number of clusters: 77
Number of outliers: 63/627 (10.048%)

Cohesion = 208.8783629503748
Separation = 12296.173789067014
G1 = 986.9434763121434

Entropy = 1.154534
Purity = 0.677305
F-measure = 0.471458

Rand = 93.337%
Jaccard Index = 21.101%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


2884 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.675
Best F-measure: 0.471458424294801
0.675
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 77
Number of outliers: 65/627 (10.367%)

Cohesion = 222.9979514936678
Separation = 37753.459908688805
G1 = 3431.3109803900365

Entropy = 1.152869
Purity = 0.677936
F-measure = 0.471458

Rand = 93.314%
Jaccard Index = 21.143%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


19 367


HBox(children=(IntProgress(value=0, max=19), HTML(value='')))


Best eps: 0.5833333333333334
Best F-measure: 0.4297236785911863
0.5833333333333334
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 86
Number of outliers: 67/627 (10.686%)

Cohesion = 235.7254761904762
Separation = 7250.2272948677255
G1 = 624.8984860856101

Entropy = 1.125639
Purity = 0.682143
F-measure = 0.429724

Rand = 93.119%
Jaccard Index = 17.968%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


194 367


HBox(children=(IntProgress(value=0, max=194), HTML(value='')))


Best eps: 0.8717948717948717
Best F-measure: 0.45943470609450293
0.8717948717948717
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 87
Number of outliers: 87/627 (13.876%)

Cohesion = 186.5763640775475
Separation = 1189.3335494881044
G1 = 305.9627891043375

Entropy = 0.981107
Purity = 0.709259
F-measure = 0.459435

Rand = 93.539%
Jaccard Index = 21.235%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


313 367


HBox(children=(IntProgress(value=0, max=313), HTML(value='')))


Best eps: 0.42264973081037427
Best F-measure: 0.45373295912662925
0.42264973081037427
_______________________________
3 cos_distance
_______________________________
Number of clusters: 90
Number of outliers: 124/627 (19.777%)

Cohesion = 195.7505484760398
Separation = 2142.829964784615
G1 = 553.6170303169647

Entropy = 0.866996
Purity = 0.745527
F-measure = 0.453733

Rand = 93.514%
Jaccard Index = 22.847%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


1107 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.873
Best F-measure: 0.45034035490903745
0.873
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 82
Number of outliers: 75/627 (11.962%)

Cohesion = 211.25152812737915
Separation = 29813.55480826695
G1 = 3385.2060243866604

Entropy = 1.099347
Purity = 0.686594
F-measure = 0.45034

Rand = 93.263%
Jaccard Index = 20.32%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


8 367


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))


Best eps: 0.16666666666666663
Best F-measure: 0.44084138294422825
0.16666666666666663
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 93
Number of outliers: 103/627 (16.427%)

Cohesion = 215.5
Separation = 1634.3650707702782
G1 = 229.75885962696307

Entropy = 0.901319
Purity = 0.734733
F-measure = 0.440841

Rand = 93.317%
Jaccard Index = 19.753%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


96 367


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


Best eps: 0.7
Best F-measure: 0.4417876158026513
0.7
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 86
Number of outliers: 141/627 (22.488%)

Cohesion = 185.8150068201791
Separation = 193.49005511963992
G1 = 46.553147078731634

Entropy = 0.845381
Purity = 0.748971
F-measure = 0.441788

Rand = 93.593%
Jaccard Index = 23.004%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


152 367


HBox(children=(IntProgress(value=0, max=152), HTML(value='')))


Best eps: 0.5188747756753118
Best F-measure: 0.44077639948043024
0.5188747756753118
_______________________________
4 cos_distance
_______________________________
Number of clusters: 87
Number of outliers: 137/627 (21.85%)

Cohesion = 192.1967798475936
Separation = 430.9434597843887
G1 = 103.53471612633841

Entropy = 0.853062
Purity = 0.746939
F-measure = 0.440776

Rand = 93.52%
Jaccard Index = 22.525%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


643 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.519
Best F-measure: 0.44077639948043024
0.519
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 87
Number of outliers: 137/627 (21.85%)

Cohesion = 204.20565129733228
Separation = 20859.025187634146
G1 = 2885.9946543528567

Entropy = 0.853062
Purity = 0.746939
F-measure = 0.440776

Rand = 93.52%
Jaccard Index = 22.525%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


6 367


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


Best eps: 0.0625
Best F-measure: 0.41839400550073763
0.0625
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 87
Number of outliers: 131/627 (20.893%)

Cohesion = 204.5
Separation = 125.79812255643374
G1 = 30.7828373015873

Entropy = 0.910007
Purity = 0.721774
F-measure = 0.418394

Rand = 93.417%
Jaccard Index = 20.728%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


49 367


HBox(children=(IntProgress(value=0, max=49), HTML(value='')))


Best eps: 0.7647058823529411
Best F-measure: 0.4252654811885229
0.7647058823529411
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 87
Number of outliers: 148/627 (23.604%)

Cohesion = 186.2216227889757
Separation = 72.8995322617345
G1 = 12.196117057548502

Entropy = 0.820584
Purity = 0.755741
F-measure = 0.425265

Rand = 93.54%
Jaccard Index = 22.125%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


74 367


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


Best eps: 0.5799159747915971
Best F-measure: 0.4252654811885229
0.5799159747915971
_______________________________
5 cos_distance
_______________________________
Number of clusters: 88
Number of outliers: 145/627 (23.126%)

Cohesion = 189.816317016435
Separation = 119.16970070551402
G1 = 17.32570650626595

Entropy = 0.815477
Purity = 0.757261
F-measure = 0.425265

Rand = 93.45%
Jaccard Index = 21.722%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


393 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.588
Best F-measure: 0.4252654811885229
0.588
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 88
Number of outliers: 145/627 (23.126%)

Cohesion = 201.36316995491768
Separation = 19737.008493396286
G1 = 2830.390069548673

Entropy = 0.815477
Purity = 0.757261
F-measure = 0.425265

Rand = 93.45%
Jaccard Index = 21.722%


In [125]:
LASE_ACTIONS_PATH = "/Users/aliscafo/Documents/ALINA/WORK/SPbAU/thesis/CodeDiffEditScripts/TufanoActions"

edit_scripts_lase = []

for double_num in tqdm_notebook(ids_per_unique_label):
    actions_file = open(LASE_ACTIONS_PATH + "/" + double_num + "/" + double_num + "/" + "sampleChange1")
    edit_script = actions_file.read().split("\n")
    edit_script = [elem for elem in edit_script if elem != '']
    
    print(len(edit_script), from_change_to_unique_label[double_num])
    
    edit_scripts_lase.append(edit_script)
    actions_file.close()
    

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))

6 ['Add braces to if statement']
5 ['Add braces to if statement']
4 ['Add braces to if statement']
5 ['Add braces to if statement']
4 ['Add braces to if statement']
3 ['Add braces to if statement']
4 ['Add braces to if statement']
3 ['Forbid overriding: add final to method']
3 ['Forbid overriding: add final to method']
3 ['Forbid overriding: add final to method']
10 ['Remove statement']
7 ['Remove statement']
18 ['Remove statement']
7 ['Remove statement']
13 ['Remove statement']
8 ['Remove statement']
7 ['Remove statement']
10 ['Remove statement']
9 ['Remove statement']
4 ['Remove statement']
9 ['Remove statement']
7 ['Remove statement']
10 ['Remove statement']
7 ['Remove statement']
10 ['Remove statement']
7 ['Remove statement']
4 ['Remove statement']
5 ['Remove statement']
5 ['Remove statement']
5 ['Remove statement']
7 ['Remove statement']
10 ['Remove statement']
9 ['Remove statement']
7 ['Remove statement']
7 ['Remove statement']
7 ['Remove statement']
14 ['Remove statement']
9 ['R

1 ['Change parameter value of invoked method']
1 ['Change parameter value of invoked method']
1 ['Change parameter value of invoked method']
2 ['Change parameter value of invoked method']
4 ['Change parameter value of invoked method']
2 ['Change parameter value of invoked method']
4 ['Change parameter value of invoked method']
8 ['Change parameter value of invoked method']
1 ['Change parameter value of invoked method']
1 ['Change parameter value of invoked method']
1 ['Change parameter value of invoked method']
2 ['Change parameter value of invoked method']
1 ['Change parameter value of invoked method']
2 ['Change parameter value of invoked method']
6 ['Change parameter value of invoked method']
3 ['Change parameter value of invoked method']
3 ['Replace fully qualified name with import or vice versa']
1 ['Replace fully qualified name with import or vice versa']
3 ['Replace fully qualified name with import or vice versa']
1 ['Replace fully qualified name with import or vice versa']
1 ['

In [126]:
dists_for_lcs_lase = []

for es1 in tqdm_notebook(edit_scripts_lase):
    cur_dists = []
    for es2 in edit_scripts_lase:
        cur_lcs = lcs(es1, es2)
        dist = 1 - cur_lcs / max(len(es1), len(es2))
        cur_dists.append(dist)
    dists_for_lcs_lase.append(cur_dists)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [128]:
cur_dists = dists_for_lcs_lase
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.62
Best F-measure: 0.48855668584218703
0.62
Number of clusters: 58
Number of outliers: 39/627 (6.22%)

Cohesion = 191.74560920990658
Separation = 22554.256642361768
G1 = 1587.3188918243009

Entropy = 1.465264
Purity = 0.610544
F-measure = 0.488557

Rand = 92.705%
Jaccard Index = 24.012%


(92.705, 0.48855668584218703)

In [130]:
cur_dists = dists_for_lcs_lase
link = 'complete'
best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=118), HTML(value='')))


Best eps: 0.76
Best F-measure: 0.4994008568424561
0.76
Number of clusters: 57
Number of outliers: 24/627 (3.828%)

Cohesion = 192.2861686399549
Separation = 27482.872863932236
G1 = 1543.4644093459096

Entropy = 1.474871
Purity = 0.615257
F-measure = 0.499401

Rand = 93.229%
Jaccard Index = 22.628%


(93.229, 0.4994008568424561)

In [143]:
dists_for_equals_lase = []

for es1 in tqdm_notebook(edit_scripts_lase):
    cur_dists = []
    for es2 in edit_scripts_lase:
        if es1 == es2:
            dist = 0.0
        else:
            dist = 1.0
        cur_dists.append(dist)
    dists_for_lcs_lase.append(cur_dists)

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [147]:
cur_dists = dists_for_equals_lase
link = 'average'

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=0.1).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
cur_dists = dists_for_equals_lase
link = 'complete'

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=0.1).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)

# Concat, without context

In [131]:
RESULTS_BRUTE_TUFANO_1GRAM = "/Volumes/Seagate/Alina/result_for_tufano-1gram"
RESULTS_BRUTE_TUFANO_2GRAM = "/Volumes/Seagate/Alina/result_for_tufano-2gram"
RESULTS_BRUTE_TUFANO_3GRAM = "/Volumes/Seagate/Alina/result_for_tufano-3gram"
RESULTS_BRUTE_TUFANO_4GRAM = "/Volumes/Seagate/Alina/result_for_tufano-4gram"
RESULTS_BRUTE_TUFANO_5GRAM = "/Volumes/Seagate/Alina/result_for_tufano-5gram"

In [132]:
hists_tufano_1gram = get_hists(RESULTS_BRUTE_TUFANO_1GRAM, "1gram")
hists_tufano_2gram = get_hists(RESULTS_BRUTE_TUFANO_2GRAM, "2gram")
hists_tufano_3gram = get_hists(RESULTS_BRUTE_TUFANO_3GRAM, "3gram")
hists_tufano_4gram = get_hists(RESULTS_BRUTE_TUFANO_4GRAM, "4gram")
hists_tufano_5gram = get_hists(RESULTS_BRUTE_TUFANO_5GRAM, "5gram")

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [133]:
hist_len_1gram_tufano = get_hists_len(RESULTS_BRUTE_TUFANO_1GRAM, "1gram")
hist_len_2gram_tufano = get_hists_len(RESULTS_BRUTE_TUFANO_2GRAM, "2gram")
hist_len_3gram_tufano = get_hists_len(RESULTS_BRUTE_TUFANO_3GRAM, "3gram")
hist_len_4gram_tufano = get_hists_len(RESULTS_BRUTE_TUFANO_4GRAM, "4gram")
hist_len_5gram_tufano = get_hists_len(RESULTS_BRUTE_TUFANO_5GRAM, "5gram")

244
482
607
656
629


In [134]:
concat_hists_upd_to_2gram_len = hist_len_1gram_tufano + hist_len_2gram_tufano

concat_hists_upd_to_3gram_len = hist_len_1gram_tufano + hist_len_2gram_tufano + hist_len_3gram_tufano

concat_hists_upd_to_4gram_len = hist_len_1gram_tufano + hist_len_2gram_tufano + hist_len_3gram_tufano
concat_hists_upd_to_4gram_len += hist_len_4gram_tufano

concat_hists_upd_to_5gram_len = hist_len_1gram_tufano + hist_len_2gram_tufano + hist_len_3gram_tufano
concat_hists_upd_to_5gram_len += hist_len_4gram_tufano + hist_len_5gram_tufano

In [135]:
concat_hists_upd_2gram = []

num_changes = len(hists_tufano_1gram)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_tufano_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_tufano_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano, amount))
            
    concat_hists_upd_2gram.append(concat_hist)

    

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [136]:
concat_hists_upd_3gram = []

num_changes = len(hists_tufano_1gram)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_tufano_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_tufano_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano, amount))
        
    for gram_ind, amount in hists_tufano_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano, amount))
            
    concat_hists_upd_3gram.append(concat_hist)

    

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [137]:
concat_hists_upd_4gram = []

num_changes = len(hists_tufano_1gram)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_tufano_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_tufano_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano, amount))
        
    for gram_ind, amount in hists_tufano_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano, amount))
        
    for gram_ind, amount in hists_tufano_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano +
                            hist_len_3gram_tufano, amount))    
            
    concat_hists_upd_4gram.append(concat_hist)

    

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [138]:
concat_hists_upd_5gram = []

num_changes = len(hists_tufano_1gram)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_tufano_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_tufano_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano, amount))
        
    for gram_ind, amount in hists_tufano_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano, amount))
        
    for gram_ind, amount in hists_tufano_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano +
                            hist_len_3gram_tufano, amount))    
        
    for gram_ind, amount in hists_tufano_5gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_tufano + hist_len_2gram_tufano +
                            hist_len_3gram_tufano + hist_len_4gram_tufano, amount))  
            
    concat_hists_upd_5gram.append(concat_hist)

    

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




In [139]:
concat_hists_tufano = []

concat_hists_tufano.append([]) # zero array
concat_hists_tufano.append(hists_tufano_1gram)
concat_hists_tufano.append(concat_hists_upd_2gram)
concat_hists_tufano.append(concat_hists_upd_3gram)
concat_hists_tufano.append(concat_hists_upd_4gram)
concat_hists_tufano.append(concat_hists_upd_5gram)


In [140]:
concat_hists_tufano_len = []

concat_hists_tufano_len.append(0) # zero
concat_hists_tufano_len.append(hist_len_1gram_tufano)
concat_hists_tufano_len.append(concat_hists_upd_to_2gram_len)
concat_hists_tufano_len.append(concat_hists_upd_to_3gram_len)
concat_hists_tufano_len.append(concat_hists_upd_to_4gram_len)
concat_hists_tufano_len.append(concat_hists_upd_to_5gram_len)


In [141]:
for i in range(1, 6):
    hists = concat_hists_tufano[i]
    hists_len = concat_hists_tufano_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'average'
        best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.04
Best F-measure: 0.42564245806908346
0.04
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 55
Number of outliers: 21/627 (3.349%)

Cohesion = 274.41488337163685
Separation = 66455.8330343099
G1 = 2838.8180289859124

Entropy = 1.878762
Purity = 0.542904
F-measure = 0.425642

Rand = 92.139%
Jaccard Index = 18.268%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.625
Best F-measure: 0.5074974224189802
0.625
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 60
Number of outliers: 57/627 (9.091%)

Cohesion = 190.92460444374964
Separation = 18485.590108184817
G1 = 1402.3506694114346

Entropy = 1.254324
Purity = 0.675439
F-measure = 0.507497

Rand = 93.016%
Jaccard Index = 25.405%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.255
Best F-measure: 0.49897725764851786
0.255
_______________________________
1 cos_distance
_______________________________
Number of clusters: 71
Number of outliers: 80/627 (12.759%)

Cohesion = 226.0580182504343
Separation = 35745.32067583748
G1 = 3380.438376892599

Entropy = 1.070603
Purity = 0.709324
F-measure = 0.498977

Rand = 93.374%
Jaccard Index = 26.78%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=222), HTML(value='')))


Best eps: 0.23
Best F-measure: 0.5005914860570045
0.23
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 74
Number of outliers: 82/627 (13.078%)

Cohesion = 249.29234045866298
Separation = 70642.6704831098
G1 = 6545.095568385952

Entropy = 1.030649
Purity = 0.717431
F-measure = 0.500591

Rand = 93.359%
Jaccard Index = 26.859%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.185
Best F-measure: 0.4076660219743155
0.185
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 44
Number of outliers: 22/627 (3.509%)

Cohesion = 274.54692921346384
Separation = 57687.205097914644
G1 = 1227.9891652951865

Entropy = 2.165251
Purity = 0.502479
F-measure = 0.407666

Rand = 90.605%
Jaccard Index = 17.524%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.6900000000000001
Best F-measure: 0.5083857975516979
0.6900000000000001
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 69
Number of outliers: 68/627 (10.845%)

Cohesion = 181.72954768862851
Separation = 12314.995940362654
G1 = 1396.7317063219366

Entropy = 1.161737
Purity = 0.694097
F-measure = 0.508386

Rand = 93.44%
Jaccard Index = 25.555%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.445
Best F-measure: 0.4911284971385502
0.445
_______________________________
2 cos_distance
_______________________________
Number of clusters: 62
Number of outliers: 60/627 (9.569%)

Cohesion = 215.71327319017217
Separation = 27958.95214697576
G1 = 2169.0611919386356

Entropy = 1.344928
Purity = 0.650794
F-measure = 0.491128

Rand = 92.542%
Jaccard Index = 23.587%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.43
Best F-measure: 0.49786556430785117
0.43
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 66
Number of outliers: 62/627 (9.888%)

Cohesion = 234.31917809800248
Separation = 55449.06769168832
G1 = 4285.784121882865

Entropy = 1.229827
Purity = 0.677876
F-measure = 0.497866

Rand = 93.224%
Jaccard Index = 25.704%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.18
Best F-measure: 0.4028724907441601
0.18
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 44
Number of outliers: 22/627 (3.509%)

Cohesion = 274.48355067203056
Separation = 58514.196914719156
G1 = 1174.4277276944601

Entropy = 2.153049
Purity = 0.500826
F-measure = 0.402872

Rand = 90.593%
Jaccard Index = 17.162%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.765
Best F-measure: 0.502540138258273
0.765
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 67
Number of outliers: 66/627 (10.526%)

Cohesion = 170.8178431032783
Separation = 9526.600977765786
G1 = 1040.846476944222

Entropy = 1.237275
Purity = 0.677362
F-measure = 0.50254

Rand = 93.279%
Jaccard Index = 24.409%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.505
Best F-measure: 0.48215122981481534
0.505
_______________________________
3 cos_distance
_______________________________
Number of clusters: 65
Number of outliers: 59/627 (9.41%)

Cohesion = 208.4262100436822
Separation = 24322.169757170646
G1 = 1834.3855027980585

Entropy = 1.317396
Purity = 0.658451
F-measure = 0.482151

Rand = 92.691%
Jaccard Index = 22.703%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.505
Best F-measure: 0.48138736780154323
0.505
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 65
Number of outliers: 63/627 (10.048%)

Cohesion = 221.12164519974104
Separation = 47236.95408723809
G1 = 3647.7006298629503

Entropy = 1.311102
Purity = 0.656028
F-measure = 0.481387

Rand = 92.684%
Jaccard Index = 22.863%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.17500000000000002
Best F-measure: 0.4011688035370513
0.17500000000000002
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 44
Number of outliers: 22/627 (3.509%)

Cohesion = 274.3395737072228
Separation = 58948.48541542995
G1 = 1159.0136716569364

Entropy = 2.163643
Purity = 0.499174
F-measure = 0.401169

Rand = 90.56%
Jaccard Index = 16.754%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.765
Best F-measure: 0.4963539384526771
0.765
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 77
Number of outliers: 71/627 (11.324%)

Cohesion = 171.99520845010278
Separation = 8130.6694865531335
G1 = 1328.2620924486566

Entropy = 1.094838
Purity = 0.701439
F-measure = 0.496354

Rand = 93.285%
Jaccard Index = 23.475%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.55
Best F-measure: 0.4772337025724002
0.55
_______________________________
4 cos_distance
_______________________________
Number of clusters: 64
Number of outliers: 63/627 (10.048%)

Cohesion = 202.75166259384324
Separation = 21665.116915869203
G1 = 1614.7719362024252

Entropy = 1.350008
Purity = 0.652482
F-measure = 0.477234

Rand = 92.578%
Jaccard Index = 22.56%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.555
Best F-measure: 0.47830013093737916
0.555
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 64
Number of outliers: 65/627 (10.367%)

Cohesion = 214.79095467313002
Separation = 43722.156083182985
G1 = 3182.325866662627

Entropy = 1.339691
Purity = 0.654804
F-measure = 0.4783

Rand = 92.562%
Jaccard Index = 22.907%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.17500000000000002
Best F-measure: 0.4011688035370513
0.17500000000000002
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 44
Number of outliers: 22/627 (3.509%)

Cohesion = 274.34447803593025
Separation = 58952.95696251678
G1 = 1159.4934347055284

Entropy = 2.163643
Purity = 0.499174
F-measure = 0.401169

Rand = 90.56%
Jaccard Index = 16.754%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.78
Best F-measure: 0.49302500203143784
0.78
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 75
Number of outliers: 77/627 (12.281%)

Cohesion = 170.58282060245142
Separation = 7146.092795385471
G1 = 1071.540596913852

Entropy = 1.13419
Purity = 0.692727
F-measure = 0.493025

Rand = 93.372%
Jaccard Index = 23.677%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.54
Best F-measure: 0.4751951712770497
0.54
_______________________________
5 cos_distance
_______________________________
Number of clusters: 69
Number of outliers: 72/627 (11.483%)

Cohesion = 201.3009154488032
Separation = 20286.931007701398
G1 = 2017.9349276434116

Entropy = 1.270197
Purity = 0.659459
F-measure = 0.475195

Rand = 92.764%
Jaccard Index = 22.42%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.51
Best F-measure: 0.4747044053338121
0.51
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 69
Number of outliers: 82/627 (13.078%)

Cohesion = 212.7041372245011
Separation = 40000.09468234771
G1 = 3715.975269414795

Entropy = 1.239309
Purity = 0.666055
F-measure = 0.474704

Rand = 92.832%
Jaccard Index = 23.161%


In [142]:
for i in range(1, 6):
    hists = concat_hists_tufano[i]
    hists_len = concat_hists_tufano_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'complete'
        best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    agglomerative=True, linkage=link, with_step=0.003)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


249 367


HBox(children=(IntProgress(value=0, max=249), HTML(value='')))


Best eps: 0.6722222222222223
Best F-measure: 0.4014052951767639
0.6722222222222223
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 40
Number of outliers: 15/627 (2.392%)

Cohesion = 260.854323206227
Separation = 56501.08034032663
G1 = 1091.9514078881075

Entropy = 2.264713
Purity = 0.473856
F-measure = 0.401405

Rand = 89.868%
Jaccard Index = 16.399%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


1164 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.627
Best F-measure: 0.4963336464770641
0.627
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 73
Number of outliers: 63/627 (10.048%)

Cohesion = 201.91674258467847
Separation = 19754.856917293855
G1 = 2146.922971392345

Entropy = 1.104558
Purity = 0.693262
F-measure = 0.496334

Rand = 93.392%
Jaccard Index = 23.68%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


1809 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.438
Best F-measure: 0.5007743360640162
0.438
_______________________________
1 cos_distance
_______________________________
Number of clusters: 63
Number of outliers: 46/627 (7.337%)

Cohesion = 233.2737421999571
Separation = 42488.90791696328
G1 = 2695.6927535343734

Entropy = 1.277213
Purity = 0.657487
F-measure = 0.500774

Rand = 93.276%
Jaccard Index = 23.975%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


5802 369


HBox(children=(IntProgress(value=0, max=369), HTML(value='')))


Best eps: 0.342
Best F-measure: 0.48553571034697335
0.342
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 72
Number of outliers: 61/627 (9.729%)

Cohesion = 256.08928752269946
Separation = 80075.05803123493
G1 = 5827.961616278907

Entropy = 1.126467
Purity = 0.690813
F-measure = 0.485536

Rand = 93.209%
Jaccard Index = 22.13%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


445 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.669
Best F-measure: 0.41019528095852453
0.669
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 39
Number of outliers: 14/627 (2.233%)

Cohesion = 253.05389413876557
Separation = 50208.54059999524
G1 = 1112.620163256819

Entropy = 2.284457
Purity = 0.479608
F-measure = 0.410195

Rand = 88.422%
Jaccard Index = 17.717%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


2030 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.837
Best F-measure: 0.5085693297128325
0.837
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 75
Number of outliers: 32/627 (5.104%)

Cohesion = 179.93282228110036
Separation = 14814.315519084086
G1 = 1780.8748386430293

Entropy = 1.23042
Purity = 0.662185
F-measure = 0.508569

Rand = 93.766%
Jaccard Index = 24.458%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


3051 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.405
Best F-measure: 0.48212805764446587
0.405
_______________________________
2 cos_distance
_______________________________
Number of clusters: 78
Number of outliers: 71/627 (11.324%)

Cohesion = 218.27327084479998
Separation = 31569.750185385907
G1 = 3607.0207060639304

Entropy = 1.059261
Purity = 0.703237
F-measure = 0.482128

Rand = 93.226%
Jaccard Index = 22.048%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


7528 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.41100000000000003
Best F-measure: 0.48212805764446587
0.41100000000000003
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 78
Number of outliers: 71/627 (11.324%)

Cohesion = 234.17934694892054
Separation = 57248.13365253277
G1 = 5793.886953907067

Entropy = 1.059261
Purity = 0.703237
F-measure = 0.482128

Rand = 93.226%
Jaccard Index = 22.048%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


506 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.669
Best F-measure: 0.40547900674974535
0.669
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 38
Number of outliers: 14/627 (2.233%)

Cohesion = 253.11849993835057
Separation = 48410.014558028655
G1 = 831.8005915034913

Entropy = 2.336131
Purity = 0.479608
F-measure = 0.405479

Rand = 87.996%
Jaccard Index = 17.156%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


2457 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.834
Best F-measure: 0.4944964276904922
0.834
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 79
Number of outliers: 49/627 (7.815%)

Cohesion = 178.9484035989162
Separation = 11262.41604244207
G1 = 1646.3001074514457

Entropy = 1.148925
Purity = 0.678201
F-measure = 0.494496

Rand = 93.502%
Jaccard Index = 22.268%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


3408 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.51
Best F-measure: 0.4757512468080085
0.51
_______________________________
3 cos_distance
_______________________________
Number of clusters: 78
Number of outliers: 65/627 (10.367%)

Cohesion = 213.17901534483377
Separation = 27457.34088072664
G1 = 3377.1724564771066

Entropy = 1.113056
Purity = 0.683274
F-measure = 0.475751

Rand = 93.195%
Jaccard Index = 21.314%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


7521 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.594
Best F-measure: 0.47774884158636677
0.594
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 72
Number of outliers: 57/627 (9.091%)

Cohesion = 225.59440280673664
Separation = 52084.640497460576
G1 = 4630.497788569756

Entropy = 1.18862
Purity = 0.675439
F-measure = 0.477749

Rand = 93.148%
Jaccard Index = 21.349%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


516 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.669
Best F-measure: 0.40547900674974535
0.669
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 38
Number of outliers: 14/627 (2.233%)

Cohesion = 253.14681113039416
Separation = 48434.65920764657
G1 = 832.4665447701313

Entropy = 2.336131
Purity = 0.479608
F-measure = 0.405479

Rand = 87.996%
Jaccard Index = 17.156%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


2694 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.867
Best F-measure: 0.49485001939145107
0.867
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 80
Number of outliers: 48/627 (7.656%)

Cohesion = 170.49699361262952
Separation = 9516.491729231302
G1 = 1696.6478054304885

Entropy = 1.172044
Purity = 0.673575
F-measure = 0.49485

Rand = 93.449%
Jaccard Index = 22.343%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


3052 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.531
Best F-measure: 0.4931648201267822
0.531
_______________________________
4 cos_distance
_______________________________
Number of clusters: 81
Number of outliers: 69/627 (11.005%)

Cohesion = 209.7731142613657
Separation = 24868.065775127976
G1 = 3720.2094216396067

Entropy = 1.04703
Purity = 0.700717
F-measure = 0.493165

Rand = 93.438%
Jaccard Index = 23.429%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


7453 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.8160000000000001
Best F-measure: 0.4946333630416104
0.8160000000000001
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 56
Number of outliers: 30/627 (4.785%)

Cohesion = 206.100927641061
Separation = 52664.67841503166
G1 = 3970.9736830788397

Entropy = 1.546207
Purity = 0.59464
F-measure = 0.494633

Rand = 92.756%
Jaccard Index = 22.531%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


513 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.669
Best F-measure: 0.40547900674974535
0.669
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 38
Number of outliers: 14/627 (2.233%)

Cohesion = 253.15427953962137
Separation = 48437.77709454721
G1 = 832.5515415835889

Entropy = 2.336131
Purity = 0.479608
F-measure = 0.405479

Rand = 87.996%
Jaccard Index = 17.156%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


2880 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.873
Best F-measure: 0.4990773812365414
0.873
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 84
Number of outliers: 51/627 (8.134%)

Cohesion = 169.45391658719555
Separation = 8667.651326928612
G1 = 1912.2406700098131

Entropy = 1.128013
Purity = 0.689236
F-measure = 0.499077

Rand = 93.484%
Jaccard Index = 22.556%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


3987 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.801
Best F-measure: 0.4964601274467097
0.801
_______________________________
5 cos_distance
_______________________________
Number of clusters: 58
Number of outliers: 33/627 (5.263%)

Cohesion = 194.90423042917547
Separation = 25809.328483585705
G1 = 1762.3890725876183

Entropy = 1.456527
Purity = 0.614478
F-measure = 0.49646

Rand = 93.096%
Jaccard Index = 22.784%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))


7426 367


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Best eps: 0.639
Best F-measure: 0.49676902541406887
0.639
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 74
Number of outliers: 62/627 (9.888%)

Cohesion = 215.91346284027642
Separation = 47036.162832884205
G1 = 4818.945919193867

Entropy = 1.154307
Purity = 0.681416
F-measure = 0.496769

Rand = 93.2%
Jaccard Index = 22.714%


In [146]:
hists = concat_hists_tufano[4]
hists_len = concat_hists_tufano_len[4]

for j in range(len(distances)):
    dist = distances[j]
    dist_name = distances_names[j]

    cur_dists = get_dists(hists, dist, hists_len)
    link = 'complete'
    best_eps = find_eps_with_brute_force_universal(cur_dists, unique_label_to_changes, 
                                from_change_to_unique_label, ids_per_unique_label, 
                                agglomerative=True, linkage=link)

    print(best_eps)

    print("_______________________________")
    print(4, dist_name)
    print("_______________________________")

    tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                        affinity='precomputed', 
                                                        linkage=link, 
                                                        compute_full_tree=True,
                                                        distance_threshold=best_eps).fit(cur_dists)

    print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                            unique_label_to_changes, from_change_to_unique_label, ids_per_unique_label,
                                  without_outliers=True)


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=516), HTML(value='')))


Best eps: 0.675
Best F-measure: 0.40547900674974535
0.675
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 38
Number of outliers: 14/627 (2.233%)

Cohesion = 253.14681113039416
Separation = 48434.65920764657
G1 = 832.4665447701313

Entropy = 2.336131
Purity = 0.479608
F-measure = 0.405479

Rand = 87.996%
Jaccard Index = 17.156%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2694), HTML(value='')))


Best eps: 0.8642857142857143
Best F-measure: 0.49673760548956347
0.8642857142857143
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 81
Number of outliers: 48/627 (7.656%)

Cohesion = 170.65765716390345
Separation = 9510.717794680697
G1 = 1733.6674382444373

Entropy = 1.161681
Purity = 0.675302
F-measure = 0.496738

Rand = 93.454%
Jaccard Index = 22.358%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3052), HTML(value='')))


Best eps: 0.5309029062834291
Best F-measure: 0.4931648201267822
0.5309029062834291
_______________________________
5 cos_distance
_______________________________
Number of clusters: 81
Number of outliers: 69/627 (11.005%)

Cohesion = 209.7731142613657
Separation = 24868.065775127976
G1 = 3720.2094216396067

Entropy = 1.04703
Purity = 0.700717
F-measure = 0.493165

Rand = 93.438%
Jaccard Index = 23.429%


HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7453), HTML(value='')))


Best eps: 0.8078843745641707
Best F-measure: 0.4960692378690646
0.8078843745641707
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 56
Number of outliers: 34/627 (5.423%)

Cohesion = 209.85561651405277
Separation = 51271.906319479654
G1 = 3397.78228669994

Entropy = 1.485602
Purity = 0.613828
F-measure = 0.496069

Rand = 92.834%
Jaccard Index = 22.946%
