In [1]:
import numpy as np

import matplotlib as mpl
mpl.rcParams['savefig.dpi'] = 100
mpl.rcParams['figure.dpi'] = 100

import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import string
import mplcursors
import collections
import sklearn.cluster
import os
from tqdm import tqdm_notebook
#%matplotlib inline
%matplotlib notebook
#%matplotlib notebook

In [39]:
# NEW IMPLEMENTATION

import math

def get_intersections(hist1, hist2):
    len1 = len(hist1)
    len2 = len(hist2)
    intersections = []
    i = 0
    j = 0

    while i < len1 and j < len2:
        if hist1[i][0] < hist2[j][0]:
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            intersections.append((hist1[i][0], hist1[i][1], hist2[j][1]))
            i += 1
            j += 1
        elif hist1[i][0] > hist2[j][0]:
            j += 1
            
    return intersections

def get_union_inds(hist1, hist2):
    len1 = len(hist1)
    len2 = len(hist2)
    union = []
    i = 0
    j = 0

    while i < len1 or j < len2:
        if i >= len1:
            union.append(hist2[j][0])
            j += 1
        elif j >= len2:
            union.append(hist1[i][0])
            i += 1
        elif hist1[i][0] < hist2[j][0]:
            union.append(hist1[i][0])
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            union.append(hist1[i][0])
            i += 1
            j += 1
        elif hist1[i][0] > hist2[j][0]:
            union.append(hist2[j][0])
            j += 1
            
    return union


def jaccard_metric(hist1, hist2, hist_len=None):
    intersections = get_intersections(hist1, hist2)

    metric = 0

    for ind, v1, v2 in intersections:
        mx = max(v1, v2)
        mn = min(v1, v2)
        metric += mn / mx

    len_intersections = len(intersections)

    if (len_intersections != 0):
        metric = metric / len(intersections)
    else:
        metric = 0

    return 1 - metric

def canberra_metric(hist1, hist2, hist_len=None):
    metric = 0
    i = 0
    j = 0
    n = len(hist1)
    m = len(hist2)
    
    union_len = 0
    
    while i < n or j < m:
        if i >= n:
            metric += abs(hist2[j][1]) / abs(hist2[j][1]) # 1
            j += 1
        elif j >= m:
            metric += abs(hist1[i][1]) / abs(hist1[i][1]) # 1
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            metric += abs(hist1[i][1] - hist2[j][1]) / (abs(hist1[i][1]) + abs(hist2[j][1]))
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            metric += abs(hist1[i][1]) / abs(hist1[i][1]) # 1
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            metric += abs(hist2[j][1]) / abs(hist2[j][1]) # 1
            j += 1
        
        #union_len += 1
            
    union_len = len(get_union_inds(hist1, hist2))
    if union_len == 0:
        return 1
    
    return metric / union_len


def canberra_metric_optimized(hist1, hist2, hist_len=None):
    metric = 0
    i = 0
    j = 0
    n = len(hist1)
    m = len(hist2)
    
    union_len = 0
    
    while i < n or j < m:
        if i >= n:
            metric += 1.0
            j += 1
        elif j >= m:
            metric += 1.0
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            metric += abs(hist1[i][1] - hist2[j][1]) / (abs(hist1[i][1]) + abs(hist2[j][1]))
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            metric += 1.0
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            metric += 1.0
            j += 1
        
        union_len += 1
            
    #union_len = len(get_union_inds(hist1, hist2))
    if union_len == 0:
        return 1
    
    return metric / union_len



'''
def pearsons_correlation(hist1, hist2, hist_len):
    #hist_len = hist_len_3gram
    union_len = len(get_union_inds(hist1, hist2))
    
    top = 0
    left = 0
    right = 0
    
    n = len(hist1)
    m = len(hist2)
    
    i = 0
    j = 0
    
    while i < n or j < m:
        if i >= n:
            top += (- 1/hist_len) * (hist2[j][1] - 1/hist_len)
            left += (- 1/hist_len) ** 2
            right += (hist2[j][1] - 1/hist_len) ** 2
            j += 1
        elif j >= m:
            top += (hist1[i][1] - 1/hist_len) * (- 1/hist_len)
            left += (hist1[i][1] - 1/hist_len)  ** 2
            right += (- 1/hist_len) ** 2
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            top += (hist1[i][1] - 1/hist_len) * (hist2[j][1] - 1/hist_len)
            left += (hist1[i][1] - 1/hist_len)  ** 2
            right += (hist2[j][1] - 1/hist_len) ** 2
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            top += (hist1[i][1] - 1/hist_len) * (- 1/hist_len)
            left += (hist1[i][1] - 1/hist_len)  ** 2
            right += (- 1/hist_len) ** 2
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            top += (- 1/hist_len) * (hist2[j][1] - 1/hist_len)
            left += (- 1/hist_len) ** 2
            right += (hist2[j][1] - 1/hist_len) ** 2
            j += 1

    bottom = math.sqrt(left * right)
    return 1 - top / bottom
'''


def cos_distance(hist1, hist2, hist_len=None):
    intersections = get_intersections(hist1, hist2)

    top = 0

    for ind, v1, v2 in intersections:
        top += v1 * v2
        
    bottom1 = (sum([pair[1] ** 2 for pair in hist1]))
    bottom2 = (sum([pair[1] ** 2 for pair in hist2]))
    
    #print(top)
    #print(bottom1)
    #print(bottom2)
    
    return 1 - abs(top / np.sqrt(bottom1 * bottom2))
    

def pearsons_correlation_mean(hist1, hist2, hist_len):
    union_len = len(get_union_inds(hist1, hist2))
    
    top = 0
    left = 0
    right = 0
    
    n = len(hist1)
    m = len(hist2)
    
    mean1 = sum([pair[1] for pair in hist1]) / hist_len
    mean2 = sum([pair[1] for pair in hist2]) / hist_len
    
    i = 0
    j = 0
    
    while i < n or j < m:
        if i >= n:
            top += (- mean1) * (hist2[j][1] - mean2)
            left += (- mean1) ** 2
            right += (hist2[j][1] - mean2) ** 2
            j += 1
        elif j >= m:
            top += (hist1[i][1] - mean1) * (- mean2)
            left += (hist1[i][1] - mean1)  ** 2
            right += (- mean2) ** 2
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            top += (hist1[i][1] - mean1) * (hist2[j][1] - mean2)
            left += (hist1[i][1] - mean1)  ** 2
            right += (hist2[j][1] - mean2) ** 2
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            top += (hist1[i][1] - mean1) * (- mean2)
            left += (hist1[i][1] - mean1)  ** 2
            right += (- mean2) ** 2
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            top += (- mean1) * (hist2[j][1] - mean2)
            left += (- mean1) ** 2
            right += (hist2[j][1] - mean2) ** 2
            j += 1

    bottom = math.sqrt(left * right)
    return 1 - top / bottom
    

def get_dists(hists, dist_metric, hist_len):
    #hist_len = len(hists[0])
    
    #n = len(hists)
    dists = []
    
    for hist1 in tqdm_notebook(hists):
        cur_dists = []

        for hist2 in hists:
            distance = dist_metric(hist1, hist2, hist_len)
            
            cur_dists.append(distance)

        dists.append(cur_dists)
        
    return dists


def get_dists_optimized(hists, dist_metric, hist_len):
    #hist_len = len(hists[0])
    
    n = len(hists)
    #dists = [[0.0 for i in range(n)] for j in range(n)]
    dists = np.zeros((n, n))
    
    for i in tqdm_notebook(range(n)):
        for j in range(n):
            if i <= j:
                break
                
            distance = dist_metric(hists[i], hists[j], hist_len)
            dists[i][j] = distance
            dists[j][i] = distance
        
    return dists

In [138]:
def print_clustering_results_tufano_unique(clustering, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    without_outliers, only_fm=False,
                                    to_print=True):
    clusters = collections.defaultdict(list)
    clusters_to_ids = collections.defaultdict(list)
    outliers = []
    num_changes = len(clustering.labels_)
    
    print("Start evaluate...")
    
    print("Save clusters...")
    
    for i in tqdm_notebook(range(len(clustering.labels_))):
        label = clustering.labels_[i]
        change = ids_per_unique_label[i]
        if label == -1:
            outliers.append(change)
        else:
            clusters[label].append(change)
            clusters_to_ids[label].append(i)
    
    # For Agglomerative
    print("Delete outliers...")
    clusters_list = list(clusters_to_ids.keys())
    for label in tqdm_notebook(clusters_list):
        if len(clusters_to_ids[label]) == 1:
            outliers.append(clusters[label][0])
            clustering.labels_[clusters_to_ids[label][0]] = -1
            clusters_to_ids.pop(label, None)
            clusters.pop(label, None)
            
    num_inliers = num_changes - len(outliers)   
    
    '''
    cohesion = 0
    separation = 0
    g1 = 0
    MAX_DIST = np.array(dists).max()
    '''
    #print("MAX_DIST", MAX_DIST)
    '''
    for i in tqdm_notebook(clusters_to_ids.keys()):
        coef = 0
        #print("!!!", len(clusters_to_ids[i]))
        for xi in range(len(clusters_to_ids[i])):
            for yi in range(len(clusters_to_ids[i])):
                if xi < yi:
                    x = clusters_to_ids[i][xi]
                    y = clusters_to_ids[i][yi]
                    #print("dists[x][y]", dists[x][y])
                    coef += MAX_DIST - dists[x][y]
           
        cohesion += (1 / len(clusters_to_ids[i])) * coef
        coef_for_sep = len(clusters_to_ids[i]) / np.sqrt(coef)
        coef = 1 / coef
        
        summ = 0
        for j in clusters_to_ids.keys():
            if i == j:
                continue
                
            for xi in range(len(clusters_to_ids[i])):
                for yi in range(len(clusters_to_ids[j])):
                    x = clusters_to_ids[i][xi]
                    y = clusters_to_ids[j][yi]
                    summ += MAX_DIST - dists[x][y]

        g1 += coef * summ
        separation += coef_for_sep * summ
    '''
        
        
    #final_init_labels = dict()
        
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    rand = 0
    jaccard_index = 0
    
    if not only_fm:
    
        print("Calculate rand...")
        for i in tqdm_notebook(range(len(clustering.labels_))):
            change_i = ids_per_unique_label[i]
            init_label_i = from_change_to_unique_label[change_i][0]
            after_label_i = clustering.labels_[i]

            for j in range(len(clustering.labels_)):
                change_j = ids_per_unique_label[j]
                init_label_j = from_change_to_unique_label[change_j][0]
                after_label_j = clustering.labels_[j]

                if i >= j:
                    continue

                if (without_outliers and (after_label_i == -1 or after_label_j == -1)):
                    continue

                if init_label_i == init_label_j:
                    if after_label_i == after_label_j:
                        tp += 1
                    else:
                        fp += 1
                else:
                    if after_label_i == after_label_j:
                        tn += 1
                    else:
                        fn += 1
        
        rand = round(100 * (tp + fn) / (tp + tn + fp + fn), 3)
        jaccard_index = round(100 * tp / (tp + tn + fp), 3)
                    
                    
    print("Calculate confusion...")
    confusion_mtx = []
    pure_classes = set([])
    for i in tqdm_notebook(clusters_to_ids.keys()):
        mtx_row = collections.defaultdict(int)
        
        for change_i in clusters_to_ids[i]:
            change = ids_per_unique_label[change_i]
            init_label = from_change_to_unique_label[change][0]
            mtx_row[init_label] += 1
            
            if mtx_row[init_label] == len(clusters_to_ids[i]):
                pure_classes.add(init_label)
        
        confusion_mtx.append(mtx_row)
        
    
        
    ENTROPY = 0
    PURITY = 0
    entropies = []
    purities = []
    Fs = collections.defaultdict(list)
    print("Calculate entropy & purity...")
    for i in tqdm_notebook(range(len(clusters_to_ids.keys()))):
        entropy = 0
        purity = 0
        
        clustering_label = list(clusters_to_ids.keys())[i]
        
        for init_lbl in unique_label_to_changes:
            pij = confusion_mtx[i][init_lbl] / len(clusters_to_ids[clustering_label])
            precision = pij
            recall = confusion_mtx[i][init_lbl] / len(unique_label_to_changes[init_lbl])
            
            if pij != 0:
                entropy += - pij * np.log2(pij)
            purity = max(purity, pij)
            
            fij = 0
            if precision != 0 or recall != 0:
                fij = (2 * precision * recall) / (precision + recall)
            Fs[init_lbl].append(fij)
        
        ENTROPY += entropy * len(clusters_to_ids[clustering_label]) / num_inliers
        entropies.append(round(entropy, 3))
        
        PURITY += purity * len(clusters_to_ids[clustering_label]) / num_inliers
        purities.append(round(purity, 3))
        
    Fmeasure = 0
    print("Calculate F-measure...")
    for init_lbl in tqdm_notebook(unique_label_to_changes):
        Fmeasure += max(Fs[init_lbl]) * len(unique_label_to_changes[init_lbl]) / num_changes
            
    
    outl_percent = round(100 * len(outliers) / num_changes, 3)
    
    
    if (to_print):
        '''
        print("CLUSTERS:\n")

        for i in clusters.keys():
            for change in clusters[i]:
                print(change, from_change_to_unique_label[change])
            print("\n")
        '''
        
        print("Number of clusters:", len(clusters))
        print("Number of outliers:", str(len(outliers)) + "/" + str(num_changes), "(" + str(outl_percent) + "%)")
        
        #print()
        #print("Cohesion =", str(cohesion))
        #print("Separation =", str(separation))
        #print("G1 =", str(g1))
        
        print()
        print("Entropy =", str(round(ENTROPY, 6)))
        print("Purity =", str(round(PURITY, 6)))
        #pairs = [(entropies[i], purities[i]) for i in range(len(entropies))]
        #print("(entropy, purity):", pairs)
        print("F-measure =", round(Fmeasure, 6))
        
        print()
        print("Rand =", str(rand) + "%")
        print("Jaccard Index =", str(jaccard_index) + "%")       
        
        print("Pure classes:", len(pure_classes))
    
    return rand, Fmeasure
    
    

In [155]:
# CHECKING PATTERNS WITH DIFFERENT PROJECTS

# We consider only changes whose edit script is not empty

patterns_path = "/Users/aliscafo/Downloads/CPatMiner-master 2/SemanticChangeGraphMiner/output/patterns/repos-hybrid/1"
actions_path = "/Users/aliscafo/Documents/ALINA/WORK/SPbAU/thesis/CodeDiffEditScripts/CPatMinerActions"

patterns_dir = os.fsencode(patterns_path)
sorted_size_dirs = [int(os.fsdecode(el)) for el in os.listdir(patterns_dir) if os.fsdecode(el).isnumeric()]
sorted_size_dirs.sort()

set_of_patterns_with_different_methods = set()
changes_with_different_methods = []
lens = []

total_num_of_possible_changes = 0

for size_dir in sorted_size_dirs:
    size_dir_str = str(size_dir)
    sorted_id_dirs = [int(os.fsdecode(el)) for el in os.listdir(patterns_path + os.sep + size_dir_str) if os.fsdecode(el).isnumeric()]
    sorted_id_dirs.sort()
    
    if 3 <= size_dir and size_dir <= 5:
        continue
            
    for id_dir in sorted_id_dirs:
        path = patterns_path + os.sep + size_dir_str + os.sep + str(id_dir) + os.sep + "details.html"
        
        if not os.path.exists(path):
            continue

        with open(path, 'r', encoding="latin-1") as file:
            content = file.read()
        
        print(size_dir_str, str(id_dir))
                
        set_of_methods = set()
        
        last_ind = 0
        num_change = 1
                
        while True:            
            start = 0
            end = 0
            
            try:
                start = content.index("<div id='method'>", last_ind) + len("<div id='method'>") 
                end = content.index("</div><BR>", start)
            except:
                break
                
            last_ind = end
            total_num_of_possible_changes += 1
    
            
            name = content[start:end]
            parts = name.split(',')
            project_name = parts[0]
            file_name = parts[2]
            nickname = project_name.split('/')[0]
            
            es_path = actions_path + "/" + str(size_dir) + "/" + str(id_dir) + "/sampleChange" + str(num_change)
            if not os.path.exists(es_path):
                continue
            es_file = open(es_path)
            es = es_file.read()
            if es == "":
                continue
            es_file.close()
            
            num_change += 1
                        
            change_tuple = (size_dir, id_dir, num_change)
            
            if name not in set_of_methods:
                #print("Added", change_tuple)
                set_of_methods.add(name)
                changes_with_different_methods.append((size_dir, id_dir, num_change))
        
        print(len(set_of_methods))
        if len(set_of_methods) > 1:
            final_ok_num += 1
            lens.append(len(set_of_methods))
            set_of_patterns_with_different_methods.add((size_dir, id_dir))
            print("OK")
        elif len(set_of_methods) == 1:
            changes_with_different_methods.pop()
            

print("TOTAL NUMBER OF CONSIDERED CHANGES:", total_num_of_possible_changes)

print("CHOSEN:", len(changes_with_different_methods))

# and "src/test/java" not in file_name

6 48
4
OK
6 50
3
OK
6 139
3
OK
6 148
5
OK
6 151
0
6 170
3
OK
6 191
3
OK
6 193
4
OK
6 240
4
OK
6 276
2
OK
6 290
1
6 294
3
OK
6 330
3
OK
6 354
4
OK
6 360
4
OK
6 377
2
OK
6 380
6
OK
6 392
3
OK
6 399
1
6 408
3
OK
6 418
4
OK
6 424
3
OK
6 427
3
OK
6 430
4
OK
6 456
3
OK
6 466
4
OK
6 474
2
OK
6 485
2
OK
6 513
3
OK
6 520
4
OK
6 543
6
OK
6 568
3
OK
6 569
1
6 589
4
OK
6 614
2
OK
6 623
8
OK
6 628
32
OK
6 662
5
OK
6 665
4
OK
6 674
1
6 683
2
OK
6 692
8
OK
6 729
14
OK
6 736
3
OK
6 744
2
OK
6 746
1
6 748
1
6 784
6
OK
6 792
3
OK
6 797
1
6 813
3
OK
6 815
8
OK
6 817
3
OK
6 830
3
OK
6 838
5
OK
6 845
3
OK
6 878
2
OK
6 889
1
6 904
2
OK
6 941
2
OK
6 970
3
OK
6 993
2
OK
6 999
2
OK
6 1006
0
6 1043
2
OK
6 1045
4
OK
6 1074
3
OK
6 1081
3
OK
6 1107
4
OK
6 1141
2
OK
6 1159
2
OK
6 1160
2
OK
6 1170
1
6 1193
5
OK
6 1285
1
6 1303
4
OK
6 1323
2
OK
6 1351
2
OK
6 1389
4
OK
6 1406
1
6 1412
3
OK
6 1418
0
6 1431
1
6 1433
16
OK
6 1446
6
OK
6 1448
6
OK
6 1497
2
OK
6 1526
0
6 1549
3
OK
6 1560
0
6 1587
3
OK
6 1591
3
OK
6 1626
15

6 14339
0
6 14370
6
OK
6 14388
7
OK
6 14393
6
OK
6 14402
1
6 14447
6
OK
6 14461
1
6 14479
2
OK
6 14486
3
OK
6 14491
3
OK
6 14512
6
OK
6 14519
3
OK
6 14540
34
OK
6 14560
3
OK
6 14585
4
OK
6 14604
3
OK
6 14610
2
OK
6 14648
8
OK
6 14708
0
6 14751
3
OK
6 14767
3
OK
6 14788
4
OK
6 14804
1
6 14809
3
OK
6 14813
0
6 14829
3
OK
6 14848
9
OK
6 14860
0
6 14868
6
OK
6 14882
3
OK
6 14900
4
OK
6 14930
1
6 14947
0
6 14951
1
6 14959
3
OK
6 14968
5
OK
6 14971
5
OK
6 14973
2
OK
6 14980
3
OK
6 14981
53
OK
6 14996
6
OK
6 15001
3
OK
6 15012
3
OK
6 15017
4
OK
6 15035
3
OK
6 15046
1
6 15059
5
OK
6 15126
3
OK
6 15128
2
OK
6 15152
3
OK
6 15158
1
6 15165
0
6 15171
1
6 15176
3
OK
6 15181
1
6 15185
4
OK
6 15190
3
OK
6 15203
3
OK
6 15227
3
OK
6 15239
4
OK
6 15244
3
OK
6 15280
0
6 15328
4
OK
6 15332
2
OK
7 2
3
OK
7 9
1
7 68
2
OK
7 102
1
7 106
0
7 126
1
7 138
1
7 161
3
OK
7 178
3
OK
7 323
7
OK
7 334
3
OK
7 425
1
7 435
3
OK
7 495
3
OK
7 501
3
OK
7 515
5
OK
7 597
2
OK
7 601
3
OK
7 615
1
7 697
5
OK
7 738
3
OK
7 789
3
O

0
8 6564
1
8 6600
0
8 6643
3
OK
8 6653
8
OK
8 6656
4
OK
8 6734
2
OK
8 6810
2
OK
8 6819
2
OK
8 6892
3
OK
8 6898
1
8 6924
0
8 6932
2
OK
8 6946
2
OK
8 6989
3
OK
8 7042
1
8 7084
4
OK
8 7086
6
OK
8 7148
0
8 7172
3
OK
8 7196
6
OK
8 7230
1
8 7236
2
OK
8 7337
3
OK
8 7364
0
8 7390
2
OK
8 7474
2
OK
8 7528
2
OK
8 7586
0
8 7646
3
OK
8 7671
1
8 7772
0
8 7815
0
8 7826
0
8 7940
2
OK
8 7994
4
OK
8 7997
3
OK
8 8069
4
OK
8 8138
5
OK
8 8156
6
OK
8 8165
3
OK
8 8184
3
OK
8 8187
12
OK
8 8213
1
8 8258
2
OK
8 8270
1
8 8304
4
OK
8 8319
1
8 8343
3
OK
8 8352
3
OK
8 8471
3
OK
8 8477
3
OK
8 8529
1
8 8549
2
OK
8 8604
1
8 8656
2
OK
8 8667
30
OK
8 8685
3
OK
8 8719
1
8 8720
3
OK
8 8757
4
OK
8 8839
4
OK
8 8912
4
OK
8 8944
7
OK
8 8975
1
8 9011
5
OK
8 9151
0
8 9164
4
OK
8 9189
2
OK
8 9228
3
OK
8 9240
3
OK
8 9322
3
OK
8 9349
0
8 9361
1
8 9371
4
OK
8 9375
2
OK
8 9459
3
OK
8 9467
1
8 9485
3
OK
8 9499
2
OK
8 9514
9
OK
8 9535
1
8 9740
3
OK
8 9757
1
8 9779
2
OK
8 9798
3
OK
8 9815
2
OK
8 9830
1
8 9849
3
OK
8 9909
3
OK
8 9958
4


4
OK
10 6264
4
OK
10 6322
2
OK
10 6390
2
OK
10 6394
3
OK
10 6441
3
OK
10 6489
0
10 6515
3
OK
10 6538
1
10 6646
4
OK
10 6660
5
OK
10 6750
1
10 6754
2
OK
10 6764
2
OK
10 6843
2
OK
10 6881
0
10 6901
4
OK
10 6929
3
OK
10 6991
3
OK
10 7002
0
10 7034
3
OK
10 7176
3
OK
10 7326
3
OK
10 7385
10
OK
10 7387
3
OK
10 7419
1
10 7443
1
10 7444
5
OK
10 7452
2
OK
10 7461
3
OK
10 7466
3
OK
10 7481
3
OK
10 7491
2
OK
10 7529
5
OK
10 7639
3
OK
10 7640
4
OK
10 7672
1
10 7831
3
OK
10 7889
3
OK
10 7914
0
10 7936
4
OK
10 7945
1
10 7979
3
OK
10 8052
2
OK
10 8053
3
OK
10 8372
6
OK
10 8461
3
OK
10 8601
2
OK
10 8705
1
10 8744
3
OK
10 8798
1
10 8864
4
OK
10 8886
9
OK
10 8898
2
OK
10 8969
4
OK
10 9028
1
10 9064
1
10 9081
3
OK
10 9091
3
OK
10 9171
2
OK
10 9197
2
OK
10 9221
4
OK
10 9279
3
OK
10 9353
6
OK
10 9409
3
OK
10 9441
1
10 9492
3
OK
10 9627
3
OK
10 9651
3
OK
10 9663
4
OK
10 9769
4
OK
10 9796
1
10 9807
7
OK
10 9865
2
OK
10 10005
1
10 10065
4
OK
10 10079
4
OK
10 10108
2
OK
10 10252
3
OK
10 10278
0
10 10288
1
10 1

2
OK
13 8824
4
OK
13 8860
1
13 8897
8
OK
13 8959
6
OK
13 9029
5
OK
13 9096
4
OK
13 9186
3
OK
13 9227
1
13 9273
1
13 9295
3
OK
13 9311
1
13 9363
3
OK
13 9380
3
OK
13 9391
1
13 9586
3
OK
13 9683
1
13 9940
2
OK
13 10017
0
13 10061
3
OK
13 10092
3
OK
13 10164
1
13 10226
2
OK
13 10333
5
OK
13 10541
4
OK
13 10866
1
13 10912
3
OK
13 11008
2
OK
13 11147
0
13 11191
1
13 11290
0
13 11362
1
13 11498
3
OK
13 11521
0
13 11822
4
OK
13 11919
1
13 11921
3
OK
13 11929
1
13 11933
3
OK
13 12130
5
OK
13 12259
3
OK
13 12290
0
13 12364
6
OK
13 12372
2
OK
13 12390
0
13 12528
3
OK
13 12585
6
OK
13 12723
2
OK
13 12730
3
OK
13 13138
2
OK
13 13142
2
OK
13 13164
1
13 13228
6
OK
13 13377
3
OK
13 13574
3
OK
13 13656
1
13 13791
3
OK
13 13812
2
OK
13 13843
3
OK
13 13891
3
OK
13 14377
1
13 14676
4
OK
13 14737
3
OK
13 14758
0
13 14853
3
OK
13 14926
4
OK
13 15085
1
13 15105
0
13 15209
8
OK
13 15282
3
OK
13 15302
0
13 15310
0
14 110
2
OK
14 239
2
OK
14 407
0
14 415
1
14 561
2
OK
14 574
1
14 607
0
14 1012
3
OK
14 1319
5
O

1
20 4236
3
OK
20 4493
1
20 4797
0
20 4923
0
20 4982
4
OK
20 5060
1
20 5092
3
OK
20 5444
3
OK
20 6254
1
20 6292
4
OK
20 6374
4
OK
20 6931
1
20 6998
3
OK
20 7722
0
20 7776
0
20 8227
7
OK
20 8266
4
OK
20 9100
2
OK
20 9148
0
20 9431
4
OK
20 9507
1
20 9604
2
OK
20 10085
4
OK
20 10183
0
20 10501
3
OK
20 11171
0
20 11524
3
OK
20 11532
0
20 12124
3
OK
20 12317
2
OK
20 13502
0
20 14283
3
OK
20 14419
2
OK
20 14632
0
20 14705
2
OK
20 14956
1
20 15318
3
OK
21 186
1
21 203
4
OK
21 440
2
OK
21 1238
1
21 1616
1
21 2114
3
OK
21 2271
2
OK
21 2683
4
OK
21 3149
6
OK
21 3713
1
21 3749
4
OK
21 4247
3
OK
21 4331
2
OK
21 4827
6
OK
21 5158
3
OK
21 5166
0
21 6027
4
OK
21 6054
1
21 6298
2
OK
21 6598
3
OK
21 6927
1
21 7210
5
OK
21 7221
3
OK
21 7511
2
OK
21 8125
4
OK
21 8521
1
21 8730
3
OK
21 8753
0
21 8888
13
OK
21 9272
4
OK
21 10116
2
OK
21 10204
3
OK
21 10456
4
OK
21 10521
0
21 10944
1
21 11205
0
21 11269
3
OK
21 11292
4
OK
21 11485
4
OK
21 11656
0
21 12176
4
OK
21 12418
9
OK
21 12495
3
OK
21 12671
1
21 12920

In [160]:
distr = collections.defaultdict(int)

for elem in lens:
    distr[elem] += 1
    
distr

defaultdict(int,
            {4: 505,
             3: 1374,
             5: 144,
             2: 611,
             6: 121,
             8: 54,
             32: 1,
             14: 7,
             16: 5,
             15: 3,
             7: 62,
             21: 4,
             30: 2,
             10: 20,
             24: 4,
             13: 8,
             9: 27,
             26: 5,
             25: 1,
             11: 20,
             18: 3,
             34: 1,
             53: 1,
             20: 1,
             39: 1,
             48: 1,
             69: 1,
             17: 2,
             12: 5,
             19: 1,
             87: 1,
             22: 3,
             38: 1,
             23: 1})

In [156]:
np.array(lens).sum()

11737

In [161]:
len(lens)

3001

In [157]:
sorted(lens)[-3000:]

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


In [82]:
RESULTS_CPATMINER = "/Volumes/Seagate/Alina/result_for_cpatminer"

def get_hists(hists_path, gram_path, changes):
    hists = []

    for elem in tqdm_notebook(changes):
        hist_path1 = hists_path + "/" + str(elem[0]) + " " + str(elem[1]) + "/" + gram_path + "/sampleChange" + str(elem[2]) + "_hist.txt"
        
        if os.path.exists(hist_path1):
            hist_file1 = open(hist_path1, "r")
            lines = hist_file1.read().split("\n")
            hist_data1 = [(int(line.split(" ")[0]), int(line.split(" ")[1])) for line in lines if line != '']
            hists.append(hist_data1)
            hist_file1.close()
        else:
            hists.append([])
            
    return hists

def get_hists_len(hists_path, gram_path):
    hists_len = None
    
    es_path = hists_path + "/" + "edit_scripts_" + gram_path + "s_mapped.txt"
    es_file = open(es_path, "r")
    es_lines = es_file.read().split("\n")
    hists_len = int(es_lines[-2].split(" ")[0][:-1]) + 1
    es_file.close()

    print(hists_len)
    
    return hists_len

In [35]:
hists_1gram = get_hists(RESULTS_CPATMINER, "1gram")
hists_2gram = get_hists(RESULTS_CPATMINER, "2gram")
hists_3gram = get_hists(RESULTS_CPATMINER, "3gram")
hists_4gram = get_hists(RESULTS_CPATMINER, "4gram")
hists_5gram = get_hists(RESULTS_CPATMINER, "5gram")

HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))




In [36]:
hist_len_1gram = get_hists_len(RESULTS_CPATMINER, "1gram")
hist_len_2gram = get_hists_len(RESULTS_CPATMINER, "2gram")
hist_len_3gram = get_hists_len(RESULTS_CPATMINER, "3gram")
hist_len_4gram = get_hists_len(RESULTS_CPATMINER, "4gram")
hist_len_5gram = get_hists_len(RESULTS_CPATMINER, "5gram")

3254
41535
108151
178826
237211


In [37]:
concat_hists_to_5gram = []

num_changes = len(hists_1gram)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram, amount))
        
    for gram_ind, amount in hists_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram + hist_len_2gram, amount))
        
    for gram_ind, amount in hists_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram + hist_len_2gram + hist_len_3gram, amount))
        
    for gram_ind, amount in hists_5gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram + 
                            hist_len_2gram + hist_len_3gram + hist_len_4gram, 
                            amount))
            
    concat_hists_to_5gram.append(concat_hist)

concat_hists_len_to_5gram = hist_len_1gram + hist_len_2gram + hist_len_3gram
concat_hists_len_to_5gram += hist_len_4gram + hist_len_5gram


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))




In [40]:
concat_hists_to_5gram = np.array(concat_hists_to_5gram)

In [41]:
dists_to_5gram = get_dists_optimized(concat_hists_to_5gram, 
                                         canberra_metric_optimized, concat_hists_len_to_5gram)

HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))




In [None]:
def print_clustering_results_tufano_unique(clustering, dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    without_outliers,
                                    to_print=True):

In [125]:
unique_label_to_changes = collections.defaultdict(list)
from_change_to_unique_label = collections.defaultdict(list)

for elem in changes_with_different_methods:
    unique_label_to_changes[elem[1]].append(elem)
    from_change_to_unique_label[elem].append(elem[1])
    
print("Classes:", len(unique_label_to_changes))

Classes: 3001


In [45]:
tufano_clustering = sklearn.cluster.DBSCAN(eps=0.7, min_samples=2, 
                                                   metric='precomputed').fit(dists_to_5gram)

In [61]:
print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes, 
                                    from_change_to_unique_label, changes_with_different_methods, 
                                    without_outliers=True, to_print=True, only_fm=True)

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1935), HTML(value='')))


Calculate rand...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1935), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1935), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Number of clusters: 1935
Number of outliers: 3503/11737 (29.846%)

Entropy = 0.910256
Purity = 0.795725
F-measure = 0.614316

Rand = 99.697%
Jaccard Index = 19.903%


(99.697, 0.6143162409144306)

In [75]:
max_fm = 0
best_eps = 0
best_clustering = None

for eps in tqdm_notebook(np.arange(0.1, 1.1, 0.05)):
    tufano_clustering = sklearn.cluster.DBSCAN(eps=eps, min_samples=2, 
                                                   metric='precomputed').fit(dists_to_5gram)
    
    rand, fm = print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes, 
                                    from_change_to_unique_label, changes_with_different_methods, 
                                    without_outliers=True, to_print=False, only_fm=True)
    print("Eps =", eps)
    print("F-measure =", fm)
    
    if fm > max_fm:
        max_fm = fm
        best_eps = eps
        best_clustering = tufano_clustering

print(best_eps, max_fm)


rand, fm = print_clustering_results_tufano_unique(best_clustering, unique_label_to_changes, 
                                    from_change_to_unique_label, changes_with_different_methods, 
                                    without_outliers=True, to_print=True)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1861), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1861), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1861), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.1
F-measure = 0.5459820228318738
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1891), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1891), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1891), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.15000000000000002
F-measure = 0.5592119821282668
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1933), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1933), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1933), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.20000000000000004
F-measure = 0.5753849690298074
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1954), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.25000000000000006
F-measure = 0.5881104254179511
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1981), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1981), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1981), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.30000000000000004
F-measure = 0.5979133000928881
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1995), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1995), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1995), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.3500000000000001
F-measure = 0.6070316466883383
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2022), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2022), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2022), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.40000000000000013
F-measure = 0.6183527119069784
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2046), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2046), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2046), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.45000000000000007
F-measure = 0.6269067425022009
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2044), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2044), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2044), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.5000000000000001
F-measure = 0.6309012207889209
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.5500000000000002
F-measure = 0.6386721846775143
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2045), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2045), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2045), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6000000000000002
F-measure = 0.6407215588115838
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2021), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2021), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2021), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6500000000000001
F-measure = 0.6401483907430576
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1935), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1935), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1935), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.7000000000000002
F-measure = 0.6143162409144306
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1788), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1788), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1788), HTML(value='')))

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1505), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1505), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1505), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.8000000000000002
F-measure = 0.4596412888221975
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1035), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1035), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1035), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.8500000000000002
F-measure = 0.29404161237528387
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=469), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=469), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=469), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.9000000000000002
F-measure = 0.12450626992052646
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.9500000000000003
F-measure = 0.008297038496324698
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 1.0000000000000004
F-measure = 0.0012474516447588515
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 1.0500000000000003
F-measure = 0.0012474516447588515

0.6000000000000002 0.6407215588115838
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2045), HTML(value='')))


Calculate rand...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2045), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2045), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Number of clusters: 2045
Number of outliers: 3849/11737 (32.794%)

Entropy = 0.477972
Purity = 0.863337
F-measure = 0.640722

Rand = 99.918%
Jaccard Index = 48.838%


In [100]:
max_fm = 0
best_eps = 0
best_clustering = None

for eps in tqdm_notebook(np.arange(0.55, 0.65, 0.005)):
    tufano_clustering = sklearn.cluster.DBSCAN(eps=eps, min_samples=2, 
                                                   metric='precomputed').fit(dists_to_5gram)
    
    rand, fm = print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes, 
                                    from_change_to_unique_label, changes_with_different_methods, 
                                    without_outliers=True, to_print=False, only_fm=True)
    print("Eps =", eps)
    print("F-measure =", fm)
    
    if fm > max_fm:
        max_fm = fm
        best_eps = eps
        best_clustering = tufano_clustering

print(best_eps, max_fm)


rand, fm = print_clustering_results_tufano_unique(best_clustering, unique_label_to_changes, 
                                    from_change_to_unique_label, changes_with_different_methods, 
                                    without_outliers=True, to_print=True)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.55
F-measure = 0.6386721846775143
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2048), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2048), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2048), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.555
F-measure = 0.6385041838163905
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2055), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2055), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2055), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.56
F-measure = 0.6400534832101138
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2058), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2058), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2058), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.5650000000000001
F-measure = 0.6407976909778359
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2057), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2057), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2057), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.5700000000000001
F-measure = 0.6413990597487441
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2047), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2047), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2047), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.5750000000000001
F-measure = 0.6391676850617208
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.5800000000000001
F-measure = 0.6395466283201146
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2048), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2048), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2048), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.5850000000000001
F-measure = 0.6406414858444618
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2048), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2048), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2048), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.5900000000000001
F-measure = 0.6401914440641706
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2049), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.5950000000000001
F-measure = 0.6416660850638897
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2045), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2045), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2045), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6000000000000001
F-measure = 0.6407215588115838
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2047), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2047), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2047), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6050000000000001
F-measure = 0.6415942360666418
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2041), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2041), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2041), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6100000000000001
F-measure = 0.6401806269778821
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6150000000000001
F-measure = 0.6411787229785999
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6200000000000001
F-measure = 0.6418865848651008
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2029), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2029), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2029), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6250000000000001
F-measure = 0.6394991329743259
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2032), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2032), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2032), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6300000000000001
F-measure = 0.6409494687296496
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2032), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2032), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2032), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6350000000000001
F-measure = 0.6411536680180757
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2031), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2031), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2031), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6400000000000001
F-measure = 0.6404903921165993
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2029), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2029), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2029), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Eps = 0.6450000000000001
F-measure = 0.6413018443838469

0.6200000000000001 0.6418865848651008
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate rand...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Number of clusters: 2038
Number of outliers: 3786/11737 (32.257%)

Entropy = 0.509308
Purity = 0.857754
F-measure = 0.641887

Rand = 99.913%
Jaccard Index = 47.134%


In [139]:
tufano_clustering = sklearn.cluster.DBSCAN(eps=0.62, min_samples=2, 
                                                   metric='precomputed').fit(dists_to_5gram)
    
rand, fm = print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes, 
                                from_change_to_unique_label, changes_with_different_methods, 
                                without_outliers=True, to_print=True, only_fm=True)

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=11737), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=2038), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=3001), HTML(value='')))


Number of clusters: 2038
Number of outliers: 3786/11737 (32.257%)

Entropy = 0.509308
Purity = 0.857754
F-measure = 0.641887

Rand = 0%
Jaccard Index = 0%
Pure classes: 1771


In [151]:
changes_with_different_methods_filtered = [elem for elem in changes_with_different_methods 
                                           if len(unique_label_to_changes[elem[1]]) > 2]
len(changes_with_different_methods_filtered)

10515

In [83]:
hists_1gram_filtered = get_hists(RESULTS_CPATMINER, "1gram", changes_with_different_methods_filtered)
hists_2gram_filtered = get_hists(RESULTS_CPATMINER, "2gram", changes_with_different_methods_filtered)
hists_3gram_filtered = get_hists(RESULTS_CPATMINER, "3gram", changes_with_different_methods_filtered)
hists_4gram_filtered = get_hists(RESULTS_CPATMINER, "4gram", changes_with_different_methods_filtered)
hists_5gram_filtered = get_hists(RESULTS_CPATMINER, "5gram", changes_with_different_methods_filtered)

HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))




In [85]:
concat_hists_filtered_to_5gram = []

num_changes = len(hists_1gram_filtered)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_1gram_filtered[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_2gram_filtered[i]:
        concat_hist.append((gram_ind + hist_len_1gram, amount))
        
    for gram_ind, amount in hists_3gram_filtered[i]:
        concat_hist.append((gram_ind + hist_len_1gram + hist_len_2gram, amount))
        
    for gram_ind, amount in hists_4gram_filtered[i]:
        concat_hist.append((gram_ind + hist_len_1gram + hist_len_2gram + hist_len_3gram, amount))
        
    for gram_ind, amount in hists_5gram_filtered[i]:
        concat_hist.append((gram_ind + hist_len_1gram + 
                            hist_len_2gram + hist_len_3gram + hist_len_4gram, 
                            amount))
            
    concat_hists_filtered_to_5gram.append(concat_hist)

concat_hists_len_to_5gram = hist_len_1gram + hist_len_2gram + hist_len_3gram
concat_hists_len_to_5gram += hist_len_4gram + hist_len_5gram


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))




In [86]:
concat_hists_filtered_to_5gram = np.array(concat_hists_filtered_to_5gram)

In [87]:
dists_filtered_to_5gram = get_dists_optimized(concat_hists_filtered_to_5gram, 
                                         canberra_metric_optimized, concat_hists_len_to_5gram)

HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))




In [154]:
unique_label_to_changes_filtered = collections.defaultdict(list)
from_change_to_unique_label_filtered = collections.defaultdict(list)

for elem in changes_with_different_methods_filtered:
    unique_label_to_changes_filtered[elem[1]].append(elem)
    from_change_to_unique_label_filtered[elem].append(elem[1])
    
print(len(unique_label_to_changes_filtered))

2390


In [89]:
max_fm = 0
best_eps = 0
best_clustering = None

for eps in tqdm_notebook(np.arange(0.1, 1.1, 0.05)):
    tufano_clustering = sklearn.cluster.DBSCAN(eps=eps, min_samples=2, 
                                                   metric='precomputed').fit(dists_filtered_to_5gram)
    
    rand, fm = print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes_filtered, 
                                    from_change_to_unique_label_filtered, changes_with_different_methods_filtered, 
                                    without_outliers=True, to_print=False, only_fm=True)
    print("Eps =", eps)
    print("F-measure =", fm)
    
    if fm > max_fm:
        max_fm = fm
        best_eps = eps
        best_clustering = tufano_clustering

print(best_eps, max_fm)


rand, fm = print_clustering_results_tufano_unique(best_clustering, unique_label_to_changes_filtered, 
                                    from_change_to_unique_label_filtered, changes_with_different_methods_filtered, 
                                    without_outliers=True, to_print=True)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=596), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=596), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=596), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.1
F-measure = 0.7222121051166008
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=588), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=588), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=588), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.15000000000000002
F-measure = 0.7357756463000191
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=581), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=581), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=581), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.20000000000000004
F-measure = 0.749242365996106
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=568), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=568), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=568), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.25000000000000006
F-measure = 0.7603836800189181
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=565), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=565), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=565), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.30000000000000004
F-measure = 0.7666050504203188
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=557), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=557), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=557), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.3500000000000001
F-measure = 0.7739594466880033
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=551), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=551), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=551), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.40000000000000013
F-measure = 0.7786575235324729
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=547), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=547), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=547), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.45000000000000007
F-measure = 0.7825089686151363
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=532), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=532), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=532), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.5000000000000001
F-measure = 0.7838755835161242
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=522), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=522), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=522), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.5500000000000002
F-measure = 0.788242565701633
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=507), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=507), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=507), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.6000000000000002
F-measure = 0.7911760735911368
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=489), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=489), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=489), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.6500000000000001
F-measure = 0.7939555764012072
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=463), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=463), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=463), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.7000000000000002
F-measure = 0.7777966439152346
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=425), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=425), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=425), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.7500000000000002
F-measure = 0.7524062276389183
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=360), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=360), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=360), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.8000000000000002
F-measure = 0.6663686219837499
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=257), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=257), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=257), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.8500000000000002
F-measure = 0.43518337880113434
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=126), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=126), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=126), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.9000000000000002
F-measure = 0.2072115825585001
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=16), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=16), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=16), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.9500000000000003
F-measure = 0.028137267587380728
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 1.0000000000000004
F-measure = 0.006563398606578413
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 1.0500000000000003
F-measure = 0.006563398606578413

0.6500000000000001 0.7939555764012072
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=489), HTML(value='')))


Calculate rand...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=489), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=489), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Number of clusters: 489
Number of outliers: 646/4373 (14.772%)

Entropy = 0.351802
Purity = 0.885967
F-measure = 0.793956

Rand = 99.822%
Jaccard Index = 64.014%


In [90]:
max_fm = 0
best_eps = 0
best_clustering = None

for eps in tqdm_notebook(np.arange(0.6, 0.7, 0.005)):
    tufano_clustering = sklearn.cluster.DBSCAN(eps=eps, min_samples=2, 
                                                   metric='precomputed').fit(dists_filtered_to_5gram)
    
    rand, fm = print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes_filtered, 
                                    from_change_to_unique_label_filtered, changes_with_different_methods_filtered, 
                                    without_outliers=True, to_print=False, only_fm=True)
    print("Eps =", eps)
    print("F-measure =", fm)
    
    if fm > max_fm:
        max_fm = fm
        best_eps = eps
        best_clustering = tufano_clustering

print(best_eps, max_fm)


rand, fm = print_clustering_results_tufano_unique(best_clustering, unique_label_to_changes_filtered, 
                                    from_change_to_unique_label_filtered, changes_with_different_methods_filtered, 
                                    without_outliers=True, to_print=True)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=507), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=507), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=507), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.6
F-measure = 0.7911760735911368
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=507), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=507), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=507), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.605
F-measure = 0.7916229654825202
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=505), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=505), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=505), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.61
F-measure = 0.7926137347559328
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=501), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=501), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=501), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.615
F-measure = 0.7944561450232472
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=499), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=499), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=499), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.62
F-measure = 0.7951304073230985
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=495), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.625
F-measure = 0.7928026855052837
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=494), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=494), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=494), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.63
F-measure = 0.7931675776673253
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=493), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=493), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=493), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.635
F-measure = 0.7926158007612469
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=493), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=493), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=493), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.64
F-measure = 0.7931723433369984
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=492), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=492), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=492), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.645
F-measure = 0.7946995010935878
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=489), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=489), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=489), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.65
F-measure = 0.7939555764012072
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=488), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=488), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=488), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.655
F-measure = 0.7950316929328101
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=487), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=487), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=487), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.66
F-measure = 0.7946581545876275
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=487), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=487), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=487), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.665
F-measure = 0.7946581545876275
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=482), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=482), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=482), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.67
F-measure = 0.7922838417666309
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=482), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=482), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=482), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.675
F-measure = 0.7926234928340095
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=481), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=481), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=481), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.68
F-measure = 0.7916999001901899
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=477), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=477), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=477), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.685
F-measure = 0.7881306619344848
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=474), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=474), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=474), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.6900000000000001
F-measure = 0.7851178115840209
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=467), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=467), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=467), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Eps = 0.6950000000000001
F-measure = 0.7801349963648627

0.62 0.7951304073230985
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=499), HTML(value='')))


Calculate rand...


HBox(children=(IntProgress(value=0, max=4373), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=499), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=499), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))


Number of clusters: 499
Number of outliers: 671/4373 (15.344%)

Entropy = 0.31561
Purity = 0.893301
F-measure = 0.79513

Rand = 99.839%
Jaccard Index = 66.341%


In [147]:
changes_with_different_methods_filtered5 = [elem for elem in changes_with_different_methods 
                                           if len(unique_label_to_changes[elem[1]]) > 9]
len(changes_with_different_methods_filtered5)

1818

In [110]:
hists_1gram_filtered5 = get_hists(RESULTS_CPATMINER, "1gram", changes_with_different_methods_filtered5)
hists_2gram_filtered5 = get_hists(RESULTS_CPATMINER, "2gram", changes_with_different_methods_filtered5)
hists_3gram_filtered5 = get_hists(RESULTS_CPATMINER, "3gram", changes_with_different_methods_filtered5)
hists_4gram_filtered5 = get_hists(RESULTS_CPATMINER, "4gram", changes_with_different_methods_filtered5)
hists_5gram_filtered5 = get_hists(RESULTS_CPATMINER, "5gram", changes_with_different_methods_filtered5)

HBox(children=(IntProgress(value=0, max=6393), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6393), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6393), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6393), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6393), HTML(value='')))




In [111]:
concat_hists_filtered5_to_5gram = []

num_changes = len(hists_1gram_filtered5)

for i in tqdm_notebook(range(num_changes)):
    concat_hist = []
    
    for gram_ind, amount in hists_1gram_filtered5[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_2gram_filtered5[i]:
        concat_hist.append((gram_ind + hist_len_1gram, amount))
        
    for gram_ind, amount in hists_3gram_filtered5[i]:
        concat_hist.append((gram_ind + hist_len_1gram + hist_len_2gram, amount))
        
    for gram_ind, amount in hists_4gram_filtered5[i]:
        concat_hist.append((gram_ind + hist_len_1gram + hist_len_2gram + hist_len_3gram, amount))
        
    for gram_ind, amount in hists_5gram_filtered5[i]:
        concat_hist.append((gram_ind + hist_len_1gram + 
                            hist_len_2gram + hist_len_3gram + hist_len_4gram, 
                            amount))
            
    concat_hists_filtered5_to_5gram.append(concat_hist)

concat_hists_len_to_5gram = hist_len_1gram + hist_len_2gram + hist_len_3gram
concat_hists_len_to_5gram += hist_len_4gram + hist_len_5gram

concat_hists_filtered5_to_5gram = np.array(concat_hists_filtered5_to_5gram)


HBox(children=(IntProgress(value=0, max=6393), HTML(value='')))




In [112]:
dists_filtered3_to_5gram = get_dists_optimized(concat_hists_filtered5_to_5gram, 
                                         canberra_metric_optimized, concat_hists_len_to_5gram)

HBox(children=(IntProgress(value=0, max=6393), HTML(value='')))




In [104]:
dists_filtered10_to_5gram = get_dists_optimized(concat_hists_filtered5_to_5gram, 
                                         canberra_metric_optimized, concat_hists_len_to_5gram)

HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))




In [95]:
dists_filtered5_to_5gram = get_dists_optimized(concat_hists_filtered5_to_5gram, 
                                         canberra_metric_optimized, concat_hists_len_to_5gram)

HBox(children=(IntProgress(value=0, max=3653), HTML(value='')))




In [148]:
unique_label_to_changes_filtered5 = collections.defaultdict(list)
from_change_to_unique_label_filtered5 = collections.defaultdict(list)

for elem in changes_with_different_methods_filtered5:
    unique_label_to_changes_filtered5[elem[1]].append(elem)
    from_change_to_unique_label_filtered5[elem].append(elem[1])
    
print("Classes:", len(unique_label_to_changes_filtered5))
    

Classes: 103


In [123]:
max_fm = 0
best_eps = 0
best_clustering = None

for eps in tqdm_notebook(np.arange(0.1, 1.1, 0.05)):
    tufano_clustering = sklearn.cluster.DBSCAN(eps=eps, min_samples=2, 
                                                   metric='precomputed').fit(dists_filtered10_to_5gram)
    
    rand, fm = print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes_filtered5, 
                                    from_change_to_unique_label_filtered5, changes_with_different_methods_filtered5, 
                                    without_outliers=True, to_print=False, only_fm=True)
    print("Eps =", eps)
    print("F-measure =", fm)
    
    if fm > max_fm:
        max_fm = fm
        best_eps = eps
        best_clustering = tufano_clustering

print(best_eps, max_fm)


rand, fm = print_clustering_results_tufano_unique(best_clustering, unique_label_to_changes_filtered5, 
                                    from_change_to_unique_label_filtered5, changes_with_different_methods_filtered5, 
                                    without_outliers=True, to_print=True)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=159), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=159), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=159), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.1
F-measure = 0.7810342248069188
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=153), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=153), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=153), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.15000000000000002
F-measure = 0.78817340091294
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.20000000000000004
F-measure = 0.7951013922289897
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=139), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=139), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=139), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.25000000000000006
F-measure = 0.810613902951956
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=134), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=134), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=134), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.30000000000000004
F-measure = 0.8147578835113205
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=129), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=129), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=129), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.3500000000000001
F-measure = 0.8184591175686287
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=125), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=125), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=125), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.40000000000000013
F-measure = 0.8246486404403358
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=125), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=125), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=125), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.45000000000000007
F-measure = 0.826312747816717
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=121), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=121), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=121), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.5000000000000001
F-measure = 0.8335244642717126
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=116), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=116), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=116), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.5500000000000002
F-measure = 0.8394861909117939
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=113), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=113), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=113), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.6000000000000002
F-measure = 0.8478958941976507
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=109), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=109), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=109), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.6500000000000001
F-measure = 0.8510846385792896
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.7000000000000002
F-measure = 0.8519394758748977
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.7500000000000002
F-measure = 0.8563249196886672
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.8000000000000002
F-measure = 0.8322864734766499
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=71), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=71), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=71), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.8500000000000002
F-measure = 0.7226095177618288
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=42), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=42), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=42), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.9000000000000002
F-measure = 0.3724377784605341
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.9500000000000003
F-measure = 0.08597551572402211
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 1.0000000000000004
F-measure = 0.02757143590527473
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 1.0500000000000003
F-measure = 0.02757143590527473

0.7500000000000002 0.8563249196886672
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


Calculate rand...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Number of clusters: 96
Number of outliers: 149/1818 (8.196%)

Entropy = 0.243253
Purity = 0.901738
F-measure = 0.856325

Rand = 99.567%
Jaccard Index = 75.2%


In [124]:
max_fm = 0
best_eps = 0
best_clustering = None

for eps in tqdm_notebook(np.arange(0.7, 0.8, 0.005)):
    tufano_clustering = sklearn.cluster.DBSCAN(eps=eps, min_samples=2, 
                                                   metric='precomputed').fit(dists_filtered10_to_5gram)
    
    rand, fm = print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes_filtered5, 
                                    from_change_to_unique_label_filtered5, changes_with_different_methods_filtered5, 
                                    without_outliers=True, to_print=False, only_fm=True)
    print("Eps =", eps)
    print("F-measure =", fm)
    
    if fm > max_fm:
        max_fm = fm
        best_eps = eps
        best_clustering = tufano_clustering

print(best_eps, max_fm)


rand, fm = print_clustering_results_tufano_unique(best_clustering, unique_label_to_changes_filtered5, 
                                    from_change_to_unique_label_filtered5, changes_with_different_methods_filtered5, 
                                    without_outliers=True, to_print=True)

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.7
F-measure = 0.8519394758748977
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.705
F-measure = 0.8519394758748977
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=101), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=101), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=101), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.71
F-measure = 0.8528506088736961
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=101), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=101), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=101), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.715
F-measure = 0.8528506088736961
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.72
F-measure = 0.8537179647391568
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.725
F-measure = 0.8537179647391568
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.73
F-measure = 0.8537179647391568
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.735
F-measure = 0.8537179647391568
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.74
F-measure = 0.8537179647391568
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=99), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=99), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=99), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.745
F-measure = 0.8537179647391568
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.75
F-measure = 0.8563249196886672
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.755
F-measure = 0.8660516974819721
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=93), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=93), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=93), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.76
F-measure = 0.8655377417482452
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=91), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=91), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=91), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.765
F-measure = 0.8549614231113905
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=91), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=91), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=91), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.77
F-measure = 0.8549614231113905
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=91), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=91), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=91), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.775
F-measure = 0.8552830927052505
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=90), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=90), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=90), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.78
F-measure = 0.8498848041831895
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=89), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=89), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=89), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.785
F-measure = 0.8444225659997382
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=89), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=89), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=89), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.79
F-measure = 0.8441620136287116
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.795
F-measure = 0.8322864734766499
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Eps = 0.8
F-measure = 0.8322864734766499

0.755 0.8660516974819721
Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


Calculate rand...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Number of clusters: 94
Number of outliers: 149/1818 (8.196%)

Entropy = 0.243369
Purity = 0.901138
F-measure = 0.866052

Rand = 99.682%
Jaccard Index = 81.781%


In [142]:
tufano_clustering = sklearn.cluster.DBSCAN(eps=0.605, min_samples=2, 
                                                   metric='precomputed').fit(dists_filtered3_to_5gram)
    
rand, fm = print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes_filtered5, 
                                    from_change_to_unique_label_filtered5, changes_with_different_methods_filtered5,
                                without_outliers=True, to_print=True, only_fm=True)

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=6393), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=932), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=932), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=932), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=1016), HTML(value='')))


Number of clusters: 932
Number of outliers: 1322/6393 (20.679%)

Entropy = 0.344238
Purity = 0.888977
F-measure = 0.76018

Rand = 0%
Jaccard Index = 0%
Pure classes: 792


In [146]:
tufano_clustering = sklearn.cluster.DBSCAN(eps=0.66, min_samples=2, 
                                                   metric='precomputed').fit(dists_filtered5_to_5gram)
    
rand, fm = print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes_filtered5, 
                                    from_change_to_unique_label_filtered5, changes_with_different_methods_filtered5,
                                without_outliers=True, to_print=True, only_fm=True)

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=3653), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=362), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=362), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=362), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=367), HTML(value='')))


Number of clusters: 362
Number of outliers: 470/3653 (12.866%)

Entropy = 0.318361
Purity = 0.890983
F-measure = 0.807519

Rand = 0%
Jaccard Index = 0%
Pure classes: 291


In [149]:
tufano_clustering = sklearn.cluster.DBSCAN(eps=0.755, min_samples=2, 
                                                   metric='precomputed').fit(dists_filtered10_to_5gram)
    
rand, fm = print_clustering_results_tufano_unique(tufano_clustering, unique_label_to_changes_filtered5, 
                                    from_change_to_unique_label_filtered5, changes_with_different_methods_filtered5,
                                without_outliers=True, to_print=True, only_fm=True)

Start evaluate...
Save clusters...


HBox(children=(IntProgress(value=0, max=1818), HTML(value='')))


Delete outliers...


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


Calculate confusion...


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


Calculate entropy & purity...


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


Calculate F-measure...


HBox(children=(IntProgress(value=0, max=103), HTML(value='')))


Number of clusters: 94
Number of outliers: 149/1818 (8.196%)

Entropy = 0.243369
Purity = 0.901138
F-measure = 0.866052

Rand = 0%
Jaccard Index = 0%
Pure classes: 79
