In [1]:
import numpy as np

import matplotlib as mpl
mpl.rcParams['savefig.dpi'] = 100
mpl.rcParams['figure.dpi'] = 100

import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import string
import mplcursors
import collections
import sklearn.cluster
import os
from tqdm import tqdm_notebook
#%matplotlib inline
%matplotlib notebook
#%matplotlib notebook

In [141]:
# NEW IMPLEMENTATION

import math

def get_intersections(hist1, hist2):
    len1 = len(hist1)
    len2 = len(hist2)
    intersections = []
    i = 0
    j = 0

    while i < len1 and j < len2:
        if hist1[i][0] < hist2[j][0]:
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            intersections.append((hist1[i][0], hist1[i][1], hist2[j][1]))
            i += 1
            j += 1
        elif hist1[i][0] > hist2[j][0]:
            j += 1
            
    return intersections

def get_union_inds(hist1, hist2):
    len1 = len(hist1)
    len2 = len(hist2)
    union = []
    i = 0
    j = 0

    while i < len1 or j < len2:
        if i >= len1:
            union.append(hist2[j][0])
            j += 1
        elif j >= len2:
            union.append(hist1[i][0])
            i += 1
        elif hist1[i][0] < hist2[j][0]:
            union.append(hist1[i][0])
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            union.append(hist1[i][0])
            i += 1
            j += 1
        elif hist1[i][0] > hist2[j][0]:
            union.append(hist2[j][0])
            j += 1
            
    return union


def jaccard_metric(hist1, hist2, hist_len=None):
    intersections = get_intersections(hist1, hist2)

    metric = 0

    for ind, v1, v2 in intersections:
        mx = max(v1, v2)
        mn = min(v1, v2)
        metric += mn / mx

    len_intersections = len(intersections)

    if (len_intersections != 0):
        metric = metric / len(intersections)
    else:
        metric = 0

    return 1 - metric

def canberra_metric(hist1, hist2, hist_len=None):
    metric = 0
    i = 0
    j = 0
    n = len(hist1)
    m = len(hist2)
    
    union_len = 0
    
    while i < n or j < m:
        if i >= n:
            metric += abs(hist2[j][1]) / abs(hist2[j][1]) # 1
            j += 1
        elif j >= m:
            metric += abs(hist1[i][1]) / abs(hist1[i][1]) # 1
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            metric += abs(hist1[i][1] - hist2[j][1]) / (abs(hist1[i][1]) + abs(hist2[j][1]))
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            metric += abs(hist1[i][1]) / abs(hist1[i][1]) # 1
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            metric += abs(hist2[j][1]) / abs(hist2[j][1]) # 1
            j += 1
        
        #union_len += 1
            
    union_len = len(get_union_inds(hist1, hist2))
    if union_len == 0:
        return 1
    
    return metric / union_len


def canberra_metric_optimized(hist1, hist2, hist_len=None):
    metric = 0
    i = 0
    j = 0
    n = len(hist1)
    m = len(hist2)
    
    union_len = 0
    
    while i < n or j < m:
        if i >= n:
            metric += 1.0
            j += 1
        elif j >= m:
            metric += 1.0
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            metric += abs(hist1[i][1] - hist2[j][1]) / (abs(hist1[i][1]) + abs(hist2[j][1]))
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            metric += 1.0
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            metric += 1.0
            j += 1
        
        union_len += 1
            
    #union_len = len(get_union_inds(hist1, hist2))
    if union_len == 0:
        return 1
    
    return metric / union_len



'''
def pearsons_correlation(hist1, hist2, hist_len):
    #hist_len = hist_len_3gram
    union_len = len(get_union_inds(hist1, hist2))
    
    top = 0
    left = 0
    right = 0
    
    n = len(hist1)
    m = len(hist2)
    
    i = 0
    j = 0
    
    while i < n or j < m:
        if i >= n:
            top += (- 1/hist_len) * (hist2[j][1] - 1/hist_len)
            left += (- 1/hist_len) ** 2
            right += (hist2[j][1] - 1/hist_len) ** 2
            j += 1
        elif j >= m:
            top += (hist1[i][1] - 1/hist_len) * (- 1/hist_len)
            left += (hist1[i][1] - 1/hist_len)  ** 2
            right += (- 1/hist_len) ** 2
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            top += (hist1[i][1] - 1/hist_len) * (hist2[j][1] - 1/hist_len)
            left += (hist1[i][1] - 1/hist_len)  ** 2
            right += (hist2[j][1] - 1/hist_len) ** 2
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            top += (hist1[i][1] - 1/hist_len) * (- 1/hist_len)
            left += (hist1[i][1] - 1/hist_len)  ** 2
            right += (- 1/hist_len) ** 2
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            top += (- 1/hist_len) * (hist2[j][1] - 1/hist_len)
            left += (- 1/hist_len) ** 2
            right += (hist2[j][1] - 1/hist_len) ** 2
            j += 1

    bottom = math.sqrt(left * right)
    return 1 - top / bottom
'''


def cos_distance(hist1, hist2, hist_len=None):
    intersections = get_intersections(hist1, hist2)

    top = 0

    for ind, v1, v2 in intersections:
        top += v1 * v2
        
    bottom1 = (sum([pair[1] ** 2 for pair in hist1]))
    bottom2 = (sum([pair[1] ** 2 for pair in hist2]))
    
    #print(top)
    #print(bottom1)
    #print(bottom2)
    
    return 1 - abs(top / np.sqrt(bottom1 * bottom2))
    

def pearsons_correlation_mean(hist1, hist2, hist_len):
    union_len = len(get_union_inds(hist1, hist2))
    
    top = 0
    left = 0
    right = 0
    
    n = len(hist1)
    m = len(hist2)
    
    mean1 = sum([pair[1] for pair in hist1]) / hist_len
    mean2 = sum([pair[1] for pair in hist2]) / hist_len
    
    i = 0
    j = 0
    
    while i < n or j < m:
        if i >= n:
            top += (- mean1) * (hist2[j][1] - mean2)
            left += (- mean1) ** 2
            right += (hist2[j][1] - mean2) ** 2
            j += 1
        elif j >= m:
            top += (hist1[i][1] - mean1) * (- mean2)
            left += (hist1[i][1] - mean1)  ** 2
            right += (- mean2) ** 2
            i += 1
        elif hist1[i][0] == hist2[j][0]:
            top += (hist1[i][1] - mean1) * (hist2[j][1] - mean2)
            left += (hist1[i][1] - mean1)  ** 2
            right += (hist2[j][1] - mean2) ** 2
            i += 1
            j += 1
        elif hist1[i][0] < hist2[j][0]:
            top += (hist1[i][1] - mean1) * (- mean2)
            left += (hist1[i][1] - mean1)  ** 2
            right += (- mean2) ** 2
            i += 1
        elif hist1[i][0] > hist2[j][0]:
            top += (- mean1) * (hist2[j][1] - mean2)
            left += (- mean1) ** 2
            right += (hist2[j][1] - mean2) ** 2
            j += 1

    bottom = math.sqrt(left * right)
    return round(1 - top / bottom, 15)
    

def get_dists(hists, dist_metric, hist_len):
    #hist_len = len(hists[0])
    
    #n = len(hists)
    dists = []
    
    for hist1 in tqdm_notebook(hists):
        cur_dists = []

        for hist2 in hists:
            distance = dist_metric(hist1, hist2, hist_len)
            
            cur_dists.append(distance)

        dists.append(cur_dists)
        
    return dists


def get_dists_optimized(hists, dist_metric, hist_len):
    #hist_len = len(hists[0])
    
    n = len(hists)
    #dists = [[0.0 for i in range(n)] for j in range(n)]
    dists = np.zeros((n, n))
    
    for i in tqdm_notebook(range(n)):
        for j in range(n):
            if i <= j:
                break
                
            distance = dist_metric(hists[i], hists[j], hist_len)
            dists[i][j] = distance
            dists[j][i] = distance
        
    return dists

In [3]:
RESULTS_LASE = "/Volumes/Seagate/Alina/result_for_lase_treat-false"

In [4]:
all_dirs = [os.fsdecode(el) for el in os.listdir(RESULTS_LASE) 
               if os.path.isdir(RESULTS_LASE + "/" + os.fsdecode(el))]
all_dirs = sorted(all_dirs, key=lambda s: int(s.split(" ")[0])) 
len(all_dirs)

149

In [5]:
LASE_DATASET_PATH = "/Users/aliscafo/Documents/ALINA/WORK/SPbAU/thesis/CodeDiffEditScripts/LaseDataset"

label_to_changes = collections.defaultdict(list)
from_change_to_label = collections.defaultdict(list)

for double_num in tqdm_notebook(all_dirs):
    change_id = double_num.split(" ")[0]
    label_file = open(LASE_DATASET_PATH + "/" + change_id + "/" + "label")
    label = int(label_file.read())
    
    label_to_changes[label].append(change_id)
    from_change_to_label[change_id].append(label)

    label_file.close()
    

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




In [6]:
print(len(from_change_to_label))
print("NUM CLASSES:", len(label_to_changes))

149
NUM CLASSES: 24


In [7]:
ids_per_label = []
lines_poses = []
cur_sum = 0
all_labels = []

for label in label_to_changes:
    all_labels.append(label)
    for change in label_to_changes[label]:
        ids_per_label.append(change)
        cur_sum += 1
    lines_poses.append(cur_sum)

In [8]:
for i in range(len(ids_per_label)):
    print("On index", i, ids_per_label[i])

On index 0 1
On index 1 2
On index 2 3
On index 3 4
On index 4 5
On index 5 6
On index 6 7
On index 7 8
On index 8 9
On index 9 10
On index 10 11
On index 11 12
On index 12 13
On index 13 14
On index 14 15
On index 15 16
On index 16 17
On index 17 18
On index 18 19
On index 19 20
On index 20 21
On index 21 22
On index 22 23
On index 23 24
On index 24 25
On index 25 26
On index 26 27
On index 27 28
On index 28 29
On index 29 30
On index 30 31
On index 31 32
On index 32 33
On index 33 34
On index 34 35
On index 35 36
On index 36 37
On index 37 38
On index 38 39
On index 39 40
On index 40 41
On index 41 42
On index 42 43
On index 43 44
On index 44 45
On index 45 46
On index 46 47
On index 47 48
On index 48 49
On index 49 50
On index 50 51
On index 51 52
On index 52 53
On index 53 54
On index 54 55
On index 55 56
On index 56 57
On index 57 58
On index 58 59
On index 59 60
On index 60 61
On index 61 62
On index 62 63
On index 63 64
On index 64 65
On index 65 66
On index 66 67
On index 67 68

In [9]:
for i in range(len(lines_poses)):
    if i == 0:
        print("From 0 to", lines_poses[i], all_labels[i])
    else:
        print("From", lines_poses[i - 1], "to", lines_poses[i], all_labels[i])

From 0 to 4 1
From 4 to 20 2
From 20 to 24 3
From 24 to 30 4
From 30 to 42 5
From 42 to 45 6
From 45 to 52 7
From 52 to 56 8
From 56 to 60 9
From 60 to 63 10
From 63 to 66 11
From 66 to 75 12
From 75 to 81 13
From 81 to 84 14
From 84 to 87 15
From 87 to 96 16
From 96 to 100 17
From 100 to 105 18
From 105 to 111 19
From 111 to 116 20
From 116 to 126 21
From 126 to 129 22
From 129 to 134 23
From 134 to 149 24


In [27]:
def print_clustering_results_tufano_unique(clustering, dists, unique_label_to_changes, 
                                    from_change_to_unique_label, ids_per_unique_label, 
                                    without_outliers,
                                    to_print=True):
    clusters = collections.defaultdict(list)
    clusters_to_ids = collections.defaultdict(list)
    outliers = []
    num_changes = len(clustering.labels_)
    
    for i in range(len(clustering.labels_)):
        label = clustering.labels_[i]
        change = ids_per_unique_label[i]
        if label == -1:
            outliers.append(change)
        else:
            clusters[label].append(change)
            clusters_to_ids[label].append(i)
    
    # For Agglomerative
    clusters_list = list(clusters_to_ids.keys())
    for label in clusters_list:
        if len(clusters_to_ids[label]) == 1:
            outliers.append(clusters[label][0])
            clustering.labels_[clusters_to_ids[label][0]] = -1
            clusters_to_ids.pop(label, None)
            clusters.pop(label, None)
            
    num_inliers = num_changes - len(outliers)   
            
    cohesion = 0
    separation = 0
    g1 = 0
    MAX_DIST = np.array(dists).max()
    #print("MAX_DIST", MAX_DIST)
    for i in clusters_to_ids.keys():
        coef = 0
        #print("!!!", len(clusters_to_ids[i]))
        for xi in range(len(clusters_to_ids[i])):
            for yi in range(len(clusters_to_ids[i])):
                if xi < yi:
                    x = clusters_to_ids[i][xi]
                    y = clusters_to_ids[i][yi]
                    #print("dists[x][y]", dists[x][y])
                    coef += MAX_DIST - dists[x][y]
                    
        cohesion += (1 / len(clusters_to_ids[i])) * coef
        coef_for_sep = len(clusters_to_ids[i]) / np.sqrt(coef)
        coef = 1 / coef
        
        summ = 0
        for j in clusters_to_ids.keys():
            if i == j:
                continue
                
            for xi in range(len(clusters_to_ids[i])):
                for yi in range(len(clusters_to_ids[j])):
                    x = clusters_to_ids[i][xi]
                    y = clusters_to_ids[j][yi]
                    summ += MAX_DIST - dists[x][y]
        
        g1 += coef * summ
        separation += coef_for_sep * summ
        
        
    #final_init_labels = dict()
        
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    for i in range(len(clustering.labels_)):
        change_i = ids_per_unique_label[i]
        init_label_i = from_change_to_unique_label[change_i][0]
        after_label_i = clustering.labels_[i]
        
        for j in range(len(clustering.labels_)):
            change_j = ids_per_unique_label[j]
            init_label_j = from_change_to_unique_label[change_j][0]
            after_label_j = clustering.labels_[j]
            
            if i >= j:
                continue
                
            if (without_outliers and (after_label_i == -1 or after_label_j == -1)):
                continue
            
            if init_label_i == init_label_j:
                if after_label_i == after_label_j:
                    tp += 1
                else:
                    fp += 1
            else:
                if after_label_i == after_label_j:
                    tn += 1
                else:
                    fn += 1
                    
                    
    
    confusion_mtx = []
    for i in clusters_to_ids.keys():
        mtx_row = collections.defaultdict(int)
        
        for change_i in clusters_to_ids[i]:
            change = ids_per_unique_label[change_i]
            init_label = from_change_to_unique_label[change][0]
            mtx_row[init_label] += 1
        
        confusion_mtx.append(mtx_row)
        
    ENTROPY = 0
    PURITY = 0
    entropies = []
    purities = []
    Fs = collections.defaultdict(list)
    for i in range(len(clusters_to_ids.keys())):
        entropy = 0
        purity = 0
        
        clustering_label = list(clusters_to_ids.keys())[i]
        
        for init_lbl in unique_label_to_changes:
            pij = confusion_mtx[i][init_lbl] / len(clusters_to_ids[clustering_label])
            precision = pij
            recall = confusion_mtx[i][init_lbl] / len(unique_label_to_changes[init_lbl])
            
            if pij != 0:
                entropy += - pij * np.log2(pij)
            purity = max(purity, pij)
            
            fij = 0
            if precision != 0 or recall != 0:
                fij = (2 * precision * recall) / (precision + recall)
            Fs[init_lbl].append(fij)
        
        ENTROPY += entropy * len(clusters_to_ids[clustering_label]) / num_inliers
        entropies.append(round(entropy, 3))
        
        PURITY += purity * len(clusters_to_ids[clustering_label]) / num_inliers
        purities.append(round(purity, 3))
        
    Fmeasure = 0
    for init_lbl in unique_label_to_changes:
        Fmeasure += max(Fs[init_lbl]) * len(unique_label_to_changes[init_lbl]) / num_changes
            
    
    rand = round(100 * (tp + fn) / (tp + tn + fp + fn), 3)
    jaccard_index = round(100 * tp / (tp + tn + fp), 3)
    outl_percent = round(100 * len(outliers) / num_changes, 3)
    
    
    if (to_print):
        '''
        print("CLUSTERS:\n")

        for i in clusters.keys():
            for change in clusters[i]:
                print(change, from_change_to_unique_label[change])
            print("\n")
        '''
        
        print("Number of clusters:", len(clusters))
        print("Number of outliers:", str(len(outliers)) + "/" + str(num_changes), "(" + str(outl_percent) + "%)")
        
        print()
        print("Cohesion =", str(cohesion))
        print("Separation =", str(separation))
        print("G1 =", str(g1))
        
        print()
        print("Entropy =", str(round(ENTROPY, 6)))
        print("Purity =", str(round(PURITY, 6)))
        #pairs = [(entropies[i], purities[i]) for i in range(len(entropies))]
        #print("(entropy, purity):", pairs)
        print("F-measure =", round(Fmeasure, 6))
        
        print()
        print("Rand =", str(rand) + "%")
        print("Jaccard Index =", str(jaccard_index) + "%")       
    
    return rand, Fmeasure
    
    

In [208]:
def find_eps_with_brute_force_universal(dists, label_to_changes, from_change_to_label, 
                                        ids_per_label, agglomerative=False, linkage='single', with_step=None):
    if with_step is None:
        all_dists = np.unique(np.array(dists).flatten())
    else:
        all_dists = np.arange(0.0, max(1.1, np.array(dists).flatten().max()), with_step)
        
    if agglomerative is True:
        print("Agglomerative Clustering")
    else:
        print("DBSCAN")

    max_rand = 0
    best_eps = -1
    max_fm = 0

    for eps in tqdm_notebook(all_dists):
        if eps <= 0.0:
            continue
            
        if agglomerative is True:
            tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=linkage, 
                                                            compute_full_tree=True,
                                                            distance_threshold=eps).fit(dists)
        else:
            tufano_clustering = sklearn.cluster.DBSCAN(eps=eps, 
                                                   min_samples=2, 
                                                   metric='precomputed').fit(dists)
        
        rand, fm = print_clustering_results_tufano_unique(tufano_clustering, 
                                               dists,
                                               label_to_changes, 
                                               from_change_to_label, 
                                               ids_per_label,
                                               to_print=False, 
                                               without_outliers=True)

        if max_fm < fm:
            max_fm = fm
            best_eps = eps

    print("Best eps:", best_eps)
    print("Best F-measure:", max_fm)
    
    return best_eps

In [12]:
def get_lase_hists(hists_path, gram_path):
    hists = []

    for double_num in tqdm_notebook(ids_per_label):
        hist_path1 = hists_path + "/" + double_num + " " + double_num + "/" + gram_path + "/sampleChange1" + "_hist.txt"
        
        if os.path.exists(hist_path1):
            hist_file1 = open(hist_path1, "r")
            lines = hist_file1.read().split("\n")
            hist_data1 = [(int(line.split(" ")[0]), int(line.split(" ")[1])) for line in lines if line != '']
            hists.append(hist_data1)
            hist_file1.close()
        else:
            hists.append([])
        
    return hists

def get_lase_hists_len(hists_path, gram_path):
    hists_len = None
    
    es_path = hists_path + "/" + "edit_scripts_" + gram_path + "s_mapped.txt"
    es_file = open(es_path, "r")
    es_lines = es_file.read().split("\n")
    hists_len = int(es_lines[-2].split(" ")[0][:-1]) + 1
    es_file.close()

    print(hists_len)
    
    return hists_len

In [86]:
#RESULTS_LASE = "/Volumes/Seagate/Alina/result_for_lase_treat-false"

#RESULTS_LASE = "/Volumes/Seagate/Alina/result_for_lase_with_context_concat"

RESULTS_LASE = "/Volumes/Seagate/Alina/result_for_lase_wo_context_concat"

In [87]:
hists_lase_1gram = get_lase_hists(RESULTS_LASE, "1gram")
hists_lase_2gram = get_lase_hists(RESULTS_LASE, "2gram")
hists_lase_3gram = get_lase_hists(RESULTS_LASE, "3gram")
hists_lase_4gram = get_lase_hists(RESULTS_LASE, "4gram")
hists_lase_5gram = get_lase_hists(RESULTS_LASE, "5gram")
hists_lase_6gram = get_lase_hists(RESULTS_LASE, "6gram")
hists_lase_7gram = get_lase_hists(RESULTS_LASE, "7gram")
hists_lase_8gram = get_lase_hists(RESULTS_LASE, "8gram")
hists_lase_9gram = get_lase_hists(RESULTS_LASE, "9gram")
hists_lase_10gram = get_lase_hists(RESULTS_LASE, "10gram")

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




In [14]:
'''
hist_len_1gram_lase = get_lase_hists_len(RESULTS_LASE, "1gram")
hist_len_2gram_lase = get_lase_hists_len(RESULTS_LASE, "2gram")
hist_len_3gram_lase = get_lase_hists_len(RESULTS_LASE, "3gram")
hist_len_4gram_lase = get_lase_hists_len(RESULTS_LASE, "4gram")
hist_len_5gram_lase = get_lase_hists_len(RESULTS_LASE, "5gram")
hist_len_6gram_lase = get_lase_hists_len(RESULTS_LASE, "6gram")
hist_len_7gram_lase = get_lase_hists_len(RESULTS_LASE, "7gram")
hist_len_8gram_lase = get_lase_hists_len(RESULTS_LASE, "8gram")
hist_len_9gram_lase = get_lase_hists_len(RESULTS_LASE, "9gram")
hist_len_10gram_lase = get_lase_hists_len(RESULTS_LASE, "10gram")
'''

409
1199
1726
2009
2167
2258
2313
2342
2358
2370


In [62]:
'''
hist_len_1gram_lase = get_lase_hists_len(RESULTS_LASE, "1gram")
hist_len_2gram_lase = get_lase_hists_len(RESULTS_LASE, "2gram")
hist_len_3gram_lase = get_lase_hists_len(RESULTS_LASE, "3gram")
hist_len_4gram_lase = get_lase_hists_len(RESULTS_LASE, "4gram")
hist_len_5gram_lase = get_lase_hists_len(RESULTS_LASE, "5gram")
hist_len_6gram_lase = get_lase_hists_len(RESULTS_LASE, "6gram")
hist_len_7gram_lase = get_lase_hists_len(RESULTS_LASE, "7gram")
hist_len_8gram_lase = get_lase_hists_len(RESULTS_LASE, "8gram")
hist_len_9gram_lase = get_lase_hists_len(RESULTS_LASE, "9gram")
hist_len_10gram_lase = get_lase_hists_len(RESULTS_LASE, "10gram")
'''

423
1221
1735
2016
2170
2259
2313
2342
2358
2370


In [88]:
hist_len_1gram_lase = get_lase_hists_len(RESULTS_LASE, "1gram")
hist_len_2gram_lase = get_lase_hists_len(RESULTS_LASE, "2gram")
hist_len_3gram_lase = get_lase_hists_len(RESULTS_LASE, "3gram")
hist_len_4gram_lase = get_lase_hists_len(RESULTS_LASE, "4gram")
hist_len_5gram_lase = get_lase_hists_len(RESULTS_LASE, "5gram")
hist_len_6gram_lase = get_lase_hists_len(RESULTS_LASE, "6gram")
hist_len_7gram_lase = get_lase_hists_len(RESULTS_LASE, "7gram")
hist_len_8gram_lase = get_lase_hists_len(RESULTS_LASE, "8gram")
hist_len_9gram_lase = get_lase_hists_len(RESULTS_LASE, "9gram")
hist_len_10gram_lase = get_lase_hists_len(RESULTS_LASE, "10gram")

325
1058
1607
1933
2126
2236
2299
2332
2348
2361


In [89]:
concat_hists_to_2gram_lase = []

num_changes = len(hists_lase_1gram)

for i in range(num_changes):
    concat_hist = []
    
    for gram_ind, amount in hists_lase_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_lase_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase, amount))
            
    concat_hists_to_2gram_lase.append(concat_hist)

concat_hists_len_to_2gram_lase = hist_len_1gram_lase + hist_len_2gram_lase

In [90]:
concat_hists_to_3gram_lase = []

num_changes = len(hists_lase_1gram)

for i in range(num_changes):
    concat_hist = []
    
    for gram_ind, amount in hists_lase_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_lase_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase, amount))
        
    for gram_ind, amount in hists_lase_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase, amount))
            
    concat_hists_to_3gram_lase.append(concat_hist)

concat_hists_len_to_3gram_lase = hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase

In [91]:
concat_hists_to_4gram_lase = []

num_changes = len(hists_lase_1gram)

for i in range(num_changes):
    concat_hist = []
    
    for gram_ind, amount in hists_lase_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_lase_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase, amount))
        
    for gram_ind, amount in hists_lase_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase, amount))
        
    for gram_ind, amount in hists_lase_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase, amount))
            
    concat_hists_to_4gram_lase.append(concat_hist)

concat_hists_len_to_4gram_lase = hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase + hist_len_4gram_lase

In [92]:
concat_hists_to_5gram_lase = []

num_changes = len(hists_lase_1gram)

for i in range(num_changes):
    concat_hist = []
    
    for gram_ind, amount in hists_lase_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_lase_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase, amount))
        
    for gram_ind, amount in hists_lase_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase, amount))
        
    for gram_ind, amount in hists_lase_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase, amount))
        
    for gram_ind, amount in hists_lase_5gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + 
                            hist_len_3gram_lase + hist_len_4gram_lase, 
                            amount))
            
    concat_hists_to_5gram_lase.append(concat_hist)

concat_hists_len_to_5gram_lase = hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase 
concat_hists_len_to_5gram_lase += hist_len_4gram_lase + hist_len_5gram_lase


In [93]:
concat_hists_to_6gram_lase = []

num_changes = len(hists_lase_1gram)

for i in range(num_changes):
    concat_hist = []
    
    for gram_ind, amount in hists_lase_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_lase_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase, amount))
        
    for gram_ind, amount in hists_lase_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase, amount))
        
    for gram_ind, amount in hists_lase_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase, amount))
        
    for gram_ind, amount in hists_lase_5gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase + hist_len_4gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_6gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + hist_len_4gram_lase + hist_len_5gram_lase, 
                            amount))
            
    concat_hists_to_6gram_lase.append(concat_hist)

concat_hists_len_to_6gram_lase = hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase
concat_hists_len_to_6gram_lase += hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase


In [94]:
concat_hists_to_7gram_lase = []

num_changes = len(hists_lase_1gram)

for i in range(num_changes):
    concat_hist = []
    
    for gram_ind, amount in hists_lase_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_lase_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase, amount))
        
    for gram_ind, amount in hists_lase_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase, amount))
        
    for gram_ind, amount in hists_lase_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase, amount))
        
    for gram_ind, amount in hists_lase_5gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + 
                            hist_len_3gram_lase + hist_len_4gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_6gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + hist_len_4gram_lase + hist_len_5gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_7gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + 
                            hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase, 
                            amount))
            
    concat_hists_to_7gram_lase.append(concat_hist)

concat_hists_len_to_7gram_lase = hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase 
concat_hists_len_to_7gram_lase += hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase
concat_hists_len_to_7gram_lase += hist_len_7gram_lase


In [95]:
concat_hists_to_8gram_lase = []

num_changes = len(hists_lase_1gram)

for i in range(num_changes):
    concat_hist = []
    
    for gram_ind, amount in hists_lase_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_lase_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase, amount))
        
    for gram_ind, amount in hists_lase_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase, amount))
        
    for gram_ind, amount in hists_lase_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase, amount))
        
    for gram_ind, amount in hists_lase_5gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + 
                            hist_len_3gram_lase + hist_len_4gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_6gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + hist_len_4gram_lase + hist_len_5gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_7gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + 
                            hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_8gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + 
                            hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase + hist_len_7gram_lase, 
                            amount))
            
    concat_hists_to_8gram_lase.append(concat_hist)

concat_hists_len_to_8gram_lase = hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase 
concat_hists_len_to_8gram_lase += hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase
concat_hists_len_to_8gram_lase += hist_len_7gram_lase + hist_len_8gram_lase


In [96]:
concat_hists_to_9gram_lase = []

num_changes = len(hists_lase_1gram)

for i in range(num_changes):
    concat_hist = []
    
    for gram_ind, amount in hists_lase_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_lase_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase, amount))
        
    for gram_ind, amount in hists_lase_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase, amount))
        
    for gram_ind, amount in hists_lase_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase, amount))
        
    for gram_ind, amount in hists_lase_5gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + 
                            hist_len_3gram_lase + hist_len_4gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_6gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + hist_len_4gram_lase + hist_len_5gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_7gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + 
                            hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_8gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + 
                            hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase + hist_len_7gram_lase, 
                            amount))
    
    for gram_ind, amount in hists_lase_9gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + 
                            hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase + 
                            hist_len_7gram_lase + hist_len_8gram_lase, 
                            amount))
            
    concat_hists_to_9gram_lase.append(concat_hist)

concat_hists_len_to_9gram_lase = hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase 
concat_hists_len_to_9gram_lase += hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase
concat_hists_len_to_9gram_lase += hist_len_7gram_lase + hist_len_8gram_lase + hist_len_9gram_lase


In [97]:
concat_hists_to_10gram_lase = []

num_changes = len(hists_lase_1gram)

for i in range(num_changes):
    concat_hist = []
    
    for gram_ind, amount in hists_lase_1gram[i]:
        concat_hist.append((gram_ind, amount))
    
    for gram_ind, amount in hists_lase_2gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase, amount))
        
    for gram_ind, amount in hists_lase_3gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase, amount))
        
    for gram_ind, amount in hists_lase_4gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase, amount))
        
    for gram_ind, amount in hists_lase_5gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + hist_len_2gram_lase + 
                            hist_len_3gram_lase + hist_len_4gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_6gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + hist_len_4gram_lase + hist_len_5gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_7gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + 
                            hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_8gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + 
                            hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase + hist_len_7gram_lase, 
                            amount))
    
    for gram_ind, amount in hists_lase_9gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + 
                            hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase + 
                            hist_len_7gram_lase + hist_len_8gram_lase, 
                            amount))
        
    for gram_ind, amount in hists_lase_10gram[i]:
        concat_hist.append((gram_ind + hist_len_1gram_lase + 
                            hist_len_2gram_lase + hist_len_3gram_lase + 
                            hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase + 
                            hist_len_7gram_lase + hist_len_8gram_lase + hist_len_9gram_lase, 
                            amount))
            
    concat_hists_to_10gram_lase.append(concat_hist)

concat_hists_len_to_10gram_lase = hist_len_1gram_lase + hist_len_2gram_lase + hist_len_3gram_lase 
concat_hists_len_to_10gram_lase += hist_len_4gram_lase + hist_len_5gram_lase + hist_len_6gram_lase
concat_hists_len_to_10gram_lase += hist_len_7gram_lase + hist_len_8gram_lase + hist_len_9gram_lase + hist_len_10gram_lase


In [98]:
concat_hists_lase = []

concat_hists_lase.append([]) # zero array
concat_hists_lase.append(hists_lase_1gram)
concat_hists_lase.append(concat_hists_to_2gram_lase)
concat_hists_lase.append(concat_hists_to_3gram_lase)
concat_hists_lase.append(concat_hists_to_4gram_lase)
concat_hists_lase.append(concat_hists_to_5gram_lase)
concat_hists_lase.append(concat_hists_to_6gram_lase)
concat_hists_lase.append(concat_hists_to_7gram_lase)
concat_hists_lase.append(concat_hists_to_8gram_lase)
concat_hists_lase.append(concat_hists_to_9gram_lase)
concat_hists_lase.append(concat_hists_to_10gram_lase)


In [99]:
concat_hists_lase_len = []

concat_hists_lase_len.append([]) # zero array
concat_hists_lase_len.append(hist_len_1gram_lase)
concat_hists_lase_len.append(concat_hists_len_to_2gram_lase)
concat_hists_lase_len.append(concat_hists_len_to_3gram_lase)
concat_hists_lase_len.append(concat_hists_len_to_4gram_lase)
concat_hists_lase_len.append(concat_hists_len_to_5gram_lase)
concat_hists_lase_len.append(concat_hists_len_to_6gram_lase)
concat_hists_lase_len.append(concat_hists_len_to_7gram_lase)
concat_hists_lase_len.append(concat_hists_len_to_8gram_lase)
concat_hists_lase_len.append(concat_hists_len_to_9gram_lase)
concat_hists_lase_len.append(concat_hists_len_to_10gram_lase)


In [201]:
distances = [jaccard_metric, canberra_metric, cos_distance, pearsons_correlation_mean]
distances_names = ["jaccard_metric", "canberra_metric", "cos_distance", "pearsons_correlation_mean"]

# DBSCAN

In [51]:
for i in range(1, 11):
    hists = concat_hists_lase[i]
    hists_len = concat_hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=False)

        print(best_eps)

        tufano_clustering = sklearn.cluster.DBSCAN(eps=best_eps, 
                                                   min_samples=2, 
                                                   metric='precomputed').fit(cur_dists)
        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=571), HTML(value='')))


Best eps: 0.07758620689655171
Best F-measure: 0.19721821123429376
0.07758620689655171
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 0/149 (0.0%)

Cohesion = 38.8698625099818
Separation = 2737.827542290662
G1 = 277.9643636188017

Entropy = 3.860773
Purity = 0.187919
F-measure = 0.197218

Rand = 20.198%
Jaccard Index = 5.792%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1092), HTML(value='')))


Best eps: 0.6005464480874317
Best F-measure: 0.9328815852305783
0.6005464480874317
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 4/149 (2.685%)

Cohesion = 49.528446538230305
Separation = 1769.7129054837073
G1 = 236.58235786365387

Entropy = 0.0
Purity = 1.0
F-measure = 0.932882

Rand = 99.282%
Jaccard Index = 85.902%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1266), HTML(value='')))


Best eps: 0.3154295187513232
Best F-measure: 0.8821368042844551
0.3154295187513232
_______________________________
1 cos_distance
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 54.41851279708291
Separation = 4298.2565752047585
G1 = 471.8045496593741

Entropy = 0.142666
Purity = 0.951389
F-measure = 0.882137

Rand = 98.902%
Jaccard Index = 80.0%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1559), HTML(value='')))


Best eps: 0.2323085875061277
Best F-measure: 0.8787642790115411
0.2323085875061277
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 8/149 (5.369%)

Cohesion = 63.896908599195925
Separation = 6892.30854723801
G1 = 767.3302620099142

Entropy = 0.042553
Purity = 0.978723
F-measure = 0.878764

Rand = 99.139%
Jaccard Index = 83.527%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=621), HTML(value='')))


Best eps: 0.06559999999999999
Best F-measure: 0.19721821123429376
0.06559999999999999
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 0/149 (0.0%)

Cohesion = 39.44579675938123
Separation = 2852.3289431026474
G1 = 290.53843861692417

Entropy = 3.860773
Purity = 0.187919
F-measure = 0.197218

Rand = 20.198%
Jaccard Index = 5.792%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1235), HTML(value='')))


Best eps: 0.7260536398467433
Best F-measure: 0.94496212214333
0.7260536398467433
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 47.52638931981136
Separation = 1003.1303454288761
G1 = 121.13832149893827

Entropy = 0.0
Purity = 1.0
F-measure = 0.944962

Rand = 99.454%
Jaccard Index = 89.286%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1299), HTML(value='')))


Best eps: 0.23529411764705888
Best F-measure: 0.8921870978034875
0.23529411764705888
_______________________________
2 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 53.29017626008794
Separation = 2839.615864812512
G1 = 408.1777234988491

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.23%
Jaccard Index = 85.01%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1569), HTML(value='')))


Best eps: 0.24034280907601624
Best F-measure: 0.8921870978034875
0.24034280907601624
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 59.91527634900284
Separation = 5008.717300319084
G1 = 625.3580793510646

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.23%
Jaccard Index = 85.01%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=633), HTML(value='')))


Best eps: 0.049735449735449744
Best F-measure: 0.19721821123429376
0.049735449735449744
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 0/149 (0.0%)

Cohesion = 39.633566438204284
Separation = 2876.715739185247
G1 = 293.42834414052174

Entropy = 3.860773
Purity = 0.187919
F-measure = 0.197218

Rand = 20.198%
Jaccard Index = 5.792%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1258), HTML(value='')))


Best eps: 0.8196920781666545
Best F-measure: 0.9551374847759344
0.8196920781666545
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 3/149 (2.013%)

Cohesion = 45.487719048097716
Separation = 683.4854683736842
G1 = 63.347840454111456

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.395%
Jaccard Index = 88.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1303), HTML(value='')))


Best eps: 0.2536095087475332
Best F-measure: 0.8957194185382102
0.2536095087475332
_______________________________
3 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 53.22553578204034
Separation = 2229.756646650507
G1 = 321.5496223269984

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1584), HTML(value='')))


Best eps: 0.26708787499826425
Best F-measure: 0.8957194185382102
0.26708787499826425
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 59.09389786517994
Separation = 4323.526615661782
G1 = 539.4816355737873

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.04040816326530616
Best F-measure: 0.19721821123429376
0.04040816326530616
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 0/149 (0.0%)

Cohesion = 39.70121808037168
Separation = 2882.9604877408965
G1 = 294.4827047484322

Entropy = 3.860773
Purity = 0.187919
F-measure = 0.197218

Rand = 20.198%
Jaccard Index = 5.792%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1267), HTML(value='')))


Best eps: 0.8485817036400695
Best F-measure: 0.9551374847759344
0.8485817036400695
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 44.92348958528238
Separation = 514.2759188356086
G1 = 62.451652763159636

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.358%
Jaccard Index = 87.681%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1297), HTML(value='')))


Best eps: 0.2666666666666667
Best F-measure: 0.8957194185382102
0.2666666666666667
_______________________________
4 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.8997787669844
Separation = 1851.3860946378297
G1 = 269.9965977483535

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1571), HTML(value='')))


Best eps: 0.2828926923518049
Best F-measure: 0.8957194185382102
0.2828926923518049
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 58.72865770944297
Separation = 4073.4298692588645
G1 = 507.78772695552243

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.03402061855670091
Best F-measure: 0.19721821123429376
0.03402061855670091
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 0/149 (0.0%)

Cohesion = 39.730313152013025
Separation = 2883.580341731306
G1 = 294.5320173353277

Entropy = 3.860773
Purity = 0.187919
F-measure = 0.197218

Rand = 20.198%
Jaccard Index = 5.792%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1283), HTML(value='')))


Best eps: 0.8913857677902621
Best F-measure: 0.9748161874392403
0.8913857677902621
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 3/149 (2.013%)

Cohesion = 43.77190906784348
Separation = 423.8769733319344
G1 = 32.826400557488505

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.974816

Rand = 99.547%
Jaccard Index = 91.304%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1316), HTML(value='')))


Best eps: 0.2978746600641785
Best F-measure: 0.8957194185382102
0.2978746600641785
_______________________________
5 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.57811716109924
Separation = 1608.309445457926
G1 = 237.0093763962565

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1582), HTML(value='')))


Best eps: 0.3203946824329561
Best F-measure: 0.8957194185382102
0.3203946824329561
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 58.49490147448503
Separation = 3969.370346087171
G1 = 494.3943140283952

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.03027522935779814
Best F-measure: 0.19721821123429376
0.03027522935779814
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 0/149 (0.0%)

Cohesion = 39.74297548278694
Separation = 2883.7575791356735
G1 = 294.48652753142585

Entropy = 3.860773
Purity = 0.187919
F-measure = 0.197218

Rand = 20.198%
Jaccard Index = 5.792%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1290), HTML(value='')))


Best eps: 0.9069182389937107
Best F-measure: 0.9783692865312261
0.9069182389937107
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 2/149 (1.342%)

Cohesion = 43.22808334666974
Separation = 374.5769030623859
G1 = 28.5059054168642

Entropy = 0.036712
Purity = 0.993197
F-measure = 0.978369

Rand = 99.553%
Jaccard Index = 91.429%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1312), HTML(value='')))


Best eps: 0.3333713021894099
Best F-measure: 0.8957194185382102
0.3333713021894099
_______________________________
6 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.25822381329784
Separation = 1437.9894747479523
G1 = 213.95277252006693

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1579), HTML(value='')))


Best eps: 0.5643356149958736
Best F-measure: 0.9042524961988047
0.5643356149958736
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 57.85069373187709
Separation = 4009.85801636396
G1 = 430.1730536611916

Entropy = 0.041958
Purity = 0.979021
F-measure = 0.904252

Rand = 99.251%
Jaccard Index = 85.606%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.027966101694915202
Best F-measure: 0.19721821123429376
0.027966101694915202
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 0/149 (0.0%)

Cohesion = 39.750937186513845
Separation = 2883.70766140943
G1 = 294.40869197429856

Entropy = 3.860773
Purity = 0.187919
F-measure = 0.197218

Rand = 20.198%
Jaccard Index = 5.792%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1292), HTML(value='')))


Best eps: 0.9210300429184549
Best F-measure: 0.9783692865312261
0.9210300429184549
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 2/149 (1.342%)

Cohesion = 42.7684917805107
Separation = 328.62915372438755
G1 = 25.183376258585398

Entropy = 0.036712
Purity = 0.993197
F-measure = 0.978369

Rand = 99.553%
Jaccard Index = 91.429%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1307), HTML(value='')))


Best eps: 0.37802222749799774
Best F-measure: 0.8957194185382102
0.37802222749799774
_______________________________
7 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 51.94167321509548
Separation = 1312.545193479303
G1 = 197.04848679325542

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1587), HTML(value='')))


Best eps: 0.5957489596197095
Best F-measure: 0.9042524961988047
0.5957489596197095
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 57.58299002702614
Separation = 4021.540866687036
G1 = 432.51080709105

Entropy = 0.041958
Purity = 0.979021
F-measure = 0.904252

Rand = 99.251%
Jaccard Index = 85.606%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.026470588235294024
Best F-measure: 0.19721821123429376
0.026470588235294024
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 0/149 (0.0%)

Cohesion = 39.75609938317119
Separation = 2883.422784142497
G1 = 294.30113224743525

Entropy = 3.860773
Purity = 0.187919
F-measure = 0.197218

Rand = 20.198%
Jaccard Index = 5.792%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1295), HTML(value='')))


Best eps: 0.9294117647058824
Best F-measure: 0.9708189509607563
0.9294117647058824
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 2/149 (1.342%)

Cohesion = 42.49175953478039
Separation = 292.0961940245821
G1 = 32.24309225813787

Entropy = 0.036712
Purity = 0.993197
F-measure = 0.970819

Rand = 99.422%
Jaccard Index = 88.929%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1309), HTML(value='')))


Best eps: 0.41888554545301937
Best F-measure: 0.8957194185382102
0.41888554545301937
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 51.62520258046753
Separation = 1216.5556511072048
G1 = 184.0956047529648

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1577), HTML(value='')))


Best eps: 0.625038717789127
Best F-measure: 0.9042524961988047
0.625038717789127
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 57.33388225851276
Separation = 4054.910676255649
G1 = 437.23942641733373

Entropy = 0.041958
Purity = 0.979021
F-measure = 0.904252

Rand = 99.251%
Jaccard Index = 85.606%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.025515463917525683
Best F-measure: 0.19721821123429376
0.025515463917525683
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 0/149 (0.0%)

Cohesion = 39.75962674689001
Separation = 2883.1371813444416
G1 = 294.21511710044973

Entropy = 3.860773
Purity = 0.187919
F-measure = 0.197218

Rand = 20.198%
Jaccard Index = 5.792%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1297), HTML(value='')))


Best eps: 0.9367088607594937
Best F-measure: 0.9708189509607563
0.9367088607594937
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 2/149 (1.342%)

Cohesion = 42.169524065004445
Separation = 266.57368863950114
G1 = 29.90883033709974

Entropy = 0.036712
Purity = 0.993197
F-measure = 0.970819

Rand = 99.422%
Jaccard Index = 88.929%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1308), HTML(value='')))


Best eps: 0.6060107225308131
Best F-measure: 0.8995545096216235
0.6060107225308131
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.425445769632496
Separation = 1170.381695480484
G1 = 175.25273402718247

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1575), HTML(value='')))


Best eps: 0.6512449560753417
Best F-measure: 0.9042524961988047
0.6512449560753417
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 57.16952938803938
Separation = 4100.285188379873
G1 = 443.4652544306994

Entropy = 0.041958
Purity = 0.979021
F-measure = 0.904252

Rand = 99.251%
Jaccard Index = 85.606%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.024874371859296418
Best F-measure: 0.19721821123429376
0.024874371859296418
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 0/149 (0.0%)

Cohesion = 39.76238198292665
Separation = 2882.9095247976766
G1 = 294.1488209363981

Entropy = 3.860773
Purity = 0.187919
F-measure = 0.197218

Rand = 20.198%
Jaccard Index = 5.792%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1294), HTML(value='')))


Best eps: 0.9368169398907104
Best F-measure: 0.9551374847759344
0.9368169398907104
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 42.36340246565068
Separation = 231.6332537404911
G1 = 31.472214607267585

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.358%
Jaccard Index = 87.681%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1316), HTML(value='')))


Best eps: 0.6261725205296185
Best F-measure: 0.8995545096216235
0.6261725205296185
_______________________________
10 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.22623928855977
Separation = 1108.9900737668695
G1 = 167.2538433857298

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1573), HTML(value='')))


Best eps: 0.6752802803214588
Best F-measure: 0.9176753149907511
0.6752802803214588
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 57.44166126228457
Separation = 4142.324994781629
G1 = 482.42906127222955

Entropy = 0.0
Purity = 1.0
F-measure = 0.917675

Rand = 99.34%
Jaccard Index = 87.091%


# Average

In [24]:
cur_dists = get_dists(hists_lase_1gram, jaccard_metric, hist_len_1gram_lase)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.045
Best F-measure: 0.5312122913532581
0.045
CLUSTERS:

1 [1]
2 [1]
3 [1]
4 [1]
31 [5]
32 [5]
34 [5]
35 [5]
42 [5]
64 [11]
65 [11]
66 [11]
114 [20]


5 [2]
7 [2]
8 [2]
13 [2]
19 [2]
20 [2]
94 [16]
96 [16]


6 [2]
12 [2]
14 [2]
18 [2]
21 [3]
22 [3]
23 [3]
24 [3]


9 [2]
15 [2]
43 [6]
44 [6]
45 [6]
61 [10]


10 [2]
11 [2]
16 [2]
17 [2]
33 [5]
36 [5]
37 [5]
40 [5]
116 [20]


25 [4]
27 [4]
30 [4]


26 [4]
28 [4]
29 [4]
113 [20]
115 [20]
122 [21]
144 [24]
145 [24]
146 [24]
148 [24]
149 [24]


38 [5]
82 [14]
83 [14]
84 [14]
88 [16]


39 [5]
41 [5]
92 [16]
93 [16]


46 [7]
47 [7]
48 [7]
49 [7]
50 [7]
51 [7]
52 [7]
53 [8]
54 [8]
55 [8]
56 [8]


58 [9]
59 [9]


62 [10]
63 [10]
67 [12]
68 [12]
69 [12]
73 [12]
74 [12]
75 [12]
95 [16]
112 [20]
117 [21]
118 [21]
119 [21]
120 [21]
121 [21]
123 [21]
124 [21]
125 [21]
126 [21]


70 [12]
71 [12]
72 [12]
77 [13]
78 [13]
79 [13]
80 [13]
81 [13]


76 [13]
89 [16]
91 [16]
97 [17]
98 [17]
99 [17]
100 [17]
130 [23]
131 [23]
132 [23]
133 [23]
134

(92.519, 0.5312122913532581)

In [16]:
cur_dists = get_dists(hists_lase_1gram, canberra_metric, hist_len_1gram_lase)
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.685
Best F-measure: 0.9218450453349781
0.685
CLUSTERS:

1 [1]
2 [1]
3 [1]
4 [1]


5 [2]
6 [2]
8 [2]
9 [2]
10 [2]
11 [2]
12 [2]
13 [2]
14 [2]
15 [2]
16 [2]
17 [2]
18 [2]
20 [2]


7 [2]
19 [2]


21 [3]
22 [3]
23 [3]
24 [3]


25 [4]
27 [4]
30 [4]


26 [4]
28 [4]
29 [4]


31 [5]
32 [5]
33 [5]
34 [5]
35 [5]
36 [5]
37 [5]
38 [5]
39 [5]
40 [5]
41 [5]
42 [5]


43 [6]
44 [6]
45 [6]


46 [7]
47 [7]
48 [7]
49 [7]
50 [7]
51 [7]
52 [7]


53 [8]
54 [8]
56 [8]


55 [8]
127 [22]
128 [22]
129 [22]


57 [9]
58 [9]
59 [9]
60 [9]


61 [10]
62 [10]
63 [10]


64 [11]
65 [11]
66 [11]


67 [12]
68 [12]
69 [12]
70 [12]
71 [12]
72 [12]
73 [12]
74 [12]
75 [12]


77 [13]
78 [13]
79 [13]
80 [13]
81 [13]


82 [14]
83 [14]
84 [14]


85 [15]
86 [15]
87 [15]


88 [16]
93 [16]


89 [16]
91 [16]


90 [16]
92 [16]
94 [16]
96 [16]


97 [17]
98 [17]
99 [17]
100 [17]


101 [18]
102 [18]
103 [18]
104 [18]
105 [18]


106 [19]
107 [19]
108 [19]
109 [19]
110 [19]
111 [19]


112 [20]
113 [20]
115 [20]
116 [20]
130 [

(99.216, 0.9218450453349781)

In [52]:
for i in range(1, 11):
    hists = concat_hists_lase[i]
    hists_len = concat_hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'average'
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.045
Best F-measure: 0.5312122913532581
0.045
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 18
Number of outliers: 4/149 (2.685%)

Cohesion = 63.0310581932956
Separation = 13956.058710356667
G1 = 665.1170988068042

Entropy = 1.337041
Purity = 0.537931
F-measure = 0.531212

Rand = 92.519%
Jaccard Index = 23.656%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.685
Best F-measure: 0.9218450453349781
0.685
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 48.283696940439334
Separation = 1814.184271443892
G1 = 215.20768927083802

Entropy = 0.083321
Purity = 0.965753
F-measure = 0.921845

Rand = 99.216%
Jaccard Index = 85.125%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.375
Best F-measure: 0.8847806928343839
0.375
_______________________________
1 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 54.94427551042648
Separation = 4418.873545099504
G1 = 492.6291039861108

Entropy = 0.111322
Purity = 0.951724
F-measure = 0.884781

Rand = 98.975%
Jaccard Index = 80.616%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.405
Best F-measure: 0.8847806928343839
0.405
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 65.24743352150026
Separation = 7371.577630349504
G1 = 709.1212239367347

Entropy = 0.111322
Purity = 0.951724
F-measure = 0.884781

Rand = 98.975%
Jaccard Index = 80.616%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.05
Best F-measure: 0.5501314773776894
0.05
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 3/149 (2.013%)

Cohesion = 63.110038299416104
Separation = 14425.712028650141
G1 = 768.9408601621839

Entropy = 1.255685
Purity = 0.547945
F-measure = 0.550131

Rand = 92.603%
Jaccard Index = 24.566%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.77
Best F-measure: 0.94496212214333
0.77
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 47.381188502731284
Separation = 967.8709189404767
G1 = 95.51375229928365

Entropy = 0.0
Purity = 1.0
F-measure = 0.944962

Rand = 99.524%
Jaccard Index = 90.667%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.47000000000000003
Best F-measure: 0.9015679324404154
0.47000000000000003
_______________________________
2 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 53.98912370034842
Separation = 2999.772168726614
G1 = 351.30298740383085

Entropy = 0.070429
Purity = 0.972222
F-measure = 0.901568

Rand = 99.174%
Jaccard Index = 84.171%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.245
Best F-measure: 0.8921870978034875
0.245
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 59.91527634900284
Separation = 5008.717300319084
G1 = 625.3580793510646

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.23%
Jaccard Index = 85.01%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.055
Best F-measure: 0.5554601043055543
0.055
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 1/149 (0.671%)

Cohesion = 64.08104975869547
Separation = 14954.916876689245
G1 = 703.8792799138995

Entropy = 1.241385
Purity = 0.547297
F-measure = 0.55546

Rand = 92.811%
Jaccard Index = 24.59%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.8250000000000001
Best F-measure: 0.9583849409352763
0.8250000000000001
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 45.73999303575975
Separation = 649.4858703124526
G1 = 61.74865845909621

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.28
Best F-measure: 0.8957194185382102
0.28
_______________________________
3 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 53.22553578204034
Separation = 2229.756646650507
G1 = 321.5496223269984

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.595
Best F-measure: 0.9098046621536552
0.595
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 59.451536367992034
Separation = 4556.7767596260455
G1 = 465.4402602435834

Entropy = 0.069943
Purity = 0.972414
F-measure = 0.909805

Rand = 99.148%
Jaccard Index = 83.61%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.045
Best F-measure: 0.5554601043055543
0.045
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 1/149 (0.671%)

Cohesion = 64.1145439370312
Separation = 14978.011092587407
G1 = 705.4450091140963

Entropy = 1.241385
Purity = 0.547297
F-measure = 0.55546

Rand = 92.811%
Jaccard Index = 24.59%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.865
Best F-measure: 0.9583849409352763
0.865
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 44.98986659027305
Separation = 488.7460534820417
G1 = 47.36966231954129

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.27
Best F-measure: 0.8957194185382102
0.27
_______________________________
4 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.8997787669844
Separation = 1851.3860946378297
G1 = 269.9965977483535

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.305
Best F-measure: 0.8957194185382102
0.305
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 58.72865770944297
Separation = 4073.4298692588645
G1 = 507.78772695552243

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.04
Best F-measure: 0.5554601043055543
0.04
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 1/149 (0.671%)

Cohesion = 64.14165459224311
Separation = 14983.56453740409
G1 = 705.5199542094877

Entropy = 1.241385
Purity = 0.547297
F-measure = 0.55546

Rand = 92.811%
Jaccard Index = 24.59%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.895
Best F-measure: 0.9583849409352763
0.895
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 44.38365620643486
Separation = 394.15526212617067
G1 = 38.854919937662146

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.49
Best F-measure: 0.8995545096216235
0.49
_______________________________
5 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.710700643619965
Separation = 1649.2944622329321
G1 = 238.14152375879956

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.53
Best F-measure: 0.8995545096216235
0.53
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.65174810201428
Separation = 4039.3587578817264
G1 = 495.3984895684954

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.06
Best F-measure: 0.5594869499431382
0.06
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.62219714591593
Separation = 15118.930407572894
G1 = 661.4244467919137

Entropy = 1.233054
Purity = 0.550336
F-measure = 0.559487

Rand = 92.908%
Jaccard Index = 24.735%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.915
Best F-measure: 0.9481679442493469
0.915
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 43.75773221914541
Separation = 347.58025679520455
G1 = 44.57011799904089

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.948168

Rand = 99.348%
Jaccard Index = 87.5%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.525
Best F-measure: 0.8995545096216235
0.525
_______________________________
6 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.365015597292185
Separation = 1474.8687300622205
G1 = 215.17000128367206

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.5650000000000001
Best F-measure: 0.8995545096216235
0.5650000000000001
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.44575500341881
Separation = 4006.492859824121
G1 = 491.5318985003589

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.05
Best F-measure: 0.5594869499431382
0.05
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.64230915498011
Separation = 15118.427751744617
G1 = 660.9741155685471

Entropy = 1.233054
Purity = 0.550336
F-measure = 0.559487

Rand = 92.908%
Jaccard Index = 24.735%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.925
Best F-measure: 0.9481679442493469
0.925
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 43.294218891648775
Separation = 303.74420905580075
G1 = 39.79402615784436

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.948168

Rand = 99.348%
Jaccard Index = 87.5%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.555
Best F-measure: 0.8995545096216235
0.555
_______________________________
7 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.02565073470804
Separation = 1346.282060196684
G1 = 198.31287788297956

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.6
Best F-measure: 0.8995545096216235
0.6
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.25738247839585
Separation = 4011.1122849156714
G1 = 492.5760302917782

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.045
Best F-measure: 0.5594869499431382
0.045
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.65683092374893
Separation = 15117.61681360721
G1 = 660.6113864009599

Entropy = 1.233054
Purity = 0.550336
F-measure = 0.559487

Rand = 92.908%
Jaccard Index = 24.735%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.935
Best F-measure: 0.9481679442493469
0.935
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 42.872559420397245
Separation = 271.90696041399576
G1 = 36.26662805755835

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.948168

Rand = 99.348%
Jaccard Index = 87.5%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.585
Best F-measure: 0.8995545096216235
0.585
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.68824615654077
Separation = 1247.818232222818
G1 = 185.38849369338357

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.625
Best F-measure: 0.8995545096216235
0.625
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.07263173024626
Separation = 4038.403659410798
G1 = 496.49926526195765

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.04
Best F-measure: 0.5594869499431382
0.04
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.66780737496524
Separation = 15116.81877535718
G1 = 660.3347976625985

Entropy = 1.233054
Purity = 0.550336
F-measure = 0.559487

Rand = 92.908%
Jaccard Index = 24.735%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.9550000000000001
Best F-measure: 0.9376491416500075
0.9550000000000001
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 2/149 (1.342%)

Cohesion = 40.48203837960834
Separation = 257.460640145644
G1 = 23.59325876363728

Entropy = 0.13199
Purity = 0.959184
F-measure = 0.937649

Rand = 99.077%
Jaccard Index = 83.163%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.61
Best F-measure: 0.8995545096216235
0.61
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.425445769632496
Separation = 1170.381695480484
G1 = 175.25273402718247

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.65
Best F-measure: 0.8995545096216235
0.65
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 57.960367268854775
Separation = 4078.5169877465714
G1 = 502.2797201702773

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.04
Best F-measure: 0.5594869499431382
0.04
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.67682459517269
Separation = 15116.143610417714
G1 = 660.1167316266583

Entropy = 1.233054
Purity = 0.550336
F-measure = 0.559487

Rand = 92.908%
Jaccard Index = 24.735%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.935
Best F-measure: 0.9360874073365952
0.935
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 43.164971237682
Separation = 220.9661676350944
G1 = 31.987432704472404

Entropy = 0.037218
Purity = 0.993103
F-measure = 0.936087

Rand = 99.215%
Jaccard Index = 84.954%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.63
Best F-measure: 0.8995545096216235
0.63
_______________________________
10 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.22623928855977
Separation = 1108.9900737668695
G1 = 167.2538433857298

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.67
Best F-measure: 0.8995545096216235
0.67
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 57.90618750503389
Separation = 4126.827846571128
G1 = 509.3592726478762

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


In [54]:
LASE_ACTIONS_PATH = "/Users/aliscafo/Documents/ALINA/WORK/SPbAU/thesis/CodeDiffEditScripts/LaseActions"

edit_scripts_lase = []

for double_num in tqdm_notebook(ids_per_label):
    actions_file = open(LASE_ACTIONS_PATH + "/" + double_num + "/" + double_num + "/" + "sampleChange1")
    edit_script = actions_file.read().split("\n")
    edit_script = [elem for elem in edit_script if elem != '']
    
    print(len(edit_script), from_change_to_label[double_num])
    
    edit_scripts_lase.append(edit_script)
    actions_file.close()
    

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))

54 [1]
54 [1]
101 [1]
54 [1]
198 [2]
206 [2]
4 [2]
165 [2]
200 [2]
185 [2]
185 [2]
220 [2]
198 [2]
206 [2]
200 [2]
185 [2]
185 [2]
220 [2]
4 [2]
165 [2]
15 [3]
15 [3]
15 [3]
15 [3]
87 [4]
22 [4]
85 [4]
22 [4]
22 [4]
85 [4]
4 [5]
4 [5]
4 [5]
4 [5]
4 [5]
4 [5]
4 [5]
4 [5]
4 [5]
4 [5]
4 [5]
4 [5]
43 [6]
43 [6]
43 [6]
212 [7]
211 [7]
211 [7]
211 [7]
211 [7]
211 [7]
212 [7]
3 [8]
3 [8]
28 [8]
3 [8]
180 [9]
134 [9]
134 [9]
164 [9]
12 [10]
12 [10]
12 [10]
40 [11]
40 [11]
51 [11]
71 [12]
39 [12]
39 [12]
72 [12]
56 [12]
56 [12]
22 [12]
22 [12]
22 [12]
7 [13]
22 [13]
22 [13]
22 [13]
22 [13]
22 [13]
113 [14]
113 [14]
137 [14]
140 [15]
140 [15]
246 [15]
21 [16]
14 [16]
81 [16]
14 [16]
80 [16]
41 [16]
44 [16]
11 [16]
44 [16]
10 [17]
10 [17]
10 [17]
10 [17]
49 [18]
49 [18]
54 [18]
49 [18]
49 [18]
11 [19]
11 [19]
11 [19]
11 [19]
11 [19]
11 [19]
12 [20]
12 [20]
4 [20]
12 [20]
12 [20]
28 [21]
23 [21]
28 [21]
28 [21]
28 [21]
37 [21]
28 [21]
23 [21]
23 [21]
23 [21]
44 [22]
44 [22]
47 [22]
13 [23]
13 [23]

In [19]:
def lcs(X, Y): 
    # find the length of the strings 
    m = len(X) 
    n = len(Y) 
  
    # declaring the array for storing the dp values 
    L = [[None]*(n + 1) for i in range(m + 1)] 
  
    """Following steps build L[m + 1][n + 1] in bottom up fashion 
    Note: L[i][j] contains length of LCS of X[0..i-1] 
    and Y[0..j-1]"""
    for i in range(m + 1): 
        for j in range(n + 1): 
            if i == 0 or j == 0 : 
                L[i][j] = 0
            elif X[i-1] == Y[j-1]: 
                L[i][j] = L[i-1][j-1]+1
            else: 
                L[i][j] = max(L[i-1][j], L[i][j-1]) 
  
    # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1] 
    return L[m][n] 

In [20]:
dists_for_lcs_lase = []

for es1 in tqdm_notebook(edit_scripts_lase):
    cur_dists = []
    for es2 in edit_scripts_lase:
        cur_lcs = lcs(es1, es2)
        dist = 1 - cur_lcs / max(len(es1), len(es2))
        cur_dists.append(dist)
    dists_for_lcs_lase.append(cur_dists)

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




In [21]:
cur_dists = dists_for_lcs_lase
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.525
Best F-measure: 0.9147607798614508
0.525
CLUSTERS:

1 [1]
2 [1]
3 [1]
4 [1]


5 [2]
6 [2]
8 [2]
9 [2]
10 [2]
11 [2]
12 [2]
13 [2]
14 [2]
15 [2]
16 [2]
17 [2]
18 [2]
20 [2]


7 [2]
19 [2]


21 [3]
22 [3]
23 [3]
24 [3]


25 [4]
27 [4]
30 [4]


26 [4]
28 [4]
29 [4]


31 [5]
32 [5]
33 [5]
34 [5]
35 [5]
36 [5]
37 [5]
38 [5]
39 [5]
40 [5]
41 [5]
42 [5]


43 [6]
44 [6]
45 [6]


46 [7]
47 [7]
48 [7]
49 [7]
50 [7]
51 [7]
52 [7]


53 [8]
54 [8]
56 [8]


57 [9]
58 [9]
59 [9]
60 [9]


61 [10]
62 [10]
63 [10]


64 [11]
65 [11]
66 [11]


67 [12]
70 [12]
71 [12]
72 [12]


68 [12]
69 [12]


73 [12]
74 [12]
75 [12]


77 [13]
78 [13]
79 [13]
80 [13]
81 [13]


82 [14]
83 [14]
84 [14]


85 [15]
86 [15]
87 [15]


88 [16]
89 [16]
91 [16]


90 [16]
92 [16]


94 [16]
96 [16]


97 [17]
98 [17]
99 [17]
100 [17]


101 [18]
102 [18]
103 [18]
104 [18]
105 [18]


106 [19]
107 [19]
108 [19]
109 [19]
110 [19]
111 [19]


112 [20]
113 [20]
115 [20]
116 [20]


117 [21]
118 [21]
119 [21]
120 [21]
121 [21

(99.233, 0.9147607798614508)

# Complete

In [22]:
cur_dists = get_dists(hists_lase_1gram, canberra_metric, hist_len_1gram_lase)
link = 'complete'
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1092), HTML(value='')))


Best eps: 0.6538517398907009
Best F-measure: 0.9328815852305783
0.6538517398907009
CLUSTERS:

1 [1]
2 [1]
3 [1]
4 [1]


5 [2]
6 [2]
8 [2]
9 [2]
10 [2]
11 [2]
12 [2]
13 [2]
14 [2]
15 [2]
16 [2]
17 [2]
18 [2]
20 [2]


7 [2]
19 [2]


21 [3]
22 [3]
23 [3]
24 [3]


25 [4]
27 [4]
30 [4]


26 [4]
28 [4]
29 [4]


31 [5]
32 [5]
33 [5]
34 [5]
35 [5]
36 [5]
37 [5]
38 [5]
39 [5]
40 [5]
41 [5]
42 [5]


43 [6]
44 [6]
45 [6]


46 [7]
47 [7]
48 [7]
49 [7]
50 [7]
51 [7]
52 [7]


53 [8]
54 [8]
56 [8]


57 [9]
58 [9]
59 [9]
60 [9]


61 [10]
62 [10]
63 [10]


64 [11]
65 [11]
66 [11]


67 [12]
68 [12]
69 [12]
70 [12]
71 [12]
72 [12]


73 [12]
74 [12]
75 [12]


77 [13]
78 [13]
79 [13]
80 [13]
81 [13]


82 [14]
83 [14]
84 [14]


85 [15]
86 [15]
87 [15]


88 [16]
93 [16]


89 [16]
91 [16]


90 [16]
92 [16]
94 [16]
96 [16]


97 [17]
98 [17]
99 [17]
100 [17]


101 [18]
102 [18]
103 [18]
104 [18]
105 [18]


106 [19]
107 [19]
108 [19]
109 [19]
110 [19]
111 [19]


112 [20]
113 [20]
115 [20]
116 [20]


117 [21]
11

(99.282, 0.9328815852305783)

In [23]:
cur_dists = dists_for_lcs_lase
link = 'complete'
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link)

print(best_eps)

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))


Best eps: 0.5384615384615384
Best F-measure: 0.9147607798614508
0.5384615384615384
CLUSTERS:

1 [1]
2 [1]
3 [1]
4 [1]


5 [2]
6 [2]
8 [2]
9 [2]
10 [2]
11 [2]
12 [2]
13 [2]
14 [2]
15 [2]
16 [2]
17 [2]
18 [2]
20 [2]


7 [2]
19 [2]


21 [3]
22 [3]
23 [3]
24 [3]


25 [4]
27 [4]
30 [4]


26 [4]
28 [4]
29 [4]


31 [5]
32 [5]
33 [5]
34 [5]
35 [5]
36 [5]
37 [5]
38 [5]
39 [5]
40 [5]
41 [5]
42 [5]


43 [6]
44 [6]
45 [6]


46 [7]
47 [7]
48 [7]
49 [7]
50 [7]
51 [7]
52 [7]


53 [8]
54 [8]
56 [8]


57 [9]
58 [9]
59 [9]
60 [9]


61 [10]
62 [10]
63 [10]


64 [11]
65 [11]
66 [11]


67 [12]
70 [12]
71 [12]
72 [12]


68 [12]
69 [12]


73 [12]
74 [12]
75 [12]


77 [13]
78 [13]
79 [13]
80 [13]
81 [13]


82 [14]
83 [14]
84 [14]


85 [15]
86 [15]
87 [15]


88 [16]
89 [16]
91 [16]


90 [16]
92 [16]


94 [16]
96 [16]


97 [17]
98 [17]
99 [17]
100 [17]


101 [18]
102 [18]
103 [18]
104 [18]
105 [18]


106 [19]
107 [19]
108 [19]
109 [19]
110 [19]
111 [19]


112 [20]
113 [20]
115 [20]
116 [20]


117 [21]
118 [21]

(99.233, 0.9147607798614508)

In [53]:
for i in range(1, 11):
    hists = concat_hists_lase[i]
    hists_len = concat_hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'complete'
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=571), HTML(value='')))


Best eps: 0.29166666666666663
Best F-measure: 0.5425752440596499
0.29166666666666663
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.07092200549252
Separation = 14415.480193668323
G1 = 784.3392634900117

Entropy = 1.419877
Purity = 0.557047
F-measure = 0.542575

Rand = 91.983%
Jaccard Index = 22.997%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1092), HTML(value='')))


Best eps: 0.6538517398907009
Best F-measure: 0.9328815852305783
0.6538517398907009
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 4/149 (2.685%)

Cohesion = 49.528446538230305
Separation = 1769.7129054837073
G1 = 236.58235786365387

Entropy = 0.0
Purity = 1.0
F-measure = 0.932882

Rand = 99.282%
Jaccard Index = 85.902%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1266), HTML(value='')))


Best eps: 0.3800636909429168
Best F-measure: 0.8943684205429169
0.3800636909429168
_______________________________
1 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 54.67407110267244
Separation = 4335.667391863981
G1 = 512.7536694633058

Entropy = 0.041667
Purity = 0.979167
F-measure = 0.894368

Rand = 99.136%
Jaccard Index = 83.333%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1559), HTML(value='')))


Best eps: 0.4207075849233962
Best F-measure: 0.8874652565927731
0.4207075849233962
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 64.89828359095914
Separation = 7225.219426177708
G1 = 704.1262024491477

Entropy = 0.083333
Purity = 0.958333
F-measure = 0.887465

Rand = 99.048%
Jaccard Index = 81.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=621), HTML(value='')))


Best eps: 0.24242424242424232
Best F-measure: 0.5382378407219364
0.24242424242424232
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 20
Number of outliers: 0/149 (0.0%)

Cohesion = 63.790638018461316
Separation = 14951.107576177767
G1 = 747.669824212636

Entropy = 1.393075
Purity = 0.557047
F-measure = 0.538238

Rand = 92.037%
Jaccard Index = 21.326%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1235), HTML(value='')))


Best eps: 0.7865947838170061
Best F-measure: 0.9463044040225246
0.7865947838170061
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 47.29795290130209
Separation = 974.075162399964
G1 = 97.1395899078105

Entropy = 0.0
Purity = 1.0
F-measure = 0.946304

Rand = 99.437%
Jaccard Index = 88.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1299), HTML(value='')))


Best eps: 0.24341866641997645
Best F-measure: 0.8921870978034875
0.24341866641997645
_______________________________
2 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 53.29017626008794
Separation = 2839.615864812512
G1 = 408.1777234988491

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.23%
Jaccard Index = 85.01%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1569), HTML(value='')))


Best eps: 0.26164183574142563
Best F-measure: 0.8921870978034875
0.26164183574142563
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 59.91527634900284
Separation = 5008.717300319084
G1 = 625.3580793510646

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.23%
Jaccard Index = 85.01%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=633), HTML(value='')))


Best eps: 0.2783482142857142
Best F-measure: 0.5454340165614788
0.2783482142857142
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.23944262566921
Separation = 14821.914621602325
G1 = 789.6548830608518

Entropy = 1.340533
Purity = 0.536913
F-measure = 0.545434

Rand = 92.264%
Jaccard Index = 24.713%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1258), HTML(value='')))


Best eps: 0.8403361344537815
Best F-measure: 0.9463044040225246
0.8403361344537815
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 46.33024888257056
Separation = 654.0362424122803
G1 = 67.05809932890782

Entropy = 0.0
Purity = 1.0
F-measure = 0.946304

Rand = 99.437%
Jaccard Index = 88.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1303), HTML(value='')))


Best eps: 0.3579804249171822
Best F-measure: 0.8957194185382102
0.3579804249171822
_______________________________
3 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 53.22553578204034
Separation = 2229.756646650507
G1 = 321.5496223269984

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1584), HTML(value='')))


Best eps: 0.3569645297393841
Best F-measure: 0.8957194185382102
0.3569645297393841
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 59.09389786517994
Separation = 4323.526615661782
G1 = 539.4816355737873

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.2783482142857142
Best F-measure: 0.5454340165614788
0.2783482142857142
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.28625288670109
Separation = 14845.035062476049
G1 = 790.1898985669953

Entropy = 1.340533
Purity = 0.536913
F-measure = 0.545434

Rand = 92.264%
Jaccard Index = 24.713%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1267), HTML(value='')))


Best eps: 0.8703703703703703
Best F-measure: 0.9463044040225246
0.8703703703703703
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 45.6293250199147
Separation = 491.51101577489123
G1 = 51.34791090942411

Entropy = 0.0
Purity = 1.0
F-measure = 0.946304

Rand = 99.437%
Jaccard Index = 88.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1297), HTML(value='')))


Best eps: 0.4585071906246122
Best F-measure: 0.8995545096216235
0.4585071906246122
_______________________________
4 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 53.06180086960258
Separation = 1898.048506562795
G1 = 270.9796211124021

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1571), HTML(value='')))


Best eps: 0.4355272963259006
Best F-measure: 0.8957194185382102
0.4355272963259006
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 58.72865770944297
Separation = 4073.4298692588645
G1 = 507.78772695552243

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.28644307081807063
Best F-measure: 0.5454340165614788
0.28644307081807063
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.32290660209404
Separation = 14849.740132272396
G1 = 789.5463676185227

Entropy = 1.340533
Purity = 0.536913
F-measure = 0.545434

Rand = 92.264%
Jaccard Index = 24.713%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1283), HTML(value='')))


Best eps: 0.9024390243902439
Best F-measure: 0.9393348634959371
0.9024390243902439
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 45.2020951882602
Separation = 406.9277309755107
G1 = 54.42310038114645

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1316), HTML(value='')))


Best eps: 0.49624760214689845
Best F-measure: 0.8995545096216235
0.49624760214689845
_______________________________
5 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.710700643619965
Separation = 1649.2944622329321
G1 = 238.14152375879956

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1582), HTML(value='')))


Best eps: 0.5322652267737382
Best F-measure: 0.8995545096216235
0.5322652267737382
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.65174810201428
Separation = 4039.3587578817264
G1 = 495.3984895684954

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.2929595494111623
Best F-measure: 0.5454340165614788
0.2929595494111623
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.34692701770537
Separation = 14850.035701828932
G1 = 788.9958668728302

Entropy = 1.340533
Purity = 0.536913
F-measure = 0.545434

Rand = 92.264%
Jaccard Index = 24.713%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1290), HTML(value='')))


Best eps: 0.9202279202279202
Best F-measure: 0.9393348634959371
0.9202279202279202
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 44.705538616758446
Separation = 343.3544110108448
G1 = 47.05556675951653

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1312), HTML(value='')))


Best eps: 0.5337558349923058
Best F-measure: 0.8995545096216235
0.5337558349923058
_______________________________
6 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.365015597292185
Separation = 1474.8687300622205
G1 = 215.17000128367206

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1579), HTML(value='')))


Best eps: 0.5677200009492205
Best F-measure: 0.8995545096216235
0.5677200009492205
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.44575500341881
Separation = 4006.492859824121
G1 = 491.5318985003589

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.2977389219576718
Best F-measure: 0.5454340165614788
0.2977389219576718
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.36338331441264
Separation = 14849.841050095754
G1 = 788.5792829861075

Entropy = 1.340533
Purity = 0.536913
F-measure = 0.545434

Rand = 92.264%
Jaccard Index = 24.713%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1292), HTML(value='')))


Best eps: 0.9323671497584541
Best F-measure: 0.9393348634959371
0.9323671497584541
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 44.2544867848159
Separation = 299.4960987650304
G1 = 41.922791110064814

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1307), HTML(value='')))


Best eps: 0.5618611987168809
Best F-measure: 0.8995545096216235
0.5618611987168809
_______________________________
7 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.02565073470804
Separation = 1346.282060196684
G1 = 198.31287788297956

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1587), HTML(value='')))


Best eps: 0.5957489596197095
Best F-measure: 0.8995545096216235
0.5957489596197095
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.25738247839585
Separation = 4011.1122849156714
G1 = 492.5760302917782

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.29999999999999993
Best F-measure: 0.5454340165614788
0.29999999999999993
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.37545782008377
Separation = 14849.2146964451
G1 = 788.236537581748

Entropy = 1.340533
Purity = 0.536913
F-measure = 0.545434

Rand = 92.264%
Jaccard Index = 24.713%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1295), HTML(value='')))


Best eps: 0.9432048681541582
Best F-measure: 0.9360874073365952
0.9432048681541582
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 3/149 (2.013%)

Cohesion = 43.59268772244132
Separation = 271.515658239059
G1 = 38.30373045812406

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.936087

Rand = 99.178%
Jaccard Index = 84.239%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1309), HTML(value='')))


Best eps: 0.585410615544946
Best F-measure: 0.8995545096216235
0.585410615544946
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.68824615654077
Separation = 1247.818232222818
G1 = 185.38849369338357

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1577), HTML(value='')))


Best eps: 0.625038717789127
Best F-measure: 0.8995545096216235
0.625038717789127
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.07263173024626
Separation = 4038.403659410798
G1 = 496.49926526195765

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.296875
Best F-measure: 0.5454340165614788
0.296875
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.38485217183066
Separation = 14848.513204569867
G1 = 787.9759198220868

Entropy = 1.340533
Purity = 0.536913
F-measure = 0.545434

Rand = 92.264%
Jaccard Index = 24.713%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1297), HTML(value='')))


Best eps: 0.927699530516432
Best F-measure: 0.9212140581268096
0.927699530516432
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 44.207202707792824
Separation = 235.85612898675282
G1 = 36.199397185984786

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1308), HTML(value='')))


Best eps: 0.6063142322150428
Best F-measure: 0.8995545096216235
0.6063142322150428
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.425445769632496
Separation = 1170.381695480484
G1 = 175.25273402718247

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1575), HTML(value='')))


Best eps: 0.6506308762686406
Best F-measure: 0.8995545096216235
0.6506308762686406
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 57.960367268854775
Separation = 4078.5169877465714
G1 = 502.2797201702773

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=634), HTML(value='')))


Best eps: 0.29166666666666663
Best F-measure: 0.5454340165614788
0.29166666666666663
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.3930939571011
Separation = 14847.829237710575
G1 = 787.7466492701105

Entropy = 1.340533
Purity = 0.536913
F-measure = 0.545434

Rand = 92.264%
Jaccard Index = 24.713%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1294), HTML(value='')))


Best eps: 0.9343283582089552
Best F-measure: 0.9212140581268096
0.9343283582089552
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 43.97713374933263
Separation = 217.79345922932353
G1 = 33.779128282288454

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1316), HTML(value='')))


Best eps: 0.6293820663876157
Best F-measure: 0.8995545096216235
0.6293820663876157
_______________________________
10 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.22623928855977
Separation = 1108.9900737668695
G1 = 167.2538433857298

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1573), HTML(value='')))


Best eps: 0.8267994728456971
Best F-measure: 0.9008880753847194
0.8267994728456971
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 56.57278691952854
Separation = 4359.12341034228
G1 = 429.0075755366955

Entropy = 0.082759
Purity = 0.958621
F-measure = 0.900888

Rand = 99.1%
Jaccard Index = 82.909%


# LASE with UPD

### DBSCAN

In [81]:
for i in range(1, 11):
    hists = concat_hists_lase[i]
    hists_len = concat_hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=False)

        print(best_eps)

        tufano_clustering = sklearn.cluster.DBSCAN(eps=best_eps, 
                                                   min_samples=2, 
                                                   metric='precomputed').fit(cur_dists)
        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=544), HTML(value='')))


Best eps: 0.06149425287356314
Best F-measure: 0.18754906921092465
0.06149425287356314
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 1/149 (0.671%)

Cohesion = 37.89767092344934
Separation = 2367.1412008881625
G1 = 362.0127455303709

Entropy = 3.944602
Purity = 0.175676
F-measure = 0.187549

Rand = 17.834%
Jaccard Index = 5.658%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1076), HTML(value='')))


Best eps: 0.6005464480874317
Best F-measure: 0.9328815852305783
0.6005464480874317
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 4/149 (2.685%)

Cohesion = 49.49948347777997
Separation = 1730.2662643789113
G1 = 230.7760864177212

Entropy = 0.0
Purity = 1.0
F-measure = 0.932882

Rand = 99.282%
Jaccard Index = 85.902%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1253), HTML(value='')))


Best eps: 0.3205501306462569
Best F-measure: 0.8821368042844551
0.3205501306462569
_______________________________
1 cos_distance
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 54.46126388428483
Separation = 4126.099638440328
G1 = 443.48639167880657

Entropy = 0.142666
Purity = 0.951389
F-measure = 0.882137

Rand = 98.902%
Jaccard Index = 80.0%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1566), HTML(value='')))


Best eps: 0.3406681271408002
Best F-measure: 0.8783017132010418
0.3406681271408002
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 27
Number of outliers: 6/149 (4.027%)

Cohesion = 64.18946056544598
Separation = 6858.277699557816
G1 = 667.4800889603648

Entropy = 0.143664
Purity = 0.951049
F-measure = 0.878302

Rand = 98.887%
Jaccard Index = 79.893%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=600), HTML(value='')))


Best eps: 0.0026315789473684292
Best F-measure: 0.18352222357334078
0.0026315789473684292
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 2/149 (1.342%)

Cohesion = 37.98446291230488
Separation = 2224.904239175631
G1 = 437.92838654245884

Entropy = 3.971436
Purity = 0.170068
F-measure = 0.183522

Rand = 16.709%
Jaccard Index = 5.638%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1222), HTML(value='')))


Best eps: 0.7507374631268438
Best F-measure: 0.9551374847759344
0.7507374631268438
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 3/149 (2.013%)

Cohesion = 46.57977701425086
Separation = 997.4212838044502
G1 = 89.2475345766416

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.395%
Jaccard Index = 88.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1291), HTML(value='')))


Best eps: 0.23529411764705888
Best F-measure: 0.8921870978034875
0.23529411764705888
_______________________________
2 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 52.88477519813346
Separation = 2684.1181567044505
G1 = 387.65136385239214

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.281%
Jaccard Index = 85.972%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1568), HTML(value='')))


Best eps: 0.24023065995955017
Best F-measure: 0.8921870978034875
0.24023065995955017
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 59.45256062027303
Separation = 4821.637555405764
G1 = 610.320958969969

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.281%
Jaccard Index = 85.972%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=608), HTML(value='')))


Best eps: 0.0014492753623188692
Best F-measure: 0.18352222357334078
0.0014492753623188692
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 2/149 (1.342%)

Cohesion = 38.151689302548974
Separation = 2247.5609364441116
G1 = 443.46463914079357

Entropy = 3.971436
Purity = 0.170068
F-measure = 0.183522

Rand = 16.709%
Jaccard Index = 5.638%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1242), HTML(value='')))


Best eps: 0.8033509700176368
Best F-measure: 0.9551374847759344
0.8033509700176368
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 45.692200832501
Separation = 671.7351294559131
G1 = 78.96840366145886

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.358%
Jaccard Index = 87.681%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1294), HTML(value='')))


Best eps: 0.2536095087475332
Best F-measure: 0.8957194185382102
0.2536095087475332
_______________________________
3 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 52.84497853348076
Separation = 2099.4468236246703
G1 = 303.9783289226338

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1570), HTML(value='')))


Best eps: 0.26690196140357003
Best F-measure: 0.8957194185382102
0.26690196140357003
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 58.778371120546915
Separation = 4216.690890647615
G1 = 533.2396060194985

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.0009746588693957392
Best F-measure: 0.18352222357334078
0.0009746588693957392
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 2/149 (1.342%)

Cohesion = 38.202595560604735
Separation = 2255.4342883712307
G1 = 445.5737059246043

Entropy = 3.971436
Purity = 0.170068
F-measure = 0.183522

Rand = 16.709%
Jaccard Index = 5.638%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1262), HTML(value='')))


Best eps: 0.847008547008547
Best F-measure: 0.9551374847759344
0.847008547008547
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 44.92777440945913
Separation = 506.9774825216793
G1 = 61.326638883817985

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.358%
Jaccard Index = 87.681%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1282), HTML(value='')))


Best eps: 0.2669968403373204
Best F-measure: 0.8957194185382102
0.2669968403373204
_______________________________
4 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 52.542244935618626
Separation = 1735.2366801229034
G1 = 253.70628280120908

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1565), HTML(value='')))


Best eps: 0.2858688856618372
Best F-measure: 0.8957194185382102
0.2858688856618372
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 58.27094480012774
Separation = 3896.2028503358742
G1 = 494.0119822393469

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.021157684630738527
Best F-measure: 0.18754906921092465
0.021157684630738527
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 1/149 (0.671%)

Cohesion = 38.706744249787704
Separation = 2472.8789930492526
G1 = 383.48679229989614

Entropy = 3.944602
Purity = 0.175676
F-measure = 0.187549

Rand = 17.834%
Jaccard Index = 5.658%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1269), HTML(value='')))


Best eps: 0.8913857677902621
Best F-measure: 0.9748161874392403
0.8913857677902621
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 3/149 (2.013%)

Cohesion = 43.77755612646844
Separation = 417.99709426254583
G1 = 32.21452630330608

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.974816

Rand = 99.547%
Jaccard Index = 91.304%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1302), HTML(value='')))


Best eps: 0.3041315803239616
Best F-measure: 0.8957194185382102
0.3041315803239616
_______________________________
5 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 52.24338846441498
Separation = 1501.7831908060984
G1 = 221.48585808855967

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1589), HTML(value='')))


Best eps: 0.3424147643206643
Best F-measure: 0.8957194185382102
0.3424147643206643
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 58.06499091658465
Separation = 3802.826706358697
G1 = 482.05191395653947

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.017935702199661563
Best F-measure: 0.18754906921092465
0.017935702199661563
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 1/149 (0.671%)

Cohesion = 38.71498153430268
Separation = 2473.6611927656395
G1 = 383.61297322429823

Entropy = 3.944602
Purity = 0.175676
F-measure = 0.187549

Rand = 17.834%
Jaccard Index = 5.658%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1281), HTML(value='')))


Best eps: 0.9069182389937107
Best F-measure: 0.9783692865312261
0.9069182389937107
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 2/149 (1.342%)

Cohesion = 43.2346043715098
Separation = 369.6918566283684
G1 = 27.999276564937396

Entropy = 0.036712
Purity = 0.993197
F-measure = 0.978369

Rand = 99.553%
Jaccard Index = 91.429%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1302), HTML(value='')))


Best eps: 0.3519281461785321
Best F-measure: 0.8957194185382102
0.3519281461785321
_______________________________
6 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 51.943958060133546
Separation = 1338.637192612185
G1 = 198.9985341442775

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1568), HTML(value='')))


Best eps: 0.3955496009800811
Best F-measure: 0.8957194185382102
0.3955496009800811
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 57.91119840714525
Separation = 3780.5427677132784
G1 = 479.28441915719594

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.01563421828908551
Best F-measure: 0.18754906921092465
0.01563421828908551
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 1/149 (0.671%)

Cohesion = 38.720181960626334
Separation = 2473.982523213689
G1 = 383.62978448869046

Entropy = 3.944602
Purity = 0.175676
F-measure = 0.187549

Rand = 17.834%
Jaccard Index = 5.658%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1283), HTML(value='')))


Best eps: 0.9210300429184549
Best F-measure: 0.9783692865312261
0.9210300429184549
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 2/149 (1.342%)

Cohesion = 42.77551567241165
Separation = 324.5051242894779
G1 = 24.747644236722298

Entropy = 0.036712
Purity = 0.993197
F-measure = 0.978369

Rand = 99.553%
Jaccard Index = 91.429%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1298), HTML(value='')))


Best eps: 0.3971452720451838
Best F-measure: 0.8957194185382102
0.3971452720451838
_______________________________
7 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 51.64422626330322
Separation = 1218.9057757921712
G1 = 182.5836148403781

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1566), HTML(value='')))


Best eps: 0.4459021312545969
Best F-measure: 0.8957194185382102
0.4459021312545969
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 58.035862088350726
Separation = 3849.198678756131
G1 = 475.09058191673375

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.013910761154855678
Best F-measure: 0.18754906921092465
0.013910761154855678
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 1/149 (0.671%)

Cohesion = 38.72349322195036
Separation = 2473.9620307148157
G1 = 383.58091663204226

Entropy = 3.944602
Purity = 0.175676
F-measure = 0.187549

Rand = 17.834%
Jaccard Index = 5.658%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1282), HTML(value='')))


Best eps: 0.9294117647058824
Best F-measure: 0.9708189509607563
0.9294117647058824
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 2/149 (1.342%)

Cohesion = 42.4988809569135
Separation = 288.6093795999956
G1 = 31.712828526599186

Entropy = 0.036712
Purity = 0.993197
F-measure = 0.970819

Rand = 99.422%
Jaccard Index = 88.929%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1304), HTML(value='')))


Best eps: 0.43829382034711095
Best F-measure: 0.8957194185382102
0.43829382034711095
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 51.55615820883729
Separation = 1148.5748472117484
G1 = 170.080683865551

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1590), HTML(value='')))


Best eps: 0.49173254815293577
Best F-measure: 0.8957194185382102
0.49173254815293577
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 57.87702911840255
Separation = 3882.2733449921116
G1 = 479.8307001256309

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.0125741399762751
Best F-measure: 0.18754906921092465
0.0125741399762751
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 1/149 (0.671%)

Cohesion = 38.725785417690915
Separation = 2473.843786948245
G1 = 383.532938175693

Entropy = 3.944602
Purity = 0.175676
F-measure = 0.187549

Rand = 17.834%
Jaccard Index = 5.658%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1282), HTML(value='')))


Best eps: 0.9367088607594937
Best F-measure: 0.9708189509607563
0.9367088607594937
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 2/149 (1.342%)

Cohesion = 42.17656887018823
Separation = 263.47234880532216
G1 = 29.42328882427309

Entropy = 0.036712
Purity = 0.993197
F-measure = 0.970819

Rand = 99.422%
Jaccard Index = 88.929%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1301), HTML(value='')))


Best eps: 0.4758693656376294
Best F-measure: 0.8957194185382102
0.4758693656376294
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 51.312824554476194
Separation = 1075.7573041767853
G1 = 160.4488068982839

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1581), HTML(value='')))


Best eps: 0.6482559678456777
Best F-measure: 0.8995545096216235
0.6482559678456777
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 57.85972702049615
Separation = 3995.4130887393785
G1 = 488.2375742557847

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.011509229098805585
Best F-measure: 0.18754906921092465
0.011509229098805585
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 1/149 (0.671%)

Cohesion = 38.72767538987514
Separation = 2473.740945133642
G1 = 383.4938342936393

Entropy = 3.944602
Purity = 0.175676
F-measure = 0.187549

Rand = 17.834%
Jaccard Index = 5.658%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1284), HTML(value='')))


Best eps: 0.9368169398907104
Best F-measure: 0.9551374847759344
0.9368169398907104
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 42.37029467536296
Separation = 228.97948296516387
G1 = 30.97477199244008

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.358%
Jaccard Index = 87.681%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1306), HTML(value='')))


Best eps: 0.5096685771120242
Best F-measure: 0.8957194185382102
0.5096685771120242
_______________________________
10 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 51.13059732945803
Separation = 1018.2671954142439
G1 = 152.8816512443778

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1588), HTML(value='')))


Best eps: 0.6924951006225576
Best F-measure: 0.9042524961988047
0.6924951006225576
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 56.96370406987147
Separation = 4074.3500116170258
G1 = 439.41404740743087

Entropy = 0.041958
Purity = 0.979021
F-measure = 0.904252

Rand = 99.251%
Jaccard Index = 85.606%


### HAC average

In [84]:
for i in range(1, 11):
    hists = concat_hists_lase[i]
    hists_len = concat_hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'average'
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.03
Best F-measure: 0.5703295859730917
0.03
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 23
Number of outliers: 5/149 (3.356%)

Cohesion = 60.31539412669273
Separation = 14276.9871916422
G1 = 1477.6441713998092

Entropy = 1.18215
Purity = 0.590278
F-measure = 0.57033

Rand = 93.056%
Jaccard Index = 26.213%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.705
Best F-measure: 0.9352678641269244
0.705
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 3/149 (2.013%)

Cohesion = 47.73324417810008
Separation = 1773.5763188071564
G1 = 199.68717794822118

Entropy = 0.083321
Purity = 0.965753
F-measure = 0.935268

Rand = 99.301%
Jaccard Index = 86.738%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.39
Best F-measure: 0.8903735339976948
0.39
_______________________________
1 cos_distance
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 54.71162604573232
Separation = 4212.690570034269
G1 = 429.43434157181025

Entropy = 0.141683
Purity = 0.951724
F-measure = 0.890374

Rand = 98.879%
Jaccard Index = 79.51%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.405
Best F-measure: 0.8916838567845277
0.405
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 4/149 (2.685%)

Cohesion = 64.94678548549366
Separation = 7177.122233137656
G1 = 715.585251931048

Entropy = 0.069943
Purity = 0.972414
F-measure = 0.891684

Rand = 99.061%
Jaccard Index = 81.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.13
Best F-measure: 0.5545372264313475
0.13
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 16
Number of outliers: 1/149 (0.671%)

Cohesion = 64.8364926731208
Separation = 13867.447650605649
G1 = 523.5072604046633

Entropy = 1.461462
Purity = 0.540541
F-measure = 0.554537

Rand = 91.267%
Jaccard Index = 24.663%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.76
Best F-measure: 0.9463044040225246
0.76
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 47.30800251271975
Separation = 956.3238069196735
G1 = 95.13183653775727

Entropy = 0.0
Purity = 1.0
F-measure = 0.946304

Rand = 99.437%
Jaccard Index = 88.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.24
Best F-measure: 0.8921870978034875
0.24
_______________________________
2 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 52.88477519813346
Separation = 2684.1181567044505
G1 = 387.65136385239214

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.281%
Jaccard Index = 85.972%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.245
Best F-measure: 0.8921870978034875
0.245
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 59.45256062027303
Separation = 4821.637555405764
G1 = 610.320958969969

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.281%
Jaccard Index = 85.972%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.13
Best F-measure: 0.5585640720689314
0.13
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 16
Number of outliers: 0/149 (0.0%)

Cohesion = 65.3127661883162
Separation = 14088.301827177527
G1 = 486.3644481429114

Entropy = 1.451653
Purity = 0.543624
F-measure = 0.558564

Rand = 91.384%
Jaccard Index = 24.782%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.8250000000000001
Best F-measure: 0.9583849409352763
0.8250000000000001
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 45.74435022425518
Separation = 639.5588211602358
G1 = 60.61813780183028

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.28
Best F-measure: 0.8957194185382102
0.28
_______________________________
3 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 52.84497853348076
Separation = 2099.4468236246703
G1 = 303.9783289226338

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.295
Best F-measure: 0.8957194185382102
0.295
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 58.778371120546915
Separation = 4216.690890647615
G1 = 533.2396060194985

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.09
Best F-measure: 0.5546225901512185
0.09
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 17
Number of outliers: 0/149 (0.0%)

Cohesion = 64.97932623249457
Separation = 14239.366174991977
G1 = 674.188517412546

Entropy = 1.444267
Purity = 0.543624
F-measure = 0.554623

Rand = 91.52%
Jaccard Index = 24.9%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.87
Best F-measure: 0.9583849409352763
0.87
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 44.99415141444981
Separation = 481.8494750532139
G1 = 46.55909113766663

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.295
Best F-measure: 0.8957194185382102
0.295
_______________________________
4 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 52.542244935618626
Separation = 1735.2366801229034
G1 = 253.70628280120908

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.33
Best F-measure: 0.8957194185382102
0.33
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 58.27094480012774
Separation = 3896.2028503358742
G1 = 494.0119822393469

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.075
Best F-measure: 0.5574901441821258
0.075
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 18
Number of outliers: 0/149 (0.0%)

Cohesion = 64.72799109766285
Separation = 14517.245285996352
G1 = 726.0557138513566

Entropy = 1.39019
Purity = 0.543624
F-measure = 0.55749

Rand = 91.919%
Jaccard Index = 24.555%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.895
Best F-measure: 0.9583849409352763
0.895
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 44.389303265059816
Separation = 388.8864242332735
G1 = 38.21657862867195

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.495
Best F-measure: 0.8995545096216235
0.495
_______________________________
5 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.63512324200671
Separation = 1568.6748327785901
G1 = 221.963234673532

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.535
Best F-measure: 0.8995545096216235
0.535
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.521698214451376
Separation = 3926.6482068593396
G1 = 476.58288699595

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.06
Best F-measure: 0.5574901441821258
0.06
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 18
Number of outliers: 0/149 (0.0%)

Cohesion = 64.75472791053855
Separation = 14517.25237515108
G1 = 725.4075569052325

Entropy = 1.39019
Purity = 0.543624
F-measure = 0.55749

Rand = 91.919%
Jaccard Index = 24.555%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.915
Best F-measure: 0.9481679442493469
0.915
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 43.76425324398547
Separation = 343.3035493970312
G1 = 43.87970050839465

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.948168

Rand = 99.348%
Jaccard Index = 87.5%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.53
Best F-measure: 0.8995545096216235
0.53
_______________________________
6 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.29119043206248
Separation = 1398.9013941787284
G1 = 199.92389961948905

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.5700000000000001
Best F-measure: 0.8995545096216235
0.5700000000000001
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.32594591857045
Separation = 3904.089472901011
G1 = 474.36225588850306

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.05
Best F-measure: 0.5574901441821258
0.05
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 18
Number of outliers: 0/149 (0.0%)

Cohesion = 64.77449793351803
Separation = 14516.733203584925
G1 = 724.9065603696258

Entropy = 1.39019
Purity = 0.543624
F-measure = 0.55749

Rand = 91.919%
Jaccard Index = 24.555%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.93
Best F-measure: 0.9481679442493469
0.93
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 43.30124278354973
Separation = 300.15552149276016
G1 = 39.19205611838029

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.948168

Rand = 99.348%
Jaccard Index = 87.5%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.56
Best F-measure: 0.8995545096216235
0.56
_______________________________
7 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.953740912455046
Separation = 1274.0234738324866
G1 = 183.79014066719893

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.6
Best F-measure: 0.8995545096216235
0.6
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.14543956692635
Separation = 3916.652269301281
G1 = 476.6803596283959

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.05
Best F-measure: 0.5574901441821258
0.05
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 18
Number of outliers: 0/149 (0.0%)

Cohesion = 64.78863174465744
Separation = 14515.933085787301
G1 = 724.5167301543734

Entropy = 1.39019
Purity = 0.543624
F-measure = 0.55749

Rand = 91.919%
Jaccard Index = 24.555%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.915
Best F-measure: 0.9393348634959371
0.915
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 43.96467272346395
Separation = 255.83953183208942
G1 = 36.34910106483262

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.585
Best F-measure: 0.8995545096216235
0.585
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.618345499147665
Separation = 1178.666539537413
G1 = 171.4600089212902

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.625
Best F-measure: 0.8995545096216235
0.625
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 57.96688404247513
Separation = 3950.2079766733877
G1 = 481.6214060763636

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.04
Best F-measure: 0.559226282371814
0.04
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 20
Number of outliers: 0/149 (0.0%)

Cohesion = 64.1179715564506
Separation = 15018.920761185202
G1 = 920.3369882476181

Entropy = 1.293854
Purity = 0.550336
F-measure = 0.559226

Rand = 92.663%
Jaccard Index = 24.463%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.9550000000000001
Best F-measure: 0.9376491416500075
0.9550000000000001
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 2/149 (1.342%)

Cohesion = 40.489083184792136
Separation = 254.62743198530137
G1 = 23.236353340638924

Entropy = 0.13199
Purity = 0.959184
F-measure = 0.937649

Rand = 99.077%
Jaccard Index = 83.163%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.61
Best F-measure: 0.8995545096216235
0.61
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.357525226294634
Separation = 1103.8467481733364
G1 = 161.81784033404733

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.65
Best F-measure: 0.8995545096216235
0.65
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 57.85972702049615
Separation = 3995.4130887393785
G1 = 488.2375742557847

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.04
Best F-measure: 0.559226282371814
0.04
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 20
Number of outliers: 0/149 (0.0%)

Cohesion = 64.12642181412627
Separation = 15018.32007839404
G1 = 920.1027720813186

Entropy = 1.293854
Purity = 0.550336
F-measure = 0.559226

Rand = 92.663%
Jaccard Index = 24.463%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.935
Best F-measure: 0.9360874073365952
0.935
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 43.1728549425342
Separation = 218.52842007798483
G1 = 31.509227849269895

Entropy = 0.037218
Purity = 0.993103
F-measure = 0.936087

Rand = 99.215%
Jaccard Index = 84.954%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.63
Best F-measure: 0.8995545096216235
0.63
_______________________________
10 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.16023109982057
Separation = 1044.6961859193798
G1 = 154.23358723521727

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.8250000000000001
Best F-measure: 0.9098046621536552
0.8250000000000001
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 3/149 (2.013%)

Cohesion = 57.03581140720807
Separation = 4354.353043094204
G1 = 447.7274630596303

Entropy = 0.069464
Purity = 0.972603
F-measure = 0.909805

Rand = 99.112%
Jaccard Index = 82.909%


### HAC complete

In [85]:
for i in range(1, 11):
    hists = concat_hists_lase[i]
    hists_len = concat_hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'complete'
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=544), HTML(value='')))


Best eps: 0.25396825396825395
Best F-measure: 0.5510879027963193
0.25396825396825395
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.26128686741748
Separation = 14489.14902811083
G1 = 711.9665533803262

Entropy = 1.321847
Purity = 0.543624
F-measure = 0.551088

Rand = 92.708%
Jaccard Index = 25.07%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1076), HTML(value='')))


Best eps: 0.6553987678987678
Best F-measure: 0.9328815852305783
0.6553987678987678
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 4/149 (2.685%)

Cohesion = 49.49948347777997
Separation = 1730.2662643789113
G1 = 230.7760864177212

Entropy = 0.0
Purity = 1.0
F-measure = 0.932882

Rand = 99.282%
Jaccard Index = 85.902%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1253), HTML(value='')))


Best eps: 0.37911933371067863
Best F-measure: 0.8943684205429169
0.37911933371067863
_______________________________
1 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 54.59327485687898
Separation = 4184.165814038469
G1 = 479.32481860912384

Entropy = 0.041667
Purity = 0.979167
F-measure = 0.894368

Rand = 99.136%
Jaccard Index = 83.333%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1566), HTML(value='')))


Best eps: 0.4164981672426019
Best F-measure: 0.8943684205429169
0.4164981672426019
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 64.5979538864644
Separation = 7038.212157088337
G1 = 710.7880860062428

Entropy = 0.041667
Purity = 0.979167
F-measure = 0.894368

Rand = 99.136%
Jaccard Index = 83.333%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=600), HTML(value='')))


Best eps: 0.06024096385542166
Best F-measure: 0.5273783525758801
0.06024096385542166
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 27
Number of outliers: 1/149 (0.671%)

Cohesion = 60.37628845568109
Separation = 15988.689151655235
G1 = 1751.3128075224092

Entropy = 1.266454
Purity = 0.581081
F-measure = 0.527378

Rand = 93.326%
Jaccard Index = 20.132%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1222), HTML(value='')))


Best eps: 0.7865947838170061
Best F-measure: 0.9463044040225246
0.7865947838170061
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 47.30800251271975
Separation = 956.3238069196735
G1 = 95.13183653775727

Entropy = 0.0
Purity = 1.0
F-measure = 0.946304

Rand = 99.437%
Jaccard Index = 88.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1291), HTML(value='')))


Best eps: 0.24341866641997645
Best F-measure: 0.8921870978034875
0.24341866641997645
_______________________________
2 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 52.88477519813346
Separation = 2684.1181567044505
G1 = 387.65136385239214

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.281%
Jaccard Index = 85.972%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1568), HTML(value='')))


Best eps: 0.2612477821278276
Best F-measure: 0.8921870978034875
0.2612477821278276
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 59.45256062027303
Separation = 4821.637555405764
G1 = 610.320958969969

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.281%
Jaccard Index = 85.972%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=608), HTML(value='')))


Best eps: 0.27812499999999996
Best F-measure: 0.5514528420364487
0.27812499999999996
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.23563195299094
Separation = 14801.42375196157
G1 = 779.0909134968637

Entropy = 1.384288
Purity = 0.536913
F-measure = 0.551453

Rand = 92.227%
Jaccard Index = 23.619%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1242), HTML(value='')))


Best eps: 0.8403361344537815
Best F-measure: 0.9463044040225246
0.8403361344537815
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 46.33912493127367
Separation = 643.837879536105
G1 = 65.84338522422578

Entropy = 0.0
Purity = 1.0
F-measure = 0.946304

Rand = 99.437%
Jaccard Index = 88.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1294), HTML(value='')))


Best eps: 0.33871942009359113
Best F-measure: 0.8957194185382102
0.33871942009359113
_______________________________
3 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 53.15190609606708
Separation = 2134.8919113455604
G1 = 301.6862084575988

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1570), HTML(value='')))


Best eps: 0.3566753414823205
Best F-measure: 0.8957194185382102
0.3566753414823205
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 59.127167658466966
Separation = 4275.4405530519225
G1 = 525.1667430465795

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.2783482142857142
Best F-measure: 0.5514528420364487
0.2783482142857142
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.28064430018695
Separation = 14824.846849512642
G1 = 779.6936009101823

Entropy = 1.384288
Purity = 0.536913
F-measure = 0.551453

Rand = 92.227%
Jaccard Index = 23.619%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1262), HTML(value='')))


Best eps: 0.8726415094339622
Best F-measure: 0.9463044040225246
0.8726415094339622
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 45.63720696562865
Separation = 484.44917607531596
G1 = 50.475967235284436

Entropy = 0.0
Purity = 1.0
F-measure = 0.946304

Rand = 99.437%
Jaccard Index = 88.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1282), HTML(value='')))


Best eps: 0.45752356415366235
Best F-measure: 0.8995545096216235
0.45752356415366235
_______________________________
4 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.98569262437661
Separation = 1811.1777768361242
G1 = 253.4999270209529

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1565), HTML(value='')))


Best eps: 0.4352834532954858
Best F-measure: 0.8957194185382102
0.4352834532954858
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 58.593985087496854
Separation = 3952.063987340652
G1 = 486.87573928451644

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.06060606060606055
Best F-measure: 0.5547950448463674
0.06060606060606055
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 1/149 (0.671%)

Cohesion = 61.89311395511068
Separation = 15616.095086083205
G1 = 1429.733330066474

Entropy = 1.241611
Purity = 0.567568
F-measure = 0.554795

Rand = 93.216%
Jaccard Index = 23.364%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1269), HTML(value='')))


Best eps: 0.9027777777777778
Best F-measure: 0.9393348634959371
0.9027777777777778
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 45.21025362297533
Separation = 401.55393123637197
G1 = 53.55731060477954

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1302), HTML(value='')))


Best eps: 0.5012947490725999
Best F-measure: 0.8995545096216235
0.5012947490725999
_______________________________
5 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.63512324200671
Separation = 1568.6748327785901
G1 = 221.963234673532

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1589), HTML(value='')))


Best eps: 0.5320418819306685
Best F-measure: 0.8995545096216235
0.5320418819306685
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.521698214451376
Separation = 3926.6482068593396
G1 = 476.58288699595

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.06060606060606055
Best F-measure: 0.5588218904839514
0.06060606060606055
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 0/149 (0.0%)

Cohesion = 62.36958185656476
Separation = 15753.504638872337
G1 = 1388.6424877251575

Entropy = 1.233278
Purity = 0.57047
F-measure = 0.558822

Rand = 93.307%
Jaccard Index = 23.523%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1281), HTML(value='')))


Best eps: 0.9212121212121211
Best F-measure: 0.9393348634959371
0.9212121212121211
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 44.713957115516976
Separation = 339.0410841028536
G1 = 46.330789929102615

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1302), HTML(value='')))


Best eps: 0.5451872995419955
Best F-measure: 0.8995545096216235
0.5451872995419955
_______________________________
6 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 52.29119043206248
Separation = 1398.9013941787284
G1 = 199.92389961948905

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1568), HTML(value='')))


Best eps: 0.5763261323076144
Best F-measure: 0.8995545096216235
0.5763261323076144
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.32594591857045
Separation = 3904.089472901011
G1 = 474.36225588850306

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.050000000000000044
Best F-measure: 0.5588218904839514
0.050000000000000044
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 0/149 (0.0%)

Cohesion = 62.3852937768449
Separation = 15753.426263767158
G1 = 1388.2766839525252

Entropy = 1.233278
Purity = 0.57047
F-measure = 0.558822

Rand = 93.307%
Jaccard Index = 23.523%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1283), HTML(value='')))


Best eps: 0.9344262295081968
Best F-measure: 0.9393348634959371
0.9344262295081968
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 44.263042192029374
Separation = 295.8927308664021
G1 = 41.293062661540986

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1298), HTML(value='')))


Best eps: 0.5795012242781475
Best F-measure: 0.8995545096216235
0.5795012242781475
_______________________________
7 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.953740912455046
Separation = 1274.0234738324866
G1 = 183.79014066719893

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1566), HTML(value='')))


Best eps: 0.5993660727669665
Best F-measure: 0.8995545096216235
0.5993660727669665
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.14543956692635
Separation = 3916.652269301281
G1 = 476.6803596283959

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.050000000000000044
Best F-measure: 0.5588218904839514
0.050000000000000044
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 0/149 (0.0%)

Cohesion = 62.3963883018308
Separation = 15752.947481823096
G1 = 1387.9609377329307

Entropy = 1.233278
Purity = 0.57047
F-measure = 0.558822

Rand = 93.307%
Jaccard Index = 23.523%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1282), HTML(value='')))


Best eps: 0.9432098765432099
Best F-measure: 0.9360874073365952
0.9432098765432099
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 3/149 (2.013%)

Cohesion = 43.601097659465815
Separation = 268.4011487985355
G1 = 37.7410809675504

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.936087

Rand = 99.178%
Jaccard Index = 84.239%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1304), HTML(value='')))


Best eps: 0.585410615544946
Best F-measure: 0.8995545096216235
0.585410615544946
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.618345499147665
Separation = 1178.666539537413
G1 = 171.4600089212902

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1590), HTML(value='')))


Best eps: 0.6354311965033888
Best F-measure: 0.8995545096216235
0.6354311965033888
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 57.96688404247513
Separation = 3950.2079766733877
G1 = 481.6214060763636

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.03888888888888886
Best F-measure: 0.5588218904839514
0.03888888888888886
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 0/149 (0.0%)

Cohesion = 62.40452369469826
Separation = 15752.43927410845
G1 = 1387.7154067395695

Entropy = 1.233278
Purity = 0.57047
F-measure = 0.558822

Rand = 93.307%
Jaccard Index = 23.523%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1282), HTML(value='')))


Best eps: 0.9287037037037037
Best F-measure: 0.9212140581268096
0.9287037037037037
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 44.21957895019309
Separation = 233.17042194376455
G1 = 35.605804226685635

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1301), HTML(value='')))


Best eps: 0.6132687993912562
Best F-measure: 0.8995545096216235
0.6132687993912562
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.357525226294634
Separation = 1103.8467481733364
G1 = 161.81784033404733

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1581), HTML(value='')))


Best eps: 0.6505066856231865
Best F-measure: 0.8995545096216235
0.6505066856231865
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 57.85972702049615
Separation = 3995.4130887393785
G1 = 488.2375742557847

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=610), HTML(value='')))


Best eps: 0.03888888888888886
Best F-measure: 0.5588218904839514
0.03888888888888886
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 0/149 (0.0%)

Cohesion = 62.41122167701864
Separation = 15752.021714276323
G1 = 1387.5096669218237

Entropy = 1.233278
Purity = 0.57047
F-measure = 0.558822

Rand = 93.307%
Jaccard Index = 23.523%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1284), HTML(value='')))


Best eps: 0.9351578947368421
Best F-measure: 0.9212140581268096
0.9351578947368421
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 43.98871766838173
Separation = 215.37588351638337
G1 = 33.23429039851587

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1306), HTML(value='')))


Best eps: 0.6371145751670915
Best F-measure: 0.8995545096216235
0.6371145751670915
_______________________________
10 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 51.16023109982057
Separation = 1044.6961859193798
G1 = 154.23358723521727

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1588), HTML(value='')))


Best eps: 0.6759849436351233
Best F-measure: 0.8995545096216235
0.6759849436351233
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 57.80991851779305
Separation = 4047.979664766859
G1 = 496.01846631750806

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


In [183]:
LASE_ACTIONS_PATH = "/Users/aliscafo/Documents/ALINA/WORK/SPbAU/thesis/CodeDiffEditScripts/LaseActionsWithUPD"

edit_scripts_lase = []

for double_num in tqdm_notebook(ids_per_label):
    actions_file = open(LASE_ACTIONS_PATH + "/" + double_num + "/" + double_num + "/" + "sampleChange1")
    edit_script = actions_file.read().split("\n")
    edit_script = [elem for elem in edit_script if elem != '']
    
    print(len(edit_script), from_change_to_label[double_num])
    print(edit_script)
    
    edit_scripts_lase.append(edit_script)
    actions_file.close()
    

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))

54 [1]
['UPD 27@@ 25@@', 'INS 34@@ 27@@ at 1', 'INS 60@@ 8@@ at 0', 'INS 24@@ 8@@ at 1', 'UPD 42@@ 22@@', 'MOV 5@@ 60@@ at 0', 'INS 59@@ 60@@ at 1', 'INS 58@@ 24@@ at 0', 'INS 27@@ 24@@ at 1', 'INS 37@@ 24@@ at 2', 'INS 8@@ 24@@ at 3', 'MOV 42@@ 59@@ at 0', 'INS 33@@ 59@@ at 1', 'INS 39@@ 58@@ at 0', 'INS 59@@ 58@@ at 1', 'INS 27@@ 27@@ at 0', 'INS 27@@ 27@@ at 1', 'INS 42@@ 37@@ at 0', 'INS 25@@ 8@@ at 0', 'INS 42@@ 59@@ at 0', 'INS 34@@ 59@@ at 1', 'INS 42@@ 27@@ at 0', 'INS 33@@ 27@@ at 1', 'INS 42@@ 27@@ at 0', 'INS 22@@ 27@@ at 1', 'INS 27@@ 25@@ at 0', 'INS 21@@ 25@@ at 1', 'INS 52@@ 22@@ at 0', 'INS 42@@ 22@@ at 1', 'INS 2@@ 27@@ at 0', 'INS 42@@ 27@@ at 1', 'INS 7@@ 21@@ at 0', 'INS 22@@ 2@@ at 0', 'INS 42@@ 2@@ at 1', 'INS 42@@ 7@@ at 0', 'INS 2@@ 7@@ at 1', 'MOV 52@@ 22@@ at 0', 'UPD 42@@ 22@@', 'MOV 42@@ 22@@ at 1', 'INS 22@@ 2@@ at 0', 'INS 42@@ 2@@ at 1', 'INS 52@@ 22@@ at 0', 'INS 42@@ 22@@ at 1', 'DEL 33@@ 27@@', 'DEL 39@@ 5@@', 'DEL 85@@ 5@@', 'DEL 5@@ 11@@', 'DEL 22@@ 

In [184]:
dists_for_lcs_lase = []

for es1 in tqdm_notebook(edit_scripts_lase):
    cur_dists = []
    for es2 in edit_scripts_lase:
        cur_lcs = lcs(es1, es2)
        dist = 1 - cur_lcs / max(len(es1), len(es2))
        cur_dists.append(dist)
    dists_for_lcs_lase.append(cur_dists)

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




In [185]:
cur_dists = dists_for_lcs_lase
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                            from_change_to_label, ids_per_label, 
                            agglomerative=False)

print(best_eps)

tufano_clustering = sklearn.cluster.DBSCAN(eps=best_eps, 
                                           min_samples=2, 
                                           metric='precomputed').fit(cur_dists)
print("_______________________________")
print(i, dist_name)
print("_______________________________")
print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)

DBSCAN


HBox(children=(IntProgress(value=0, max=483), HTML(value='')))


Best eps: 0.5238095238095238
Best F-measure: 0.9259120447039908
0.5238095238095238
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 50.998737399493564
Separation = 1769.862618338729
G1 = 215.56693366636847

Entropy = 0.0
Purity = 1.0
F-measure = 0.925912

Rand = 99.31%
Jaccard Index = 86.476%


(99.31, 0.9259120447039908)

In [186]:
cur_dists = dists_for_lcs_lase
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                            from_change_to_label, ids_per_label, 
                            agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

print("_______________________________")
print(i, dist_name)
print("_______________________________")

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                    affinity='precomputed', 
                                                    linkage=link, 
                                                    compute_full_tree=True,
                                                    distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.525
Best F-measure: 0.9147607798614508
0.525
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 5/149 (3.356%)

Cohesion = 51.14172925375302
Separation = 1778.4948417276805
G1 = 233.51329149912507

Entropy = 0.0
Purity = 1.0
F-measure = 0.914761

Rand = 99.233%
Jaccard Index = 84.952%


(99.233, 0.9147607798614508)

In [187]:
cur_dists = dists_for_lcs_lase
link = 'complete'
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                            from_change_to_label, ids_per_label, 
                            agglomerative=True, linkage=link)

print(best_eps)

print("_______________________________")
print(i, dist_name)
print("_______________________________")

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                    affinity='precomputed', 
                                                    linkage=link, 
                                                    compute_full_tree=True,
                                                    distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=483), HTML(value='')))


Best eps: 0.5384615384615384
Best F-measure: 0.9147607798614508
0.5384615384615384
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 5/149 (3.356%)

Cohesion = 51.14172925375302
Separation = 1778.4948417276805
G1 = 233.51329149912507

Entropy = 0.0
Purity = 1.0
F-measure = 0.914761

Rand = 99.233%
Jaccard Index = 84.952%


(99.233, 0.9147607798614508)

In [118]:
dists_for_equals_lase = []

for es1 in tqdm_notebook(edit_scripts_lase):
    cur_dists = []
    for es2 in edit_scripts_lase:
        if es1 == es2:
            dist = 0.0
        else:
            dist = 1.0
        cur_dists.append(dist)
    dists_for_equals_lase.append(cur_dists)

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




In [124]:
cur_dists = dists_for_equals_lase

best_eps = 0.1

tufano_clustering = sklearn.cluster.DBSCAN(eps=best_eps, 
                                           min_samples=2, 
                                           metric='precomputed').fit(cur_dists)
print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)

Number of clusters: 33
Number of outliers: 33/149 (22.148%)

Cohesion = 41.5
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.696348

Rand = 97.166%
Jaccard Index = 54.126%


(97.166, 0.696348059099737)

In [122]:
cur_dists = dists_for_equals_lase
link = 'average'

best_eps = 0.1
tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                    affinity='precomputed', 
                                                    linkage=link, 
                                                    compute_full_tree=True,
                                                    distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)


Number of clusters: 33
Number of outliers: 33/149 (22.148%)

Cohesion = 41.5
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.696348

Rand = 97.166%
Jaccard Index = 54.126%


(97.166, 0.696348059099737)

In [123]:
cur_dists = dists_for_equals_lase
link = 'complete'

best_eps = 0.1

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                    affinity='precomputed', 
                                                    linkage=link, 
                                                    compute_full_tree=True,
                                                    distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)


Number of clusters: 33
Number of outliers: 33/149 (22.148%)

Cohesion = 41.5
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.696348

Rand = 97.166%
Jaccard Index = 54.126%


(97.166, 0.696348059099737)

# LASE without context

### DBSCAN

In [102]:
for i in range(1, 11):
    hists = concat_hists_lase[i]
    hists_len = concat_hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=False)

        print(best_eps)

        tufano_clustering = sklearn.cluster.DBSCAN(eps=best_eps, 
                                                   min_samples=2, 
                                                   metric='precomputed').fit(cur_dists)
        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=701), HTML(value='')))


Best eps: 0.08593073593073586
Best F-measure: 0.2216340614071988
0.08593073593073586
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 39.60702510537168
Separation = 2836.0744018749474
G1 = 397.64631655220364

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1170), HTML(value='')))


Best eps: 0.6005464480874317
Best F-measure: 0.94496212214333
0.6005464480874317
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 48.83444955205416
Separation = 1997.2847510863821
G1 = 250.72515737475507

Entropy = 0.0
Purity = 1.0
F-measure = 0.944962

Rand = 99.454%
Jaccard Index = 89.286%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1315), HTML(value='')))


Best eps: 0.10850397289695746
Best F-measure: 0.8670193125685882
0.10850397289695746
_______________________________
1 cos_distance
_______________________________
Number of clusters: 31
Number of outliers: 9/149 (6.04%)

Cohesion = 53.68811698798146
Separation = 5457.574106769652
G1 = 759.2591867930533

Entropy = 0.0
Purity = 1.0
F-measure = 0.867019

Rand = 98.726%
Jaccard Index = 75.15%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1560), HTML(value='')))


Best eps: 0.10923985328796071
Best F-measure: 0.8629924669310044
0.10923985328796071
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 10/149 (6.711%)

Cohesion = 61.02892908535854
Separation = 7257.290427315227
G1 = 941.4275234599688

Entropy = 0.0
Purity = 1.0
F-measure = 0.862992

Rand = 98.707%
Jaccard Index = 75.05%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=742), HTML(value='')))


Best eps: 0.060052910052910025
Best F-measure: 0.2216340614071988
0.060052910052910025
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 40.2047587832276
Separation = 2967.2820210232658
G1 = 416.5772749386866

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1280), HTML(value='')))


Best eps: 0.711764705882353
Best F-measure: 0.94496212214333
0.711764705882353
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 47.378778301026905
Separation = 1145.3171051960817
G1 = 133.88369043245356

Entropy = 0.0
Purity = 1.0
F-measure = 0.944962

Rand = 99.454%
Jaccard Index = 89.286%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1340), HTML(value='')))


Best eps: 0.23529411764705888
Best F-measure: 0.8841271730211407
0.23529411764705888
_______________________________
2 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 8/149 (5.369%)

Cohesion = 53.336010913610856
Separation = 3829.474374794565
G1 = 471.9463838505069

Entropy = 0.083393
Purity = 0.964539
F-measure = 0.884127

Rand = 98.967%
Jaccard Index = 81.181%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1565), HTML(value='')))


Best eps: 0.24117828633942295
Best F-measure: 0.8841271730211407
0.24117828633942295
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 8/149 (5.369%)

Cohesion = 57.727730081797816
Separation = 4949.633366785077
G1 = 559.5804532173453

Entropy = 0.083393
Purity = 0.964539
F-measure = 0.884127

Rand = 98.967%
Jaccard Index = 81.181%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=740), HTML(value='')))


Best eps: 0.042409240924092484
Best F-measure: 0.2216340614071988
0.042409240924092484
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 40.401236676729
Separation = 3026.489984316264
G1 = 420.82704981238226

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1321), HTML(value='')))


Best eps: 0.7949640287769785
Best F-measure: 0.94496212214333
0.7949640287769785
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 46.45435230519475
Separation = 767.5957084039803
G1 = 92.61167435153861

Entropy = 0.0
Purity = 1.0
F-measure = 0.944962

Rand = 99.454%
Jaccard Index = 89.286%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1350), HTML(value='')))


Best eps: 0.25342835404914965
Best F-measure: 0.9138402239073378
0.25342835404914965
_______________________________
3 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 7/149 (4.698%)

Cohesion = 53.351723692018844
Separation = 3190.917230834855
G1 = 392.75554750465705

Entropy = 0.0
Purity = 1.0
F-measure = 0.91384

Rand = 99.331%
Jaccard Index = 87.016%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1587), HTML(value='')))


Best eps: 0.26504267744230925
Best F-measure: 0.9138402239073378
0.26504267744230925
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 7/149 (4.698%)

Cohesion = 57.1154897152228
Separation = 4297.473007077799
G1 = 490.1032018380012

Entropy = 0.0
Purity = 1.0
F-measure = 0.91384

Rand = 99.331%
Jaccard Index = 87.016%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=741), HTML(value='')))


Best eps: 0.035377809660449566
Best F-measure: 0.2216340614071988
0.035377809660449566
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 40.47782411940478
Separation = 3051.802679181875
G1 = 421.63814238295276

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1323), HTML(value='')))


Best eps: 0.8419689119170984
Best F-measure: 0.94496212214333
0.8419689119170984
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 45.795257513313985
Separation = 573.8584735373337
G1 = 71.2763820774386

Entropy = 0.0
Purity = 1.0
F-measure = 0.944962

Rand = 99.454%
Jaccard Index = 89.286%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1336), HTML(value='')))


Best eps: 0.29841910708963404
Best F-measure: 0.9138402239073378
0.29841910708963404
_______________________________
4 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 7/149 (4.698%)

Cohesion = 52.975334621878275
Separation = 2724.6890269690894
G1 = 334.0209159588543

Entropy = 0.0
Purity = 1.0
F-measure = 0.91384

Rand = 99.331%
Jaccard Index = 87.016%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1577), HTML(value='')))


Best eps: 0.31117094456200867
Best F-measure: 0.9138402239073378
0.31117094456200867
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 7/149 (4.698%)

Cohesion = 56.55676664957678
Separation = 3877.7906298179714
G1 = 441.33320460347034

Entropy = 0.0
Purity = 1.0
F-measure = 0.91384

Rand = 99.331%
Jaccard Index = 87.016%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=740), HTML(value='')))


Best eps: 0.03243315508021383
Best F-measure: 0.2216340614071988
0.03243315508021383
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 40.51654165413352
Separation = 3065.089576361136
G1 = 421.52607049178647

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1334), HTML(value='')))


Best eps: 0.8854122621564482
Best F-measure: 0.9439518024493124
0.8854122621564482
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 3/149 (2.013%)

Cohesion = 42.81434968248844
Separation = 463.818346647389
G1 = 36.842992639767644

Entropy = 0.119728
Purity = 0.965753
F-measure = 0.943952

Rand = 99.017%
Jaccard Index = 82.432%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1355), HTML(value='')))


Best eps: 0.3406586192409933
Best F-measure: 0.9138402239073378
0.3406586192409933
_______________________________
5 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 7/149 (4.698%)

Cohesion = 52.61213572275444
Separation = 2420.1538173491913
G1 = 296.18588938602045

Entropy = 0.0
Purity = 1.0
F-measure = 0.91384

Rand = 99.331%
Jaccard Index = 87.016%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Best eps: 0.3546988346982779
Best F-measure: 0.9138402239073378
0.3546988346982779
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 7/149 (4.698%)

Cohesion = 56.310407006917984
Separation = 3719.616854485834
G1 = 422.33219514326856

Entropy = 0.0
Purity = 1.0
F-measure = 0.91384

Rand = 99.331%
Jaccard Index = 87.016%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=740), HTML(value='')))


Best eps: 0.02826797385620894
Best F-measure: 0.2216340614071988
0.02826797385620894
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 40.53557906537821
Separation = 3068.4202352447605
G1 = 421.13578540521274

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1338), HTML(value='')))


Best eps: 0.8942953020134228
Best F-measure: 0.9551374847759344
0.8942953020134228
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 43.74078842262151
Separation = 391.6355182265738
G1 = 48.24515991830026

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.358%
Jaccard Index = 87.681%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1353), HTML(value='')))


Best eps: 0.37789027339397807
Best F-measure: 0.9138402239073378
0.37789027339397807
_______________________________
6 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 7/149 (4.698%)

Cohesion = 52.2605691733328
Separation = 2202.150677010787
G1 = 269.5388083946854

Entropy = 0.0
Purity = 1.0
F-measure = 0.91384

Rand = 99.331%
Jaccard Index = 87.016%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1583), HTML(value='')))


Best eps: 0.3930160531403494
Best F-measure: 0.9138402239073378
0.3930160531403494
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 7/149 (4.698%)

Cohesion = 56.10895566611452
Separation = 3639.326816936055
G1 = 413.03686305983774

Entropy = 0.0
Purity = 1.0
F-measure = 0.91384

Rand = 99.331%
Jaccard Index = 87.016%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=739), HTML(value='')))


Best eps: 0.023499881880462925
Best F-measure: 0.2216340614071988
0.023499881880462925
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 40.54679569412786
Separation = 3068.3900488362406
G1 = 420.74443621015024

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1340), HTML(value='')))


Best eps: 0.9094827586206896
Best F-measure: 0.9551374847759344
0.9094827586206896
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 43.29140745351786
Separation = 340.03703988760276
G1 = 42.88163794757979

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.358%
Jaccard Index = 87.681%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1347), HTML(value='')))


Best eps: 0.3787831400137367
Best F-measure: 0.8957194185382102
0.3787831400137367
_______________________________
7 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.195873527212896
Separation = 2038.943674006842
G1 = 273.4266951252593

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1585), HTML(value='')))


Best eps: 0.42721752445771344
Best F-measure: 0.8876594937558635
0.42721752445771344
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 7/149 (4.698%)

Cohesion = 55.161002528553944
Separation = 3571.2364814860766
G1 = 394.8747927565538

Entropy = 0.082806
Purity = 0.964789
F-measure = 0.887659

Rand = 98.981%
Jaccard Index = 81.488%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=739), HTML(value='')))


Best eps: 0.020108146351323963
Best F-measure: 0.2216340614071988
0.020108146351323963
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 40.554250716478094
Separation = 3068.0237005199865
G1 = 420.4005655197142

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1341), HTML(value='')))


Best eps: 0.9204545454545454
Best F-measure: 0.9551374847759344
0.9204545454545454
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 42.87939363296188
Separation = 302.8933655262791
G1 = 38.94034523001293

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.358%
Jaccard Index = 87.681%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1351), HTML(value='')))


Best eps: 0.41937269561916046
Best F-measure: 0.8957194185382102
0.41937269561916046
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 51.88835590080194
Separation = 1909.689957499617
G1 = 256.747178332623

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1571), HTML(value='')))


Best eps: 0.46981187090032395
Best F-measure: 0.8876594937558635
0.46981187090032395
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 7/149 (4.698%)

Cohesion = 54.92257556509343
Separation = 3571.266166975599
G1 = 395.4355723203089

Entropy = 0.082806
Purity = 0.964789
F-measure = 0.887659

Rand = 98.981%
Jaccard Index = 81.488%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=739), HTML(value='')))


Best eps: 0.01765149498713503
Best F-measure: 0.2216340614071988
0.01765149498713503
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 40.55948784904098
Separation = 3067.6950727776893
G1 = 420.1383909265171

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1343), HTML(value='')))


Best eps: 0.9287330316742082
Best F-measure: 0.9551374847759344
0.9287330316742082
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 42.56546535532433
Separation = 274.9999810207081
G1 = 35.93607944910134

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.358%
Jaccard Index = 87.681%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1348), HTML(value='')))


Best eps: 0.45663997956855396
Best F-measure: 0.8957194185382102
0.45663997956855396
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 51.65087070360033
Separation = 1806.3601315077692
G1 = 243.5463508747896

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1571), HTML(value='')))


Best eps: 0.5112067852861952
Best F-measure: 0.8876594937558635
0.5112067852861952
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 7/149 (4.698%)

Cohesion = 54.76640921901225
Separation = 3589.7736487557327
G1 = 398.43117256727663

Entropy = 0.082806
Purity = 0.964789
F-measure = 0.887659

Rand = 98.981%
Jaccard Index = 81.488%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=739), HTML(value='')))


Best eps: 0.01579344288322604
Best F-measure: 0.2216340614071988
0.01579344288322604
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 40.56357717260999
Separation = 3067.362214904125
G1 = 419.93149169267934

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1345), HTML(value='')))


Best eps: 0.9351851851851852
Best F-measure: 0.9551374847759344
0.9351851851851852
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 42.327850824548875
Separation = 253.5276520504006
G1 = 33.601532897768024

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.955137

Rand = 99.358%
Jaccard Index = 87.681%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1358), HTML(value='')))


Best eps: 0.4943876986268312
Best F-measure: 0.8876594937558635
0.4943876986268312
_______________________________
10 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 7/149 (4.698%)

Cohesion = 50.18563787927453
Separation = 1699.418782424467
G1 = 203.55362719238772

Entropy = 0.082806
Purity = 0.964789
F-measure = 0.887659

Rand = 98.981%
Jaccard Index = 81.488%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1576), HTML(value='')))


Best eps: 0.5486453034983856
Best F-measure: 0.8876594937558635
0.5486453034983856
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 7/149 (4.698%)

Cohesion = 54.67664583373232
Separation = 3621.01318040767
G1 = 403.18375524033803

Entropy = 0.082806
Purity = 0.964789
F-measure = 0.887659

Rand = 98.981%
Jaccard Index = 81.488%


### HAC average

In [103]:
for i in range(1, 11):
    hists = concat_hists_lase[i]
    hists_len = concat_hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'average'
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.035
Best F-measure: 0.5526319451806876
0.035
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 58.699828929147195
Separation = 14613.14806804667
G1 = 1591.6382427545284

Entropy = 1.13823
Purity = 0.613793
F-measure = 0.552632

Rand = 93.161%
Jaccard Index = 22.644%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.635
Best F-measure: 0.924721363647538
0.635
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 48.00401363599733
Separation = 1999.1812008376771
G1 = 239.07032011993098

Entropy = 0.061515
Purity = 0.972414
F-measure = 0.924721

Rand = 99.262%
Jaccard Index = 86.051%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.12
Best F-measure: 0.8670193125685882
0.12
_______________________________
1 cos_distance
_______________________________
Number of clusters: 31
Number of outliers: 9/149 (6.04%)

Cohesion = 53.68811698798146
Separation = 5457.574106769652
G1 = 759.2591867930533

Entropy = 0.0
Purity = 1.0
F-measure = 0.867019

Rand = 98.726%
Jaccard Index = 75.15%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.13
Best F-measure: 0.8540439210697067
0.13
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 32
Number of outliers: 10/149 (6.711%)

Cohesion = 60.585733732486915
Separation = 7321.616316815078
G1 = 1072.6507757431737

Entropy = 0.0
Purity = 1.0
F-measure = 0.854044

Rand = 98.665%
Jaccard Index = 74.245%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.03
Best F-measure: 0.5699910179630052
0.03
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 58.26111238025284
Separation = 14835.20684253884
G1 = 1605.5021165150233

Entropy = 1.090007
Purity = 0.631944
F-measure = 0.569991

Rand = 93.172%
Jaccard Index = 23.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.76
Best F-measure: 0.94496212214333
0.76
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 47.225539564181226
Separation = 1107.1339551945582
G1 = 107.77035445915203

Entropy = 0.0
Purity = 1.0
F-measure = 0.944962

Rand = 99.524%
Jaccard Index = 90.667%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.17
Best F-measure: 0.8670193125685882
0.17
_______________________________
2 cos_distance
_______________________________
Number of clusters: 31
Number of outliers: 8/149 (5.369%)

Cohesion = 53.80480766936275
Separation = 4050.2994225166512
G1 = 539.5863071193071

Entropy = 0.0
Purity = 1.0
F-measure = 0.867019

Rand = 98.683%
Jaccard Index = 74.359%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.2
Best F-measure: 0.8670193125685882
0.2
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 8/149 (5.369%)

Cohesion = 58.13822225101212
Separation = 5182.675856489391
G1 = 638.3859616624419

Entropy = 0.0
Purity = 1.0
F-measure = 0.867019

Rand = 98.683%
Jaccard Index = 74.359%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.11
Best F-measure: 0.580443542223999
0.11
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 21
Number of outliers: 0/149 (0.0%)

Cohesion = 63.198827610509326
Separation = 14720.757897129606
G1 = 806.0472268833892

Entropy = 1.175593
Purity = 0.610738
F-measure = 0.580444

Rand = 92.808%
Jaccard Index = 27.114%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.84
Best F-measure: 0.9583849409352763
0.84
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 45.59497310980398
Separation = 743.5190034294253
G1 = 68.8899852407282

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.28
Best F-measure: 0.8957194185382102
0.28
_______________________________
3 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 53.41787418584624
Separation = 3205.4988548836127
G1 = 428.0883721616952

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.295
Best F-measure: 0.8957194185382102
0.295
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 57.170152549806865
Separation = 4313.283400725992
G1 = 531.2352743489857

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.105
Best F-measure: 0.5929389662630471
0.105
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 20
Number of outliers: 0/149 (0.0%)

Cohesion = 63.54041869914197
Separation = 14671.03118597369
G1 = 724.4043553316203

Entropy = 1.194082
Purity = 0.610738
F-measure = 0.592939

Rand = 92.79%
Jaccard Index = 27.596%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.865
Best F-measure: 0.9583849409352763
0.865
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 44.89932448682054
Separation = 554.5682363666389
G1 = 52.56989091868434

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.275
Best F-measure: 0.8957194185382102
0.275
_______________________________
4 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 53.11319016005382
Separation = 2734.015447948901
G1 = 364.90743279571245

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.31
Best F-measure: 0.8957194185382102
0.31
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 56.68534809174813
Separation = 3887.4211342478616
G1 = 478.4189337447135

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.1
Best F-measure: 0.5929389662630471
0.1
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 20
Number of outliers: 0/149 (0.0%)

Cohesion = 63.57954447974224
Separation = 14690.975403719007
G1 = 723.9790559315427

Entropy = 1.194082
Purity = 0.610738
F-measure = 0.592939

Rand = 92.79%
Jaccard Index = 27.596%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.895
Best F-measure: 0.9583849409352763
0.895
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 44.31389185455178
Separation = 443.800812510779
G1 = 42.90526910719758

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.325
Best F-measure: 0.8957194185382102
0.325
_______________________________
5 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.808781622434545
Separation = 2425.5347401038007
G1 = 323.9336432887134

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.365
Best F-measure: 0.8957194185382102
0.365
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 56.49751667155819
Separation = 3724.4553919787013
G1 = 457.26790506740895

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.1
Best F-measure: 0.5929389662630471
0.1
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 20
Number of outliers: 0/149 (0.0%)

Cohesion = 63.607920570707876
Separation = 14695.692592656269
G1 = 723.2364422971467

Entropy = 1.194082
Purity = 0.610738
F-measure = 0.592939

Rand = 92.79%
Jaccard Index = 27.596%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.91
Best F-measure: 0.9514154004086888
0.91
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 43.94869258660669
Separation = 382.2483049099684
G1 = 47.87385226417505

Entropy = 0.0
Purity = 1.0
F-measure = 0.951415

Rand = 99.531%
Jaccard Index = 90.789%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.37
Best F-measure: 0.8957194185382102
0.37
_______________________________
6 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.50191695712007
Separation = 2204.75887217029
G1 = 294.9531608053245

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.41500000000000004
Best F-measure: 0.8957194185382102
0.41500000000000004
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 56.340045093371195
Separation = 3640.5526348599146
G1 = 446.5798281919265

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.095
Best F-measure: 0.5929389662630471
0.095
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 20
Number of outliers: 0/149 (0.0%)

Cohesion = 63.6294657994726
Separation = 14695.31314068502
G1 = 722.5305145273438

Entropy = 1.194082
Purity = 0.610738
F-measure = 0.592939

Rand = 92.79%
Jaccard Index = 27.596%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.925
Best F-measure: 0.9481679442493469
0.925
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 43.243098549450494
Separation = 335.8342262047129
G1 = 42.768215354177876

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.948168

Rand = 99.348%
Jaccard Index = 87.5%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.41000000000000003
Best F-measure: 0.8957194185382102
0.41000000000000003
_______________________________
7 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.195873527212896
Separation = 2038.943674006842
G1 = 273.4266951252593

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.36
Best F-measure: 0.8867708726769126
0.36
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 55.698261530452484
Separation = 3513.2788971790446
G1 = 461.51232825749645

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.095
Best F-measure: 0.5929389662630471
0.095
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 20
Number of outliers: 0/149 (0.0%)

Cohesion = 63.645335214664286
Separation = 14694.409889939481
G1 = 721.9714305998451

Entropy = 1.194082
Purity = 0.610738
F-measure = 0.592939

Rand = 92.79%
Jaccard Index = 27.596%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.935
Best F-measure: 0.9481679442493469
0.935
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 42.82723728631994
Separation = 299.1475103286293
G1 = 38.87184449889349

Entropy = 0.036963
Purity = 0.993151
F-measure = 0.948168

Rand = 99.348%
Jaccard Index = 87.5%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.45
Best F-measure: 0.8957194185382102
0.45
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 51.88835590080194
Separation = 1909.689957499617
G1 = 256.747178332623

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.38
Best F-measure: 0.8867708726769126
0.38
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 55.60531788010041
Separation = 3503.2557448809935
G1 = 458.215595534148

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.095
Best F-measure: 0.5929389662630471
0.095
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 20
Number of outliers: 0/149 (0.0%)

Cohesion = 63.6573051142419
Separation = 14693.53818052176
G1 = 721.5476304787093

Entropy = 1.194082
Purity = 0.610738
F-measure = 0.592939

Rand = 92.79%
Jaccard Index = 27.596%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.925
Best F-measure: 0.9360874073365952
0.925
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 43.36380786240735
Separation = 262.68584263146874
G1 = 36.643653416472645

Entropy = 0.037218
Purity = 0.993103
F-measure = 0.936087

Rand = 99.215%
Jaccard Index = 84.954%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.485
Best F-measure: 0.8957194185382102
0.485
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 51.65087070360033
Separation = 1806.3601315077692
G1 = 243.5463508747896

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.4
Best F-measure: 0.8867708726769126
0.4
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 55.575230004812035
Separation = 3512.1879590874432
G1 = 458.0032641047995

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.09
Best F-measure: 0.5929389662630471
0.09
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 20
Number of outliers: 0/149 (0.0%)

Cohesion = 63.666943576912786
Separation = 14692.768645338703
G1 = 721.2135348943291

Entropy = 1.194082
Purity = 0.610738
F-measure = 0.592939

Rand = 92.79%
Jaccard Index = 27.596%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.93
Best F-measure: 0.9360874073365952
0.93
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 43.13164387527553
Separation = 241.925139932531
G1 = 34.2006740611084

Entropy = 0.037218
Purity = 0.993103
F-measure = 0.936087

Rand = 99.215%
Jaccard Index = 84.954%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.4
Best F-measure: 0.8867708726769126
0.4
_______________________________
10 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 51.13475115849401
Separation = 1674.9959690577577
G1 = 250.06929671522315

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.42
Best F-measure: 0.8867708726769126
0.42
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 55.59499138801701
Separation = 3534.315316801324
G1 = 460.0667905453221

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


### HAC complete

In [104]:
for i in range(1, 11):
    hists = concat_hists_lase[i]
    hists_len = concat_hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'complete'
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=701), HTML(value='')))


Best eps: 0.25555555555555554
Best F-measure: 0.5773679592767631
0.25555555555555554
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 23
Number of outliers: 0/149 (0.0%)

Cohesion = 62.321974252005546
Separation = 15066.229912630779
G1 = 881.8877402018372

Entropy = 1.087539
Purity = 0.610738
F-measure = 0.577368

Rand = 93.978%
Jaccard Index = 25.893%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1170), HTML(value='')))


Best eps: 0.6851851851851851
Best F-measure: 0.9300052669180184
0.6851851851851851
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 3/149 (2.013%)

Cohesion = 49.20290456166456
Separation = 2065.7927233867895
G1 = 266.8154263400863

Entropy = 0.022227
Purity = 0.993151
F-measure = 0.930005

Rand = 99.235%
Jaccard Index = 84.944%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1315), HTML(value='')))


Best eps: 0.22285218942346485
Best F-measure: 0.8493505258392111
0.22285218942346485
_______________________________
1 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 7/149 (4.698%)

Cohesion = 54.51637203834549
Separation = 5410.136056507714
G1 = 662.6469344711356

Entropy = 0.162386
Purity = 0.93662
F-measure = 0.849351

Rand = 98.632%
Jaccard Index = 75.88%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1560), HTML(value='')))


Best eps: 0.23699304564585433
Best F-measure: 0.8493505258392111
0.23699304564585433
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 7/149 (4.698%)

Cohesion = 62.591141189711465
Separation = 7362.742485314822
G1 = 800.0123645102535

Entropy = 0.162386
Purity = 0.93662
F-measure = 0.849351

Rand = 98.632%
Jaccard Index = 75.88%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=742), HTML(value='')))


Best eps: 0.0625
Best F-measure: 0.5429747441731361
0.0625
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 31
Number of outliers: 1/149 (0.671%)

Cohesion = 58.39178913519431
Separation = 16482.1005677688
G1 = 1986.862470175093

Entropy = 1.040346
Purity = 0.608108
F-measure = 0.542975

Rand = 94.264%
Jaccard Index = 20.102%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1280), HTML(value='')))


Best eps: 0.8532494758909854
Best F-measure: 0.9379942998475147
0.8532494758909854
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 2/149 (1.342%)

Cohesion = 45.157742031692045
Separation = 1202.845744400743
G1 = 97.33855011204727

Entropy = 0.119466
Purity = 0.959184
F-measure = 0.937994

Rand = 99.171%
Jaccard Index = 84.602%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1340), HTML(value='')))


Best eps: 0.21537673298332127
Best F-measure: 0.8670193125685882
0.21537673298332127
_______________________________
2 cos_distance
_______________________________
Number of clusters: 31
Number of outliers: 8/149 (5.369%)

Cohesion = 53.80480766936275
Separation = 4050.2994225166512
G1 = 539.5863071193071

Entropy = 0.0
Purity = 1.0
F-measure = 0.867019

Rand = 98.683%
Jaccard Index = 74.359%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1565), HTML(value='')))


Best eps: 0.26455786290882144
Best F-measure: 0.8660063676520132
0.26455786290882144
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 8/149 (5.369%)

Cohesion = 57.69382715670947
Separation = 4973.659123385777
G1 = 606.9752334456057

Entropy = 0.083393
Purity = 0.964539
F-measure = 0.866006

Rand = 98.875%
Jaccard Index = 79.52%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=740), HTML(value='')))


Best eps: 0.125
Best F-measure: 0.5723644923309352
0.125
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 27
Number of outliers: 0/149 (0.0%)

Cohesion = 60.697732002375396
Separation = 15994.026085172345
G1 = 1458.2355017517118

Entropy = 1.029095
Purity = 0.630872
F-measure = 0.572364

Rand = 94.032%
Jaccard Index = 24.714%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1321), HTML(value='')))


Best eps: 0.8722222222222221
Best F-measure: 0.9583849409352763
0.8722222222222221
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 45.59497310980398
Separation = 743.5190034294253
G1 = 68.8899852407282

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1350), HTML(value='')))


Best eps: 0.26037877179989455
Best F-measure: 0.8921870978034875
0.26037877179989455
_______________________________
3 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 53.13319568370806
Separation = 3127.0129158118384
G1 = 423.4273310369901

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.23%
Jaccard Index = 85.01%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1587), HTML(value='')))


Best eps: 0.3019293997863379
Best F-measure: 0.8921870978034875
0.3019293997863379
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 56.8626464010738
Separation = 4222.536426797795
G1 = 526.2753966244405

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.23%
Jaccard Index = 85.01%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=741), HTML(value='')))


Best eps: 0.13157894736842102
Best F-measure: 0.5723644923309352
0.13157894736842102
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 26
Number of outliers: 0/149 (0.0%)

Cohesion = 61.166310540716026
Separation = 15912.50060899465
G1 = 1166.256756948241

Entropy = 1.042518
Purity = 0.630872
F-measure = 0.572364

Rand = 94.014%
Jaccard Index = 24.743%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1323), HTML(value='')))


Best eps: 0.9029535864978903
Best F-measure: 0.9583849409352763
0.9029535864978903
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 5/149 (3.356%)

Cohesion = 44.89932448682054
Separation = 554.5682363666389
G1 = 52.56989091868434

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.611%
Jaccard Index = 92.381%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1336), HTML(value='')))


Best eps: 0.3511468428224539
Best F-measure: 0.8957194185382102
0.3511468428224539
_______________________________
4 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 53.11319016005382
Separation = 2734.015447948901
G1 = 364.90743279571245

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1577), HTML(value='')))


Best eps: 0.3597209695054614
Best F-measure: 0.8921870978034875
0.3597209695054614
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 56.36796129445808
Separation = 3808.014670307803
G1 = 474.07502042816253

Entropy = 0.0
Purity = 1.0
F-measure = 0.892187

Rand = 99.23%
Jaccard Index = 85.01%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=740), HTML(value='')))


Best eps: 0.13461538461538458
Best F-measure: 0.5723644923309352
0.13461538461538458
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 26
Number of outliers: 0/149 (0.0%)

Cohesion = 61.20053146408148
Separation = 15933.393606623911
G1 = 1166.1431413323903

Entropy = 1.042518
Purity = 0.630872
F-measure = 0.572364

Rand = 94.014%
Jaccard Index = 24.743%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1334), HTML(value='')))


Best eps: 0.895910780669145
Best F-measure: 0.9393348634959371
0.895910780669145
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 45.12992019623631
Separation = 456.6556305918066
G1 = 58.906083917449905

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1355), HTML(value='')))


Best eps: 0.392462020905777
Best F-measure: 0.8957194185382102
0.392462020905777
_______________________________
5 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.808781622434545
Separation = 2425.5347401038007
G1 = 323.9336432887134

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1582), HTML(value='')))


Best eps: 0.4021271818340597
Best F-measure: 0.8957194185382102
0.4021271818340597
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 56.49751667155819
Separation = 3724.4553919787013
G1 = 457.26790506740895

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=740), HTML(value='')))


Best eps: 0.13636363636363635
Best F-measure: 0.5723644923309352
0.13636363636363635
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 26
Number of outliers: 0/149 (0.0%)

Cohesion = 61.22661017378726
Separation = 15938.446743050763
G1 = 1165.4510127739047

Entropy = 1.042518
Purity = 0.630872
F-measure = 0.572364

Rand = 94.014%
Jaccard Index = 24.743%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1338), HTML(value='')))


Best eps: 0.9165099268547546
Best F-measure: 0.9393348634959371
0.9165099268547546
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 44.43997613267035
Separation = 385.75895658693645
G1 = 51.50787012926589

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1353), HTML(value='')))


Best eps: 0.42553482941075405
Best F-measure: 0.8957194185382102
0.42553482941075405
_______________________________
6 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.50191695712007
Separation = 2204.75887217029
G1 = 294.9531608053245

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1583), HTML(value='')))


Best eps: 0.3757053753254612
Best F-measure: 0.8867708726769126
0.3757053753254612
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 7/149 (4.698%)

Cohesion = 56.158403767608156
Separation = 3664.9285508324233
G1 = 514.5520237837019

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.201%
Jaccard Index = 84.496%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=739), HTML(value='')))


Best eps: 0.13636363636363635
Best F-measure: 0.5723644923309352
0.13636363636363635
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 26
Number of outliers: 0/149 (0.0%)

Cohesion = 61.24690889386373
Separation = 15938.173392725324
G1 = 1164.684579853231

Entropy = 1.042518
Purity = 0.630872
F-measure = 0.572364

Rand = 94.014%
Jaccard Index = 24.743%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1340), HTML(value='')))


Best eps: 0.9309857837635614
Best F-measure: 0.9393348634959371
0.9309857837635614
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 43.997917778536085
Separation = 334.3530486794017
G1 = 45.731481649751885

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1347), HTML(value='')))


Best eps: 0.4519994339936173
Best F-measure: 0.8957194185382102
0.4519994339936173
_______________________________
7 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 52.195873527212896
Separation = 2038.943674006842
G1 = 273.4266951252593

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.241%
Jaccard Index = 85.271%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1585), HTML(value='')))


Best eps: 0.3710103471793894
Best F-measure: 0.8867708726769126
0.3710103471793894
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 55.698261530452484
Separation = 3513.2788971790446
G1 = 461.51232825749645

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=739), HTML(value='')))


Best eps: 0.13636363636363635
Best F-measure: 0.5723644923309352
0.13636363636363635
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 26
Number of outliers: 0/149 (0.0%)

Cohesion = 61.2617477434396
Separation = 15937.351459458288
G1 = 1164.0713994839232

Entropy = 1.042518
Purity = 0.630872
F-measure = 0.572364

Rand = 94.014%
Jaccard Index = 24.743%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1341), HTML(value='')))


Best eps: 0.9409491037131883
Best F-measure: 0.9393348634959371
0.9409491037131883
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 43.58679997134369
Separation = 297.39518951084625
G1 = 41.482874244043245

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1351), HTML(value='')))


Best eps: 0.383675499655976
Best F-measure: 0.8867708726769126
0.383675499655976
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 51.46349411799298
Separation = 1858.4867000203878
G1 = 277.0101135426546

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1571), HTML(value='')))


Best eps: 0.41130818747973463
Best F-measure: 0.8867708726769126
0.41130818747973463
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 55.60531788010041
Separation = 3503.2557448809935
G1 = 458.215595534148

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=739), HTML(value='')))


Best eps: 0.1333333333333332
Best F-measure: 0.5723644923309352
0.1333333333333332
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 26
Number of outliers: 0/149 (0.0%)

Cohesion = 61.273206719841255
Separation = 15936.483851023047
G1 = 1163.5926266238007

Entropy = 1.042518
Purity = 0.630872
F-measure = 0.572364

Rand = 94.014%
Jaccard Index = 24.743%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1343), HTML(value='')))


Best eps: 0.9609375
Best F-measure: 0.9247057092434878
0.9609375
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 2/149 (1.342%)

Cohesion = 40.61704468570511
Separation = 279.18258087373914
G1 = 34.51807771266215

Entropy = 0.13199
Purity = 0.959184
F-measure = 0.924706

Rand = 99.022%
Jaccard Index = 82.143%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1348), HTML(value='')))


Best eps: 0.41705078443710486
Best F-measure: 0.8867708726769126
0.41705078443710486
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 51.27151527686411
Separation = 1756.9539990663752
G1 = 262.0441027546658

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1571), HTML(value='')))


Best eps: 0.44641742807554385
Best F-measure: 0.8867708726769126
0.44641742807554385
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 55.575230004812035
Separation = 3512.1879590874432
G1 = 458.0032641047995

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=739), HTML(value='')))


Best eps: 0.12962962962962965
Best F-measure: 0.5723644923309352
0.12962962962962965
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 26
Number of outliers: 0/149 (0.0%)

Cohesion = 61.28288959706881
Separation = 15935.625520563217
G1 = 1163.1820864186757

Entropy = 1.042518
Purity = 0.630872
F-measure = 0.572364

Rand = 94.014%
Jaccard Index = 24.743%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1345), HTML(value='')))


Best eps: 0.9651741293532337
Best F-measure: 0.9247057092434878
0.9651741293532337
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 2/149 (1.342%)

Cohesion = 40.374851234369956
Separation = 257.6225521271974
G1 = 32.29429581115038

Entropy = 0.13199
Purity = 0.959184
F-measure = 0.924706

Rand = 99.022%
Jaccard Index = 82.143%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1358), HTML(value='')))


Best eps: 0.4467854078516117
Best F-measure: 0.8867708726769126
0.4467854078516117
_______________________________
10 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 51.13475115849401
Separation = 1674.9959690577577
G1 = 250.06929671522315

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1576), HTML(value='')))


Best eps: 0.47766243452861745
Best F-measure: 0.8867708726769126
0.47766243452861745
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 9/149 (6.04%)

Cohesion = 55.59499138801701
Separation = 3534.315316801324
G1 = 460.0667905453221

Entropy = 0.0
Purity = 1.0
F-measure = 0.886771

Rand = 99.26%
Jaccard Index = 85.799%


In [182]:
LASE_ACTIONS_PATH = "/Users/aliscafo/Documents/ALINA/WORK/SPbAU/thesis/CodeDiffEditScripts/LaseActionsWoContext"

edit_scripts_lase = []

for double_num in tqdm_notebook(ids_per_label):
    actions_file = open(LASE_ACTIONS_PATH + "/" + double_num + "/" + double_num + "/" + "sampleChange1")
    edit_script = actions_file.read().split("\n")
    edit_script = [elem for elem in edit_script if elem != '']
    
    print(len(edit_script), from_change_to_label[double_num])
    print(edit_script)
    
    edit_scripts_lase.append(edit_script)
    actions_file.close()
    

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))

54 [1]
['UPD 27@@', 'INS 34@@ 27@@ at 1', 'INS 60@@ 8@@ at 0', 'INS 24@@ 8@@ at 1', 'UPD 42@@', 'MOV 5@@ 60@@ at 0', 'INS 59@@ 60@@ at 1', 'INS 58@@ 24@@ at 0', 'INS 27@@ 24@@ at 1', 'INS 37@@ 24@@ at 2', 'INS 8@@ 24@@ at 3', 'MOV 42@@ 59@@ at 0', 'INS 33@@ 59@@ at 1', 'INS 39@@ 58@@ at 0', 'INS 59@@ 58@@ at 1', 'INS 27@@ 27@@ at 0', 'INS 27@@ 27@@ at 1', 'INS 42@@ 37@@ at 0', 'INS 25@@ 8@@ at 0', 'INS 42@@ 59@@ at 0', 'INS 34@@ 59@@ at 1', 'INS 42@@ 27@@ at 0', 'INS 33@@ 27@@ at 1', 'INS 42@@ 27@@ at 0', 'INS 22@@ 27@@ at 1', 'INS 27@@ 25@@ at 0', 'INS 21@@ 25@@ at 1', 'INS 52@@ 22@@ at 0', 'INS 42@@ 22@@ at 1', 'INS 2@@ 27@@ at 0', 'INS 42@@ 27@@ at 1', 'INS 7@@ 21@@ at 0', 'INS 22@@ 2@@ at 0', 'INS 42@@ 2@@ at 1', 'INS 42@@ 7@@ at 0', 'INS 2@@ 7@@ at 1', 'MOV 52@@ 22@@ at 0', 'UPD 42@@', 'MOV 42@@ 22@@ at 1', 'INS 22@@ 2@@ at 0', 'INS 42@@ 2@@ at 1', 'INS 52@@ 22@@ at 0', 'INS 42@@ 22@@ at 1', 'DEL 33@@', 'DEL 39@@', 'DEL 85@@', 'DEL 5@@', 'DEL 22@@', 'DEL 42@@', 'DEL 42@@', 'DEL 32

In [170]:
dists_for_lcs_lase = []

for es1 in tqdm_notebook(edit_scripts_lase):
    cur_dists = []
    for es2 in edit_scripts_lase:
        cur_lcs = lcs(es1, es2)
        dist = 1 - cur_lcs / max(len(es1), len(es2))
        cur_dists.append(dist)
    dists_for_lcs_lase.append(cur_dists)

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




In [172]:
cur_dists = dists_for_lcs_lase
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                            from_change_to_label, ids_per_label, 
                            agglomerative=False)

print(best_eps)

tufano_clustering = sklearn.cluster.DBSCAN(eps=best_eps, 
                                           min_samples=2, 
                                           metric='precomputed').fit(cur_dists)
print("_______________________________")
print(i, dist_name)
print("_______________________________")
print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)

DBSCAN


HBox(children=(IntProgress(value=0, max=514), HTML(value='')))


Best eps: 0.5238095238095238
Best F-measure: 0.9259120447039908
0.5238095238095238
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 51.096407171642966
Separation = 2103.30578719415
G1 = 249.5272577600066

Entropy = 0.0
Purity = 1.0
F-measure = 0.925912

Rand = 99.31%
Jaccard Index = 86.476%


(99.31, 0.9259120447039908)

In [173]:
cur_dists = dists_for_lcs_lase
link = 'average'
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                            from_change_to_label, ids_per_label, 
                            agglomerative=True, linkage=link, with_step=0.005)

print(best_eps)

print("_______________________________")
print(i, dist_name)
print("_______________________________")

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                    affinity='precomputed', 
                                                    linkage=link, 
                                                    compute_full_tree=True,
                                                    distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Best eps: 0.525
Best F-measure: 0.9147607798614508
0.525
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 5/149 (3.356%)

Cohesion = 51.23827422152058
Separation = 2109.7280393234605
G1 = 275.49397659773774

Entropy = 0.0
Purity = 1.0
F-measure = 0.914761

Rand = 99.233%
Jaccard Index = 84.952%


(99.233, 0.9147607798614508)

In [174]:
cur_dists = dists_for_lcs_lase
link = 'complete'
best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                            from_change_to_label, ids_per_label, 
                            agglomerative=True, linkage=link)

print(best_eps)

print("_______________________________")
print(i, dist_name)
print("_______________________________")

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                    affinity='precomputed', 
                                                    linkage=link, 
                                                    compute_full_tree=True,
                                                    distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=514), HTML(value='')))


Best eps: 0.5384615384615384
Best F-measure: 0.9147607798614508
0.5384615384615384
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 5/149 (3.356%)

Cohesion = 51.23827422152058
Separation = 2109.7280393234605
G1 = 275.49397659773774

Entropy = 0.0
Purity = 1.0
F-measure = 0.914761

Rand = 99.233%
Jaccard Index = 84.952%


(99.233, 0.9147607798614508)

In [178]:
dists_for_equals_lase = []

for es1 in tqdm_notebook(edit_scripts_lase):
    cur_dists = []
    for es2 in edit_scripts_lase:
        if es1 == es2:
            dist = 0.0
        else:
            dist = 1.0
        cur_dists.append(dist)
    dists_for_equals_lase.append(cur_dists)

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




In [179]:
cur_dists = dists_for_equals_lase

best_eps = 0.1

tufano_clustering = sklearn.cluster.DBSCAN(eps=best_eps, 
                                           min_samples=2, 
                                           metric='precomputed').fit(cur_dists)
print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)

Number of clusters: 33
Number of outliers: 33/149 (22.148%)

Cohesion = 41.5
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.696348

Rand = 97.166%
Jaccard Index = 54.126%


(97.166, 0.696348059099737)

In [180]:
cur_dists = dists_for_equals_lase
link = 'average'

best_eps = 0.1
tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                    affinity='precomputed', 
                                                    linkage=link, 
                                                    compute_full_tree=True,
                                                    distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)


Number of clusters: 33
Number of outliers: 33/149 (22.148%)

Cohesion = 41.5
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.696348

Rand = 97.166%
Jaccard Index = 54.126%


(97.166, 0.696348059099737)

In [181]:
cur_dists = dists_for_equals_lase
link = 'complete'

best_eps = 0.1

tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                    affinity='precomputed', 
                                                    linkage=link, 
                                                    compute_full_tree=True,
                                                    distance_threshold=best_eps).fit(cur_dists)

print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                        label_to_changes, from_change_to_label, ids_per_label,
                              without_outliers=True)


Number of clusters: 33
Number of outliers: 33/149 (22.148%)

Cohesion = 41.5
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.696348

Rand = 97.166%
Jaccard Index = 54.126%


(97.166, 0.696348059099737)

# LASE for fixed n (with context)

In [202]:
#RESULTS_LASE = "/Volumes/Seagate/Alina/result_for_lase_with_context_nonconcat"

RESULTS_LASE = "/Volumes/Seagate/Alina/result_for_lase_wo_context_nonconcat"

In [203]:
hists_lase_1gram = get_lase_hists(RESULTS_LASE, "1gram")
hists_lase_2gram = get_lase_hists(RESULTS_LASE, "2gram")
hists_lase_3gram = get_lase_hists(RESULTS_LASE, "3gram")
hists_lase_4gram = get_lase_hists(RESULTS_LASE, "4gram")
hists_lase_5gram = get_lase_hists(RESULTS_LASE, "5gram")
hists_lase_6gram = get_lase_hists(RESULTS_LASE, "6gram")
hists_lase_7gram = get_lase_hists(RESULTS_LASE, "7gram")
hists_lase_8gram = get_lase_hists(RESULTS_LASE, "8gram")
hists_lase_9gram = get_lase_hists(RESULTS_LASE, "9gram")
hists_lase_10gram = get_lase_hists(RESULTS_LASE, "10gram")

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




In [194]:
# with context
hist_len_1gram_lase = get_lase_hists_len(RESULTS_LASE, "1gram")
hist_len_2gram_lase = get_lase_hists_len(RESULTS_LASE, "2gram")
hist_len_3gram_lase = get_lase_hists_len(RESULTS_LASE, "3gram")
hist_len_4gram_lase = get_lase_hists_len(RESULTS_LASE, "4gram")
hist_len_5gram_lase = get_lase_hists_len(RESULTS_LASE, "5gram")
hist_len_6gram_lase = get_lase_hists_len(RESULTS_LASE, "6gram")
hist_len_7gram_lase = get_lase_hists_len(RESULTS_LASE, "7gram")
hist_len_8gram_lase = get_lase_hists_len(RESULTS_LASE, "8gram")
hist_len_9gram_lase = get_lase_hists_len(RESULTS_LASE, "9gram")
hist_len_10gram_lase = get_lase_hists_len(RESULTS_LASE, "10gram")

423
1221
1735
2017
2174
2263
2317
2347
2363
2377


In [204]:
# without context
hist_len_1gram_lase = get_lase_hists_len(RESULTS_LASE, "1gram")
hist_len_2gram_lase = get_lase_hists_len(RESULTS_LASE, "2gram")
hist_len_3gram_lase = get_lase_hists_len(RESULTS_LASE, "3gram")
hist_len_4gram_lase = get_lase_hists_len(RESULTS_LASE, "4gram")
hist_len_5gram_lase = get_lase_hists_len(RESULTS_LASE, "5gram")
hist_len_6gram_lase = get_lase_hists_len(RESULTS_LASE, "6gram")
hist_len_7gram_lase = get_lase_hists_len(RESULTS_LASE, "7gram")
hist_len_8gram_lase = get_lase_hists_len(RESULTS_LASE, "8gram")
hist_len_9gram_lase = get_lase_hists_len(RESULTS_LASE, "9gram")
hist_len_10gram_lase = get_lase_hists_len(RESULTS_LASE, "10gram")

325
1058
1607
1934
2130
2240
2303
2337
2353
2368


In [205]:
hists_lase = []

hists_lase.append([]) # zero array
hists_lase.append(hists_lase_1gram)
hists_lase.append(hists_lase_2gram)
hists_lase.append(hists_lase_3gram)
hists_lase.append(hists_lase_4gram)
hists_lase.append(hists_lase_5gram)
hists_lase.append(hists_lase_6gram)
hists_lase.append(hists_lase_7gram)
hists_lase.append(hists_lase_8gram)
hists_lase.append(hists_lase_9gram)
hists_lase.append(hists_lase_10gram)


In [206]:
hists_lase_len = []

hists_lase_len.append([]) # zero array
hists_lase_len.append(hist_len_1gram_lase)
hists_lase_len.append(hist_len_2gram_lase)
hists_lase_len.append(hist_len_3gram_lase)
hists_lase_len.append(hist_len_4gram_lase)
hists_lase_len.append(hist_len_5gram_lase)
hists_lase_len.append(hist_len_6gram_lase)
hists_lase_len.append(hist_len_7gram_lase)
hists_lase_len.append(hist_len_8gram_lase)
hists_lase_len.append(hist_len_9gram_lase)
hists_lase_len.append(hist_len_10gram_lase)


### DBSCAN

In [148]:
for i in range(1, 11):
    hists = hists_lase[i]
    hists_len = hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=False)

        print(best_eps)

        tufano_clustering = sklearn.cluster.DBSCAN(eps=best_eps, 
                                                   min_samples=2, 
                                                   metric='precomputed').fit(cur_dists)
        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=544), HTML(value='')))


Best eps: 0.06149425287356314
Best F-measure: 0.18754906921092465
0.06149425287356314
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 5
Number of outliers: 1/149 (0.671%)

Cohesion = 37.89767092344934
Separation = 2367.1412008881625
G1 = 362.0127455303709

Entropy = 3.944602
Purity = 0.175676
F-measure = 0.187549

Rand = 17.834%
Jaccard Index = 5.658%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1076), HTML(value='')))


Best eps: 0.6005464480874317
Best F-measure: 0.9328815852305783
0.6005464480874317
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 4/149 (2.685%)

Cohesion = 49.49948347777997
Separation = 1730.2662643789113
G1 = 230.7760864177212

Entropy = 0.0
Purity = 1.0
F-measure = 0.932882

Rand = 99.282%
Jaccard Index = 85.902%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1253), HTML(value='')))


Best eps: 0.3205501306462569
Best F-measure: 0.8821368042844551
0.3205501306462569
_______________________________
1 cos_distance
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 54.46126388428483
Separation = 4126.099638440328
G1 = 443.48639167880657

Entropy = 0.142666
Purity = 0.951389
F-measure = 0.882137

Rand = 98.902%
Jaccard Index = 80.0%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1558), HTML(value='')))


Best eps: 0.3406681271408
Best F-measure: 0.8783017132010418
0.3406681271408
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 27
Number of outliers: 6/149 (4.027%)

Cohesion = 64.18946056544598
Separation = 6858.277699557816
G1 = 667.4800889603648

Entropy = 0.143664
Purity = 0.951049
F-measure = 0.878302

Rand = 98.887%
Jaccard Index = 79.893%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=203), HTML(value='')))


Best eps: 0.015625
Best F-measure: 0.11057332077788631
0.015625
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 3
Number of outliers: 0/149 (0.0%)

Cohesion = 22.862402628813992
Separation = 323.68951069206486
G1 = 70.76882076780313

Entropy = 4.272667
Purity = 0.127517
F-measure = 0.110573

Rand = 9.36%
Jaccard Index = 4.765%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=686), HTML(value='')))


Best eps: 0.8390151515151515
Best F-measure: 0.9551374847759344
0.8390151515151515
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 4/149 (2.685%)

Cohesion = 45.35907616356663
Separation = 371.3886722462594
G1 = 34.65925140085188

Entropy = 0.037218
Purity = 0.993103
F-measure = 0.955137

Rand = 99.425%
Jaccard Index = 88.991%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=786), HTML(value='')))


Best eps: 0.26720927372085956
Best F-measure: 0.8957194185382102
0.26720927372085956
_______________________________
2 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 52.70058172491153
Separation = 1029.8172844707979
G1 = 162.39488323205785

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1405), HTML(value='')))


Best eps: 0.286793389660616
Best F-measure: 0.8957194185382102
0.286793389660616
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 8/149 (5.369%)

Cohesion = 61.106194920761595
Separation = 4378.783354773734
G1 = 536.4625804548726

Entropy = 0.0
Purity = 1.0
F-measure = 0.895719

Rand = 99.291%
Jaccard Index = 86.22%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Best eps: 0.00877192982456143
Best F-measure: 0.33475698397610937
0.00877192982456143
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 7
Number of outliers: 1/149 (0.671%)

Cohesion = 26.837739110625414
Separation = 150.73596872798373
G1 = 29.224280628526092

Entropy = 3.096442
Purity = 0.351351
F-measure = 0.334757

Rand = 48.419%
Jaccard Index = 8.332%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=373), HTML(value='')))


Best eps: 0.8913043478260869
Best F-measure: 0.9514154004086888
0.8913043478260869
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 44.44295008104857
Separation = 81.12025092175278
G1 = 11.119286767376304

Entropy = 0.0
Purity = 1.0
F-measure = 0.951415

Rand = 99.573%
Jaccard Index = 91.619%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=397), HTML(value='')))


Best eps: 0.5913731760376033
Best F-measure: 0.9176753149907511
0.5913731760376033
_______________________________
3 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 51.6609795895483
Separation = 230.96575134037224
G1 = 35.895708945637466

Entropy = 0.0
Purity = 1.0
F-measure = 0.917675

Rand = 99.34%
Jaccard Index = 87.091%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1287), HTML(value='')))


Best eps: 0.626619061320404
Best F-measure: 0.9176753149907511
0.626619061320404
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 60.71099660333396
Separation = 4228.335445823746
G1 = 456.4386894274224

Entropy = 0.0
Purity = 1.0
F-measure = 0.917675

Rand = 99.34%
Jaccard Index = 87.091%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=39), HTML(value='')))


Best eps: 0.006249999999999978
Best F-measure: 0.5993923291968911
0.006249999999999978
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 15
Number of outliers: 2/149 (1.342%)

Cohesion = 39.91828022066075
Separation = 125.39397959028473
G1 = 4.0217443887813165

Entropy = 1.62672
Purity = 0.605442
F-measure = 0.599392

Rand = 78.315%
Jaccard Index = 18.035%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=195), HTML(value='')))


Best eps: 0.9259259259259259
Best F-measure: 0.9393348634959371
0.9259259259259259
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 44.194655387832604
Separation = 30.300063839308958
G1 = 4.524799351058689

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=201), HTML(value='')))


Best eps: 0.8425275552669533
Best F-measure: 0.9393348634959371
0.8425275552669533
_______________________________
4 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 49.86818632372269
Separation = 75.26103150206298
G1 = 9.87697989351624

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1145), HTML(value='')))


Best eps: 0.871305279372233
Best F-measure: 0.9393348634959371
0.871305279372233
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 60.143062549753054
Separation = 5038.362253578839
G1 = 499.35951546775857

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=19), HTML(value='')))


Best eps: 0.14015151515151525
Best F-measure: 0.6583734476646241
0.14015151515151525
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 16
Number of outliers: 3/149 (2.013%)

Cohesion = 39.22782537796395
Separation = 96.73456691771534
G1 = 1.4510608180543536

Entropy = 1.31906
Purity = 0.671233
F-measure = 0.658373

Rand = 83.987%
Jaccard Index = 23.199%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=119), HTML(value='')))


Best eps: 0.9659090909090909
Best F-measure: 0.9357817644039512
0.9659090909090909
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 4/149 (2.685%)

Cohesion = 40.444891835032095
Separation = 7.791489604153405
G1 = 0.8176395182346307

Entropy = 0.094996
Purity = 0.965517
F-measure = 0.935782

Rand = 99.08%
Jaccard Index = 83.505%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=117), HTML(value='')))


Best eps: 0.8804771390665607
Best F-measure: 0.9393348634959371
0.8804771390665607
_______________________________
5 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 48.91924004501355
Separation = 27.63604335879712
G1 = 3.5232467414405138

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1131), HTML(value='')))


Best eps: 0.89341911822236
Best F-measure: 0.9393348634959371
0.89341911822236
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 59.5030735840899
Separation = 5180.599488370867
G1 = 535.6192254563935

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


Best eps: 0.25
Best F-measure: 0.8752902523628935
0.25
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 23
Number of outliers: 4/149 (2.685%)

Cohesion = 53.70788399784878
Separation = 64.61108997283318
G1 = 2.4

Entropy = 0.27321
Purity = 0.903448
F-measure = 0.87529

Rand = 98.362%
Jaccard Index = 73.488%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=77), HTML(value='')))


Best eps: 0.9431818181818182
Best F-measure: 0.9310981337826973
0.9431818181818182
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 42.65248908890591
Separation = 3.281066545310244
G1 = 0.5476101126079217

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))


Best eps: 0.8707280775012451
Best F-measure: 0.9310981337826973
0.8707280775012451
_______________________________
6 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 48.12780676128298
Separation = 13.89679352908097
G1 = 1.284130611712762

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1135), HTML(value='')))


Best eps: 0.946965820739672
Best F-measure: 0.9393348634959371
0.946965820739672
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 58.76361807512263
Separation = 5371.728605538601
G1 = 570.8407795328625

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))


Best eps: 0.25
Best F-measure: 0.8759452329922127
0.25
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 25
Number of outliers: 5/149 (3.356%)

Cohesion = 54.10335820895523
Separation = 64.61108997283318
G1 = 2.4

Entropy = 0.20475
Purity = 0.923611
F-measure = 0.875945

Rand = 98.592%
Jaccard Index = 76.151%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=72), HTML(value='')))


Best eps: 0.9545454545454546
Best F-measure: 0.9310981337826973
0.9545454545454546
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 41.7441074428466
Separation = 1.6554801762908646
G1 = 0.15780018525017916

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=72), HTML(value='')))


Best eps: 0.8915347710906719
Best F-measure: 0.9310981337826973
0.8915347710906719
_______________________________
7 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 47.06830206051663
Separation = 8.718942567161607
G1 = 0.5738746989503202

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1083), HTML(value='')))


Best eps: 0.917406103951721
Best F-measure: 0.9310981337826973
0.917406103951721
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 57.624734569722634
Separation = 5304.867886904268
G1 = 586.3546421609873

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))


Best eps: 0.19999999999999996
Best F-measure: 0.9059303485477981
0.19999999999999996
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 55.833345358345355
Separation = 64.61108997283318
G1 = 2.4

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Best eps: 0.9659090909090909
Best F-measure: 0.9059303485477981
0.9659090909090909
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.758888805089235
Separation = 0.49832898957876376
G1 = 0.0278696818198479

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Best eps: 0.9144601077231699
Best F-measure: 0.9059303485477981
0.9144601077231699
_______________________________
8 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 48.57939177035384
Separation = 3.9562195402873916
G1 = 0.2075871620266279

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1072), HTML(value='')))


Best eps: 0.968300842229172
Best F-measure: 0.9059303485477981
0.968300842229172
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 58.971210853210906
Separation = 5299.696445572124
G1 = 613.2361491651543

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


Best eps: 0.125
Best F-measure: 0.9059303485477981
0.125
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 55.87903041716601
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=49), HTML(value='')))


Best eps: 0.9772727272727273
Best F-measure: 0.9059303485477981
0.9772727272727273
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.41332165837684
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Best eps: 0.9398615220675737
Best F-measure: 0.9059303485477981
0.9398615220675737
_______________________________
9 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 48.21060525890796
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1026), HTML(value='')))


Best eps: 0.965498308740846
Best F-measure: 0.8969818026865004
0.965498308740846
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.86224191708861
Separation = 5354.156671053316
G1 = 712.2117886852354

Entropy = 0.0
Purity = 1.0
F-measure = 0.896982

Rand = 98.857%
Jaccard Index = 77.649%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


Best eps: 0.10227272727272729
Best F-measure: 0.9059303485477981
0.10227272727272729
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 55.96590909090909
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))


Best eps: 0.9886363636363636
Best F-measure: 0.9059303485477981
0.9886363636363636
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.06110817398274
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))


Best eps: 0.9681857618512111
Best F-measure: 0.9059303485477981
0.9681857618512111
_______________________________
10 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 47.82635208852346
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=975), HTML(value='')))


Best eps: 0.993917725454889
Best F-measure: 0.8969818026865004
0.993917725454889
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.50957612773061
Separation = 5429.1372065148125
G1 = 740.7917342391871

Entropy = 0.0
Purity = 1.0
F-measure = 0.896982

Rand = 98.857%
Jaccard Index = 77.649%


In [143]:
hists = hists_lase[4]
hists_len = hists_lase_len[4]

cur_dists = np.array(get_dists(hists, pearsons_correlation_mean, hists_len))

min(cur_dists.flatten())
cur_dists

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))




array([[0.        , 0.        , 0.65792061, ..., 1.01759491, 1.01759491,
        1.01759491],
       [0.        , 0.        , 0.65792061, ..., 1.01759491, 1.01759491,
        1.01759491],
       [0.65792061, 0.65792061, 0.        , ..., 1.0239148 , 1.0239148 ,
        1.0239148 ],
       ...,
       [1.01759491, 1.01759491, 1.0239148 , ..., 0.        , 0.        ,
        0.        ],
       [1.01759491, 1.01759491, 1.0239148 , ..., 0.        , 0.        ,
        0.        ],
       [1.01759491, 1.01759491, 1.0239148 , ..., 0.        , 0.        ,
        0.        ]])

### HAC average

In [200]:
for i in range(1, 11):
    hists = hists_lase[i]
    hists_len = hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'average'
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.03
Best F-measure: 0.5703295859730917
0.03
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 23
Number of outliers: 5/149 (3.356%)

Cohesion = 60.31539412669273
Separation = 14276.9871916422
G1 = 1477.6441713998092

Entropy = 1.18215
Purity = 0.590278
F-measure = 0.57033

Rand = 93.056%
Jaccard Index = 26.213%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.705
Best F-measure: 0.9352678641269244
0.705
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 26
Number of outliers: 3/149 (2.013%)

Cohesion = 47.73324417810008
Separation = 1773.5763188071564
G1 = 199.68717794822118

Entropy = 0.083321
Purity = 0.965753
F-measure = 0.935268

Rand = 99.301%
Jaccard Index = 86.738%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.39
Best F-measure: 0.8903735339976948
0.39
_______________________________
1 cos_distance
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 54.71162604573232
Separation = 4212.690570034269
G1 = 429.43434157181025

Entropy = 0.141683
Purity = 0.951724
F-measure = 0.890374

Rand = 98.879%
Jaccard Index = 79.51%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=236), HTML(value='')))


Best eps: 0.405
Best F-measure: 0.8916838567845277
0.405
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 4/149 (2.685%)

Cohesion = 64.94678548549366
Separation = 7177.122233137655
G1 = 715.5852519310482

Entropy = 0.069943
Purity = 0.972414
F-measure = 0.891684

Rand = 99.061%
Jaccard Index = 81.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.28500000000000003
Best F-measure: 0.6163715115426913
0.28500000000000003
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 15
Number of outliers: 1/149 (0.671%)

Cohesion = 63.00693115876287
Separation = 7296.1167008906905
G1 = 187.18555864820834

Entropy = 1.355165
Purity = 0.621622
F-measure = 0.616372

Rand = 93.519%
Jaccard Index = 33.927%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.835
Best F-measure: 0.9514154004086888
0.835
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 45.82744853918298
Separation = 369.6091484652511
G1 = 43.6196397263168

Entropy = 0.0
Purity = 1.0
F-measure = 0.951415

Rand = 99.573%
Jaccard Index = 91.619%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.42
Best F-measure: 0.8995545096216235
0.42
_______________________________
2 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 53.17406295837094
Separation = 1083.0549639119888
G1 = 161.00553316475575

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=231), HTML(value='')))


Best eps: 0.46
Best F-measure: 0.8995545096216235
0.46
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 61.68742077641115
Separation = 4515.652553922231
G1 = 526.0009573462897

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.19
Best F-measure: 0.7397881994602065
0.19
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 22
Number of outliers: 1/149 (0.671%)

Cohesion = 61.70118829511707
Separation = 2523.9365511186807
G1 = 178.30110201625007

Entropy = 0.748219
Purity = 0.75
F-measure = 0.739788

Rand = 96.406%
Jaccard Index = 49.613%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.905
Best F-measure: 0.9514154004086888
0.905
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 44.34771198581047
Separation = 84.93630696385783
G1 = 11.894689688249107

Entropy = 0.0
Purity = 1.0
F-measure = 0.951415

Rand = 99.531%
Jaccard Index = 90.789%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.86
Best F-measure: 0.9311705445933633
0.86
_______________________________
3 cos_distance
_______________________________
Number of clusters: 24
Number of outliers: 4/149 (2.685%)

Cohesion = 48.862625686513816
Separation = 224.82882550252623
G1 = 16.569813575206442

Entropy = 0.108341
Purity = 0.958621
F-measure = 0.931171

Rand = 99.272%
Jaccard Index = 86.549%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=233), HTML(value='')))


Best eps: 0.9
Best F-measure: 0.9406930394691939
0.9
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 24
Number of outliers: 4/149 (2.685%)

Cohesion = 57.89377301125349
Separation = 4493.24789822155
G1 = 300.84974437209115

Entropy = 0.108341
Purity = 0.958621
F-measure = 0.940693

Rand = 99.272%
Jaccard Index = 86.549%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.12
Best F-measure: 0.8387903505511513
0.12
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 2/149 (1.342%)

Cohesion = 60.89582749898396
Separation = 954.3702234095006
G1 = 88.39982313648959

Entropy = 0.41967
Purity = 0.857143
F-measure = 0.83879

Rand = 97.465%
Jaccard Index = 63.243%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.97
Best F-measure: 0.9489836040171609
0.97
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 4/149 (2.685%)

Cohesion = 42.540230340695125
Separation = 26.030178188521838
G1 = 2.1084303618013163

Entropy = 0.047563
Purity = 0.97931
F-measure = 0.948984

Rand = 99.473%
Jaccard Index = 89.89%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.855
Best F-measure: 0.9393348634959371
0.855
_______________________________
4 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 49.86818632372269
Separation = 75.26103150206298
G1 = 9.87697989351624

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=237), HTML(value='')))


Best eps: 0.975
Best F-measure: 0.9560682682672415
0.975
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 25
Number of outliers: 4/149 (2.685%)

Cohesion = 58.18892041753636
Separation = 5109.066181694589
G1 = 370.17665271651566

Entropy = 0.060779
Purity = 0.97931
F-measure = 0.956068

Rand = 99.387%
Jaccard Index = 88.427%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.335
Best F-measure: 0.865671738402222
0.335
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 4/149 (2.685%)

Cohesion = 59.19882174357967
Separation = 368.44344279299696
G1 = 17.718638316450992

Entropy = 0.283163
Purity = 0.896552
F-measure = 0.865672

Rand = 98.017%
Jaccard Index = 68.919%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.98
Best F-measure: 0.9393348634959371
0.98
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 43.20726635167965
Separation = 9.660791158286045
G1 = 1.7282053568116604

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.885
Best F-measure: 0.9212140581268096
0.885
_______________________________
5 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 49.59233344399091
Separation = 30.209138451874924
G1 = 4.215485522599901

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.965
Best F-measure: 0.9393348634959371
0.965
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 59.47521037357898
Separation = 5299.4092403292925
G1 = 541.2179088764564

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.01
Best F-measure: 0.9094910094027012
0.01
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 27
Number of outliers: 6/149 (4.027%)

Cohesion = 57.988353733512184
Separation = 166.1301899090104
G1 = 11.429408586672544

Entropy = 0.086465
Purity = 0.972028
F-measure = 0.909491

Rand = 99.163%
Jaccard Index = 84.023%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.985
Best F-measure: 0.9393348634959371
0.985
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 42.48243818113299
Separation = 3.702935534361583
G1 = 0.6992556247272101

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.985
Best F-measure: 0.9151031432557392
0.985
_______________________________
6 cos_distance
_______________________________
Number of clusters: 25
Number of outliers: 4/149 (2.685%)

Cohesion = 45.224804538321564
Separation = 2.0528996425902433
G1 = 0.18907929172969526

Entropy = 0.130651
Purity = 0.937931
F-measure = 0.915103

Rand = 98.956%
Jaccard Index = 81.304%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.975
Best F-measure: 0.9393348634959371
0.975
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 58.826172421762074
Separation = 5499.853576627753
G1 = 561.1596022553347

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.01
Best F-measure: 0.8992558679595061
0.01
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 57.495024875621894
Separation = 126.98809058352062
G1 = 8.678571428571427

Entropy = 0.075502
Purity = 0.972028
F-measure = 0.899256

Rand = 99.104%
Jaccard Index = 82.959%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.99
Best F-measure: 0.9393348634959371
0.99
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 41.72429688998654
Separation = 1.6554801762908646
G1 = 0.15780018525017916

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.895
Best F-measure: 0.9129773284135698
0.895
_______________________________
7 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 47.76100379760361
Separation = 10.467965844260224
G1 = 0.9708257391386355

Entropy = 0.0
Purity = 1.0
F-measure = 0.912977

Rand = 99.34%
Jaccard Index = 87.091%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 1.0
Best F-measure: 0.9393348634959371
1.0
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 57.68287558997319
Separation = 5437.332717968311
G1 = 572.2827004788942

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.01
Best F-measure: 0.8878095431786706
0.01
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 55.99470899470899
Separation = 70.58237126494339
G1 = 4.533333333333333

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.851%
Jaccard Index = 77.495%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.99
Best F-measure: 0.9059303485477981
0.99
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.758888805089235
Separation = 0.49832898957876376
G1 = 0.0278696818198479

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.965
Best F-measure: 0.9059303485477981
0.965
_______________________________
8 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 48.57939177035384
Separation = 3.9562195402873916
G1 = 0.2075871620266279

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 1.0
Best F-measure: 0.9059303485477981
1.0
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 58.971210853210906
Separation = 5299.696445572124
G1 = 613.2361491651543

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.72
Best F-measure: 0.9059303485477981
0.72
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 55.87903041716601
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.99
Best F-measure: 0.9059303485477981
0.99
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.41332165837684
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.975
Best F-measure: 0.9059303485477981
0.975
_______________________________
9 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 48.21060525890796
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.97
Best F-measure: 0.8788609973173729
0.97
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.51793788581148
Separation = 5305.168798665568
G1 = 744.6856529342381

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.67
Best F-measure: 0.9059303485477981
0.67
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 55.96590909090909
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.995
Best F-measure: 0.9059303485477981
0.995
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.06110817398274
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.985
Best F-measure: 0.9059303485477981
0.985
_______________________________
10 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 47.82635208852346
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.995
Best F-measure: 0.8788609973173729
0.995
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.17319489948217
Separation = 5377.3618767029075
G1 = 773.5526816999444

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%


In [198]:
for i in range(1, 11):
    hists = hists_lase[i]
    hists_len = hists_lase_len[i]
    

    dist = distances[3]
    dist_name = distances_names[3]

    cur_dists = get_dists(hists, dist, hists_len)
    link = 'average'
    best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                from_change_to_label, ids_per_label, 
                                agglomerative=True, linkage=link, with_step=0.005)

    print(best_eps)

    print("_______________________________")
    print(i, dist_name)
    print("_______________________________")

    tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                        affinity='precomputed', 
                                                        linkage=link, 
                                                        compute_full_tree=True,
                                                        distance_threshold=best_eps).fit(cur_dists)

    print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                            label_to_changes, from_change_to_label, ids_per_label,
                                  without_outliers=True)



HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=236), HTML(value='')))


Best eps: 0.405
Best F-measure: 0.8916838567845277
0.405
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 4/149 (2.685%)

Cohesion = 64.94678548549366
Separation = 7177.122233137655
G1 = 715.5852519310482

Entropy = 0.069943
Purity = 0.972414
F-measure = 0.891684

Rand = 99.061%
Jaccard Index = 81.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=231), HTML(value='')))


Best eps: 0.46
Best F-measure: 0.8995545096216235
0.46
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 61.68742077641115
Separation = 4515.652553922231
G1 = 526.0009573462897

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=233), HTML(value='')))


Best eps: 0.9
Best F-measure: 0.9406930394691939
0.9
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 24
Number of outliers: 4/149 (2.685%)

Cohesion = 57.89377301125349
Separation = 4493.24789822155
G1 = 300.84974437209115

Entropy = 0.108341
Purity = 0.958621
F-measure = 0.940693

Rand = 99.272%
Jaccard Index = 86.549%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=237), HTML(value='')))


Best eps: 0.975
Best F-measure: 0.9560682682672415
0.975
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 25
Number of outliers: 4/149 (2.685%)

Cohesion = 58.18892041753636
Separation = 5109.066181694589
G1 = 370.17665271651566

Entropy = 0.060779
Purity = 0.97931
F-measure = 0.956068

Rand = 99.387%
Jaccard Index = 88.427%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.965
Best F-measure: 0.9393348634959371
0.965
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 59.47521037357898
Separation = 5299.4092403292925
G1 = 541.2179088764564

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.975
Best F-measure: 0.9393348634959371
0.975
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 58.826172421762074
Separation = 5499.853576627753
G1 = 561.1596022553347

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 1.0
Best F-measure: 0.9393348634959371
1.0
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 57.68287558997319
Separation = 5437.332717968311
G1 = 572.2827004788942

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 1.0
Best F-measure: 0.9059303485477981
1.0
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 58.971210853210906
Separation = 5299.696445572124
G1 = 613.2361491651543

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.97
Best F-measure: 0.8788609973173729
0.97
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.51793788581148
Separation = 5305.168798665568
G1 = 744.6856529342381

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.995
Best F-measure: 0.8788609973173729
0.995
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.17319489948217
Separation = 5377.3618767029075
G1 = 773.5526816999444

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%


### HAC complete

In [149]:
for i in range(1, 11):
    hists = hists_lase[i]
    hists_len = hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'complete'
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=544), HTML(value='')))


Best eps: 0.25396825396825395
Best F-measure: 0.5510879027963193
0.25396825396825395
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 19
Number of outliers: 0/149 (0.0%)

Cohesion = 64.26128686741748
Separation = 14489.14902811083
G1 = 711.9665533803262

Entropy = 1.321847
Purity = 0.543624
F-measure = 0.551088

Rand = 92.708%
Jaccard Index = 25.07%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1076), HTML(value='')))


Best eps: 0.6553987678987678
Best F-measure: 0.9328815852305783
0.6553987678987678
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 4/149 (2.685%)

Cohesion = 49.49948347777997
Separation = 1730.2662643789113
G1 = 230.7760864177212

Entropy = 0.0
Purity = 1.0
F-measure = 0.932882

Rand = 99.282%
Jaccard Index = 85.902%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1253), HTML(value='')))


Best eps: 0.37911933371067863
Best F-measure: 0.8943684205429169
0.37911933371067863
_______________________________
1 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 54.59327485687898
Separation = 4184.165814038469
G1 = 479.32481860912384

Entropy = 0.041667
Purity = 0.979167
F-measure = 0.894368

Rand = 99.136%
Jaccard Index = 83.333%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1558), HTML(value='')))


Best eps: 0.416498167242602
Best F-measure: 0.8943684205429169
0.416498167242602
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 64.5979538864644
Separation = 7038.212157088338
G1 = 710.7880860062429

Entropy = 0.041667
Purity = 0.979167
F-measure = 0.894368

Rand = 99.136%
Jaccard Index = 83.333%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=203), HTML(value='')))


Best eps: 0.25
Best F-measure: 0.603175248171386
0.25
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 1/149 (0.671%)

Cohesion = 61.8017717576541
Separation = 8426.705919195749
G1 = 611.7566005118257

Entropy = 1.067253
Purity = 0.668919
F-measure = 0.603175

Rand = 94.972%
Jaccard Index = 31.88%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=686), HTML(value='')))


Best eps: 0.8407960199004975
Best F-measure: 0.9393348634959371
0.8407960199004975
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 46.2371645175318
Separation = 380.56747984879735
G1 = 48.42343676761458

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=786), HTML(value='')))


Best eps: 0.7228630022631534
Best F-measure: 0.9098046621536552
0.7228630022631534
_______________________________
2 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 3/149 (2.013%)

Cohesion = 53.0240713476664
Separation = 1151.8194043177052
G1 = 132.44872499991854

Entropy = 0.069464
Purity = 0.972603
F-measure = 0.909805

Rand = 99.112%
Jaccard Index = 82.909%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1405), HTML(value='')))


Best eps: 0.482708956973963
Best F-measure: 0.8995545096216235
0.482708956973963
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 61.68742077641115
Separation = 4515.652553922231
G1 = 526.0009573462897

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Best eps: 0.28409090909090906
Best F-measure: 0.7474476567429587
0.28409090909090906
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 2/149 (1.342%)

Cohesion = 61.34161807066921
Separation = 2697.1556423418438
G1 = 193.0535431343122

Entropy = 0.647249
Purity = 0.782313
F-measure = 0.747448

Rand = 96.366%
Jaccard Index = 46.502%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=373), HTML(value='')))


Best eps: 0.9390243902439024
Best F-measure: 0.9514154004086888
0.9390243902439024
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 44.34771198581047
Separation = 84.93630696385783
G1 = 11.894689688249107

Entropy = 0.0
Purity = 1.0
F-measure = 0.951415

Rand = 99.531%
Jaccard Index = 90.789%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=397), HTML(value='')))


Best eps: 0.8678247357136426
Best F-measure: 0.9388554771105103
0.8678247357136426
_______________________________
3 cos_distance
_______________________________
Number of clusters: 26
Number of outliers: 4/149 (2.685%)

Cohesion = 50.14274977799927
Separation = 243.87992741946297
G1 = 24.27114013892265

Entropy = 0.041379
Purity = 0.97931
F-measure = 0.938855

Rand = 99.33%
Jaccard Index = 87.061%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1287), HTML(value='')))


Best eps: 0.931367498647144
Best F-measure: 0.950936014023262
0.931367498647144
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 25
Number of outliers: 4/149 (2.685%)

Cohesion = 58.764301410040375
Separation = 4507.921493061128
G1 = 333.63236744340423

Entropy = 0.041379
Purity = 0.97931
F-measure = 0.950936

Rand = 99.502%
Jaccard Index = 90.388%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=39), HTML(value='')))


Best eps: 0.44444444444444453
Best F-measure: 0.8054137670728615
0.44444444444444453
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 25
Number of outliers: 2/149 (1.342%)

Cohesion = 60.4815019527462
Separation = 1178.9762254615075
G1 = 95.16952686169947

Entropy = 0.457266
Purity = 0.843537
F-measure = 0.805414

Rand = 97.083%
Jaccard Index = 56.588%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=195), HTML(value='')))


Best eps: 0.9767195767195768
Best F-measure: 0.9393348634959371
0.9767195767195768
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 3/149 (2.013%)

Cohesion = 44.019628778901094
Separation = 32.458407906889086
G1 = 4.878723976528333

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.31%
Jaccard Index = 86.481%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=201), HTML(value='')))


Best eps: 0.9410744349011211
Best F-measure: 0.9259120447039907
0.9410744349011211
_______________________________
4 cos_distance
_______________________________
Number of clusters: 27
Number of outliers: 3/149 (2.013%)

Cohesion = 49.05977567201703
Separation = 80.31629324704139
G1 = 9.33235950115091

Entropy = 0.041096
Purity = 0.979452
F-measure = 0.925912

Rand = 99.225%
Jaccard Index = 85.064%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1145), HTML(value='')))


Best eps: 0.977927217615159
Best F-measure: 0.9393348634959371
0.977927217615159
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 60.143062549753054
Separation = 5038.362253578839
G1 = 499.35951546775857

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=19), HTML(value='')))


Best eps: 0.125
Best F-measure: 0.8648825946600581
0.125
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 58.95966787332358
Separation = 441.00181943223856
G1 = 45.032552281178106

Entropy = 0.226165
Purity = 0.917241
F-measure = 0.864883

Rand = 98.075%
Jaccard Index = 68.981%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=119), HTML(value='')))


Best eps: 0.9577464788732394
Best F-measure: 0.9212140581268096
0.9577464788732394
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 43.983608977554994
Separation = 9.574566695151779
G1 = 1.5579071220024885

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=117), HTML(value='')))


Best eps: 0.8846020473578892
Best F-measure: 0.9212140581268096
0.8846020473578892
_______________________________
5 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 49.59233344399091
Separation = 30.209138451874924
G1 = 4.215485522599901

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1131), HTML(value='')))


Best eps: 0.92588986113108
Best F-measure: 0.9212140581268096
0.92588986113108
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 60.127209598294336
Separation = 5141.112420018805
G1 = 566.841662082576

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


Best eps: 0.13257575757575768
Best F-measure: 0.9094910094027012
0.13257575757575768
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 27
Number of outliers: 6/149 (4.027%)

Cohesion = 57.988353733512184
Separation = 166.1301899090104
G1 = 11.429408586672544

Entropy = 0.086465
Purity = 0.972028
F-measure = 0.909491

Rand = 99.163%
Jaccard Index = 84.023%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=77), HTML(value='')))


Best eps: 0.9775280898876404
Best F-measure: 0.9212140581268096
0.9775280898876404
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 43.13213770211
Separation = 4.072851677914067
G1 = 0.734540956805673

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))


Best eps: 0.9430197117701811
Best F-measure: 0.9212140581268096
0.9430197117701811
_______________________________
6 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 48.68374329752536
Separation = 16.209716527720744
G1 = 1.784115760702595

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1135), HTML(value='')))


Best eps: 0.949242285149092
Best F-measure: 0.9212140581268096
0.949242285149092
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 59.39050219844166
Separation = 5329.630999064596
G1 = 603.2880555079472

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))


Best eps: 0.125
Best F-measure: 0.8992558679595061
0.125
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 57.495024875621894
Separation = 126.98809058352062
G1 = 8.678571428571427

Entropy = 0.075502
Purity = 0.972028
F-measure = 0.899256

Rand = 99.104%
Jaccard Index = 82.959%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=72), HTML(value='')))


Best eps: 0.9933665008291873
Best F-measure: 0.9212140581268096
0.9933665008291873
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 42.35639926847644
Separation = 2.2111232011419752
G1 = 0.29672856258926505

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=72), HTML(value='')))


Best eps: 0.9720525616255893
Best F-measure: 0.9212140581268096
0.9720525616255893
_______________________________
7 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 47.733965957391746
Separation = 10.467965844260224
G1 = 0.9708257391386355

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1083), HTML(value='')))


Best eps: 1.000863557697381
Best F-measure: 0.9212140581268096
1.000863557697381
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 58.323944196483936
Separation = 5392.482868639872
G1 = 604.6140948476142

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))


Best eps: 0.11742424242424254
Best F-measure: 0.8878095431786706
0.11742424242424254
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 55.99470899470899
Separation = 70.58237126494339
G1 = 4.533333333333333

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.851%
Jaccard Index = 77.495%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Best eps: 0.9732142857142857
Best F-measure: 0.8878095431786706
0.9732142857142857
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 44.38701369120941
Separation = 0.980387032337839
G1 = 0.14918456111380285

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Best eps: 0.9662420210972111
Best F-measure: 0.8878095431786706
0.9662420210972111
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 49.27888698139381
Separation = 5.348468260093446
G1 = 0.5255571675716854

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1072), HTML(value='')))


Best eps: 0.968300842229172
Best F-measure: 0.8788609973173729
0.968300842229172
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.85019257164373
Separation = 5238.238265014146
G1 = 719.86534700888

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


Best eps: 0.10984848484848486
Best F-measure: 0.8878095431786706
0.10984848484848486
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 55.994350282485875
Separation = 6.531088913245536
G1 = 2.333333333333333

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.851%
Jaccard Index = 77.495%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=49), HTML(value='')))


Best eps: 0.9778761061946902
Best F-measure: 0.8878095431786706
0.9778761061946902
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 44.03747082753665
Separation = 0.4080005868268912
G1 = 0.10336868156247409

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Best eps: 1.0
Best F-measure: 0.8878095431786706
1.0
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 48.917361668380224
Separation = 1.0023205842790772
G1 = 0.2304770160807708

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1026), HTML(value='')))


Best eps: 0.976521932162783
Best F-measure: 0.8788609973173729
0.976521932162783
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.51793788581148
Separation = 5305.168798665568
G1 = 744.6856529342381

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


Best eps: 0.10227272727272729
Best F-measure: 0.8878095431786706
0.10227272727272729
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 56.0
Separation = 7.464101615137755
G1 = 2.6666666666666665

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.851%
Jaccard Index = 77.495%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))


Best eps: 1.0
Best F-measure: 0.8878095431786706
1.0
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 43.68127240073953
Separation = 0.33343200763025743
G1 = 0.08506538335581805

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))


Best eps: 0.9717777495555132
Best F-measure: 0.8878095431786706
0.9717777495555132
_______________________________
10 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 48.540892566888026
Separation = 0.5734805019096936
G1 = 0.13284611306520083

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=975), HTML(value='')))


Best eps: 1.000841750692646
Best F-measure: 0.8788609973173729
1.000841750692646
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.17319489948217
Separation = 5377.3618767029075
G1 = 773.5526816999444

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%


# LASE for fixed n (without context)

### DBSCAN

In [155]:
for i in range(1, 11):
    hists = hists_lase[i]
    hists_len = hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=False)

        print(best_eps)

        tufano_clustering = sklearn.cluster.DBSCAN(eps=best_eps, 
                                                   min_samples=2, 
                                                   metric='precomputed').fit(cur_dists)
        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=701), HTML(value='')))


Best eps: 0.08593073593073586
Best F-measure: 0.2216340614071988
0.08593073593073586
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 0/149 (0.0%)

Cohesion = 39.60702510537168
Separation = 2836.0744018749474
G1 = 397.64631655220364

Entropy = 3.753627
Purity = 0.208054
F-measure = 0.221634

Rand = 23.753%
Jaccard Index = 6.004%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1170), HTML(value='')))


Best eps: 0.6005464480874317
Best F-measure: 0.94496212214333
0.6005464480874317
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 48.83444955205416
Separation = 1997.2847510863821
G1 = 250.72515737475507

Entropy = 0.0
Purity = 1.0
F-measure = 0.944962

Rand = 99.454%
Jaccard Index = 89.286%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1315), HTML(value='')))


Best eps: 0.10850397289695746
Best F-measure: 0.8670193125685882
0.10850397289695746
_______________________________
1 cos_distance
_______________________________
Number of clusters: 31
Number of outliers: 9/149 (6.04%)

Cohesion = 53.68811698798146
Separation = 5457.574106769652
G1 = 759.2591867930533

Entropy = 0.0
Purity = 1.0
F-measure = 0.867019

Rand = 98.726%
Jaccard Index = 75.15%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1555), HTML(value='')))


Best eps: 0.109239853287961
Best F-measure: 0.8629924669310044
0.109239853287961
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 10/149 (6.711%)

Cohesion = 61.02892908535854
Separation = 7257.290427315222
G1 = 941.4275234599687

Entropy = 0.0
Purity = 1.0
F-measure = 0.862992

Rand = 98.707%
Jaccard Index = 75.05%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=277), HTML(value='')))


Best eps: 0.04814814814814816
Best F-measure: 0.14356765092683738
0.04814814814814816
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 3
Number of outliers: 0/149 (0.0%)

Cohesion = 25.555393075328272
Separation = 519.8077073429998
G1 = 13.04135686419999

Entropy = 4.108261
Purity = 0.147651
F-measure = 0.143568

Rand = 13.604%
Jaccard Index = 5.214%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=755), HTML(value='')))


Best eps: 0.8228410008071025
Best F-measure: 0.9583849409352763
0.8228410008071025
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 45.74820339078674
Separation = 500.1018763744511
G1 = 53.24681749084797

Entropy = 0.0
Purity = 1.0
F-measure = 0.958385

Rand = 99.54%
Jaccard Index = 90.977%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=879), HTML(value='')))


Best eps: 0.29429121698042493
Best F-measure: 0.9138402239073378
0.29429121698042493
_______________________________
2 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 7/149 (4.698%)

Cohesion = 52.997506003850454
Separation = 1483.7834730350041
G1 = 191.49103149634243

Entropy = 0.0
Purity = 1.0
F-measure = 0.91384

Rand = 99.331%
Jaccard Index = 87.016%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1417), HTML(value='')))


Best eps: 0.308126859903751
Best F-measure: 0.9138402239073378
0.308126859903751
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 7/149 (4.698%)

Cohesion = 59.53827250049026
Separation = 3882.289048257089
G1 = 429.6994733664708

Entropy = 0.0
Purity = 1.0
F-measure = 0.91384

Rand = 99.331%
Jaccard Index = 87.016%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=114), HTML(value='')))


Best eps: 0.006790123456790087
Best F-measure: 0.3074532554909333
0.006790123456790087
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 6
Number of outliers: 1/149 (0.671%)

Cohesion = 26.08800041106997
Separation = 149.33740616580695
G1 = 29.22188404861194

Entropy = 3.265011
Purity = 0.324324
F-measure = 0.307453

Rand = 44.411%
Jaccard Index = 7.778%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=448), HTML(value='')))


Best eps: 0.7888697647733793
Best F-measure: 0.9176753149907511
0.7888697647733793
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 45.96859672278488
Separation = 132.94292355878352
G1 = 16.85115941347638

Entropy = 0.0
Purity = 1.0
F-measure = 0.917675

Rand = 99.34%
Jaccard Index = 87.091%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=481), HTML(value='')))


Best eps: 0.5248514928836625
Best F-measure: 0.9176753149907511
0.5248514928836625
_______________________________
3 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 51.93556234675277
Separation = 350.185214575616
G1 = 44.10991777706904

Entropy = 0.0
Purity = 1.0
F-measure = 0.917675

Rand = 99.34%
Jaccard Index = 87.091%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1293), HTML(value='')))


Best eps: 0.568011412665776
Best F-measure: 0.9176753149907511
0.568011412665776
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 60.47467673174542
Separation = 4088.68071825729
G1 = 435.7692559206747

Entropy = 0.0
Purity = 1.0
F-measure = 0.917675

Rand = 99.34%
Jaccard Index = 87.091%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Best eps: 0.025252525252525304
Best F-measure: 0.5580539068580133
0.025252525252525304
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 14
Number of outliers: 2/149 (1.342%)

Cohesion = 37.76480545556389
Separation = 35.27281395227163
G1 = 2.589524662129713

Entropy = 1.834319
Purity = 0.564626
F-measure = 0.558054

Rand = 74.289%
Jaccard Index = 15.653%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=221), HTML(value='')))


Best eps: 0.8297101449275364
Best F-measure: 0.9176753149907511
0.8297101449275364
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 45.191361938317044
Separation = 49.77352923567463
G1 = 5.8529310758099475

Entropy = 0.0
Purity = 1.0
F-measure = 0.917675

Rand = 99.34%
Jaccard Index = 87.091%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=226), HTML(value='')))


Best eps: 0.6191797702920826
Best F-measure: 0.9176753149907511
0.6191797702920826
_______________________________
4 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 50.80187977928149
Separation = 109.79670277330557
G1 = 13.401900373812795

Entropy = 0.0
Purity = 1.0
F-measure = 0.917675

Rand = 99.34%
Jaccard Index = 87.091%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1191), HTML(value='')))


Best eps: 0.849057994058598
Best F-measure: 0.9259120447039908
0.849057994058598
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 60.82276816628247
Separation = 4853.843469923264
G1 = 516.5251498359498

Entropy = 0.0
Purity = 1.0
F-measure = 0.925912

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


Best eps: 0.1477272727272727
Best F-measure: 0.6170665553325695
0.1477272727272727
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 15
Number of outliers: 3/149 (2.013%)

Cohesion = 38.24299779812452
Separation = 0.0
G1 = 0.0

Entropy = 1.519591
Purity = 0.630137
F-measure = 0.617067

Rand = 80.472%
Jaccard Index = 19.853%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=139), HTML(value='')))


Best eps: 0.9659090909090909
Best F-measure: 0.9357817644039512
0.9659090909090909
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 25
Number of outliers: 4/149 (2.685%)

Cohesion = 40.50633720845684
Separation = 18.748124266922527
G1 = 1.081522565166555

Entropy = 0.094996
Purity = 0.965517
F-measure = 0.935782

Rand = 99.08%
Jaccard Index = 83.505%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=145), HTML(value='')))


Best eps: 0.8804771390665607
Best F-measure: 0.9393348634959371
0.8804771390665607
_______________________________
5 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 48.99727890814356
Separation = 44.2750144259347
G1 = 4.5312629350606

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1174), HTML(value='')))


Best eps: 0.897394438195134
Best F-measure: 0.9393348634959371
0.897394438195134
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 59.69751829895287
Separation = 5244.785762583886
G1 = 540.5803596845963

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=17), HTML(value='')))


Best eps: 0.2272727272727273
Best F-measure: 0.7680987042526358
0.2272727272727273
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 21
Number of outliers: 4/149 (2.685%)

Cohesion = 49.66037436928562
Separation = 92.37126972162942
G1 = 2.352251609789077

Entropy = 0.649483
Purity = 0.786207
F-measure = 0.768099

Rand = 94.406%
Jaccard Index = 44.802%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Best eps: 0.9425287356321839
Best F-measure: 0.9310981337826973
0.9425287356321839
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 42.69487879604354
Separation = 6.654112129401733
G1 = 0.6636933601798862

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Best eps: 0.8777210029888703
Best F-measure: 0.9310981337826973
0.8777210029888703
_______________________________
6 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 48.20546042739281
Separation = 21.271041635634376
G1 = 1.57762481386443

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1177), HTML(value='')))


Best eps: 0.9470937472433
Best F-measure: 0.9393348634959371
0.9470937472433
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 58.873451406062415
Separation = 5388.997631722499
G1 = 571.8304147082322

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


Best eps: 0.25
Best F-measure: 0.8529772986148824
0.25
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 24
Number of outliers: 5/149 (3.356%)

Cohesion = 52.68860122794824
Separation = 72.14923327606184
G1 = 2.0176470588235293

Entropy = 0.276988
Purity = 0.895833
F-measure = 0.852977

Rand = 98.32%
Jaccard Index = 72.799%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=72), HTML(value='')))


Best eps: 0.9545454545454546
Best F-measure: 0.9310981337826973
0.9545454545454546
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 41.79028101934889
Separation = 1.9768396921721967
G1 = 0.17898663849199836

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


Best eps: 0.8915347710906719
Best F-measure: 0.9310981337826973
0.8915347710906719
_______________________________
7 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 47.133450942397715
Separation = 9.972852343511187
G1 = 0.6446922700444949

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1074), HTML(value='')))


Best eps: 0.917566306834597
Best F-measure: 0.9310981337826973
0.917566306834597
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 57.714056920247174
Separation = 5313.2029874382015
G1 = 586.9824752063346

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))


Best eps: 0.25
Best F-measure: 0.9059303485477981
0.25
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 55.80026455026455
Separation = 64.61108997283318
G1 = 2.4

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Best eps: 0.9659090909090909
Best F-measure: 0.9059303485477981
0.9659090909090909
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.79466584961404
Separation = 0.7193845115923752
G1 = 0.04023263360935653

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Best eps: 0.9144601077231699
Best F-measure: 0.9059303485477981
0.9144601077231699
_______________________________
8 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 48.64477072990128
Separation = 5.116188705549113
G1 = 0.2684510316002302

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1072), HTML(value='')))


Best eps: 0.96880042434256
Best F-measure: 0.9059303485477981
0.96880042434256
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 59.03891202550659
Separation = 5296.027037430262
G1 = 612.6923134882737

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))


Best eps: 0.19999999999999996
Best F-measure: 0.9059303485477981
0.19999999999999996
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 55.827431090566684
Separation = 32.30554498641659
G1 = 1.2

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=53), HTML(value='')))


Best eps: 0.9772727272727273
Best F-measure: 0.9059303485477981
0.9772727272727273
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.44656155584778
Separation = 0.21722299637219933
G1 = 0.012294569594597616

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))


Best eps: 0.9398615220675737
Best F-measure: 0.9059303485477981
0.9398615220675737
_______________________________
9 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 48.281211479417735
Separation = 1.2331351658446128
G1 = 0.06601255089138836

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=1026), HTML(value='')))


Best eps: 0.965609295403329
Best F-measure: 0.8969818026865004
0.965609295403329
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.93390372279772
Separation = 5349.19506179078
G1 = 711.4984628911428

Entropy = 0.0
Purity = 1.0
F-measure = 0.896982

Rand = 98.857%
Jaccard Index = 77.649%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Best eps: 0.125
Best F-measure: 0.9059303485477981
0.125
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 55.87310606060606
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Best eps: 0.9886363636363636
Best F-measure: 0.9059303485477981
0.9886363636363636
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.09197905368005
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))


Best eps: 0.9681857618512111
Best F-measure: 0.9059303485477981
0.9681857618512111
_______________________________
10 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 47.90305922999988
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


DBSCAN


HBox(children=(IntProgress(value=0, max=985), HTML(value='')))


Best eps: 0.994017375492741
Best F-measure: 0.8969818026865004
0.994017375492741
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.58092141048598
Separation = 5419.291742417873
G1 = 739.6172380578655

Entropy = 0.0
Purity = 1.0
F-measure = 0.896982

Rand = 98.857%
Jaccard Index = 77.649%


### HAC average

In [209]:
for i in range(1, 11):
    hists = hists_lase[i]
    hists_len = hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'average'
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link, with_step=0.005)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.035
Best F-measure: 0.5526319451806876
0.035
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 58.699828929147195
Separation = 14613.14806804667
G1 = 1591.6382427545284

Entropy = 1.13823
Purity = 0.613793
F-measure = 0.552632

Rand = 93.161%
Jaccard Index = 22.644%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.635
Best F-measure: 0.924721363647538
0.635
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 48.00401363599733
Separation = 1999.1812008376771
G1 = 239.07032011993098

Entropy = 0.061515
Purity = 0.972414
F-measure = 0.924721

Rand = 99.262%
Jaccard Index = 86.051%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.12
Best F-measure: 0.8670193125685882
0.12
_______________________________
1 cos_distance
_______________________________
Number of clusters: 31
Number of outliers: 9/149 (6.04%)

Cohesion = 53.68811698798146
Separation = 5457.574106769652
G1 = 759.2591867930533

Entropy = 0.0
Purity = 1.0
F-measure = 0.867019

Rand = 98.726%
Jaccard Index = 75.15%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=230), HTML(value='')))


Best eps: 0.13
Best F-measure: 0.8540439210697067
0.13
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 32
Number of outliers: 10/149 (6.711%)

Cohesion = 60.585733732486915
Separation = 7321.616316815072
G1 = 1072.6507757431739

Entropy = 0.0
Purity = 1.0
F-measure = 0.854044

Rand = 98.665%
Jaccard Index = 74.245%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.16
Best F-measure: 0.60651179749568
0.16
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 18
Number of outliers: 0/149 (0.0%)

Cohesion = 63.44300237247917
Separation = 8234.469208537012
G1 = 330.4226889760971

Entropy = 1.238761
Purity = 0.604027
F-measure = 0.606512

Rand = 94.332%
Jaccard Index = 36.159%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.8250000000000001
Best F-measure: 0.9514154004086888
0.8250000000000001
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 45.792961516259155
Separation = 485.3686932243874
G1 = 52.47410976281326

Entropy = 0.0
Purity = 1.0
F-measure = 0.951415

Rand = 99.573%
Jaccard Index = 91.619%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.425
Best F-measure: 0.8995545096216235
0.425
_______________________________
2 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 53.32754534946019
Separation = 1553.8540356859446
G1 = 211.38461868700819

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=224), HTML(value='')))


Best eps: 0.47500000000000003
Best F-measure: 0.8995545096216235
0.47500000000000003
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 59.86554020248841
Separation = 3979.9167478409795
G1 = 463.4845700283022

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.405
Best F-measure: 0.7531908482655397
0.405
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 21
Number of outliers: 1/149 (0.671%)

Cohesion = 59.20199274214076
Separation = 3145.0583445019925
G1 = 105.89919704213706

Entropy = 0.794074
Purity = 0.77027
F-measure = 0.753191

Rand = 96.507%
Jaccard Index = 50.066%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.905
Best F-measure: 0.9514154004086888
0.905
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 44.349735293935844
Separation = 132.73122572447946
G1 = 14.99302065398214

Entropy = 0.0
Purity = 1.0
F-measure = 0.951415

Rand = 99.531%
Jaccard Index = 90.789%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.71
Best F-measure: 0.9176753149907511
0.71
_______________________________
3 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 51.93556234675277
Separation = 350.185214575616
G1 = 44.10991777706904

Entropy = 0.0
Purity = 1.0
F-measure = 0.917675

Rand = 99.34%
Jaccard Index = 87.091%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=232), HTML(value='')))


Best eps: 0.76
Best F-measure: 0.9259120447039908
0.76
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 60.54615099541708
Separation = 4178.1059650527595
G1 = 438.190896836107

Entropy = 0.0
Purity = 1.0
F-measure = 0.925912

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.03
Best F-measure: 0.8232897207630723
0.03
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 27
Number of outliers: 4/149 (2.685%)

Cohesion = 58.907020545591976
Separation = 1630.4848251872597
G1 = 150.9293867728018

Entropy = 0.334833
Purity = 0.882759
F-measure = 0.82329

Rand = 97.701%
Jaccard Index = 61.6%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.935
Best F-measure: 0.9393348634959371
0.935
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 44.16983794083069
Separation = 50.19539534461844
G1 = 5.823046338675601

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.88
Best F-measure: 0.9334623802744606
0.88
_______________________________
4 cos_distance
_______________________________
Number of clusters: 27
Number of outliers: 5/149 (3.356%)

Cohesion = 49.1500039818074
Separation = 104.5194357352364
G1 = 9.188664859735256

Entropy = 0.047763
Purity = 0.986111
F-measure = 0.933462

Rand = 99.262%
Jaccard Index = 85.9%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=237), HTML(value='')))


Best eps: 0.905
Best F-measure: 0.9393348634959371
0.905
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 60.112469905935086
Separation = 4998.324013052287
G1 = 493.55308806780477

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.335
Best F-measure: 0.8357264407582884
0.335
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 23
Number of outliers: 4/149 (2.685%)

Cohesion = 59.82893666406873
Separation = 381.4751610876888
G1 = 23.676426030522382

Entropy = 0.402774
Purity = 0.862069
F-measure = 0.835726

Rand = 97.079%
Jaccard Index = 60.183%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.965
Best F-measure: 0.9393348634959371
0.965
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 43.27687921288748
Separation = 19.81325977656921
G1 = 2.204864701601495

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.9
Best F-measure: 0.9393348634959371
0.9
_______________________________
5 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 48.99727890814356
Separation = 44.2750144259347
G1 = 4.5312629350606

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.935
Best F-measure: 0.9393348634959371
0.935
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 59.670455234427436
Separation = 5364.642004729999
G1 = 546.0601786956814

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.01
Best F-measure: 0.8670398333944783
0.01
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 26
Number of outliers: 6/149 (4.027%)

Cohesion = 58.48103770153295
Separation = 390.9416410503686
G1 = 16.50521220666413

Entropy = 0.22132
Purity = 0.923077
F-measure = 0.86704

Rand = 98.198%
Jaccard Index = 70.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.98
Best F-measure: 0.9393348634959371
0.98
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 42.52482788827062
Separation = 7.075981118453072
G1 = 0.8153388722991746

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.92
Best F-measure: 0.9310981337826973
0.92
_______________________________
6 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 48.20546042739281
Separation = 21.271041635634376
G1 = 1.57762481386443

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.9550000000000001
Best F-measure: 0.9393348634959371
0.9550000000000001
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 58.873451406062415
Separation = 5388.997631722499
G1 = 571.8304147082322

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.01
Best F-measure: 0.8992558679595061
0.01
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 57.49118961178662
Separation = 149.11171933807242
G1 = 10.178571428571427

Entropy = 0.075502
Purity = 0.972028
F-measure = 0.899256

Rand = 99.104%
Jaccard Index = 82.959%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.99
Best F-measure: 0.9393348634959371
0.99
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 41.77047046648883
Separation = 1.9768396921721967
G1 = 0.17898663849199836

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.935
Best F-measure: 0.9310981337826973
0.935
_______________________________
7 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 47.133450942397715
Separation = 9.972852343511187
G1 = 0.6446922700444949

Entropy = 0.0
Purity = 1.0
F-measure = 0.931098

Rand = 99.429%
Jaccard Index = 88.825%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 1.0
Best F-measure: 0.9393348634959371
1.0
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 57.77232603527595
Separation = 5445.826223566007
G1 = 572.8466717628856

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.01
Best F-measure: 0.8878095431786706
0.01
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 55.99470899470899
Separation = 70.2091661841865
G1 = 4.4

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.851%
Jaccard Index = 77.495%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.99
Best F-measure: 0.9059303485477981
0.99
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.79466584961404
Separation = 0.7193845115923752
G1 = 0.04023263360935653

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.9500000000000001
Best F-measure: 0.9059303485477981
0.9500000000000001
_______________________________
8 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 48.64477072990128
Separation = 5.116188705549113
G1 = 0.2684510316002302

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.995
Best F-measure: 0.9059303485477981
0.995
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 59.03891202550659
Separation = 5296.027037430262
G1 = 612.6923134882737

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.01
Best F-measure: 0.8878095431786706
0.01
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 55.994350282485875
Separation = 38.2768262785268
G1 = 3.3333333333333335

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.851%
Jaccard Index = 77.495%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.99
Best F-measure: 0.9059303485477981
0.99
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.44656155584778
Separation = 0.21722299637219933
G1 = 0.012294569594597616

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.96
Best F-measure: 0.9059303485477981
0.96
_______________________________
9 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 48.281211479417735
Separation = 1.2331351658446128
G1 = 0.06601255089138836

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.995
Best F-measure: 0.8969818026865004
0.995
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 58.93390372279772
Separation = 5349.19506179078
G1 = 711.4984628911428

Entropy = 0.0
Purity = 1.0
F-measure = 0.896982

Rand = 98.857%
Jaccard Index = 77.649%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.73
Best F-measure: 0.9059303485477981
0.73
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 55.87310606060606
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.99
Best F-measure: 0.9059303485477981
0.99
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 43.09197905368005
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=220), HTML(value='')))


Best eps: 0.97
Best F-measure: 0.9059303485477981
0.97
_______________________________
10 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 47.90305922999988
Separation = 0.0
G1 = 0.0

Entropy = 0.0
Purity = 1.0
F-measure = 0.90593

Rand = 98.897%
Jaccard Index = 78.42%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


Best eps: 0.995
Best F-measure: 0.8788609973173729
0.995
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.229678896738776
Separation = 5371.354963246561
G1 = 772.6792512924173

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%


### HAC complete

In [157]:
for i in range(1, 11):
    hists = hists_lase[i]
    hists_len = hists_lase_len[i]
    
    for j in range(len(distances)):
        dist = distances[j]
        dist_name = distances_names[j]
        
        cur_dists = get_dists(hists, dist, hists_len)
        link = 'complete'
        best_eps = find_eps_with_brute_force_universal(cur_dists, label_to_changes, 
                                    from_change_to_label, ids_per_label, 
                                    agglomerative=True, linkage=link)

        print(best_eps)

        print("_______________________________")
        print(i, dist_name)
        print("_______________________________")
        
        tufano_clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=None,
                                                            affinity='precomputed', 
                                                            linkage=link, 
                                                            compute_full_tree=True,
                                                            distance_threshold=best_eps).fit(cur_dists)

        print_clustering_results_tufano_unique(tufano_clustering, cur_dists,
                                label_to_changes, from_change_to_label, ids_per_label,
                                      without_outliers=True)
        
        

HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=701), HTML(value='')))


Best eps: 0.25555555555555554
Best F-measure: 0.5773679592767631
0.25555555555555554
_______________________________
1 jaccard_metric
_______________________________
Number of clusters: 23
Number of outliers: 0/149 (0.0%)

Cohesion = 62.321974252005546
Separation = 15066.229912630779
G1 = 881.8877402018372

Entropy = 1.087539
Purity = 0.610738
F-measure = 0.577368

Rand = 93.978%
Jaccard Index = 25.893%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1170), HTML(value='')))


Best eps: 0.6851851851851851
Best F-measure: 0.9300052669180184
0.6851851851851851
_______________________________
1 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 3/149 (2.013%)

Cohesion = 49.20290456166456
Separation = 2065.7927233867895
G1 = 266.8154263400863

Entropy = 0.022227
Purity = 0.993151
F-measure = 0.930005

Rand = 99.235%
Jaccard Index = 84.944%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1315), HTML(value='')))


Best eps: 0.22285218942346485
Best F-measure: 0.8493505258392111
0.22285218942346485
_______________________________
1 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 7/149 (4.698%)

Cohesion = 54.51637203834549
Separation = 5410.136056507714
G1 = 662.6469344711356

Entropy = 0.162386
Purity = 0.93662
F-measure = 0.849351

Rand = 98.632%
Jaccard Index = 75.88%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1555), HTML(value='')))


Best eps: 0.236993045645854
Best F-measure: 0.8493505258392111
0.236993045645854
_______________________________
1 pearsons_correlation_mean
_______________________________
Number of clusters: 28
Number of outliers: 7/149 (4.698%)

Cohesion = 62.591141189711465
Separation = 7362.7424853148195
G1 = 800.0123645102535

Entropy = 0.162386
Purity = 0.93662
F-measure = 0.849351

Rand = 98.632%
Jaccard Index = 75.88%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=277), HTML(value='')))


Best eps: 0.13333333333333341
Best F-measure: 0.5887067322797787
0.13333333333333341
_______________________________
2 jaccard_metric
_______________________________
Number of clusters: 23
Number of outliers: 0/149 (0.0%)

Cohesion = 62.716225822970486
Separation = 8931.601971722368
G1 = 531.9287604945982

Entropy = 1.11398
Purity = 0.637584
F-measure = 0.588707

Rand = 94.749%
Jaccard Index = 30.989%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=755), HTML(value='')))


Best eps: 0.8333333333333334
Best F-measure: 0.9393348634959371
0.8333333333333334
_______________________________
2 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 46.39331696242628
Separation = 492.24806624560796
G1 = 56.8713459399947

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.398%
Jaccard Index = 88.19%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=879), HTML(value='')))


Best eps: 0.42759421967354716
Best F-measure: 0.8995545096216235
0.42759421967354716
_______________________________
2 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 53.32754534946019
Separation = 1553.8540356859446
G1 = 211.38461868700819

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1417), HTML(value='')))


Best eps: 0.482493676038919
Best F-measure: 0.8995545096216235
0.482493676038919
_______________________________
2 pearsons_correlation_mean
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 59.86554020248841
Separation = 3979.9167478409795
G1 = 463.4845700283022

Entropy = 0.0
Purity = 1.0
F-measure = 0.899555

Rand = 99.251%
Jaccard Index = 85.356%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=114), HTML(value='')))


Best eps: 0.026666666666666616
Best F-measure: 0.7231794344828607
0.026666666666666616
_______________________________
3 jaccard_metric
_______________________________
Number of clusters: 30
Number of outliers: 1/149 (0.671%)

Cohesion = 58.983739837398375
Separation = 3645.8493458951634
G1 = 505.46434221293833

Entropy = 0.638335
Purity = 0.783784
F-measure = 0.723179

Rand = 96.424%
Jaccard Index = 42.285%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=448), HTML(value='')))


Best eps: 0.9150326797385621
Best F-measure: 0.9393348634959371
0.9150326797385621
_______________________________
3 canberra_metric
_______________________________
Number of clusters: 28
Number of outliers: 4/149 (2.685%)

Cohesion = 44.8481735459196
Separation = 140.24262075751042
G1 = 17.353659291057408

Entropy = 0.0
Purity = 1.0
F-measure = 0.939335

Rand = 99.358%
Jaccard Index = 87.406%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=481), HTML(value='')))


Best eps: 0.872619425310593
Best F-measure: 0.9126747469590363
0.872619425310593
_______________________________
3 cos_distance
_______________________________
Number of clusters: 25
Number of outliers: 4/149 (2.685%)

Cohesion = 48.501402930402655
Separation = 366.6954641939955
G1 = 28.623385492098162

Entropy = 0.122472
Purity = 0.944828
F-measure = 0.912675

Rand = 98.994%
Jaccard Index = 81.771%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1293), HTML(value='')))


Best eps: 0.943186929735781
Best F-measure: 0.924755283871788
0.943186929735781
_______________________________
3 pearsons_correlation_mean
_______________________________
Number of clusters: 24
Number of outliers: 4/149 (2.685%)

Cohesion = 56.47129430079545
Separation = 4408.528873455554
G1 = 310.33275280813456

Entropy = 0.122472
Purity = 0.944828
F-measure = 0.924755

Rand = 99.167%
Jaccard Index = 84.896%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Best eps: 0.38888888888888895
Best F-measure: 0.8270807605648574
0.38888888888888895
_______________________________
4 jaccard_metric
_______________________________
Number of clusters: 26
Number of outliers: 2/149 (1.342%)

Cohesion = 59.82770342142989
Separation = 1526.0118667008805
G1 = 126.38768392483976

Entropy = 0.366393
Purity = 0.863946
F-measure = 0.827081

Rand = 97.586%
Jaccard Index = 61.111%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=221), HTML(value='')))


Best eps: 0.9285714285714286
Best F-measure: 0.9212140581268096
0.9285714285714286
_______________________________
4 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 44.894774954939955
Separation = 51.79812301187348
G1 = 6.6087835509937545

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=226), HTML(value='')))


Best eps: 0.8852778646747624
Best F-measure: 0.9153415749053331
0.8852778646747624
_______________________________
4 cos_distance
_______________________________
Number of clusters: 28
Number of outliers: 5/149 (3.356%)

Cohesion = 49.728484696980495
Separation = 109.68393661377398
G1 = 10.896195179026158

Entropy = 0.047763
Purity = 0.986111
F-measure = 0.915342

Rand = 99.174%
Jaccard Index = 84.23%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1191), HTML(value='')))


Best eps: 0.911940202702676
Best F-measure: 0.9212140581268096
0.911940202702676
_______________________________
4 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 4/149 (2.685%)

Cohesion = 60.644500766740165
Separation = 4974.167223402757
G1 = 523.3994017929418

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.272%
Jaccard Index = 85.714%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


Best eps: 0.125
Best F-measure: 0.8424697190663848
0.125
_______________________________
5 jaccard_metric
_______________________________
Number of clusters: 25
Number of outliers: 5/149 (3.356%)

Cohesion = 59.45962116517657
Separation = 412.91134600477517
G1 = 37.66782380630974

Entropy = 0.328173
Purity = 0.888889
F-measure = 0.84247

Rand = 97.261%
Jaccard Index = 61.892%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=139), HTML(value='')))


Best eps: 0.9615384615384616
Best F-measure: 0.9212140581268096
0.9615384615384616
_______________________________
5 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 44.051380813579556
Separation = 20.184868631620358
G1 = 2.168563822616151

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=145), HTML(value='')))


Best eps: 0.9011667577785198
Best F-measure: 0.9212140581268096
0.9011667577785198
_______________________________
5 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 49.650559925615454
Separation = 47.71199012905671
G1 = 5.400396144830824

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1174), HTML(value='')))


Best eps: 0.934647708058415
Best F-measure: 0.9212140581268096
0.934647708058415
_______________________________
5 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 60.2995164931182
Separation = 5209.369478779331
G1 = 571.9653483989282

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=17), HTML(value='')))


Best eps: 0.14015151515151525
Best F-measure: 0.8670398333944783
0.14015151515151525
_______________________________
6 jaccard_metric
_______________________________
Number of clusters: 26
Number of outliers: 6/149 (4.027%)

Cohesion = 58.48103770153295
Separation = 390.9416410503686
G1 = 16.50521220666413

Entropy = 0.22132
Purity = 0.923077
F-measure = 0.86704

Rand = 98.198%
Jaccard Index = 70.952%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Best eps: 0.9775280898876404
Best F-measure: 0.9212140581268096
0.9775280898876404
_______________________________
6 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 43.17315541704822
Separation = 7.838311657607845
G1 = 0.9467709762504068

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))


Best eps: 0.9430197117701811
Best F-measure: 0.9212140581268096
0.9430197117701811
_______________________________
6 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 48.73582503794306
Separation = 24.358168230531504
G1 = 2.2506091654926443

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1177), HTML(value='')))


Best eps: 0.952908688920998
Best F-measure: 0.9212140581268096
0.952908688920998
_______________________________
6 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 59.47260751678248
Separation = 5351.401569979766
G1 = 604.8246558281658

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


Best eps: 0.13257575757575768
Best F-measure: 0.8992558679595061
0.13257575757575768
_______________________________
7 jaccard_metric
_______________________________
Number of clusters: 28
Number of outliers: 6/149 (4.027%)

Cohesion = 57.49118961178662
Separation = 149.11171933807242
G1 = 10.178571428571427

Entropy = 0.075502
Purity = 0.972028
F-measure = 0.899256

Rand = 99.104%
Jaccard Index = 82.959%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=72), HTML(value='')))


Best eps: 0.9922779922779923
Best F-measure: 0.9212140581268096
0.9922779922779923
_______________________________
7 canberra_metric
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 42.4064289263833
Separation = 2.778822853635961
G1 = 0.3785895531861027

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


Best eps: 0.9358499700900416
Best F-measure: 0.9129773284135698
0.9358499700900416
_______________________________
7 cos_distance
_______________________________
Number of clusters: 29
Number of outliers: 6/149 (4.027%)

Cohesion = 47.80696915440383
Separation = 12.360363873075
G1 = 1.18510803240608

Entropy = 0.0
Purity = 1.0
F-measure = 0.912977

Rand = 99.34%
Jaccard Index = 87.091%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1074), HTML(value='')))


Best eps: 1.000868809566718
Best F-measure: 0.9212140581268096
1.000868809566718
_______________________________
7 pearsons_correlation_mean
_______________________________
Number of clusters: 29
Number of outliers: 5/149 (3.356%)

Cohesion = 58.392203622490655
Separation = 5404.8477316777935
G1 = 605.6267422087986

Entropy = 0.0
Purity = 1.0
F-measure = 0.921214

Rand = 99.31%
Jaccard Index = 86.476%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))


Best eps: 0.125
Best F-measure: 0.8878095431786706
0.125
_______________________________
8 jaccard_metric
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 55.99470899470899
Separation = 70.2091661841865
G1 = 4.4

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.851%
Jaccard Index = 77.495%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Best eps: 0.9949109414758268
Best F-measure: 0.8878095431786706
0.9949109414758268
_______________________________
8 canberra_metric
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 44.431339926565904
Separation = 1.3138146976400122
G1 = 0.1891645447753122

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Best eps: 0.9663853677273593
Best F-measure: 0.8878095431786706
0.9663853677273593
_______________________________
8 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 49.33208478913021
Separation = 6.996796918085872
G1 = 0.6967146483969548

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1072), HTML(value='')))


Best eps: 0.96880042434256
Best F-measure: 0.8788609973173729
0.96880042434256
_______________________________
8 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.904533891641094
Separation = 5237.638715136842
G1 = 719.5459563081921

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))


Best eps: 0.11742424242424254
Best F-measure: 0.8878095431786706
0.11742424242424254
_______________________________
9 jaccard_metric
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 55.994350282485875
Separation = 38.2768262785268
G1 = 3.3333333333333335

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.851%
Jaccard Index = 77.495%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=53), HTML(value='')))


Best eps: 0.9974842767295596
Best F-measure: 0.8878095431786706
0.9974842767295596
_______________________________
9 canberra_metric
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 44.07892179521317
Separation = 0.7365751727443705
G1 = 0.14331139001815044

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=54), HTML(value='')))


Best eps: 0.9851304197584222
Best F-measure: 0.8878095431786706
0.9851304197584222
_______________________________
9 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 48.975338181687576
Separation = 2.7569141857968313
G1 = 0.41529002359871764

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=1026), HTML(value='')))


Best eps: 1.000850339982339
Best F-measure: 0.8788609973173729
1.000850339982339
_______________________________
9 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.575289679661566
Separation = 5303.652278356562
G1 = 744.2588818763454

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Best eps: 0.10984848484848486
Best F-measure: 0.8878095431786706
0.10984848484848486
_______________________________
10 jaccard_metric
_______________________________
Number of clusters: 30
Number of outliers: 7/149 (4.698%)

Cohesion = 56.0
Separation = 6.531088913245536
G1 = 2.333333333333333

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.851%
Jaccard Index = 77.495%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Best eps: 1.0
Best F-measure: 0.8878095431786706
1.0
_______________________________
10 canberra_metric
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 43.72003687273956
Separation = 0.44376737044213965
G1 = 0.11274616818272076

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))


Best eps: 1.0
Best F-measure: 0.8878095431786706
1.0
_______________________________
10 cos_distance
_______________________________
Number of clusters: 30
Number of outliers: 6/149 (4.027%)

Cohesion = 48.604484709421094
Separation = 1.1333683674879762
G1 = 0.26163099291742953

Entropy = 0.0
Purity = 1.0
F-measure = 0.88781

Rand = 98.808%
Jaccard Index = 76.686%


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))


Agglomerative Clustering


HBox(children=(IntProgress(value=0, max=985), HTML(value='')))


Best eps: 1.000844951264482
Best F-measure: 0.8788609973173729
1.000844951264482
_______________________________
10 pearsons_correlation_mean
_______________________________
Number of clusters: 31
Number of outliers: 6/149 (4.027%)

Cohesion = 59.229678896738776
Separation = 5371.354963246561
G1 = 772.6792512924173

Entropy = 0.0
Purity = 1.0
F-measure = 0.878861

Rand = 98.769%
Jaccard Index = 75.915%
