# Clustering Evaluation Functions

This section is designed for the implementation of metric functions in relation with clustering evaluation. 

Author : Maxime Fontana

### Symmetric Distance Error 

This function is meant to measure the error as a percentage of missclassified vertices w.r.t the ground-truth. This is used for the study on synthetic data (Stochastic Block Model) in the paper that is presented in the file "Reproduce Figure 2".

This error could be represented by the Symmetric Difference between a group (a cluster) in a 'unfair' clustered dataset and a so called fair clustered one.

This sub-set can be represented as below.

<div>
<img src="attachment:symmetric-56a8fa9f5f9b58b7d0f6ea14.jpg" width="500"/>
</div>

In [1]:
# This function measures the error as a percentage of missclassified vertices w.r.t the
# ground-truth

# !!! NEEDS OPTIMISATION FOR SCALABILITY

def error_sym(labels, fair_labels):
    """
    Parameters
    ----------
    labels : ARRAY
        'UNFAIR LABELS'
    fair_labels : ARRAY
        'FAIR LABELS'.
        
    Returns
    -------
    INTEGER
        PORTION OF MISSCLASSIFIED VERTICES W.R.T GROUND-TRUTH.
    """
    pre_results = []
    ground_truth = [] 
    results = []
    my_range = np.arange(0,len(labels))
    lengths = []
    
    # Return the indices of elements of the same group and store them
    for h in range(max(fair_labels)+1):
        my_set = {i for i, x in enumerate(fair_labels) if x == h}
        pre_results.append(my_set)
        lengths.append(len(my_set))
    #print("Fair clusters")
    #print(pre_results)
    
    # Generate ground-truth in the same format (indices)
    for k in range(max(labels)+1):
        my_set = get_ground_truth(5, 5, my_range, k).astype(int)
        ground_truth.append(set(my_set))
        lengths.append(len(my_set))
    #print("ground truth")
    #print(ground_truth)
    
    # Cross-compute the symmetric difference between the 2
    for i in range(len(ground_truth)):
        for y in range(len(pre_results)):
            x = len(ground_truth[i].symmetric_difference(pre_results[y]))
            results.append(x)
        
    return (min(results) * 100) / len(labels)

In [2]:
def error_sym35(labels, fair_labels):
    """
    Parameters
    ----------
    labels : ARRAY
        'UNFAIR LABELS'
    fair_labels : ARRAY
        'FAIR LABELS'.
        
    Returns
    -------
    INTEGER
        PORTION OF MISSCLASSIFIED VERTICES W.R.T GROUND-TRUTH.
    """
    pre_results = []
    ground_truth = [] 
    results = []
    my_range = np.arange(0,len(labels))
    lengths = []
    
    # Return the indices of elements of the same cluster and store them
    for h in range(max(fair_labels)+1):
        my_set = {i for i, x in enumerate(fair_labels) if x == h}
        pre_results.append(my_set)
        
    # Get the pre-computed ground-truth    
    ground_truth = get_ground_truth_35(my_range)
    
    # Cross-compute the symmetric difference between the 2
    for i in range(len(ground_truth)):
        for y in range(len(pre_results)):
            x = len(ground_truth[i].symmetric_difference(pre_results[y]))
            results.append(x)
        
    return (min(results) * 100) / len(labels)

In [4]:
def error_sym22(labels, fair_labels):
    """
    Parameters
    ----------
    labels : ARRAY
        'UNFAIR LABELS'
    fair_labels : ARRAY
        'FAIR LABELS'.
        
    Returns
    -------
    INTEGER
        PORTION OF MISSCLASSIFIED VERTICES W.R.T GROUND-TRUTH.
    """
    pre_results = []
    ground_truth = [] 
    results = []
    my_range = np.arange(0,len(labels))
    lengths = []
    
    # Return the indices of elements of the same group and store them
    for h in range(max(fair_labels)+1):
        my_set = {i for i, x in enumerate(fair_labels) if x == h}
        pre_results.append(my_set)
        #lengths.append(len(my_set))
    #print("Fair clusters")
    #print(pre_results)
    
    # Generate ground-truth in the same format (indices)
    #for k in range(max(labels)):
    ground_truth = get_ground_truth_22(my_range)
        
        #ground_truth.append(my_set[k])
        #lengths.append(len(my_set))
    #print("ground truth")
    #print(ground_truth)
    
    # Cross-compute the symmetric difference between the 2
    for i in range(len(ground_truth)):
        for y in range(len(pre_results)):
            x = len(ground_truth[i].symmetric_difference(pre_results[y]))
            results.append(x)
        
    return (min(results) * 100) / len(labels)

In [5]:
def error_sym53(labels, fair_labels):
    """
    Parameters
    ----------
    labels : ARRAY
        'UNFAIR LABELS'
    fair_labels : ARRAY
        'FAIR LABELS'.
        
    Returns
    -------
    INTEGER
        PORTION OF MISSCLASSIFIED VERTICES W.R.T GROUND-TRUTH.
    """
    pre_results = []
    ground_truth = [] 
    results = []
    my_range = np.arange(0,len(labels))
    lengths = []
    
    # Return the indices of elements of the same group and store them
    for h in range(max(fair_labels)+1):
        my_set = {i for i, x in enumerate(fair_labels) if x == h}
        pre_results.append(my_set)
       
    ground_truth = get_ground_truth_53(my_range)
    
    # Cross-compute the symmetric difference between the 2
    for i in range(len(ground_truth)):
        for y in range(len(pre_results)):
            x = len(ground_truth[i].symmetric_difference(pre_results[y]))
            results.append(x)
        
    return (min(results) * 100) / len(labels)

In [6]:
# This function measures the error as a percentage of missclassified vertices w.r.t the
# ground-truth

def error_sym2(labels, fair_labels, h, k):
    """
    Parameters
    ----------
    labels : ARRAY
        'UNFAIR LABELS'
    fair_labels : ARRAY
        'FAIR LABELS'.
        
    Returns
    -------
    INTEGER
        PORTION OF MISSCLASSIFIED VERTICES W.R.T GROUND-TRUTH.
    """
    pre_results = []
    ground_truth = [] 
    results = []
    my_range = np.arange(0,len(labels))
    lengths = []
    
    # Return the indices of elements of the same group and store them
    for k in range(max(fair_labels)+1):
        my_set = {i for i, x in enumerate(fair_labels) if x == h}
        pre_results.append(my_set)
        lengths.append(len(my_set))
    #print("Fair clusters")
    #print(pre_results)
    
    # Generate ground-truth in the same format (indices)
    for k in range(max(labels)+1):
        my_set = Opt_get_ground_truth(h, k, my_range, k).astype(int) # GET GROUND TRUTH
        ground_truth.append(set(my_set))
        lengths.append(len(my_set))
    #print("ground truth")
    #print(ground_truth)
    
    # Cross-compute the symmetric difference between the 2
    for i in range(len(ground_truth)):
        for y in range(len(pre_results)):
            x = len(ground_truth[i].symmetric_difference(pre_results[y]))
            results.append(x)
        
    return (min(results) * 100) / len(labels)

In [7]:
# Compute the Ratio-Cut of the output

def ratio_cut(laplacian, matrix_H):
    return(np.trace(np.transpose(matrix_H) @ laplacian @ matrix_H))

In [8]:
# Pre-Balance (not used)

def pre_balance(labels, fair_labels): # (hn^kn(min(set1, set2)))
    results = []
    
    for h in range(max(labels)+1): # O(h)
        scores_list = []
        my_set = {i for i, x in enumerate(labels) if x == h} # O(n)
        my_calc = (len(my_set) * 100) / len(labels)
        print("Initial Population : ", my_calc)
        for k in range(max(fair_labels)+1): # O(k)
            score = 0
            my_fair_set = {i for i, x in enumerate(fair_labels) if x == k} # O(n)
            my_intersec = len(my_set.intersection(my_fair_set)) #O(min(my_set, my_fair_set))
            print("proportion :", my_intersec, " on", len(my_fair_set))
            my_fair_calc = ((my_intersec) * 100) / len(my_fair_set)
            print("Proportion in Fair Cluster :", my_fair_calc)
            score = abs(my_calc - my_fair_calc)
            print("local score :", score)
            scores_list.append(score)
        scores = sum(scores_list) / (max(fair_labels) + 1)
        print(scores, " !!!")
        results.append(scores)
            
    return sum(results) / len(results)

In [9]:
# Balance
import numpy as np


# returns the % of each community in partioning
def prop(ground_truth):
    my_dic = {}
    for k in range(max(ground_truth)+1): # O(h)
        my_set = {i for i, x in enumerate(ground_truth) if x == k} # O(n)
        my_calc = (len(my_set) * 100) / len(ground_truth)
        my_dic[k] = int(my_calc)
        
    return my_dic
    

def balance(ground_truth, labels):
    # Constants
    result = 0
    indices = []
    arr = np.array(labels)
    init_distrib = prop(ground_truth)
    
    # Get list of indices
    for k in range(max(arr+1)):
        x = np.where(arr == k)
        distribution = []
        for i in range(len(x[0])):
            distribution.append(ground_truth[x[0][i]])
        
        cluster_distrib = prop(distribution)
        score = 0
        
        for key in cluster_distrib.keys():
            if cluster_distrib[key] != 0:
                # We take the difference between the initial distribution (percentages)
                # And the actual distribution from the algorithms
                score += abs(init_distrib[key] - cluster_distrib[key])
        
        # The greater the score the less balanced it is, therefore we subtract to the result to make it
        # more understandable on graphs 
        result -= score
        
    return result/max(arr+1)