# Big Data | Final Project
### Mohsen Ebadpour | 400131080 | M.Ebadpour@aut.ac.ir 
#### Spring 2023 
##### An Adaptive Clustering Algorithm Based on Local-Density Peaks for Imbalanced Data Without Parameters

--- 
## Importing libraries and packages 

In [14]:

import numpy as np 
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns 
import scipy.io
import pandas as pd 
from tqdm import tqdm 
import networkx as nx
from scipy import stats as st
from sklearn.metrics import recall_score,accuracy_score,normalized_mutual_info_score,adjusted_mutual_info_score,confusion_matrix
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN

import time

--- 
## Loading datasets and pre-processing 

In [53]:
def load_dataset(name="banana.mat"):
    """
    Loads and preprocesses the dataset from the given file name.

    Args:
        name (str): The file name of the dataset to be loaded. Defaults to "banana.mat".

    Returns:
        X (np.ndarray): The feature matrix of the preprocessed dataset.
        Y (np.ndarray): The target vector of the preprocessed dataset.
    """
    name = "./dataset/" + name
    if "mat" in name:
        file = scipy.io.loadmat(name)
        X,Y = file['data'],file["label"]
        
        
    
    elif "thyroid" in name : 
        data_frame = pd.read_csv(name,header=None)
        data = data_frame.to_numpy()
        Y,X = data[:,0],data[:,1:]

    elif "sensor" in name :
        data_frame = pd.read_csv(name,header=None)
        label_encoder = preprocessing.LabelEncoder()
        data_frame[24]= label_encoder.fit_transform(data_frame[24])
        data = data_frame.to_numpy()
        Y,X = data[:,24],data[:,:24]
        
    elif "ecoli" in name :
        data_frame = pd.read_csv(name,header=None)
        label_encoder = preprocessing.LabelEncoder()
        #data_frame.drop(0,axis=1,inplace=True)
        data_frame[6]= label_encoder.fit_transform(data_frame[6])
        
        label_encoder = preprocessing.LabelEncoder()
        data_frame[0]= label_encoder.fit_transform(data_frame[0])
        
        
        data = data_frame.to_numpy()
        Y,X = data[:,6],data[:,:6]
    
    
    else :
        data = pd.read_csv(name+".arff.txt",header=None)
        Y = data[2].to_numpy() + 1
        data.drop(2,axis=1,inplace=True)
        X = data.to_numpy()
    
    # Normalize the feature matrix to eliminate the sensitivity to the range of distances.    
    X = X/ X.max(axis=0)
    return X,Y


x_banana , y_banana = load_dataset()
x_thyroid, y_thyroid = load_dataset("new-thyroid.data")
x_ids2, y_ids2 = load_dataset("ids2.mat")
x_lithuanian, y_lithuanian = load_dataset("lithuanian.mat")
x_sensor, y_sensor = load_dataset("sensor_readings_24.data") 
x_guassian, y_guassian = load_dataset("gaussian.mat") 
x_ecoli,y_ecoli = load_dataset("ecoli_new.data.csv") 

x_t5,y_t5 = load_dataset("cluto-t5-8k") 
x_t7,y_t7 = load_dataset("cluto-t7-10k") 
x_zel,y_zel = load_dataset("zelnik4") 
x_zel2,y_zel2 = load_dataset("zelnik2") 

x_wingnut,y_wingnut = load_dataset("wingnut") 


datasets = [(x_ecoli,y_ecoli),(x_thyroid, y_thyroid),(x_banana , y_banana),(x_sensor, y_sensor),(x_ids2, y_ids2),(x_lithuanian, y_lithuanian),
            (x_t5,y_t5),(x_t7,y_t7),(x_guassian, y_guassian ),(x_zel,y_zel),(x_zel2,y_zel2),(x_wingnut,y_wingnut)]
dataset_names = ["Ecoli","Thyroid","Banana","Robot navigation","Ids2","Lithuanian","cluto-t5-8k","cluto-t7-10k","Guassian","zelnik4","zelnik2","wingnut"]


---
## Implementing Algorithm 

In [54]:
def distance_measure (datapoint1, datapoint2 , metric = "euclidean") :
    """
    Computes the distance between two given data points based on the specified distance metric.

    Args:
        datapoint1 (np.ndarray): The first data point.
        datapoint2 (np.ndarray): The second data point.
        metric (str): The distance metric to be used. Defaults to "euclidean".

    Returns:
        dist (float): The distance between the two data points based on the specified distance metric.
    """
    dist = -1
    if metric == "euclidean":
        dist = np.linalg.norm(datapoint1 - datapoint2)
    assert(dist!=-1)    
    return dist

In [55]:
def calculate_dc(dataset):
    """
    Computes the threshold distance (dc) and the distance matrix for the input dataset.

    Args:
        dataset (Tuple[np.ndarray, np.ndarray]): The input dataset as a tuple of feature matrix (X) and target vector (Y).

    Returns:
        dc (float): The threshold distance (dc) for the input dataset.
        distance_mat (np.ndarray): The distance matrix for the input dataset.
    """
    # Extract the feature matrix from the input dataset.
    x,y = dataset 
    dc = -np.inf
    distance_mat = np.zeros((x.shape[0],x.shape[0]))
    
    # Compute the distance matrix and the threshold distance (dc) for the input dataset.
    for i in tqdm(range(x.shape[0])):
        nearest_neighbor_distance = np.inf
        for j in range(x.shape[0]):
            # Keep track of the nearest neighbor distance for the i-th data point.
            if i == j :
                continue
            
            # Compute the distance between the i-th and j-th data points using the Euclidean distance metric.
            dist = distance_measure(x[i],x[j])
            distance_mat[i,j] = dist  
            nearest_neighbor_distance = min(nearest_neighbor_distance,dist)
            
        # Update the threshold distance (dc) if the nearest neighbor distance for the i-th data point is larger.
        dc = max(dc,nearest_neighbor_distance)
    return dc,distance_mat 



In [56]:
def calculate_density(dataset,dc,dis_mat):
    """
    Computes the density for each data point in the input dataset.

    Args:
        dataset (Tuple[np.ndarray, np.ndarray]): The input dataset as a tuple of feature matrix (X) and target vector (Y).
        dc (float): The threshold distance (dc) for the input dataset.
        dis_mat (np.ndarray): The distance matrix for the input dataset.

    Returns:
        densities (List[float]): The density for each data point in the input dataset.
    """
    x,y = dataset 
    densities = []
    
    for i in range(x.shape[0]):
        density = 0 
        for j in range(x.shape[0]):
            if i == j :
                continue
            dist = dis_mat[i,j]
            if dist <= dc : 
                density += np.exp(-(dist/dc)**2)
        densities.append(density)
        
    return densities



In [57]:
def calculate_upward_distance(densities,dis_mat):
    """
    Computes the upward distance for each data point in the input dataset.

    Args:
        densities (List[float]): The density for each data point in the input dataset.
        dis_mat (np.ndarray): The distance matrix for the input dataset.

    Returns:
        up_dis (List[float]): The upward distance for each data point in the input dataset.
    """
    up_dis = []
    
    for i in range(len(densities)):
        max_up = - np.inf
        min_up = +np.inf
        is_greater_all = True
        for j in range(len(densities)):
            if i == j:
                continue 
            
            if densities[j] >= densities [i]:
                is_greater_all = False 
            
            max_up = max(max_up,dis_mat[i,j])
            
            if densities[j] > densities[i]:
                min_up = min (min_up,dis_mat[i,j])
                
        if is_greater_all :
            up_dis.append(max_up)
        else:
            up_dis.append(min_up)
            
    return up_dis
        

In [58]:
def calculate_neighbors(dis_mat,K=None):
    """
    Computes the K-nearest neighbors for each data point in the input dataset.

    Args:
        dis_mat (np.ndarray): The distance matrix for the input dataset.
        K (Optional[int]): The number of nearest neighbors to consider. If None, defaults to the square root of the number of data points.

    Returns:
        neighbors (List[np.ndarray]): The K-nearest neighbors for each data point in the input dataset.
        reverse_neighbors (List[List[int]]): The reverse nearest neighbors for each data point in the input dataset.
    """
    if K is None:
        K = int(np.sqrt(dis_mat.shape[0]))
    neighbors =  []
    reverse_neighbors = [[] for _ in range(dis_mat.shape[0])]
    for i in range(dis_mat.shape[0]):
        i_neighbors = np.argsort(dis_mat[i])[1:1+K]
        neighbors.append(i_neighbors)
        
        for j in i_neighbors:
            reverse_neighbors[j].append(i)    
    return neighbors,reverse_neighbors



In [59]:
def determine_noise_and_init(up_dis,densities,reverse_neighbors,dis_mat):
    """
    Determines the noise points and initializes the cluster centers for the input dataset.

    Args:
        up_dis (List[float]): The upward distances for each data point in the input dataset.
        densities (List[float]): The density for each data point in the input dataset.
        reverse_neighbors (List[List[int]]): The reverse nearest neighbors for each data point in the input dataset.
        dis_mat (np.ndarray): The distance matrix for the input dataset.

    Returns:
        clusters (np.ndarray): The cluster labels for each data point in the input dataset.
        sub_clusters_init (List[int]): The indices of the initial cluster centers.
        ICC (int): The number of initial clusters.
        noise_count (int): The number of noise points.
    """
    
    # Compute the mean and standard deviation of the upward distances, densities, and reverse nearest neighbors.
    up_dis_mean = np.mean(up_dis)
    up_dis_std = np.std(up_dis)
    
    densities_mean = np.mean(densities)
    densities_std = np.std(densities)
    
    rnn = list(map(len,reverse_neighbors))
    
    rnn_mean = np.mean(rnn)
    rnn_std = np.std(rnn)
    
    clusters = np.ones((len(rnn)))*-1
    sub_clusters_init = []
    ICC = 0 
    
    #Determine the noise points based on the upward distance, reverse nearest neighbors, and density criteria.
    j = 0 
    noise_count = 0
    for i in range(len(rnn)):
        if up_dis[i] > up_dis_mean + up_dis_std and rnn[i] < rnn_mean - rnn_std and densities[i] < densities_mean - densities_std :
            clusters[i] = 0
            noise_count += 1
    
    # Determine the initial cluster centers based on the upward distance criteria.
    for i in range(len(rnn)):
        if up_dis[i] > up_dis_mean + up_dis_std and clusters[i] != 0 :
            j += 1 
            ICC += 1 
            clusters[i] = j
            sub_clusters_init.append(i)
    
    # Assign the remaining data points to their nearest cluster.
    for i in np.argsort(np.array(densities)*(-1)):
        if clusters[i] != -1:
            continue
        
        nearest_neighbors = np.argsort(dis_mat[i])[1:]
        for neighbor in nearest_neighbors:
            if densities[neighbor] > densities[i] :
                clusters[i] = clusters[neighbor]
                if clusters[i] == -1 :
                    raise ValueError("Non-Cluster label selected!")
                break
            
    return clusters,sub_clusters_init,ICC,noise_count



In [None]:
def update_clusters(densities,clusters,dis_mat,sub_clusters_init,dc,ICC,improve=False):
    """
    Updates the clusters by removing false positive sub-clusters and re-assigning their members to other clusters.

    Args:
        densities (List[float]): The density for each data point in the input dataset.
        clusters (np.ndarray): The cluster labels for each data point in the input dataset.
        dis_mat (np.ndarray): The distance matrix for the input dataset.
        sub_clusters_init (List[int]): The indices of the initial cluster centers.
        dc (float): The threshold distance for defining a cluster.
        ICC (int): The number of initial clusters.

    Returns:
        clusters (np.ndarray): The updated cluster labels for each data point in the input dataset.
        ICC (int): The updated number of clusters.
        removed_sub_clusters (List[int]): The indices of the removed sub-clusters.
    """
    # Remove false positive sub-clusters.
    removed_sub_clusters = []
    for cluster_index in sub_clusters_init:
        n_total_members_in_clusters = np.where(clusters==clusters[cluster_index])[0].shape[0] 
        center_neighbor_distances = dis_mat[cluster_index]
        n_radius_dc = np.where(center_neighbor_distances<=dc)[0].shape[0]
        
        if n_total_members_in_clusters < 0.5 * n_radius_dc :
            if improve:
                clusters[clusters==clusters[cluster_index]] = 0
            else:
                clusters[clusters==clusters[cluster_index]] = -1
            ICC -= 1 
            removed_sub_clusters.append(cluster_index)
            
    # Re-assign the remaining data points to their nearest cluster.        
    for i in np.argsort(np.array(densities)*(-1)):
        if clusters[i] != -1:
            continue
        nearest_neighbors = np.argsort(dis_mat[i])[1:]
        for neighbor in nearest_neighbors:
            if densities[neighbor] > densities[i] :
                clusters[i] = clusters[neighbor]
                
                if clusters[i] == -1 :
                    continue
                break
    return clusters, ICC,removed_sub_clusters

In [60]:
def clusters_distance(dis_mat,clusters_m_members,cluster_n_members):
    """
    Computes the distance between two clusters.

    Args:
        dis_mat (np.ndarray): The distance matrix for the input dataset.
        clusters_m_members (np.ndarray): The indices of the data points in cluster m.
        cluster_n_members (np.ndarray): The indices of the data points in cluster n.

    Returns:
        Tuple[float, int, int]: The distance between the two clusters and the indices of the closest data points in the two clusters.
    """
    distnaces = dis_mat[:,cluster_n_members][clusters_m_members,:]
    min_value = distnaces.flatten().min()
    i,j = np.where(distnaces==min_value)
    i,j = i[0],j[0]
    return distnaces.flatten().min(),clusters_m_members[i],cluster_n_members[j]


In [61]:
def get_In_set(_densities,cluster_members):
    """
    Computes the set of inner points in a cluster.

    Args:
        _densities (np.ndarray): The density for each data point in the input dataset.
        cluster_members (np.ndarray): The indices of the data points in the cluster.

    Returns:
        np.ndarray: The indices of the inner points in the cluster.
    """
    boundary = []
    _densities = np.array(_densities)
    cluster_members = np.array(cluster_members)
    avg_den = np.mean(_densities[cluster_members])
    for member in cluster_members:
        if _densities[member] < avg_den:
            boundary.append(member)   
    res = [i for i in cluster_members if not i in boundary]    
    return np.array(res)




In [62]:
def update_merge_labels(pairs,i,j):
    """
    Updates the merge labels dictionary.

    Args:
        pairs (Dict[int, List[int]]): The merge labels dictionary.
        i (int): The index of the first cluster to be merged.
        j (int): The index of the second cluster to be merged.

    Returns:
        Dict[int, List[int]]: The updated merge labels dictionary.
    """
    i,j = int(i),int(j)
    if not i in pairs :
        pairs[i] = [j]
    else:
        pairs[i].append(j)
    
    if not j in pairs :
        pairs[j] = [i]
    else:
        pairs[j].append(i)
        
    return pairs


In [None]:
def merging_clusters(dc,dis_mat,densities,clusters,sub_clusters_init,improve=False,dataset=None):
    """
    Merge clusters based on the specified algorithm.

    Parameters:
    dc (float): Threshold distance for merging clusters.
    dis_mat (array): Distance matrix between points.
    densities (array): Density of each point.
    clusters (array): Cluster label of each point.
    sub_clusters_init (array): Initial centers of subclusters.

    Returns:
    clusters (array): Updated cluster labels for each point.
    new_labels (list): List of merged subclusters.
    """
    # Initialize current centers and label_centers
    current_centers = []
    label_centers = {}
    for center in sub_clusters_init:
        current_centers.append(center)
        label_centers[clusters[center]] = center
    
    # Calculate the radius of merging
    n_noise = np.where(clusters==0)[0].shape[0]
    if n_noise == 0 :
        r = dc 
    else:
        _dis_mat = dis_mat.copy()
        for _ in range(len(clusters)):
            _dis_mat[_,_] = np.inf
        _dis_mat = np.delete(_dis_mat, np.where(clusters==0)[0] , 0)
        _dis_mat = np.delete(_dis_mat, np.where(clusters==0)[0] , 1)
        r = np.max(np.min(_dis_mat,axis=0))

    # Get unique cluster labels
    clusters_labels = np.unique(clusters)
    clusters_labels = np.delete(clusters_labels,np.where(clusters_labels==0))
    densities = np.array(densities)
    
    # Initialize merge_pairs dictionary
    merge_pairs = {}
    
    # Loop through all possible cluster pairs
    for cluster_m in range(clusters_labels.shape[0]-1):
        cluster_m_members = np.where(clusters==clusters_labels[cluster_m])[0]
        m_in = get_In_set(densities,cluster_m_members)
        
        for cluster_n in range(cluster_m+1,clusters_labels.shape[0]):
            cluster_n_members = np.where(clusters==clusters_labels[cluster_n])[0]
            n_in = get_In_set(densities,cluster_n_members)
            
            n_m_distnace,i,j = clusters_distance(dis_mat,cluster_m_members,cluster_n_members)
            
            density_m_center = densities[label_centers[clusters_labels[cluster_m]]]
            density_n_center = densities[label_centers[clusters_labels[cluster_n]]]
        
            # Merge the clusters if the conditions are met
            if n_m_distnace < r :
                if i in m_in and j in n_in:
                    merge_pairs = update_merge_labels(merge_pairs,clusters_labels[cluster_m],clusters_labels[cluster_n])
                    
                elif densities[i] + densities[j] > np.mean([density_m_center,density_n_center]):
                    merge_pairs = update_merge_labels(merge_pairs,clusters_labels[cluster_m],clusters_labels[cluster_n])
                    
    # Construct a graph and get the connected components
    graph = nx.Graph(merge_pairs)
    new_labels = list(map(list,list(nx.connected_components(graph))))
    
    # Update cluster labels for each point
    for idx in range(len(new_labels)):
        for cluster_label in new_labels[idx] :
            clusters[clusters==cluster_label] = int(idx + 1)
    
    
    if improve:     
        x,y = dataset
        noise_index =  np.where(clusters==0)[0]
        x_noise,y_noise = x[noise_index],y[noise_index]
        clusters_members_count = []
        for label in np.unique(clusters):
            clusters_members_count.append(np.where(clusters==label)[0].shape[0])
        
        clusters_members_count.sort()
        _min = int(np.mean(clusters_members_count[:len(clusters_members_count)*3//4])/2)+1  
        
        clustering_db = DBSCAN(eps=dc, min_samples=_min).fit(x_noise) 
        noise_labels = clustering_db.labels_ 
        
        clusters[noise_index[noise_labels != -1]] = 0 
        clusters[noise_index[noise_labels == -1]] = -1
        
        
    # Assign noisy points to the nearest cluster        
    for i in np.argsort(np.array(densities)*(-1)):
        if clusters[i] != 0:
            continue
        
        nearest_neighbors = np.argsort(dis_mat[i])[1:]
        for neighbor in nearest_neighbors:
            if densities[neighbor] > densities[i]:
                clusters[i] = clusters[neighbor]
                
                if clusters[i] == -1 and not(improve):
                    continue
                break
                
    clusters = clusters - 1
    
    return clusters,new_labels, merge_pairs 



In [None]:
def prediction_report(clusters,true_labels,improve=False,densities=None,up_dis=None):
    """
    Returns necessary evaluation metrics and results.

    Parameters:
    clusters (array): Cluster labels for each point.
    true_labels (array): Ground truth labels for each point.

    Returns:
    acc (float): Accuracy score.
    recall (float): Recall score.
    nmi (float): Normalized mutual information score.
    cm (array): Confusion matrix.
    clusters (array): Updated cluster labels for each point.
    prediction_rule (dict): Mapping of initial labels to new cluster labels.
    """
    
    prediction_rule = {}
    for label in np.unique(clusters):
        if improve : 
            densities,up_dis = np.array(densities),np.array(up_dis)
            if densities[clusters==label].max(axis=0) != 0:
                new_den = densities[clusters==label]/ densities[clusters==label].max(axis=0)  
                
            weight = true_labels[clusters==label] *  new_den  
            weighted_mean = np.mean(weight)
            
            orginal_labels = np.unique(true_labels)
            new_label_index = np.argmin(np.sqrt((orginal_labels - weighted_mean)**2))
            prediction_rule[-label] = orginal_labels[new_label_index] 
        
        else:
            # Find the mode of true_labels for each cluster
            _mode = st.mode(true_labels[clusters==label],keepdims=False)
            prediction_rule[-label] = int(_mode[0])

    # Update cluster labels using prediction_rule
    clusters = clusters * (-1)
    for label in np.unique(clusters):
        clusters[clusters==label] = prediction_rule[label]
        
    # Calculate evaluation metrics using updated labels
    acc = accuracy_score(true_labels,clusters)
    recall = recall_score(true_labels,clusters,average='macro')
    nmi = normalized_mutual_info_score(true_labels,clusters)
    cm = confusion_matrix(true_labels,clusters)
    return acc,recall,nmi,cm,clusters,prediction_rule
        


In [63]:
def report_results(name,dataset,new_clusters,densities,up_dis,reverse_neighbors,clusters,Noise,K_acc,cm,K_recall,K_nmi,K_time):
        _name = name + " | "
        plt.rcParams["figure.figsize"] = 16,16
        sns.set_style("whitegrid")
        x,y = dataset
        f1,f2 = "Feature-1","Feature-2"
        if x.shape[1] != 2 :
                pca = PCA(n_components=2)
                x = pca.fit_transform(x)
                f1,f2 = "Components-1(PCA)","Components-2(PCA)"
                
        data = {f1:x.T[0],f2:x.T[1],"Label":y.flatten(),
                "Clustering Result":new_clusters,"Densities":densities,"upward distances":up_dis,
                "RNN":list(map(len,reverse_neighbors)), "Initialized sub-clusters":clusters}

        H,W,I = 3,3,1
        plt.subplot(H,W,I)
        I+=1
        sns.scatterplot(data=data,x=f1,y=f2,hue="Label")
        plt.title(_name+"Orginal Dataset | Count of labels:{0}".format(np.unique(y.flatten()).shape[0]))

        plt.subplot(H,W,I)
        I+=1
        sns.scatterplot(data=data,x="Densities",y="upward distances",hue="Label")
        plt.title(_name+"2D Decision Graph")

        plt.subplot(H,W,I)
        I+=1
        sns.scatterplot(data=data,x=f1,y=f2,hue="Initialized sub-clusters")
        plt.title("Initialized sub-clusters after updating")

        plt.subplot(H,W,I)
        I+=1
        sns.scatterplot(data=data,x=f1,y=f2,hue="Clustering Result")
        plt.title("Clustering Result | Count of Clusters:{0}".format(np.unique(new_clusters.flatten()).shape[0]))

        plt.subplot(H,W,I)
        I+=1
        plt.plot(Ks,Noise,color="black")
        plt.title(_name+"Counts of noisy points in NN calculating")
        plt.xlabel("K")
        plt.ylabel("# Noisy points")
        
        plt.subplot(H,W,I)
        I+=1
        plt.plot(Ks,K_time,color="orange")
        plt.title(_name+"Run-Time in sec. in NN calculating")
        plt.xlabel("K")
        plt.ylabel("Run-Time in sec.")

        plt.subplot(H,W,I)
        I+=1
        plt.plot(Ks,K_acc,color="green")
        plt.title(_name+"Accuracy in NN calculating | max:{0}".format(round(max(K_acc)*100,2)))
        plt.xlabel("K")
        plt.ylabel("Accuracy")

        plt.subplot(H,W,I)
        I+=1
        plt.plot(Ks,K_recall,color="blue")
        plt.title(_name+"Recall in NN calculating | max:{0}".format(round(max(K_recall)*100,2)))
        plt.xlabel("K")
        plt.ylabel("Recall")


        plt.subplot(H,W,I)
        I+=1
        plt.plot(Ks,K_nmi,color="magenta")
        plt.title(_name+"NMI in NN calculating | max:{0}".format(round(max(K_nmi)*100,2)))
        plt.xlabel("K")
        plt.ylabel("NMI")


        plt.legend()
        plt.tight_layout()
        plt.savefig(name+".png")
        plt.show()
        

---
## Reporting

In [None]:
dataset = datasets[-1]
name = dataset_names[-1]
K_acc,K_recall,K_nmi,Noise = [],[],[],[]
dc, dis_mat = calculate_dc(dataset)
densities = calculate_density(dataset,dc,dis_mat.copy()) 
up_dis = calculate_upward_distance(densities.copy(),dis_mat.copy())  

plt.rcParams["figure.figsize"] = 4,4
neighbors,reverse_neighbors = calculate_neighbors(dis_mat.copy())
clusters,sub_clusters_init,ICC,noise_count = determine_noise_and_init(up_dis.copy(),densities.copy(),reverse_neighbors.copy(),dis_mat.copy())
clusters,ICC,removed_sub_clusters = update_clusters(densities.copy(),clusters.copy(),dis_mat.copy(),sub_clusters_init.copy(),dc,ICC)
new_clusters,new_labels,mp = merging_clusters(dc,dis_mat.copy(),densities.copy(),clusters.copy(),sub_clusters_init.copy())

graph = nx.Graph(mp)
nx.draw_spring(graph,with_labels=True,node_size=700,node_color="cyan",node_shape="*",
               font_size=9,alpha=0.85,linewidths=0.75,edge_color="green",style="--")
plt.title("Merging Sub-Clusters Graph | Lithuanian")
plt.show()

In [None]:
for d_index in range(len(datasets)):
    s_time = time.time()
    dataset = datasets[d_index]
    name = dataset_names[d_index]
    K_acc,K_recall,K_nmi,Noise = [],[],[],[]
    dc, dis_mat = calculate_dc(dataset)
    densities = calculate_density(dataset,dc,dis_mat.copy()) 
    up_dis = calculate_upward_distance(densities.copy(),dis_mat.copy())  
    Ks,K_cm = [], []
    K_time = []
    pure_time = time.time() - s_time
    densities_,sub_clusters_init_,up_dis_, new_clusters_,rnn_,prediction_rule_ = [],[],[],[],[],[]
    
    for k in tqdm(range(int(np.sqrt(dataset[0].shape[0])*.1+1),int(np.sqrt(dataset[0].shape[0])*1.9+1))):
        s_time = time.time()
        neighbors,reverse_neighbors = calculate_neighbors(dis_mat.copy(),k)
        clusters,sub_clusters_init,ICC,noise_count = determine_noise_and_init(up_dis.copy(),densities.copy(),reverse_neighbors.copy(),dis_mat.copy())
        clusters,ICC,removed_sub_clusters = update_clusters(densities.copy(),clusters.copy(),dis_mat.copy(),sub_clusters_init.copy(),dc,ICC)
        new_clusters,new_labels,_ = merging_clusters(dc,dis_mat.copy(),densities.copy(),clusters.copy(),sub_clusters_init.copy())
        x,y = dataset
        acc,recall,nmi,cm,new_clusters,prediction_rule = prediction_report(new_clusters.copy(),y.flatten())
        e_time = time.time() - s_time
        
        Ks.append(k)
        K_acc.append(acc)
        K_recall.append(recall)
        K_nmi.append(nmi)
        K_cm.append(cm)
        Noise.append(noise_count)
        
        densities_.append(densities)
        sub_clusters_init_.append(clusters)
        up_dis_.append(up_dis)
        new_clusters_.append(new_clusters)
        rnn_.append(reverse_neighbors)
        prediction_rule_.append(prediction_rule)
        K_time.append(e_time+pure_time)
    
    i = K_acc.index(max(K_acc))
    print(name,",",K_acc[i],",",K_recall[i],",",K_nmi[i],",",Noise[i],",",K_time[i],",",np.unique(new_clusters.flatten()).shape[0],",",np.unique(y.flatten()).shape[0],",\n",K_cm[i].tolist())
    densities,sub_clusters_init,up_dis, new_clusters,rnn,prediction_rule,cm = densities_[i],sub_clusters_init_[i],up_dis_[i], new_clusters_[i],rnn_[i],prediction_rule_[i],K_cm[i]
    report_results(name,dataset,new_clusters,densities,up_dis,reverse_neighbors,clusters,Noise,K_acc,cm,K_recall,K_nmi,K_time) 
    
    

In [41]:
_data = [
    ["Thyroid" , 0.7302325581395349 , 0.41111111111111115 , 0.14035025233770546 , 2 , 0.3879883289337158 , 2 , 3 ],
    ["Guassian" , 0.9895 , 0.9925708479938904 , 0.9370663518366252 , 54 , 33.21497678756714 , 4 , 4 ],
    ["Ids2" , 0.9959375 , 0.9983000000000001 , 0.9778326514951051 , 43 , 74.87539029121399 , 5 , 5],
    ["Banana" , 1.0 , 1.0 , 1.0 , 16 , 39.93293499946594 , 2 , 2 ],
    ["Lithuanian" , 1.0 , 1.0 , 1.0 , 2 , 42.26874780654907 , 2 , 2 ]
]

In [42]:
df = pd.DataFrame(_data,columns=["Dataset","Accuracy","Recall","NMI","# Noises","Run-Time sec.","# Clusters","# Labels"])
df["Accuracy"] = np.array(np.round(df["Accuracy"],decimals=5))
df["Recall"] = np.array(np.round(df["Recall"],decimals=5))
df["NMI"] = np.array(np.round(df["NMI"],decimals=5))
df["Run-Time sec."] = np.array(np.round(df["Run-Time sec."],decimals=5))
df

Unnamed: 0,Dataset,Accuracy,Recall,NMI,# Noises,Run-Time sec.,# Clusters,# Labels
0,Thyroid,0.73023,0.41111,0.14035,2,0.38799,2,3
1,Guassian,0.9895,0.99257,0.93707,54,33.21498,4,4
2,Ids2,0.99594,0.9983,0.97783,43,74.87539,5,5
3,Banana,1.0,1.0,1.0,16,39.93293,2,2
4,Lithuanian,1.0,1.0,1.0,2,42.26875,2,2


--- 
## Test to improve

In [91]:
data_for_tabel = []
for d_index in range(len(datasets)):
    dataset = datasets[d_index]
    name = dataset_names[d_index]
    if "Robot" in name:
        continue
    if not "hyroid" in name:
        continue
    print(name)
    
    
    dc, dis_mat = calculate_dc(dataset)
    densities = calculate_density(dataset,dc,dis_mat.copy()) 
    up_dis = calculate_upward_distance(densities.copy(),dis_mat.copy())  
    
    for improved_v in [False,True]:
        neighbors,reverse_neighbors = calculate_neighbors(dis_mat.copy())
        clusters,sub_clusters_init,ICC,noise_count = determine_noise_and_init(up_dis.copy(),densities.copy(),reverse_neighbors.copy(),dis_mat.copy())
        clusters,ICC,removed_sub_clusters = update_clusters(densities.copy(),clusters.copy(),dis_mat.copy(),sub_clusters_init.copy(),dc,ICC,improved_v)
        new_clusters,new_labels,_ = merging_clusters(dc,dis_mat.copy(),densities.copy(),clusters.copy(),sub_clusters_init.copy(),improved_v,dataset)
        x,y = dataset
        acc,recall,nmi,cm,new_clusters,prediction_rule = prediction_report(new_clusters.copy(),y.flatten())
        data_for_tabel.append([name,acc,recall,nmi,noise_count,np.unique(new_clusters.flatten()).shape[0],np.unique(y.flatten()).shape[0],improved_v])
    #print(data_for_tabel[-2:])

Thyroid


100%|██████████| 215/215 [00:00<00:00, 681.54it/s]


In [103]:
_data = [
 ['zelnik2',0.4158415841584158,0.40880503144654085,0.13052873776644158,12,2,3,False],
 ['zelnik2',0.6204620462046204,0.6037735849056604,0.5696819704451567,12,2,3,True],
 ['zelnik4', 0.6028938906752411, 0.6, 0.6264247616951141, 36, 3, 5, False],
 ['zelnik4',0.7491961414790996,0.7391304347826086,0.7981916278116126,36,4,5,True],
 ['Guassian', 0.9895, 0.9925708479938904, 0.9370663518366252, 53, 4, 4, False],
 ['Guassian', 0.9735, 0.9495750897742531, 0.8687188780003052, 53, 4, 4, True],
 ['cluto-t5-8k',0.3935,0.383812520628441,0.39979451986024084,242,4,7,False],
 ['cluto-t5-8k',0.7945,0.7841950588633025,0.828358769688396,242,6,7,True],
 ['Lithuanian', 1.0, 1.0, 1.0, 2, 2, 2, False],
 ['Lithuanian', 1.0, 1.0, 1.0, 2, 2, 2, True],
 ['Ids2', 0.9959375, 0.9983000000000001, 0.9778326514951051, 72, 5, 5, False],
 ['Ids2', 0.9346875, 0.7944000000000001, 0.8823302434257392, 72, 4, 5, True],
 ['Thyroid', 0.7302325581395349 , 0.41111111111111115 , 0.14035025233770546, 10, 2, 3, False],
 ['Thyroid', 0.7906976744186046,0.5555555555555555,0.3786734240581399,10,2,3,True],
 ['Banana', 1.0, 1.0, 1.0, 12, 2, 2, False],
 ['Banana', 0.9983333333333333, 0.999, 0.9763520059077306, 12, 2, 2, True]
 ]

In [104]:
df = pd.DataFrame(_data,columns=["Dataset","Accuracy","Recall","NMI","# Noises","# Clusters","# Labels","Improved Version"])
df["Accuracy"] = np.array(np.round(df["Accuracy"],decimals=5))
df["Recall"] = np.array(np.round(df["Recall"],decimals=5))
df["NMI"] = np.array(np.round(df["NMI"],decimals=5))
df

Unnamed: 0,Dataset,Accuracy,Recall,NMI,# Noises,# Clusters,# Labels,Improved Version
0,zelnik2,0.41584,0.40881,0.13053,12,2,3,False
1,zelnik2,0.62046,0.60377,0.56968,12,2,3,True
2,zelnik4,0.60289,0.6,0.62642,36,3,5,False
3,zelnik4,0.7492,0.73913,0.79819,36,4,5,True
4,Guassian,0.9895,0.99257,0.93707,53,4,4,False
5,Guassian,0.9735,0.94958,0.86872,53,4,4,True
6,cluto-t5-8k,0.3935,0.38381,0.39979,242,4,7,False
7,cluto-t5-8k,0.7945,0.7842,0.82836,242,6,7,True
8,Lithuanian,1.0,1.0,1.0,2,2,2,False
9,Lithuanian,1.0,1.0,1.0,2,2,2,True
