In [15]:
import pandas as pd
from sklearn.cluster import KMeans, BisectingKMeans, MiniBatchKMeans
from sklearn.mixture import GaussianMixture,BayesianGaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import datetime
import os
from collections import Counter
from scipy.spatial.distance import pdist, squareform

# import warnings
# warnings.filterwarnings("ignore", message="The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning")

In [16]:
class Clustering:
    def __init__(self, filename, n_clusters=5):
        self.filename = filename
        self.n_clusters = n_clusters

        self.cluster = {}
        self.cluster_center = {}
        self.labels = {}

        self.load_data()

    def load_data(self):
        self.data = pd.read_csv(self.filename)
        self.feature_data = self.data.iloc[:, [3, 4, 7, 8, 9]].copy()
        self.xyz_data = self.data.iloc[:, [0, 1, 2]].copy()

    def apply_clustering_algorithms(self):

        #K-means, Bisecting K-means, Mini-Batch K-means, Gaussian Mixture, Bayesian Gaussian Mixture
        self.algorithms = {
            'KMeans': KMeans(n_clusters=self.n_clusters),
        }
        
        print("***apply_clustering_algorithms start***\n")

        X = self.feature_data.to_numpy()
        
        for name, algorithm in self.algorithms.items():
            print(f"{name} algorithms is running ... ")

            self.cluster[name] = algorithm.fit(X)
            
            if( hasattr(self.cluster[name], 'labels_')): self.labels[name] = self.cluster[name].labels_
            elif( hasattr(self.cluster[name], 'fit_predict')): self.labels[name] = self.cluster[name].fit_predict(X)

            if( hasattr(self.cluster[name], 'cluster_centers_')): self.cluster_center[name] = self.cluster[name].cluster_centers_
            elif( hasattr(self.cluster[name], 'predict_proba')): self.cluster_center[name] = self.cluster[name].predict_proba(X)

        print(self.labels)

        print("done")

        print("\n***apply_clustering_algorithms end***")


    def evaluate_clusters(self):
        scores = {}
        X = self.feature_data.to_numpy()
        
        for name, labels in self.labels.items():
            if len(set(labels)) > 1:  # Evaluate only when there are more than one cluster
                scores[name] = (1,1,1,1,1)
                # scores[name] = (silhouette, davies, calinski,  dunn)  # Added WCSS and Dunn Index
        
        return scores
    

    def find_best_algorithm(self):
        self.apply_clustering_algorithms()
        self.scores = self.evaluate_clusters()
        print("------------Score------------")
        print(self.scores)
        print("------------Score(Norm)------------")

        # 정규화를 위한 준비
        normalized_scores = {algo: list(scores) for algo, scores in self.scores.items()}
        
        # 각 지표별로 정규화 수행
        for i in range(4):  # 4개의 지표에 대해 반복
            values = [scores[i] for scores in self.scores.values()]
            min_val, max_val = min(values), max(values)
            
            for algo in normalized_scores:
                if i == 1:  # Davies-Bouldin 점수는 낮을수록 좋음
                    normalized_scores[algo][i] = (max_val - self.scores[algo][i]) / (max_val - min_val) if max_val != min_val else 1
                else:  # 다른 점수들은 높을수록 좋음
                    normalized_scores[algo][i] = (self.scores[algo][i] - min_val) / (max_val - min_val) if max_val != min_val else 1

        print(normalized_scores)

        print("------------Summury------------")
        best_silhouette = max(self.scores, key=lambda x: self.scores[x][0])
        best_davies = min(self.scores, key=lambda x: self.scores[x][1])
        best_calinski = max(self.scores, key=lambda x: self.scores[x][2])
        best_dunn = max(self.scores, key=lambda x: self.scores[x][3])

        print(f"Best by Silhouette : ({best_silhouette}, {self.scores[best_silhouette]}),")
        print(f"Best by Davies-Bouldin : ({best_davies}, {self.scores[best_davies]}),")
        print(f"Best by Calinski-Harabasz : ({best_calinski}, {self.scores[best_calinski]})")
        print(f"Best by Dunn : ({best_dunn}, {self.scores[best_dunn]})")

        print("-----------------------------------")

        best_algorithms = [best_silhouette, best_davies, best_calinski]

        counts = [Counter(string) for string in best_algorithms]
        best_algorithm = max(best_algorithms, key=lambda x: counts[best_algorithms.index(x)][x])
        return best_algorithm

    def run(self):
        self.best_algorithm = self.find_best_algorithm()
        self.xyz_data['Cluster'] = self.labels[self.best_algorithm]
        

In [17]:
clustering = Clustering('data_points/output_data_with_symmetry.csv', n_clusters=5)  # Set the CSV file path and number of clusters
clustering.run()
print("\n\n***clustered dataframe***\n\n",clustering.xyz_data)

***apply_clustering_algorithms start***

KMeans algorithms is running ... 


  super()._check_params_vs_input(X, default_n_init=10)


{'KMeans': array([0, 1, 4, ..., 3, 1, 2], dtype=int32)}
done

***apply_clustering_algorithms end***
------------Score------------
{'KMeans': (1, 1, 1, 1, 1)}
------------Score(Norm)------------
{'KMeans': [1, 1, 1, 1, 1]}
------------Summury------------
Best by Silhouette : (KMeans, (1, 1, 1, 1, 1)),
Best by Davies-Bouldin : (KMeans, (1, 1, 1, 1, 1)),
Best by Calinski-Harabasz : (KMeans, (1, 1, 1, 1, 1))
Best by Dunn : (KMeans, (1, 1, 1, 1, 1))
-----------------------------------


***clustered dataframe***

               x         y         z  Cluster
0      0.739298 -0.067115  1.617565        0
1      0.449500 -0.260057  1.240352        1
2      0.129596 -0.864770  0.932873        4
3      0.518302  0.074387  1.702330        2
4     -0.365941  0.628319  1.328430        4
...         ...       ...       ...      ...
19749 -0.079149  0.593205 -0.073362        4
19750  0.480401 -0.367552  1.572363        2
19751  0.748189 -0.759315 -0.516321        3
19752 -0.164707 -0.282554  1.327774

In [18]:
class Defining_relative_levels:
    def __init__(self, clustering):
        self.xyz_data = clustering.xyz_data
        self.cluster_center = clustering.cluster_center[clustering.best_algorithm]

    def get_sorted_cluster_centers(self):
        # Calculate the Euclidean distance from each center point to (0,0)
        distances = np.linalg.norm(self.cluster_center, axis=1)
        # Sort centroid points by distance
        sorted_indices = np.argsort(distances)

        return {value : index for index, value in enumerate(sorted_indices)}
    
    def difine_relative_levels(self, sorted_dict):
        temp = self.xyz_data['Cluster'].copy()
        relative_level = []
        for i in range(len(temp)): relative_level.append(sorted_dict[temp[i]])
        self.xyz_data['Cluster'] = relative_level

    def run(self):
        self.sorted_dict = self.get_sorted_cluster_centers()
        self.difine_relative_levels(self.sorted_dict)

In [19]:
defining_relative_level = Defining_relative_levels(clustering)
defining_relative_level.run()
print("Relative level dictionary : ", defining_relative_level.sorted_dict, end="\n\n")
print("\n\n***Data with defined relative level***\n\n",defining_relative_level.xyz_data)

Relative level dictionary :  {2: 0, 0: 1, 1: 2, 3: 3, 4: 4}



***Data with defined relative level***

               x         y         z  Cluster
0      0.739298 -0.067115  1.617565        1
1      0.449500 -0.260057  1.240352        2
2      0.129596 -0.864770  0.932873        4
3      0.518302  0.074387  1.702330        0
4     -0.365941  0.628319  1.328430        4
...         ...       ...       ...      ...
19749 -0.079149  0.593205 -0.073362        4
19750  0.480401 -0.367552  1.572363        0
19751  0.748189 -0.759315 -0.516321        3
19752 -0.164707 -0.282554  1.327774        2
19753 -0.323350 -0.031115  2.012778        0

[19754 rows x 4 columns]


In [20]:
folder_path = os.path.join(os.getcwd(), "Figure")

class Visualization_UoCs:
    def __init__(self, data):
        self.df = data.xyz_data

    def figure3(self, figure_name, Iscolor=False , elev=90, azim=0):
        # Create a new 3D plo
        fig = plt.figure(figsize=(19, 19))  # Set the size of the entire graph
        ax = fig.add_subplot(111, projection='3d')

        # 각 클러스터 별로 다른 색상으로 점 표시
        clusters = self.df['Cluster'].unique()
        clusters = sorted(clusters)

        colors = plt.cm.jet(np.linspace(0, 1, len(clusters)))  # Generate colors based on the number of clusters

        for cluster, color in zip(clusters, colors):
            # Filter only data from that cluster
            cluster_data = self.df[self.df['Cluster'] == cluster]
            if(Iscolor):
                ax.scatter(cluster_data['x'], cluster_data['y'], cluster_data['z'], c=color, label=f'UoC {cluster+1}', alpha=0.5)
            else:
                ax.scatter(cluster_data['x'], cluster_data['y'], cluster_data['z'], c='dimgray', label=f'UoC {cluster+1}', alpha=0.1)

        # Setting axis labels
        ax.set_xlabel('X', fontsize=25, labelpad=30)
        ax.set_xlim(-0.5, 0.5)
        ax.set_ylabel('Y', fontsize=25, labelpad=30)
        ax.set_ylim(-0.5, 0.5)
        ax.set_zlabel('Z', fontsize=25, labelpad=30)
        ax.set_zlim(0.0, 0.8)

        # Increase tick label font size and tick size
        ax.tick_params(axis='both', which='major', labelsize=16, pad=8, length=8, width=1.1)
        ax.tick_params(axis='both', which='minor', length=4, width=1)
        
        # Increase the thickness of the axis lines
        ax.xaxis.line.set_linewidth(2)
        ax.yaxis.line.set_linewidth(2)
        ax.zaxis.line.set_linewidth(2)

        ax.view_init(elev=elev, azim=azim)  # 'elev' to adjust the angle in height, 'azim' to adjust the angle of rotation around the Y axis

        plt.savefig(os.path.join(folder_path, figure_name), dpi=600, bbox_inches='tight')
        plt.show()

    def plot_each_cluster(self, folder_path, figure_name_prefix, elev=90, azim=0):
        clusters = self.df['Cluster'].unique()
        clusters = sorted(clusters)
        
        num_clusters = len(clusters)
        colors = plt.cm.jet(np.linspace(0, 1, num_clusters)) 

        for i, (cluster, color) in enumerate(zip(clusters, colors), 1):
            fig = plt.figure(figsize=(10, 10))  # 각각의 ax를 위한 개별 figure 생성
            ax = fig.add_subplot(111, projection='3d')  # 1개의 ax 추가

            cluster_data = self.df[self.df['Cluster'] == cluster]
            ax.scatter(cluster_data['x'], cluster_data['y'], cluster_data['z'], c=[color], s=20, alpha=0.5)

            # ax.set_title(f'UoC {cluster+1}', fontsize=30)

            # Setting axis labels
            ax.set_xlabel('X', fontsize=18, labelpad=30)
            ax.set_xlim(-2, 2)
            ax.set_ylabel('Y', fontsize=18, labelpad=30)
            ax.set_ylim(-2, 2)
            ax.set_zlabel('Z', fontsize=18, labelpad=30)
            ax.set_zlim(-2, 2)

            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            # Increase tick label font size and tick size
            ax.tick_params(axis='both', which='major', labelsize=16, pad=8, length=8, width=1.1, bottom=False, top=False, left=False, right=False)
            ax.tick_params(axis='both', which='minor', length=4, width=1, bottom=False, top=False, left=False, right=False)

            # Increase the thickness of the axis lines
            ax.xaxis.line.set_linewidth(2)
            ax.yaxis.line.set_linewidth(2)
            ax.zaxis.line.set_linewidth(2)

            ax.view_init(elev=elev, azim=azim)  # 'elev' to adjust the angle in height, 'azim' to adjust the angle of rotation around the Y axis

            plt.subplots_adjust(left=0, right=1, top=1, bottom=0)

            # tight layout 적용
            plt.tight_layout(h_pad=0, w_pad=0, pad=0)
            
            # 개별 그래프 저장
            plt.savefig(os.path.join(folder_path, f'{figure_name_prefix}_cluster_{cluster+1}.png'), dpi=300, transparent=True)
            plt.close(fig)  # figure를 닫아 메모리 사용량을 줄임


In [21]:
visualizer = Visualization_UoCs(defining_relative_level)
# visualizer.figure3('figure3_a.png',False, 50,20)
# visualizer.figure3('figure3_d.png', True, 50,20)

# visualizer.figure3('figure3_a1.png',False, 90,0)
# visualizer.figure3('figure3_a2.png',False, 0,90)
# visualizer.figure3('figure3_d1.png', True, 90,0)
# visualizer.figure3('figure3_d2.png', True, 0,90)

visualizer.plot_each_cluster(folder_path, 'level별로_1.png', 90,0)
visualizer.plot_each_cluster(folder_path, 'level별로_2.png', 0,90)
visualizer.plot_each_cluster(folder_path, 'level별로_3.png', 50,20)