In [9]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

In [13]:
# [Input] sample data: iris data 로드
iris = load_iris()
D = iris.data

In [14]:
D

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [15]:
# [Input] Number of clusters
k = 3

In [16]:
# [Setting the distance] Euclidean Distance Function
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

In [17]:
# [Process of K-Means Clustering]

def kmeans_custom(D, k):
    m, n = D.shape
    np.random.seed(123)
    
    # 1. Randomly select k samples as the initial mean vectors (초기화)
    # 중복없이 랜덤하게 데이터 선택
    mu = D[np.random.choice(m, k, replace=False)]
    
    # 2. Repeat(반복적인 최적화)
    while True:
        
        # 3. C_i = Null (1<=i<=k)
        clusters = [[] for _ in range(k)]
        
        # (클러스터 분할)
        
        # 4. for j=1,2,...,m do
        for j in range(m):
            
            # 5. Compute the distance between sample x_j and each mean vector
            distances = np.array([euclidean_distance(D[j], mu[i]) for i in range(k)])
            
            # 6. According to the nearest mean vector, decide the cluster label
            cluster_idx = np.argmin(distances)
            clusters[cluster_idx].append(D[j])
        
        clusters = [np.array(cluster) for cluster in clusters]
        
        # 7. Move x_j to the corresponding clusters
        old_mu = np.copy(mu)
        
        # 8. end for
        
        # (평균 벡터)
        # 9. for i=1, 2, ..., k do
        for i in range(k):
            
            # 10. Compute the updated mean vectors
            if len(clusters[i]) > 0:
                
                # 12. Update the current mean vector
                mu[i] = np.mean(clusters[i], axis=0)
        
        # 13. else
        # 14. Leave the current mean vector unchanged
        if np.all(old_mu == mu):
            
            # 15. end if
            break

        # 16-17. end for; until All mean vectors remain unchanged
    
    # Return the output
    return clusters, mu

clusters, mu = kmeans_custom(D, k)

In [18]:
# 결과 확인
print("클러스터 중심:")
print(mu)

print("클러스터 할당:")
for i in range(k):
    print(f"클러스터 {i + 1}:")
    print(clusters[i])

클러스터 중심:
[[5.006      3.428      1.462      0.246     ]
 [6.85       3.07368421 5.74210526 2.07105263]
 [5.9016129  2.7483871  4.39354839 1.43387097]]
클러스터 할당:
클러스터 1:
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0