# 계보적 군집분석

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import StandardScaler

In [None]:
# 가상 데이터 생성 (d1 데이터프레임 생성)
np.random.seed(42)
data = {
    'country': ['Country1', 'Country2', 'Country3', 'Country4', 'Country5',
                'Country6', 'Country7', 'Country8', 'Country9', 'Country10',
                'Country11', 'Country12'],
    'feature1': np.random.rand(12),
    'feature2': np.random.rand(12),
    'feature3': np.random.rand(12),
    'feature4': np.random.rand(12)
}
d1 = pd.DataFrame(data)

In [None]:
# 데이터 스케일링
scaler = StandardScaler()
scaled_data = scaler.fit_transform(d1.drop('country', axis=1))

In [None]:
# 계보적 군집분석
distances = pdist(scaled_data)  # 거리 계산
linkage_matrix = linkage(distances, method='ward')  # 계보적 군집 분석

In [None]:
# 시각화
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix, labels=d1['country'].values)
plt.title('Dendrogram')
plt.xlabel('Country')
plt.ylabel('Distance')
plt.show()

# 군집 할당
num_clusters = 5
clusters = fcluster(linkage_matrix, num_clusters, criterion='maxclust')

# 데이터에 군집 정보를 추가
d1['cluster'] = clusters
print("\nClustered DataFrame:")
print(d1)

In [None]:
# 군집별 데이터 집계
cluster_summary = d1.groupby('cluster').mean()
print("\nCluster Summary:")
print(cluster_summary)

# 평균을 군집별로 묶고 시각화
colors = ['white', 'yellow', 'green', 'cyan', 'black']
plt.figure(figsize=(10, 7))
bar_width = 0.5
bar_positions = np.arange(len(cluster_summary.columns) - 1)  # 제외할 첫 번째 열 'cluster'

for i, color in enumerate(colors):
    plt.bar(bar_positions + i * bar_width, cluster_summary.iloc[i, 1:], 
            width=bar_width, color=color, label=f'Cluster {i + 1}')

plt.xticks(bar_positions + (len(colors) - 1) * bar_width / 2, cluster_summary.columns[1:])
plt.xlabel('Features')
plt.ylabel('Mean Value')
plt.title('Cluster Means by Feature')
plt.legend(title='Cluster')
plt.show()