# 06차시 비계층적 군집분석(K-means Clustering)

## 01 k-means 소개

### k-means 군집분석의 특징

- 임의의 k개의 점을 기반으로 가까운 거리의 데이터를 묶는 것과 더불어 평균을 활용하는 군집분석 기법
- 군집 개수(k)를 확정하기 위해 여러 번의 시행착오 필요
- 결과 고정을 위해 seed 설정 필요

## 02 주요 함수 및 메서드 소개

### sklearn - MinMaxScaler()

- MinMax 정규화를 실시하는 sklearn의 함수
- fit() 메서드로 규칙 모델을 만들고 transform() 함수로 변환을 실시

### sklearn - StandardScaler()

- 표준화를 실시하는 sklearn의 함수
- fit() 메서드로 규칙 모델을 만들고 transform() 함수로 변환을 실시

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv("실습파일/iris.csv")
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
nor_minmax = MinMaxScaler().fit(df.head().iloc[:, :-1])
nor_minmax

MinMaxScaler()

In [5]:
nor_minmax.transform(df.head().iloc[:, :-1])

array([[1.        , 0.83333333, 0.5       , 0.        ],
       [0.6       , 0.        , 0.5       , 0.        ],
       [0.2       , 0.33333333, 0.        , 0.        ],
       [0.        , 0.16666667, 1.        , 0.        ],
       [0.8       , 1.        , 0.5       , 0.        ]])

In [6]:
df_minmax = pd.DataFrame(nor_minmax.transform(df.head().iloc[:, :-1]),
                        columns = df.head().columns[:4])
df_minmax

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,1.0,0.833333,0.5,0.0
1,0.6,0.0,0.5,0.0
2,0.2,0.333333,0.0,0.0
3,0.0,0.166667,1.0,0.0
4,0.8,1.0,0.5,0.0


### sklearn - KMeans()

- k-means 군집분석을 실시하는 sklearn의 함수
- n_clusters, max_iter, random_state에 각각 군집 개수, 최대 반복 연산, 결과 고정 설정 가능
- KMeans() 함수의 fit() 메서드에 데이터를 할당하여 학습 진행
- 결과 객체의 cluster_centers_ 와 labels_ 어트리뷰트로 군집 중심과 각 행의 군집 번호 확인 가능

In [7]:
model = KMeans(n_clusters = 3, random_state = 123).fit(df.iloc[:, :-1])
model

KMeans(n_clusters=3, random_state=123)

In [8]:
model.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2])

In [9]:
model.cluster_centers_

array([[6.85      , 3.07368421, 5.74210526, 2.07105263],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [5.9016129 , 2.7483871 , 4.39354839, 1.43387097]])

In [10]:
df["cluster"] = model.labels_
df.groupby("cluster").mean()

Unnamed: 0_level_0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6.85,3.073684,5.742105,2.071053
1,5.006,3.428,1.462,0.246
2,5.901613,2.748387,4.393548,1.433871


## Q1 BMI가 0이 아닌 사람 데이터를 대상으로 k-means 군집 분석을 실시하는 경우 군집 개수가 가장 큰 군집의 Insulin 평균은 얼마인가?
1) 군집은 4개로 설정하고 Seed는 123으로 한다.

In [21]:
Q1 = pd.read_csv("실습파일/diabetes.csv")
Q1 = Q1.loc[Q1["BMI"] != 0,]
Q1.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [22]:
Q1_model = KMeans(n_clusters = 4, random_state = 123).fit(Q1)
Q1["cluster"] = Q1_model.labels_
Q1.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,cluster
0,6,148,72,35,0,33.6,0.627,50,1,0
1,1,85,66,29,0,26.6,0.351,31,0,0
2,8,183,64,0,0,23.3,0.672,32,1,0
3,1,89,66,23,94,28.1,0.167,21,0,2
4,0,137,40,35,168,43.1,2.288,33,1,3


In [23]:
Q1["cluster"].value_counts()

0    407
2    212
3    114
1     24
Name: cluster, dtype: int64

In [24]:
Q1.groupby("cluster")["Insulin"].mean()

cluster
0      4.103194
1    509.166667
2    102.674528
3    224.035088
Name: Insulin, dtype: float64

## Q2 BMI가 0이 아닌 사람 데이터를 대상으로 k-means 군집 분석을 실시하는 경우 군집 개수가 가장 큰 군집의 나이 평균은 얼마인가?
1) 군집은 4개로 설정하고 Seed는 123으로 한다.  
2) 군집 분석 이전에 Min-Max 정규화를 실시한다.  
3) 나이 계산은 정규화 실시 전의 데이터를 사용한다.

In [25]:
Q2 = pd.read_csv("실습파일/diabetes.csv")
Q2 = Q2.loc[Q2["BMI"] != 0,]
Q2.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [28]:
Q2_nor_minmax = MinMaxScaler().fit(Q2)
Q2_nor = Q2_nor_minmax.transform(Q2)
Q2_nor = pd.DataFrame(Q2_nor, columns = Q2.columns)
Q2_nor.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.0,0.314928,0.234415,0.483333,1.0
1,0.058824,0.427136,0.540984,0.292929,0.0,0.171779,0.116567,0.166667,0.0
2,0.470588,0.919598,0.52459,0.0,0.0,0.104294,0.253629,0.183333,1.0
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.202454,0.038002,0.0,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.509202,0.943638,0.2,1.0


In [29]:
Q2_model = KMeans(n_clusters = 4, random_state = 123).fit(Q2_nor)
Q2["cluster"] = Q2_model.labels_
Q2.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,cluster
0,6,148,72,35,0,33.6,0.627,50,1,2
1,1,85,66,29,0,26.6,0.351,31,0,0
2,8,183,64,0,0,23.3,0.672,32,1,2
3,1,89,66,23,94,28.1,0.167,21,0,0
4,0,137,40,35,168,43.1,2.288,33,1,1


In [30]:
Q2["cluster"].value_counts()

0    361
1    135
2    131
3    130
Name: cluster, dtype: int64

In [31]:
Q2.groupby("cluster")["Age"].mean()

cluster
0    25.667590
1    29.977778
2    44.297710
3    46.753846
Name: Age, dtype: float64

## Q3 BMI가 0이 아닌 사람 데이터를 대상으로 k-means 군집 분석을 실시하고 군집의 중심점간 유클리드 거리가 가장 가까운 그룹간 거리는?
1) 군집은 3개로 설정하고 Seed는 123으로 한다.

In [32]:
Q3 = pd.read_csv("실습파일/diabetes.csv")
Q3 = Q3.loc[Q3["BMI"] != 0,]
Q3.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [34]:
Q3_model = KMeans(n_clusters = 3, random_state = 123).fit(Q3)
Q3_centers = pd.DataFrame(Q3_model.cluster_centers_, columns = Q3.columns)
Q3_centers

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,4.026316,158.447368,72.0,32.263158,441.289474,35.107895,0.569211,34.763158,0.578947
1,3.975258,114.237113,68.647423,15.259794,14.696907,31.440619,0.434579,33.808247,0.301031
2,3.542735,129.376068,71.478632,30.337607,159.401709,34.134615,0.535188,31.948718,0.418803


In [35]:
Q3_centers = Q3_centers.transpose()
Q3_centers

Unnamed: 0,0,1,2
Pregnancies,4.026316,3.975258,3.542735
Glucose,158.447368,114.237113,129.376068
BloodPressure,72.0,68.647423,71.478632
SkinThickness,32.263158,15.259794,30.337607
Insulin,441.289474,14.696907,159.401709
BMI,35.107895,31.440619,34.134615
DiabetesPedigreeFunction,0.569211,0.434579,0.535188
Age,34.763158,33.808247,31.948718
Outcome,0.578947,0.301031,0.418803


In [37]:
print(sum((Q3_centers.iloc[:, 0] - Q3_centers.iloc[:, 1]) **2) ** 0.5)
print(sum((Q3_centers.iloc[:, 1] - Q3_centers.iloc[:, 2]) **2) ** 0.5)
print(sum((Q3_centers.iloc[:, 0] - Q3_centers.iloc[:, 2]) **2) ** 0.5)

429.24419310888464
146.33847909815492
283.405999774738
