# 비원형 데이터의 군집화 - DBSCAN
---
- 분활적 군집화 방식
- 밀도(데이터의 밀집)기반 준집화 ==> 미리 군집수 지정 필요 없음
- 다양한 형태의 데이터에서 군집화 가능
- 이상치 데이터 제거도 가능함
- 군집을 정하는 기준
    * 임의의 점(Point)에서 지정된 거리만큼 영역안에 지정된 데이터 수 존재 여부
    * 하이퍼파타미터 => 거리, 데이터수

In [1]:
from sklearn.datasets import load_iris
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import numpy as np

## [1] 데이터 로딩

In [7]:
## DataFrame 형태로 데이터 로딩 시 ==> return_X_y=True, as_frame=True 설정
X, y=load_iris(return_X_y=True, as_frame=True)

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [25]:
type(X), type(y), X.shape, y.shape

(pandas.core.frame.DataFrame, pandas.core.series.Series, (150, 4), (150,))

## [2] 데이터 전처리

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler=StandardScaler()

In [13]:
scaler.fit(X)

In [14]:
X_scaled=scaler.transform(X)

In [17]:
X_scaled[:,2]

array([-1.34022653, -1.34022653, -1.39706395, -1.2833891 , -1.34022653,
       -1.16971425, -1.34022653, -1.2833891 , -1.34022653, -1.2833891 ,
       -1.2833891 , -1.22655167, -1.34022653, -1.51073881, -1.45390138,
       -1.2833891 , -1.39706395, -1.34022653, -1.16971425, -1.2833891 ,
       -1.16971425, -1.2833891 , -1.56757623, -1.16971425, -1.05603939,
       -1.22655167, -1.22655167, -1.2833891 , -1.34022653, -1.22655167,
       -1.22655167, -1.2833891 , -1.2833891 , -1.34022653, -1.2833891 ,
       -1.45390138, -1.39706395, -1.34022653, -1.39706395, -1.2833891 ,
       -1.39706395, -1.39706395, -1.39706395, -1.22655167, -1.05603939,
       -1.34022653, -1.22655167, -1.34022653, -1.2833891 , -1.34022653,
        0.53540856,  0.42173371,  0.64908342,  0.13754657,  0.47857113,
        0.42173371,  0.53540856, -0.26031542,  0.47857113,  0.08070915,
       -0.14664056,  0.25122143,  0.13754657,  0.53540856, -0.08980313,
        0.36489628,  0.42173371,  0.194384  ,  0.42173371,  0.08

## [3] 군집화

In [19]:
# 군집화 객체 생성
dbscan = DBSCAN(eps=0.6, min_samples=8)

In [21]:
# iris 데이터를 분류
irisDBS=dbscan.fit(X_scaled)

In [37]:
irisDBS.labels_, irisDBS.labels_.shape

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,
         0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
         1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,
        -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        -1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,
         1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,  1, -1, -1,
        -1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1,  1,  1,  1, -1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
       dtype=int64),
 (150,))

In [23]:
irisDBS.components_.shape

(88, 4)

In [24]:
irisDBS.fit_predict(X_scaled)

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,  1, -1, -1,
       -1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
      dtype=int64)

In [50]:
## 정답지랑 하나로 묶기
import pandas as pd
irisDF=pd.DataFrame(X_scaled)

In [51]:
irisDF['dbscan_cluster'] = irisDBS.labels_
irisDF['target'] = y

In [55]:
irisDF

Unnamed: 0,0,1,2,3,dbscan_cluster,target
0,-0.900681,1.019004,-1.340227,-1.315444,0,0
1,-1.143017,-0.131979,-1.340227,-1.315444,0,0
2,-1.385353,0.328414,-1.397064,-1.315444,0,0
3,-1.506521,0.098217,-1.283389,-1.315444,0,0
4,-1.021849,1.249201,-1.340227,-1.315444,0,0
...,...,...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832,1,2
146,0.553333,-1.282963,0.705921,0.922303,1,2
147,0.795669,-0.131979,0.819596,1.053935,1,2
148,0.432165,0.788808,0.933271,1.448832,1,2


In [58]:
iris_result = irisDF.groupby(['target'])['dbscan_cluster'].value_counts()
print(iris_result)

target  dbscan_cluster
0        0                45
        -1                 5
1        1                40
        -1                10
2        1                33
        -1                17
Name: dbscan_cluster, dtype: int64
