# 예제 7-8 DBSCAN 군집 분석 

### 기본 라이브러리 불러오기

In [1]:
import pandas as pd
import folium

## Step 1 데이터 준비/기본 설정 

### 서울시내 중학교 진학률 데이터셋 

In [2]:
file_path = './2016_middle_shcool_graduates_report.xlsx'

In [10]:
df = pd.read_excel(file_path, header=0)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

### 열 이름 배열 출력

In [11]:
print(df.columns.values)

['지역' '학교명' '코드' '유형' '주야' '남학생수' '여학생수' '일반고' '특성화고' '과학고' '외고_국제고'
 '예고_체고' '마이스터고' '자사고' '자공고' '기타진학' '취업' '미상' '위도' '경도']


## Step 2 데이터 탐색

### 데이터 살펴보기 

In [12]:
df.head()

Unnamed: 0,지역,학교명,코드,유형,주야,남학생수,여학생수,일반고,특성화고,과학고,외고_국제고,예고_체고,마이스터고,자사고,자공고,기타진학,취업,미상,위도,경도
0,성북구,서울대학교사범대학부설중학교,3,국립,주간,277,0,0.585,0.148,0.018,0.007,0.0,0.011,0.227,0.0,0.004,0,0.0,37.594942,127.038909
1,종로구,서울대학교사범대학부설여자중학교,3,국립,주간,0,256,0.68,0.199,0.0,0.035,0.008,0.0,0.043,0.004,0.031,0,0.0,37.577473,127.003857
2,강남구,개원중학교,3,공립,주간,170,152,0.817,0.047,0.009,0.012,0.003,0.006,0.09,0.003,0.009,0,0.003,37.491637,127.071744
3,강남구,개포중학교,3,공립,주간,83,72,0.755,0.097,0.013,0.013,0.019,0.019,0.065,0.0,0.019,0,0.0,37.480439,127.062201
4,서초구,경원중학교,3,공립,주간,199,212,0.669,0.017,0.007,0.01,0.005,0.0,0.282,0.0,0.01,0,0.0,37.51075,127.0089


### 데이터 자료형 확인

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   지역      415 non-null    object 
 1   학교명     415 non-null    object 
 2   코드      415 non-null    int64  
 3   유형      415 non-null    object 
 4   주야      415 non-null    object 
 5   남학생수    415 non-null    int64  
 6   여학생수    415 non-null    int64  
 7   일반고     415 non-null    float64
 8   특성화고    415 non-null    float64
 9   과학고     415 non-null    float64
 10  외고_국제고  415 non-null    float64
 11  예고_체고   415 non-null    float64
 12  마이스터고   415 non-null    float64
 13  자사고     415 non-null    float64
 14  자공고     415 non-null    float64
 15  기타진학    415 non-null    float64
 16  취업      415 non-null    int64  
 17  미상      415 non-null    float64
 18  위도      415 non-null    float64
 19  경도      415 non-null    float64
dtypes: float64(12), int64(4), object(4)
memory usage: 58.4+ KB


### 지도에 위치 표시

In [14]:
mschool_map = folium.Map(location=[37.55, 126.98], tiles='Stamen Terrain',
                        zoom_start=12)

### 중학교 위치 정보를 CircleMarker로 표시

In [15]:
for name, lat, lng in zip(df.학교명, df.위도, df.경도):
    folium.CircleMarker([lat, lng],
                       radius=5,
                       color='brown',
                       fill=True,
                       fill_color='coral',
                       fill_opacity=0.7,
                       popup=name).add_to(mschool_map)

### 지도를 html 파일로 저장하기 

In [16]:
mschool_map.save('./seoul_mschool_location.html')

## Step 3 데이터 전처리 

### 원핫인코딩(더미 변수)

In [17]:
from sklearn import preprocessing

In [18]:
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()

In [20]:
onehot_location = label_encoder.fit_transform(df['지역'])
onehot_code = label_encoder.fit_transform(df['코드'])
onehot_type = label_encoder.fit_transform(df['유형'])
onehot_day = label_encoder.fit_transform(df['주야'])

In [21]:
df['location'] = onehot_location
df['code'] = onehot_code
df['type'] = onehot_type
df['day'] = onehot_day

In [22]:
df.head()

Unnamed: 0,지역,학교명,코드,유형,주야,남학생수,여학생수,일반고,특성화고,과학고,...,자공고,기타진학,취업,미상,위도,경도,location,code,type,day
0,성북구,서울대학교사범대학부설중학교,3,국립,주간,277,0,0.585,0.148,0.018,...,0.0,0.004,0,0.0,37.594942,127.038909,16,0,1,0
1,종로구,서울대학교사범대학부설여자중학교,3,국립,주간,0,256,0.68,0.199,0.0,...,0.004,0.031,0,0.0,37.577473,127.003857,22,0,1,0
2,강남구,개원중학교,3,공립,주간,170,152,0.817,0.047,0.009,...,0.003,0.009,0,0.003,37.491637,127.071744,0,0,0,0
3,강남구,개포중학교,3,공립,주간,83,72,0.755,0.097,0.013,...,0.0,0.019,0,0.0,37.480439,127.062201,0,0,0,0
4,서초구,경원중학교,3,공립,주간,199,212,0.669,0.017,0.007,...,0.0,0.01,0,0.0,37.51075,127.0089,14,0,0,0


## Step 4 DBSCAN 군집 모형 - sklearn 사용 

### sklearn 라이브러리에서 cluster 군집 모형 가져오기

In [23]:
from sklearn import cluster

### 분석에 사용할 속성 선택(과학고, 외고국제고, 자사고 진학률)

In [24]:
columns_list = [9, 10, 13]
X = df.iloc[:, columns_list]
X

Unnamed: 0,과학고,외고_국제고,자사고
0,0.018,0.007,0.227
1,0.000,0.035,0.043
2,0.009,0.012,0.090
3,0.013,0.013,0.065
4,0.007,0.010,0.282
...,...,...,...
410,0.000,0.000,0.000
411,0.000,0.000,0.000
412,0.000,0.000,0.000
413,0.000,0.000,0.000


### 설명 변수 데이터 정규화

In [25]:
X = preprocessing.StandardScaler().fit(X).transform(X)

### DBSCAN 모형 객체 생성

In [27]:
dbm = cluster.DBSCAN(eps=0.2, min_samples=5)

반지름 R = 0.2, 최소 포인트 개수 M = 5

### 모형 학습

In [28]:
dbm.fit(X)

DBSCAN(eps=0.2)

### 예측(군집)

In [31]:
cluster_label = dbm.labels_
cluster_label

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1,
       -1, -1, -1,  2, -1,  0, -1, -1, -1, -1, -1,  0, -1, -1, -1, -1, -1,
        0,  3, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1,  1,  0, -1, -1, -1,
        0, -1, -1, -1, -1,  0, -1,  0,  0, -1, -1,  0, -1, -1, -1,  0,  0,
       -1, -1,  0, -1, -1, -1,  0, -1, -1, -1,  0,  2,  0,  0,  0,  0,  0,
       -1, -1, -1,  0, -1,  0, -1, -1,  0, -1,  0, -1,  0,  0, -1, -1, -1,
       -1,  1,  0, -1,  0,  0, -1, -1, -1,  0, -1, -1, -1, -1, -1,  0,  1,
       -1, -1,  0,  2,  0, -1, -1,  1, -1, -1, -1,  0,  0,  0, -1, -1,  0,
       -1, -1, -1,  0,  0, -1, -1, -1, -1,  0, -1, -1, -1,  0, -1, -1, -1,
        0, -1,  0,  0, -1, -1, -1, -1, -1,  0, -1,  0,  0, -1, -1, -1, -1,
       -1,  0, -1, -1, -1,  1,  0,  3,  1, -1,  0,  0, -1,  0, -1, -1,  0,
        0,  2, -1, -1,  3,  0,  0, -1, -1, -1, -1,  0, -1,  0,  0, -1,  0,
        0,  0, -1, -1,  0, -1, -1, -1, -1, -1,  2,  0, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

-1, 0, 1, 2, 3의 5개 값이 확인된다. -1은 Noise를 나타내므로 모형이 구분한 클러스터는 Noise를 제외한 0, 1, 2, 3으로 모두 4개가 된다.

### 예측 결과를 데이터프레임에 추가

In [32]:
df['Cluster'] = cluster_label
df.head()

Unnamed: 0,지역,학교명,코드,유형,주야,남학생수,여학생수,일반고,특성화고,과학고,...,기타진학,취업,미상,위도,경도,location,code,type,day,Cluster
0,성북구,서울대학교사범대학부설중학교,3,국립,주간,277,0,0.585,0.148,0.018,...,0.004,0,0.0,37.594942,127.038909,16,0,1,0,-1
1,종로구,서울대학교사범대학부설여자중학교,3,국립,주간,0,256,0.68,0.199,0.0,...,0.031,0,0.0,37.577473,127.003857,22,0,1,0,-1
2,강남구,개원중학교,3,공립,주간,170,152,0.817,0.047,0.009,...,0.009,0,0.003,37.491637,127.071744,0,0,0,0,-1
3,강남구,개포중학교,3,공립,주간,83,72,0.755,0.097,0.013,...,0.019,0,0.0,37.480439,127.062201,0,0,0,0,-1
4,서초구,경원중학교,3,공립,주간,199,212,0.669,0.017,0.007,...,0.01,0,0.0,37.51075,127.0089,14,0,0,0,-1


### 클러스터 값으로 그룹화하고 그룹별로 내용 출력

In [33]:
grouped_cols = [0, 1, 3] + columns_list

In [34]:
grouped = df.groupby('Cluster')

In [35]:
for key, group in grouped:
    print('* key: ', key)
    print('* number: ', len(group))
    print(group.iloc[:, grouped_cols].head())
    print('\n')

* key:  -1
* number:  255
    지역               학교명  유형    과학고  외고_국제고    자사고
0  성북구    서울대학교사범대학부설중학교  국립  0.018   0.007  0.227
1  종로구  서울대학교사범대학부설여자중학교  국립  0.000   0.035  0.043
2  강남구             개원중학교  공립  0.009   0.012  0.090
3  강남구             개포중학교  공립  0.013   0.013  0.065
4  서초구             경원중학교  공립  0.007   0.010  0.282


* key:  0
* number:  102
     지역      학교명  유형  과학고  외고_국제고    자사고
13  서초구  동덕여자중학교  사립  0.0   0.022  0.038
22  강남구    수서중학교  공립  0.0   0.019  0.044
28  서초구    언남중학교  공립  0.0   0.015  0.050
34  강남구    은성중학교  사립  0.0   0.016  0.065
43  송파구    거원중학교  공립  0.0   0.021  0.054


* key:  1
* number:  45
       지역      학교명  유형  과학고  외고_국제고    자사고
46    강동구    동신중학교  사립  0.0     0.0  0.044
103   양천구    신원중학교  공립  0.0     0.0  0.006
118   구로구    개봉중학교  공립  0.0     0.0  0.012
126  영등포구    대림중학교  공립  0.0     0.0  0.050
175   중랑구  혜원여자중학교  사립  0.0     0.0  0.004


* key:  2
* number:  8
      지역    학교명  유형    과학고  외고_국제고    자사고
20   서초구  서초중학교  공립  0.003   0.013  0.085
79

### 그래프로 표현 - 시각화

In [36]:
colors = {-1:'gray', 0:'coral', 1:'blue', 2:'green', 3:'red', 4:'purple',
         5:'orange', 6:'brown', 7:'brick', 8:'yellow', 9:'magenta', 10:'cyan'}

In [67]:
cluster_map = folium.Map(location=[37.55, 126.98], tiles='Stamen Terrain',
                        zoom_start=12)

In [68]:
for name, lat, lng, clus in zip(df.학교명, df.위도, df.경도, df.Cluster):
    folium.CircleMarker([lat, lng],
                       radius=5,
                       color=colors[clus],
                       fill=True,
                       fill_color=colors[clus],
                       fill_opacity=0.7,
                       popup=name).add_to(cluster_map)

### 지도를 html 파일로 저장하기 

In [69]:
cluster_map.save('./seoul_mschool_cluster.html')

### X2 데이터셋에 대하여 위 과정 반복(과고, 외고국제고, 자사고 진학률 + 유형)

앞에서 사용한 속성에 학교 설립 유형(국립, 공립, 사립)을 추가하여 분석한다.
이때 유형은 위에서 원핫벡터로 정리된 값을 사용한다.

In [70]:
columns_list2 = [9, 10, 13, 22]
X2 = df.iloc[:, columns_list2]
X2

Unnamed: 0,과학고,외고_국제고,자사고,type
0,0.018,0.007,0.227,1
1,0.000,0.035,0.043,1
2,0.009,0.012,0.090,0
3,0.013,0.013,0.065,0
4,0.007,0.010,0.282,0
...,...,...,...,...
410,0.000,0.000,0.000,1
411,0.000,0.000,0.000,1
412,0.000,0.000,0.000,2
413,0.000,0.000,0.000,2


In [71]:
X2 = preprocessing.StandardScaler().fit(X2).transform(X2)

In [72]:
dbm2 = cluster.DBSCAN(eps=0.2, min_samples=5)
dbm2.fit(X2)

DBSCAN(eps=0.2)

In [73]:
df['Cluster2'] = dbm2.labels_

In [74]:
grouped2_cols = [0, 1, 3] + columns_list2
grouped2 = df.groupby('Cluster2')

In [75]:
dddd

NameError: name 'dddd' is not defined

In [76]:
for key, group in grouped2:
    print('* key: ', key)
    print('* number: ', len(group))
    print(group.iloc[:, grouped2_cols].head())
    print('\n')

* key:  -1
* number:  281
    지역               학교명  유형    과학고  외고_국제고    자사고  type
0  성북구    서울대학교사범대학부설중학교  국립  0.018   0.007  0.227     1
1  종로구  서울대학교사범대학부설여자중학교  국립  0.000   0.035  0.043     1
2  강남구             개원중학교  공립  0.009   0.012  0.090     0
3  강남구             개포중학교  공립  0.013   0.013  0.065     0
4  서초구             경원중학교  공립  0.007   0.010  0.282     0


* key:  0
* number:  8
      지역    학교명  유형  과학고  외고_국제고    자사고  type
22   강남구  수서중학교  공립  0.0   0.019  0.044     0
43   송파구  거원중학교  공립  0.0   0.021  0.054     0
51   송파구  방이중학교  공립  0.0   0.021  0.068     0
93   강서구  방원중학교  공립  0.0   0.019  0.057     0
164  중랑구  원묵중학교  공립  0.0   0.020  0.062     0


* key:  1
* number:  59
     지역    학교명  유형  과학고  외고_국제고    자사고  type
28  서초구  언남중학교  공립  0.0   0.015  0.050     0
47  강동구  둔촌중학교  공립  0.0   0.010  0.026     0
58  강동구  성내중학교  공립  0.0   0.013  0.026     0
62  강동구  신명중학교  공립  0.0   0.009  0.031     0
67  송파구  오금중학교  공립  0.0   0.015  0.072     0


* key:  2
* number:  6
       지역 

In [77]:
cluster2_map = folium.Map(location=[37.55, 126.98], tiles='Stamen Terrain',
                        zoom_start=12)

In [78]:
for name, lat, lng, clus in zip(df.학교명, df.위도, df.경도, df.Cluster2):
    folium.CircleMarker([lat, lng],
                       radius=5,
                       color=colors[clus],
                       fill=True,
                       fill_color=colors[clus],
                       fill_opacity=0.7,
                       popup=name).add_to(cluster2_map)

### 지도를 html 파일로 저장하기

In [79]:
cluster2_map.save('./seoul_mschool_cluster2.html')

### X3 데이터셋에 대하여 위 과정 반복(과학고, 외고_국제고)

이번에는 DBSCAN 모형에 입력하는 속성을 2개로 줄여서 예측한다. 과학고와 외고 진학률 데이터만을 사용한다.

In [80]:
columns_list3 = [9, 10]

In [81]:
X3 = df.iloc[:, columns_list3]
X3

Unnamed: 0,과학고,외고_국제고
0,0.018,0.007
1,0.000,0.035
2,0.009,0.012
3,0.013,0.013
4,0.007,0.010
...,...,...
410,0.000,0.000
411,0.000,0.000
412,0.000,0.000
413,0.000,0.000


In [82]:
X3 = preprocessing.StandardScaler().fit(X3).transform(X3)

In [83]:
dbm3 = cluster.DBSCAN(eps=0.2, min_samples=5)
dbm3.fit(X3)

DBSCAN(eps=0.2)

In [84]:
df['Cluster3'] = dbm3.labels_

In [85]:
grouped3_cols = [0, 1, 3] + columns_list3
grouped3 = df.groupby('Cluster3')

In [86]:
for key, group in grouped3:
    print('* key: ', key)
    print('* number: ', len(group))
    print(group.iloc[:, grouped3_cols].head())
    print('\n')

* key:  -1
* number:  61
    지역             학교명  유형    과학고  외고_국제고
0  성북구  서울대학교사범대학부설중학교  국립  0.018   0.007
3  강남구           개포중학교  공립  0.013   0.013
6  강남구          압구정중학교  공립  0.015   0.036
7  강남구  단국대학교사범대학부속중학교  사립  0.032   0.005
8  강남구           대명중학교  공립  0.013   0.029


* key:  0
* number:  160
     지역               학교명  유형  과학고  외고_국제고
1   종로구  서울대학교사범대학부설여자중학교  국립  0.0   0.035
13  서초구           동덕여자중학교  사립  0.0   0.022
22  강남구             수서중학교  공립  0.0   0.019
28  서초구             언남중학교  공립  0.0   0.015
29  강남구             언북중학교  공립  0.0   0.007


* key:  1
* number:  111
     지역    학교명  유형    과학고  외고_국제고
2   강남구  개원중학교  공립  0.009   0.012
4   서초구  경원중학교  공립  0.007   0.010
5   강남구  구룡중학교  공립  0.007   0.007
11  강남구  대치중학교  공립  0.007   0.024
14  서초구  반포중학교  공립  0.010   0.013


* key:  2
* number:  50
       지역    학교명  유형  과학고  외고_국제고
46    강동구  동신중학교  사립  0.0     0.0
103   양천구  신원중학교  공립  0.0     0.0
118   구로구  개봉중학교  공립  0.0     0.0
126  영등포구  대림중학교  공립  0.0     0.0
160  동대문구  

In [87]:
cluster3_map = folium.Map(location=[37.55, 126.98], tiles='Stamen Terrain',
                        zoom_start=12)

In [88]:
for name, lat, lng, clus in zip(df.학교명, df.위도, df.경도, df.Cluster3):
    folium.CircleMarker([lat, lng],
                       radius=5,
                       color=colors[clus],
                       fill=True,
                       fill_color=colors[clus],
                       fill_opacity=0.7,
                       popup=name).add_to(cluster3_map)

### 지도를 html 파일로 저장하기 

In [89]:
cluster3_map.save('./seoul_mschool_cluster3.html')

In [90]:
df

Unnamed: 0,지역,학교명,코드,유형,주야,남학생수,여학생수,일반고,특성화고,과학고,...,미상,위도,경도,location,code,type,day,Cluster,Cluster2,Cluster3
0,성북구,서울대학교사범대학부설중학교,3,국립,주간,277,0,0.585,0.148,0.018,...,0.000,37.594942,127.038909,16,0,1,0,-1,-1,-1
1,종로구,서울대학교사범대학부설여자중학교,3,국립,주간,0,256,0.680,0.199,0.000,...,0.000,37.577473,127.003857,22,0,1,0,-1,-1,0
2,강남구,개원중학교,3,공립,주간,170,152,0.817,0.047,0.009,...,0.003,37.491637,127.071744,0,0,0,0,-1,-1,1
3,강남구,개포중학교,3,공립,주간,83,72,0.755,0.097,0.013,...,0.000,37.480439,127.062201,0,0,0,0,-1,-1,-1
4,서초구,경원중학교,3,공립,주간,199,212,0.669,0.017,0.007,...,0.000,37.510750,127.008900,14,0,0,0,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,강남구,국립국악중학교,9,국립,주간,9,91,0.060,0.000,0.000,...,0.000,37.476381,127.051355,0,2,1,0,1,10,2
411,금천구,국립전통예술중학교,9,국립,주간,14,42,0.107,0.000,0.000,...,0.018,37.446354,126.906424,7,2,1,0,1,10,2
412,광진구,선화예술학교,9,사립,주간,33,241,0.102,0.000,0.000,...,0.036,37.549642,127.087737,5,2,2,0,1,5,2
413,중구,예원학교,9,사립,주간,31,258,0.170,0.000,0.000,...,0.031,37.566951,126.971553,23,2,2,0,1,5,2
