# DBSCAN(Density-based Spatial Clustering of Applications with Noise)
- P.339~
- 데이터가 위치하고 있는 공간 밀집도를 기준으로 클러스터를 구분
- 코어 포인트(core point) : 자기(코어 포인트)를 중심으로 반지름 R의 공간에 최소 M개의 포인트가 존재하는 점
- 경계 포인트(border point) : 코어 포인트는 아니지만 반지름 R 안에 있는 다른 코어 포인트
- Noise(outlier) : 코어 포인트도 아니고 경계 포인트에도 속하지 않는 점
- 하나의 클러스터는 반지름 R 안에 서로 위치하는 모든 코어 포인트를 포함하는 방식으로 구성하며 각 코어 포인트 주위에 있는 경계 포인트를 포함.
- 서로 밀접한 데이터끼리 하나의 클러스터를 구성하게 되고 어느 클러스터에도 속하지 않는 점들은 Noise로 남게 됨

In [2]:
# 메모리 변수 모두 제거
all = [var for var in globals() if var[0] != "_"]
for var in all:
    del globals()[var]

In [3]:
import pandas as pd
import folium

In [4]:
from google.colab import drive
drive.mount('/content/mnt')

Mounted at /content/mnt


In [5]:
f = '/content/mnt/MyDrive/데이터/판다스데이터분석_실습/예제_파이썬머신러닝 판다스 데이터분석/part7'

## 고등학교 진학률 데이터를 활용하여 속성이 비슷한 중학교끼리 군집화

## 1. 데이터 준비

In [19]:
# 서울시내 중학교 진학률 데이터셋
df = pd.read_excel(f'{f}/2016_middle_shcool_graduates_report.xlsx', header=0, engine='openpyxl', index_col=0)

In [22]:
# 열 이름 배열을 출력
print(df.columns.values)   

['지역' '학교명' '코드' '유형' '주야' '남학생수' '여학생수' '일반고' '특성화고' '과학고' '외고_국제고'
 '예고_체고' '마이스터고' '자사고' '자공고' '기타진학' '취업' '미상' '위도' '경도']


## 2. 데이터 탐색

In [23]:
df.head()

Unnamed: 0,지역,학교명,코드,유형,주야,남학생수,여학생수,일반고,특성화고,과학고,외고_국제고,예고_체고,마이스터고,자사고,자공고,기타진학,취업,미상,위도,경도
0,성북구,서울대학교사범대학부설중학교,3,국립,주간,277,0,0.585,0.148,0.018,0.007,0.0,0.011,0.227,0.0,0.004,0,0.0,37.594942,127.038909
1,종로구,서울대학교사범대학부설여자중학교,3,국립,주간,0,256,0.68,0.199,0.0,0.035,0.008,0.0,0.043,0.004,0.031,0,0.0,37.577473,127.003857
2,강남구,개원중학교,3,공립,주간,170,152,0.817,0.047,0.009,0.012,0.003,0.006,0.09,0.003,0.009,0,0.003,37.491637,127.071744
3,강남구,개포중학교,3,공립,주간,83,72,0.755,0.097,0.013,0.013,0.019,0.019,0.065,0.0,0.019,0,0.0,37.480439,127.062201
4,서초구,경원중학교,3,공립,주간,199,212,0.669,0.017,0.007,0.01,0.005,0.0,0.282,0.0,0.01,0,0.0,37.51075,127.0089


In [24]:
df.info() # 데이터 자료형 확인

<class 'pandas.core.frame.DataFrame'>
Int64Index: 415 entries, 0 to 414
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   지역      415 non-null    object 
 1   학교명     415 non-null    object 
 2   코드      415 non-null    int64  
 3   유형      415 non-null    object 
 4   주야      415 non-null    object 
 5   남학생수    415 non-null    int64  
 6   여학생수    415 non-null    int64  
 7   일반고     415 non-null    float64
 8   특성화고    415 non-null    float64
 9   과학고     415 non-null    float64
 10  외고_국제고  415 non-null    float64
 11  예고_체고   415 non-null    float64
 12  마이스터고   415 non-null    float64
 13  자사고     415 non-null    float64
 14  자공고     415 non-null    float64
 15  기타진학    415 non-null    float64
 16  취업      415 non-null    int64  
 17  미상      415 non-null    float64
 18  위도      415 non-null    float64
 19  경도      415 non-null    float64
dtypes: float64(12), int64(4), object(4)
memory usage: 68.1+ KB


In [25]:
# 데이터 통계 요약정보 확인
df.describe()

Unnamed: 0,코드,남학생수,여학생수,일반고,특성화고,과학고,외고_국제고,예고_체고,마이스터고,자사고,자공고,기타진학,취업,미상,위도,경도
count,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0
mean,3.19759,126.53253,116.173494,0.62308,0.149684,0.004378,0.013687,0.017393,0.005251,0.080971,0.034296,0.069571,0.0,0.00167,37.491969,127.032792
std,0.804272,79.217906,76.833082,0.211093,0.102977,0.006739,0.011548,0.092006,0.007557,0.079136,0.080302,0.23563,0.0,0.003697,0.348926,0.265245
min,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.97994,126.639561
25%,3.0,80.0,71.5,0.5665,0.0655,0.0,0.006,0.0,0.0,0.027,0.0,0.0,0.0,0.0,37.501934,126.921758
50%,3.0,129.0,118.0,0.681,0.149,0.0,0.012,0.005,0.003,0.06,0.0,0.007,0.0,0.0,37.547702,127.013579
75%,3.0,177.5,161.5,0.758,0.2245,0.006,0.019,0.011,0.007,0.1115,0.019,0.015,0.0,0.003,37.59067,127.071265
max,9.0,337.0,422.0,0.908,0.477,0.055,0.11,0.94,0.046,0.485,0.531,1.0,0.0,0.036,37.694777,129.106974


In [28]:
# 지도에 위치 표시
mschool_map = folium.Map(location=[37.55,126.98], tiles='Stamen Terrain', zoom_start=12)

# 중학교 위치정보를 CircleMarker로 표시
for name, lat, lng in zip(df.학교명, df.위도, df.경도):
    folium.CircleMarker([lat, lng],
                        radius=5,              # 원의 반지름
                        color='brown',         # 원의 둘레 색상
                        fill=True,
                        fill_color='coral',    # 원을 채우는 색
                        fill_opacity=0.7,      # 투명도    
                        popup=name
    ).add_to(mschool_map)

# mschool_map

In [29]:
# 지도를 html 파일로 저장하기
mschool_map.save(f'{f}/seoul_mschool_location.html')

## 3. 데이터 전처리

In [30]:
# 원핫인코딩(더미 변수)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

onehot_location = label_encoder.fit_transform(df['지역'])
onehot_code = label_encoder.fit_transform(df['코드'])
onehot_type = label_encoder.fit_transform(df['유형'])
onehot_day = label_encoder.fit_transform(df['주야'])

df['location'] = onehot_location
df['code'] = onehot_code
df['type'] = onehot_type
df['day'] = onehot_day

df.head()

Unnamed: 0,지역,학교명,코드,유형,주야,남학생수,여학생수,일반고,특성화고,과학고,외고_국제고,예고_체고,마이스터고,자사고,자공고,기타진학,취업,미상,위도,경도,location,code,type,day
0,성북구,서울대학교사범대학부설중학교,3,국립,주간,277,0,0.585,0.148,0.018,0.007,0.0,0.011,0.227,0.0,0.004,0,0.0,37.594942,127.038909,16,0,1,0
1,종로구,서울대학교사범대학부설여자중학교,3,국립,주간,0,256,0.68,0.199,0.0,0.035,0.008,0.0,0.043,0.004,0.031,0,0.0,37.577473,127.003857,22,0,1,0
2,강남구,개원중학교,3,공립,주간,170,152,0.817,0.047,0.009,0.012,0.003,0.006,0.09,0.003,0.009,0,0.003,37.491637,127.071744,0,0,0,0
3,강남구,개포중학교,3,공립,주간,83,72,0.755,0.097,0.013,0.013,0.019,0.019,0.065,0.0,0.019,0,0.0,37.480439,127.062201,0,0,0,0
4,서초구,경원중학교,3,공립,주간,199,212,0.669,0.017,0.007,0.01,0.005,0.0,0.282,0.0,0.01,0,0.0,37.51075,127.0089,14,0,0,0


## 4. DBSCAN 군집 모형

- 정규화

In [33]:
from sklearn.preprocessing import StandardScaler

# 분석에 사용할 속성을 선택 (과학고, 외고국제고, 자사고 진학률)
columns_list = [9, 10, 13]
X = df.iloc[:, columns_list]
print(X[:5])
print('\n')

# 설명 변수 데이터를 정규화
X = StandardScaler().fit(X).transform(X)

     과학고  외고_국제고    자사고
0  0.018   0.007  0.227
1  0.000   0.035  0.043
2  0.009   0.012  0.090
3  0.013   0.013  0.065
4  0.007   0.010  0.282




- 모형 학습/예측

In [36]:
from sklearn.cluster import DBSCAN

# DBSCAN 모형 객체 생성
dbm = DBSCAN(eps=0.2, min_samples=5) # eps: 반지름, min-sample: 최소 포인트 개수(M)

# 모형 학습
dbm.fit(X)

# 예측 (군집) 
cluster_label = dbm.labels_
print(cluster_label) # -1(Noise)를 제외한 총 4개의 클러스터(0, 1, 2, 3)로 구분

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1  2 -1  0 -1
 -1 -1 -1 -1  0 -1 -1 -1 -1 -1  0  3 -1 -1 -1 -1 -1 -1 -1  0 -1 -1  1  0
 -1 -1 -1  0 -1 -1 -1 -1  0 -1  0  0 -1 -1  0 -1 -1 -1  0  0 -1 -1  0 -1
 -1 -1  0 -1 -1 -1  0  2  0  0  0  0  0 -1 -1 -1  0 -1  0 -1 -1  0 -1  0
 -1  0  0 -1 -1 -1 -1  1  0 -1  0  0 -1 -1 -1  0 -1 -1 -1 -1 -1  0  1 -1
 -1  0  2  0 -1 -1  1 -1 -1 -1  0  0  0 -1 -1  0 -1 -1 -1  0  0 -1 -1 -1
 -1  0 -1 -1 -1  0 -1 -1 -1  0 -1  0  0 -1 -1 -1 -1 -1  0 -1  0  0 -1 -1
 -1 -1 -1  0 -1 -1 -1  1  0  3  1 -1  0  0 -1  0 -1 -1  0  0  2 -1 -1  3
  0  0 -1 -1 -1 -1  0 -1  0  0 -1  0  0  0 -1 -1  0 -1 -1 -1 -1 -1  2  0
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1  0 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1  0  0 -1 -1  0 -1  3  0  2 -1 -1
 -1 -1  0 -1 -1 -1  0 -1  0  0 -1 -1 -1 -1 -1  1 -1  0  1 -1  0  0  1 -1
  2 -1  0 -1 -1 -1 -1  0 -1 -1  1  0 -1  0 -1 -1  0  3  0 -1 -1 -1  2 -1
 -1 -1 -1  0  0  0  1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1

In [37]:
# 예측 결과를 데이터프레임에 추가
df['Cluster'] = cluster_label
df.head()

Unnamed: 0,지역,학교명,코드,유형,주야,남학생수,여학생수,일반고,특성화고,과학고,외고_국제고,예고_체고,마이스터고,자사고,자공고,기타진학,취업,미상,위도,경도,location,code,type,day,Cluster
0,성북구,서울대학교사범대학부설중학교,3,국립,주간,277,0,0.585,0.148,0.018,0.007,0.0,0.011,0.227,0.0,0.004,0,0.0,37.594942,127.038909,16,0,1,0,-1
1,종로구,서울대학교사범대학부설여자중학교,3,국립,주간,0,256,0.68,0.199,0.0,0.035,0.008,0.0,0.043,0.004,0.031,0,0.0,37.577473,127.003857,22,0,1,0,-1
2,강남구,개원중학교,3,공립,주간,170,152,0.817,0.047,0.009,0.012,0.003,0.006,0.09,0.003,0.009,0,0.003,37.491637,127.071744,0,0,0,0,-1
3,강남구,개포중학교,3,공립,주간,83,72,0.755,0.097,0.013,0.013,0.019,0.019,0.065,0.0,0.019,0,0.0,37.480439,127.062201,0,0,0,0,-1
4,서초구,경원중학교,3,공립,주간,199,212,0.669,0.017,0.007,0.01,0.005,0.0,0.282,0.0,0.01,0,0.0,37.51075,127.0089,14,0,0,0,-1


In [38]:
# 클러스터 값으로 그룹화하고, 그룹별로 내용 출력 (첫 5행만 출력)
grouped_cols = [0, 1, 3] + columns_list
grouped = df.groupby('Cluster')
for key, group in grouped:
    print('* key :', key)
    print('* number :', len(group))    
    print(group.iloc[:, grouped_cols].head())
    print('\n')

* key : -1
* number : 255
    지역               학교명  유형    과학고  외고_국제고    자사고
0  성북구    서울대학교사범대학부설중학교  국립  0.018   0.007  0.227
1  종로구  서울대학교사범대학부설여자중학교  국립  0.000   0.035  0.043
2  강남구             개원중학교  공립  0.009   0.012  0.090
3  강남구             개포중학교  공립  0.013   0.013  0.065
4  서초구             경원중학교  공립  0.007   0.010  0.282


* key : 0
* number : 102
     지역      학교명  유형  과학고  외고_국제고    자사고
13  서초구  동덕여자중학교  사립  0.0   0.022  0.038
22  강남구    수서중학교  공립  0.0   0.019  0.044
28  서초구    언남중학교  공립  0.0   0.015  0.050
34  강남구    은성중학교  사립  0.0   0.016  0.065
43  송파구    거원중학교  공립  0.0   0.021  0.054


* key : 1
* number : 45
       지역      학교명  유형  과학고  외고_국제고    자사고
46    강동구    동신중학교  사립  0.0     0.0  0.044
103   양천구    신원중학교  공립  0.0     0.0  0.006
118   구로구    개봉중학교  공립  0.0     0.0  0.012
126  영등포구    대림중학교  공립  0.0     0.0  0.050
175   중랑구  혜원여자중학교  사립  0.0     0.0  0.004


* key : 2
* number : 8
      지역    학교명  유형    과학고  외고_국제고    자사고
20   서초구  서초중학교  공립  0.003   0.013  0.085
79

- 지도 시각화

In [40]:
# 그래프로 표현 - 시각화
colors = {-1:'gray', 0:'coral', 1:'blue', 2:'green', 3:'red', 4:'purple', 
          5:'orange', 6:'brown', 7:'brick', 8:'yellow', 9:'magenta', 10:'cyan', 11:'tan'}

cluster_map = folium.Map(location=[37.55,126.98], tiles='Stamen Terrain', 
                        zoom_start=12)

for name, lat, lng, clus in zip(df.학교명, df.위도, df.경도, df.Cluster):  
    folium.CircleMarker([lat, lng],
                        radius=5,                   # 원의 반지름
                        color=colors[clus],         # 원의 둘레 색상
                        fill=True,
                        fill_color=colors[clus],    # 원을 채우는 색
                        fill_opacity=0.7,           # 투명도    
                        popup=name
    ).add_to(cluster_map)

# cluster_map

In [None]:
# 지도를 html 파일로 저장하기
cluster_map.save(f'{f}/seoul_mschool_cluster.html')

- 설명 변수를 추가/축소하여 군집화

In [42]:
# X2 데이터셋에 대하여 위의 과정을 반복(과학고, 외고국제고, 자사고 진학률 + 유형)
columns_list2 = [9, 10, 13, 22]
X2 = df.iloc[:, columns_list2]
print(X2[:5])
print('\n')

# 설명 변수 데이터 정규화
X2 = StandardScaler().fit(X2).transform(X2)
# DBSCAN 모형 객체 생성
dbm2 = DBSCAN(eps=0.2, min_samples=5)
# 모형 학습
dbm2.fit(X2)  
# 예측 (군집) 
df['Cluster2'] = dbm2.labels_  

# Noise를 제외한 총 11개의 군집으로 구분
grouped2_cols = [0, 1, 3] + columns_list2
grouped2 = df.groupby('Cluster2')
for key, group in grouped2:
    print('* key :', key)
    print('* number :', len(group))    
    print(group.iloc[:, grouped2_cols].head())
    print('\n')

     과학고  외고_국제고    자사고  type
0  0.018   0.007  0.227     1
1  0.000   0.035  0.043     1
2  0.009   0.012  0.090     0
3  0.013   0.013  0.065     0
4  0.007   0.010  0.282     0


* key : -1
* number : 281
    지역               학교명  유형    과학고  외고_국제고    자사고  type
0  성북구    서울대학교사범대학부설중학교  국립  0.018   0.007  0.227     1
1  종로구  서울대학교사범대학부설여자중학교  국립  0.000   0.035  0.043     1
2  강남구             개원중학교  공립  0.009   0.012  0.090     0
3  강남구             개포중학교  공립  0.013   0.013  0.065     0
4  서초구             경원중학교  공립  0.007   0.010  0.282     0


* key : 0
* number : 8
      지역    학교명  유형  과학고  외고_국제고    자사고  type
22   강남구  수서중학교  공립  0.0   0.019  0.044     0
43   송파구  거원중학교  공립  0.0   0.021  0.054     0
51   송파구  방이중학교  공립  0.0   0.021  0.068     0
93   강서구  방원중학교  공립  0.0   0.019  0.057     0
164  중랑구  원묵중학교  공립  0.0   0.020  0.062     0


* key : 1
* number : 59
     지역    학교명  유형  과학고  외고_국제고    자사고  type
28  서초구  언남중학교  공립  0.0   0.015  0.050     0
47  강동구  둔촌중학교  공립  0.0   0.010  

In [43]:
cluster2_map = folium.Map(location=[37.55,126.98], tiles='Stamen Terrain', 
                        zoom_start=12)

for name, lat, lng, clus in zip(df.학교명, df.위도, df.경도, df.Cluster2):  
    folium.CircleMarker([lat, lng],
                        radius=5,                   # 원의 반지름
                        color=colors[clus],         # 원의 둘레 색상
                        fill=True,
                        fill_color=colors[clus],    # 원을 채우는 색
                        fill_opacity=0.7,           # 투명도    
                        popup=name
    ).add_to(cluster2_map)

# cluster2_map

In [None]:
# 지도를 html 파일로 저장하기
cluster2_map.save(f'{f}/seoul_mschool_cluster2.html')

In [44]:
# X3 데이터셋에 대하여 위의 과정을 반복(과학고, 외고_국제고)
columns_list3 = [9, 10]
X3 = df.iloc[:, columns_list3]
print(X3[:5])
print('\n')

# 설명 변수 데이터 정규화
X3 = StandardScaler().fit(X3).transform(X3)
# DBSCAN 모형 객체 생성
dbm3 = DBSCAN(eps=0.2, min_samples=5)
# 모형 학습
dbm3.fit(X3) 
# 예측 (군집) 
df['Cluster3'] = dbm3.labels_  

# Noise를 제외한 총 7개의 군집으로 구분
grouped3_cols = [0, 1, 3] + columns_list3
grouped3 = df.groupby('Cluster3')
for key, group in grouped3:
    print('* key :', key)
    print('* number :', len(group))    
    print(group.iloc[:, grouped3_cols].head())
    print('\n')

     과학고  외고_국제고
0  0.018   0.007
1  0.000   0.035
2  0.009   0.012
3  0.013   0.013
4  0.007   0.010


* key : -1
* number : 61
    지역             학교명  유형    과학고  외고_국제고
0  성북구  서울대학교사범대학부설중학교  국립  0.018   0.007
3  강남구           개포중학교  공립  0.013   0.013
6  강남구          압구정중학교  공립  0.015   0.036
7  강남구  단국대학교사범대학부속중학교  사립  0.032   0.005
8  강남구           대명중학교  공립  0.013   0.029


* key : 0
* number : 160
     지역               학교명  유형  과학고  외고_국제고
1   종로구  서울대학교사범대학부설여자중학교  국립  0.0   0.035
13  서초구           동덕여자중학교  사립  0.0   0.022
22  강남구             수서중학교  공립  0.0   0.019
28  서초구             언남중학교  공립  0.0   0.015
29  강남구             언북중학교  공립  0.0   0.007


* key : 1
* number : 111
     지역    학교명  유형    과학고  외고_국제고
2   강남구  개원중학교  공립  0.009   0.012
4   서초구  경원중학교  공립  0.007   0.010
5   강남구  구룡중학교  공립  0.007   0.007
11  강남구  대치중학교  공립  0.007   0.024
14  서초구  반포중학교  공립  0.010   0.013


* key : 2
* number : 50
       지역    학교명  유형  과학고  외고_국제고
46    강동구  동신중학교  사립  0.0     0.0
103   양천구

In [45]:
cluster3_map = folium.Map(location=[37.55,126.98], tiles='Stamen Terrain', 
                        zoom_start=12)

for name, lat, lng, clus in zip(df.학교명, df.위도, df.경도, df.Cluster3):  
    folium.CircleMarker([lat, lng],
                        radius=5,                   # 원의 반지름
                        color=colors[clus],         # 원의 둘레 색상
                        fill=True,
                        fill_color=colors[clus],    # 원을 채우는 색
                        fill_opacity=0.7,           # 투명도    
                        popup=name
    ).add_to(cluster3_map)

# cluster3_map

In [None]:
# 지도를 html 파일로 저장하기
cluster3_map.save(f'{f}/seoul_mschool_cluster3.html')