In [2]:
!pip install folium

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [3]:
import pandas as pd
import folium

In [7]:
file_path = './pandas_part7/2016_middle_shcool_graduates_report.xlsx'
df = pd.read_excel(file_path, header=0)

In [8]:
# IPython Console 디스플레이 옵션 설정하기
pd.set_option('display.width', None)        # 출력화면의 너비
pd.set_option('display.max_rows', 100)      # 출력할 행의 개수 한도
pd.set_option('display.max_columns', 10)    # 출력할 열의 개수 한도
pd.set_option('display.max_colwidth', 20)   # 출력할 열의 너비
pd.set_option('display.unicode.east_asian_width', True)   # 유니코드 사용 너비 조정

In [9]:
df.columns.values

array(['Unnamed: 0', '지역', '학교명', '코드', '유형', '주야', '남학생수', '여학생수', '일반고',
       '특성화고', '과학고', '외고_국제고', '예고_체고', '마이스터고', '자사고', '자공고', '기타진학',
       '취업', '미상', '위도', '경도'], dtype=object)

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,지역,학교명,코드,유형,...,기타진학,취업,미상,위도,경도
0,0,성북구,서울대학교사범대학부설중학교...,3,국립,...,0.004,0,0.0,37.594942,127.038909
1,1,종로구,서울대학교사범대학부설여자중학교...,3,국립,...,0.031,0,0.0,37.577473,127.003857
2,2,강남구,개원중학교,3,공립,...,0.009,0,0.003,37.491637,127.071744
3,3,강남구,개포중학교,3,공립,...,0.019,0,0.0,37.480439,127.062201
4,4,서초구,경원중학교,3,공립,...,0.01,0,0.0,37.51075,127.0089


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  415 non-null    int64  
 1   지역          415 non-null    object 
 2   학교명         415 non-null    object 
 3   코드          415 non-null    int64  
 4   유형          415 non-null    object 
 5   주야          415 non-null    object 
 6   남학생수        415 non-null    int64  
 7   여학생수        415 non-null    int64  
 8   일반고         415 non-null    float64
 9   특성화고        415 non-null    float64
 10  과학고         415 non-null    float64
 11  외고_국제고      415 non-null    float64
 12  예고_체고       415 non-null    float64
 13  마이스터고       415 non-null    float64
 14  자사고         415 non-null    float64
 15  자공고         415 non-null    float64
 16  기타진학        415 non-null    float64
 17  취업          415 non-null    int64  
 18  미상          415 non-null    float64
 19  위도          415 non-null    f

In [12]:
df.describe()

Unnamed: 0.1,Unnamed: 0,코드,남학생수,여학생수,일반고,...,기타진학,취업,미상,위도,경도
count,415.0,415.0,415.0,415.0,415.0,...,415.0,415.0,415.0,415.0,415.0
mean,207.0,3.19759,126.53253,116.173494,0.62308,...,0.069571,0.0,0.00167,37.491969,127.032792
std,119.944432,0.804272,79.217906,76.833082,0.211093,...,0.23563,0.0,0.003697,0.348926,0.265245
min,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,34.97994,126.639561
25%,103.5,3.0,80.0,71.5,0.5665,...,0.0,0.0,0.0,37.501934,126.921758
50%,207.0,3.0,129.0,118.0,0.681,...,0.007,0.0,0.0,37.547702,127.013579
75%,310.5,3.0,177.5,161.5,0.758,...,0.015,0.0,0.003,37.59067,127.071265
max,414.0,9.0,337.0,422.0,0.908,...,1.0,0.0,0.036,37.694777,129.106974


In [15]:
# 지도에 위치 표시
mschool_map = folium.Map(location=[37.55,126.98], tiles='Stamen Terrain', 
                        zoom_start=12)
mschool_map

In [16]:
# 중학교 위치 정보를 CircleMarker로 표시
for name, lat, lng in zip(df.학교명, df.위도, df.경도):
    folium.CircleMarker([lat, lng],
                        radius=5,
                        color='brown',
                        fill=True,
                        fill_color='coral',
                        popup=name
    ).add_to(mschool_map)

In [18]:
mschool_map

In [17]:
# 지도를 html 파일로 저장하기
mschool_map.save('./seoul_location.html')

In [32]:
# 데이터 전처리
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()

onehot_location = label_encoder.fit_transform(df['지역']) # string의 종류를 숫자에 대응시킨듯
onehot_code = label_encoder.fit_transform(df['코드'])
onehot_type = label_encoder.fit_transform(df['유형'])
onehot_day = label_encoder.fit_transform(df['주야'])

df['location'] = onehot_location
df['code'] = onehot_code
df['type'] = onehot_type
df['day'] = onehot_day

df.head()

Unnamed: 0.1,Unnamed: 0,지역,학교명,코드,유형,...,location,code,type,day,cluster
0,0,성북구,서울대학교사범대학부설중학교...,3,국립,...,16,0,1,0,-1
1,1,종로구,서울대학교사범대학부설여자중학교...,3,국립,...,22,0,1,0,0
2,2,강남구,개원중학교,3,공립,...,0,0,0,0,-1
3,3,강남구,개포중학교,3,공립,...,0,0,0,0,-1
4,4,서초구,경원중학교,3,공립,...,14,0,0,0,1


In [33]:
df.columns

Index(['Unnamed: 0', '지역', '학교명', '코드', '유형', '주야', '남학생수',
       '여학생수', '일반고', '특성화고', '과학고', '외고_국제고', '예고_체고',
       '마이스터고', '자사고', '자공고', '기타진학', '취업', '미상', '위도',
       '경도', 'location', 'code', 'type', 'day', 'cluster'],
      dtype='object')

In [34]:
# DBSCAN Cluster model - by sklearn
from sklearn import cluster

columns_list = [11, 12, 15] # 과고, 외고, 자사고
X = df.iloc[:, columns_list]

X = preprocessing.StandardScaler().fit(X).transform(X)

dbm = cluster.DBSCAN(eps=0.2, min_samples=5)
dbm.fit(X)

cluster_label = dbm.labels_
print(cluster_label)
print('\n')

df['cluster'] = cluster_label
print(df.head())

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0 -1  0  0  0  0  0 -1  0  0
 -1  0  0  0  1  0  0  1  0  0  0  0  0  0  0  0  0  0  0  1  0 -1  1  0
 -1 -1  0  0  0  0 -1 -1 -1  0 -1  0 -1 -1 -1 -1  0 -1  0  0 -1 -1  0 -1
 -1  0  0  0 -1 -1 -1 -1  0  0  0 -1  0 -1 -1  0 -1  0  0 -1 -1 -1  0 -1
  0  0  0  0  0 -1 -1  1  0 -1 -1 -1  0  0 -1  0  0  1  0 -1  0  0  0 -1
  0 -1  1  0  0  0 -1  1  0  0  0  0  0  0  0  0  0  1  0  0 -1 -1 -1  0
  0 -1  0  0  0  0  0  0  0 -1  0  0  0  0  0  0 -1  0  0  0  0  0 -1  0
 -1  1 -1  0  0  0  0  0  0  0 -1  0  0  0  0  1 -1  0  0 -1 -1 -1  0  1
 -1  0 -1  0  0  0  0  0  0  0  0  0  0  0 -1  1  0  0  1  0  0  0 -1  0
 -1  0 -1 -1  1 -1 -1  0 -1 -1 -1  0 -1  0 -1 -1 -1 -1  0  0  0  0  0  0
 -1  0 -1  0  0 -1  1  0 -1  0  0  1 -1  0  0  0 -1

In [35]:
# 클러스터 값으로 그룹화하기, 그룹별로 내용 출력하기
grouped_cols = [0, 1, 3] + columns_list
grouped = df.groupby('cluster')

for key, group in grouped:
    print('* key :', key)
    print('* num :', len(group))
    print(group.iloc[:, grouped_cols].head())
    print('\n')


* key : -1
* num : 97
    Unnamed: 0    지역  코드  외고_국제고  예고_체고  자공고
23          23  강남구     3        0.040      0.019   0.000
81          81  강서구     3        0.007      0.000   0.094
87          87  강서구     3        0.004      0.017   0.143
93          93  강서구     3        0.019      0.025   0.075
96          96  양천구     3        0.028      0.050   0.000


* key : 0
* num : 270
   Unnamed: 0    지역  코드  외고_국제고  예고_체고  자공고
0           0  성북구     3        0.007      0.000   0.000
1           1  종로구     3        0.035      0.008   0.004
2           2  강남구     3        0.012      0.003   0.003
3           3  강남구     3        0.013      0.019   0.000
4           4  서초구     3        0.010      0.005   0.000


* key : 1
* num : 48
     Unnamed: 0    지역  코드  외고_국제고  예고_체고  자공고
46           46  강동구     3          0.0      0.004     0.0
100         100  양천구     3          0.0      0.021     0.0
103         103  양천구     3          0.0      0.006     0.0
115         115  강서구     3          0.0     

In [36]:
# 그래프로 표현 - 시각화
colors = {-1:'gray', 0:'coral', 1:'blue', 2:'green', 3:'red', 4:'purple', 
          5:'orange', 6:'brown', 7:'brick', 8:'yellow', 9:'magenta', 10:'cyan'}

In [37]:
cluster_map = folium.Map(location=[37.55, 126.98], titles='Stamen Terrain',
                        zoom_start=12)

for name, lat, lng, clus in zip(df.학교명, df.위도, df.경도, df.cluster):  
    folium.CircleMarker([lat, lng],
                        radius=5,                   # 원의 반지름
                        color=colors[clus],         # 원의 둘레 색상
                        fill=True,
                        fill_color=colors[clus],    # 원을 채우는 색
                        fill_opacity=0.7,           # 투명도    
                        popup=name
    ).add_to(cluster_map)

cluster_map
# cluster_map.svae('file_name.html')