In [1]:
import pandas as pd
import numpy as np

In [105]:
# 티머니 시간대별 승하차 데이터
tmoney = pd.read_csv("2022_tmoney_subway.csv", low_memory=False) 
tmoney.shape

(264576, 8)

In [106]:
# 지하철 위경도 데이터
raw = pd.read_excel("지하철_위경도.xlsx")
raw.shape

(1058, 13)

In [30]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1058 entries, 0 to 1057
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1058 non-null   float64
 1   name            1058 non-null   object 
 2   address         1058 non-null   object 
 3   roadAddress     982 non-null    object 
 4   displayCode     1058 non-null   object 
 5   displayName     1058 non-null   object 
 6   city_id         1058 non-null   float64
 7   city_name       1058 non-null   object 
 8   point_x         1058 non-null   float64
 9   point_y         1058 non-null   float64
 10  routeType_id    1058 non-null   float64
 11  routeType_name  1058 non-null   object 
 12  transfers       265 non-null    float64
dtypes: float64(6), object(7)
memory usage: 107.6+ KB


In [107]:
# 도시명이 서울에 속하는 것만 가져옴
raw = raw[raw["city_name"] == "서울"]
raw.shape

(767, 13)

In [117]:
# 사용할 칼럼만 불러옴
raw = raw[["displayName", "point_x", "point_y"]]
raw

Unnamed: 0,displayName,point_x,point_y
0,소요산역,127.061049,37.948747
5,동두천역,127.054850,37.927574
10,보산역,127.057163,37.914265
15,동두천중앙역,127.056421,37.901682
20,지행역,127.055751,37.892370
...,...,...,...
1053,아산역,127.104573,36.792118
1054,탕정역,127.084638,36.788270
1055,배방역,127.052842,36.777592
1056,온양온천역,127.003178,36.780541


In [119]:
# 칼럼 이름 지정
raw.columns = ["지하철역", "경도", "위도"]
raw

Unnamed: 0,지하철역,경도,위도
0,소요산역,127.061049,37.948747
5,동두천역,127.054850,37.927574
10,보산역,127.057163,37.914265
15,동두천중앙역,127.056421,37.901682
20,지행역,127.055751,37.892370
...,...,...,...
1053,아산역,127.104573,36.792118
1054,탕정역,127.084638,36.788270
1055,배방역,127.052842,36.777592
1056,온양온천역,127.003178,36.780541


In [121]:
# 환승역 때문에 지하철역이 중복되서 들어간 것을 확인.
# 이를 처리하기 위해 위경도의 평균값을 사용
raw = raw.groupby("지하철역", as_index=False).mean()
raw

Unnamed: 0,지하철역,경도,위도
0,4.19민주묘지역,127.013669,37.649554
1,가능역,127.044272,37.748370
2,가락시장역,127.118001,37.492591
3,가산디지털단지역,126.882584,37.481007
4,가양역,126.854433,37.561469
...,...,...,...
633,회현역,126.978559,37.558776
634,효자역,127.077281,37.753999
635,효창공원앞역,126.961987,37.538941
636,흑석역,126.963395,37.509183


In [123]:
# 불러온 두 데이터셋의 역사명이 서로 다르고 중구난방으로 되어있어 통일해주기
# 함수 정의
def preprocess_subway_name(data, col):
    data[col] = data[col].replace('\([^)]*\)',"",regex=True)
    data[col] = data[col].str.replace(" ", "")
    data[col] = data[col].str.rstrip("역")
    data[col]+="역"

In [124]:
# 함수 적용
preprocess_subway_name(tmoney, "지하철역")
preprocess_subway_name(raw, "지하철역")

In [125]:
tmoney.head(2)

Unnamed: 0,사용월,호선명,역ID,지하철역,승하차시간,승하차인원,시간,승하차
0,2022-01,1호선,1.0,서울역,1시_승차,530,1시,승차
1,2022-01,1호선,10.0,동묘앞역,1시_승차,111,1시,승차


In [127]:
# 데이터 merge
df = tmoney.merge(raw, on="지하철역")
df

Unnamed: 0,사용월,호선명,역ID,지하철역,승하차시간,승하차인원,시간,승하차,경도,위도
0,2022-01,1호선,1.0,서울역,1시_승차,530,1시,승차,126.971535,37.554903
1,2022-01,4호선,112.0,서울역,1시_승차,179,1시,승차,126.971535,37.554903
2,2022-01,경부선,121.0,서울역,1시_승차,2,1시,승차,126.971535,37.554903
3,2022-01,경의선,172.0,서울역,1시_승차,0,1시,승차,126.971535,37.554903
4,2022-01,공항철도 1호선,656.0,서울역,1시_승차,159,1시,승차,126.971535,37.554903
...,...,...,...,...,...,...,...,...,...,...
264571,2022-05,신림선,4411.0,관악산역,24시_하차,0,24시,하차,126.945149,37.468935
264572,2022-06,신림선,4411.0,관악산역,24시_하차,0,24시,하차,126.945149,37.468935
264573,2022-07,신림선,4411.0,관악산역,24시_하차,0,24시,하차,126.945149,37.468935
264574,2022-08,신림선,4411.0,관악산역,24시_하차,0,24시,하차,126.945149,37.468935


In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264576 entries, 0 to 264575
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   사용월     264576 non-null  object 
 1   호선명     264576 non-null  object 
 2   역ID     264576 non-null  float64
 3   지하철역    264576 non-null  object 
 4   승하차시간   264576 non-null  object 
 5   승하차인원   264576 non-null  object 
 6   시간      264576 non-null  object 
 7   승하차     264576 non-null  object 
 8   경도      264576 non-null  float64
 9   위도      264576 non-null  float64
dtypes: float64(3), object(7)
memory usage: 22.2+ MB


In [201]:
# 승하차인원 컬럼 숫자형으로 변환
df["승하차인원"] = df["승하차인원"].str.replace(",", "").astype(int)

AttributeError: Can only use .str accessor with string values!

In [167]:
# 역사별 승하차인원에 위경도 데이터 merge
on_off = pd.pivot_table(df, index="지하철역", columns="승하차", values="승하차인원", aggfunc="sum")
on_off = on_off.merge(raw, on="지하철역")
on_off

Unnamed: 0,지하철역,승차,하차,경도,위도
0,4.19민주묘지역,801590,750725,127.013669,37.649554
1,가능역,1651520,1579217,127.044272,37.748370
2,가락시장역,3879711,4042002,127.118001,37.492591
3,가산디지털단지역,13426389,14039716,126.882584,37.481007
4,가양역,4944772,4788551,126.854433,37.561469
...,...,...,...,...,...
524,회기역,6184902,5978154,127.057682,37.589558
525,회룡역,3084746,3052498,127.047224,37.724531
526,회현역,5642642,5913445,126.978559,37.558776
527,효창공원앞역,2393674,2324262,126.961987,37.538941


In [178]:
import folium

center = [37.563, 126.986]
zoom = 12

m = folium.Map(center, zoom_start=zoom)

for i in on_off.index:
    
    m_name = on_off.loc[i, "지하철역"]
    m_long = on_off.loc[i, "경도"]
    m_lat = on_off.loc[i, "위도"]
    m_on = on_off.loc[i, "승차"]
    m_off = on_off.loc[i, "하차"]
    
    radius = np.sqrt(np.sqrt(m_on + m_off)) - 30
    tooltip = f"역사명 : {m_name}, 승차 : {m_on}, 하차 : {m_off}"
    
    folium.CircleMarker([m_lat, m_long],
                       radius=radius,
                       fill=True,
                       tooltip=tooltip).add_to(m)

m

In [181]:
# 승하차 인원을 더한 유동인구 칼럼 생성
on_off["유동인구"] = on_off["승차"] + on_off["하차"]
on_off

Unnamed: 0,지하철역,승차,하차,경도,위도,유동인구
0,4.19민주묘지역,801590,750725,127.013669,37.649554,1552315
1,가능역,1651520,1579217,127.044272,37.748370,3230737
2,가락시장역,3879711,4042002,127.118001,37.492591,7921713
3,가산디지털단지역,13426389,14039716,126.882584,37.481007,27466105
4,가양역,4944772,4788551,126.854433,37.561469,9733323
...,...,...,...,...,...,...
524,회기역,6184902,5978154,127.057682,37.589558,12163056
525,회룡역,3084746,3052498,127.047224,37.724531,6137244
526,회현역,5642642,5913445,126.978559,37.558776,11556087
527,효창공원앞역,2393674,2324262,126.961987,37.538941,4717936


In [197]:
# 유동인구 상위 30개역만 추출
top_30 = on_off.iloc[on_off["유동인구"].nlargest(30).index].reset_index(drop=True)
top_30

Unnamed: 0,지하철역,승차,하차,경도,위도,유동인구
0,잠실역,21481670,21776044,127.102419,37.514062,43257714
1,강남역,19389564,19046174,127.027988,37.497162,38435738
2,고속터미널역,19042480,18986416,127.004606,37.504793,38028896
3,홍대입구역,18278155,19305289,126.926061,37.557277,37583444
4,서울역,18747373,18775813,126.971535,37.554903,37523186
5,사당역,15429543,15424073,126.981516,37.476686,30853616
6,선릉역,15203031,15297213,127.048943,37.504931,30500244
7,신림역,15042467,14598788,126.929676,37.484484,29641255
8,가산디지털단지역,13426389,14039716,126.882584,37.481007,27466105
9,구로디지털단지역,13476638,13427002,126.901594,37.485215,26903640


In [205]:
center = [37.563, 126.986]
zoom = 12

m = folium.Map(center, zoom_start=zoom)

for i in top_30.index:
    
    m_name = top_30.loc[i, "지하철역"]
    m_long = top_30.loc[i, "경도"]
    m_lat = top_30.loc[i, "위도"]
    m_on = top_30.loc[i, "승차"]
    m_off = top_30.loc[i, "하차"]
    
    radius = np.sqrt(np.sqrt(m_on + m_off)) - 30
    tooltip = f"역사명 : {m_name}, 승차 : {m_on}, 하차 : {m_off}"
    
    folium.CircleMarker([m_lat, m_long],
                       radius=radius,
                       fill=True,
                       tooltip=tooltip).add_to(m)

m