In [128]:
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [129]:
# 사용할 노선과 합쳐야 하는 노선 목록(중복역은 합계 후 중복 삭제예정)
# 1호선:서울지하철1호선(청량리~서울역), 경원선(소요산~청량리), 경부선(서울~천안/서동탄/광명), 장항선(천안~신창), 경인선(구로~인천)
# 2호선, 5호선, 6호선, 7호선, 8호선
# 3호선: 서울지하철3호선(지축~오금), 일산선(대화~지축)
# 4호선: 서울지하철4호선(당고개~남태령), 과천선(남태령~금정), 안산선(금정~오이도)
# 9호선: 9호선, 9호선(2~3)단계
# 수인분당선: 수인선(인천~수원), 분당선()
# 공항철도
# 신림선, 우이신설선
# 신분당선 데이터 없음, 김포 골드라인 없음, 용인 에버 없음

# 호선 분리하기 ex) 1호선, 경원선, 수인선 ......

# 월별 평일 승하차 각각 합계, 월별 주말 승하차 각각 합계

# 월별로 처리한 데이터 csv로 각 년도 폴더에 저장

In [130]:
# 2020 csv 열기
df = pd.read_csv('data/2020.csv', encoding='euc-kr', sep=',')
df.head()

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수,등록일자
0,20200101,1호선,종각,20427,16301,20200104
1,20200101,1호선,시청,12126,10516,20200104
2,20200101,우이신설선,신설동,892,828,20200104
3,20200101,우이신설선,보문,917,855,20200104
4,20200101,우이신설선,성신여대입구(돈암),2010,2363,20200104


In [131]:
# 결측치 확인
df.isna().sum().sum() 

0

In [132]:
# 등록일자 삭제
df.drop(['등록일자'], axis=1, inplace=True)
df.head()

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수
0,20200101,1호선,종각,20427,16301
1,20200101,1호선,시청,12126,10516
2,20200101,우이신설선,신설동,892,828
3,20200101,우이신설선,보문,917,855
4,20200101,우이신설선,성신여대입구(돈암),2010,2363


In [133]:
df['승하차총승객수'] = df['승차총승객수'] + df['하차총승객수']
df.head()

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수,승하차총승객수
0,20200101,1호선,종각,20427,16301,36728
1,20200101,1호선,시청,12126,10516,22642
2,20200101,우이신설선,신설동,892,828,1720
3,20200101,우이신설선,보문,917,855,1772
4,20200101,우이신설선,성신여대입구(돈암),2010,2363,4373


In [134]:
# 사용일자를 datetime 타입으로 변환
df['사용일자'] = pd.to_datetime(df['사용일자'], format='%Y%m%d')

In [135]:
# 역명, 노선명 --> 지하철역, 호선명
df = df.rename(columns={'역명': '지하철역', '노선명':'호선명'})
df.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,1호선,종각,20427,16301,36728
1,2020-01-01,1호선,시청,12126,10516,22642
2,2020-01-01,우이신설선,신설동,892,828,1720
3,2020-01-01,우이신설선,보문,917,855,1772
4,2020-01-01,우이신설선,성신여대입구(돈암),2010,2363,4373


In [136]:
# ()제거
df['지하철역'] = df['지하철역'].str.replace('(', ' ',regex=False,).str.split().str[0]

In [137]:
# 노선 이름 보기
df['호선명'].unique()

array(['1호선', '우이신설선', '9호선2~3단계', '공항철도 1호선', '9호선', '8호선', '7호선', '6호선',
       '5호선', '경강선', '수인선', '경춘선', '경의선', '장항선', '중앙선', '일산선', '분당선',
       '과천선', '안산선', '경원선', '경인선', '경부선', '4호선', '3호선', '2호선'],
      dtype=object)

In [138]:
# 1호선
df_line1 = df[df['호선명'].isin(['1호선', '경원선', '경부선', '장항선', '경인선'])].copy()
df_line1.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,1호선,종각,20427,16301,36728
1,2020-01-01,1호선,시청,12126,10516,22642
291,2020-01-01,장항선,신창,792,938,1730
292,2020-01-01,장항선,온양온천,4010,3941,7951
293,2020-01-01,장항선,배방,613,789,1402


In [139]:
df_line1['호선명'] = '1호선'
df_line1.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,1호선,종각,20427,16301,36728
1,2020-01-01,1호선,시청,12126,10516,22642
291,2020-01-01,1호선,신창,792,938,1730
292,2020-01-01,1호선,온양온천,4010,3941,7951
293,2020-01-01,1호선,배방,613,789,1402


In [140]:
df_line1.set_index('사용일자', inplace=True)
df_line1.reset_index(inplace=True)
df_line1.tail(3)

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
38162,2020-12-31,1호선,소요산,1522,1401,2923
38163,2020-12-31,1호선,동두천,1379,1589,2968
38164,2020-12-31,1호선,동두천중앙,2155,2219,4374


In [141]:
df_line1.to_csv('data/호선별/1호선.csv')
df_line1 = pd.read_csv('data/호선별/1호선.csv', index_col=0)
df_line1.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,1호선,종각,20427,16301,36728
1,2020-01-01,1호선,시청,12126,10516,22642
2,2020-01-01,1호선,신창,792,938,1730
3,2020-01-01,1호선,온양온천,4010,3941,7951
4,2020-01-01,1호선,배방,613,789,1402


In [142]:
# 2호선
df_line2 = df[df['호선명'].isin(['2호선'])].copy()
df_line2['호선명'] = '2호선'
df_line2.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
532,2020-01-01,2호선,용두,1130,1156,2286
533,2020-01-01,2호선,신정네거리,4384,4549,8933
534,2020-01-01,2호선,양천구청,2766,2916,5682
535,2020-01-01,2호선,도림천,347,380,727
536,2020-01-01,2호선,신설동,2161,1988,4149


In [143]:
df_line2.set_index('사용일자', inplace=True)
df_line2.reset_index(inplace=True)
df_line2.head(3)

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,2호선,용두,1130,1156,2286
1,2020-01-01,2호선,신정네거리,4384,4549,8933
2,2020-01-01,2호선,양천구청,2766,2916,5682


In [144]:
df_line2.to_csv('data/호선별/2호선.csv')
df_line2 = pd.read_csv('data/호선별/2호선.csv', index_col=0)
df_line2.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,2호선,용두,1130,1156,2286
1,2020-01-01,2호선,신정네거리,4384,4549,8933
2,2020-01-01,2호선,양천구청,2766,2916,5682
3,2020-01-01,2호선,도림천,347,380,727
4,2020-01-01,2호선,신설동,2161,1988,4149


In [145]:
# 3호선
df_line3 = df[df['호선명'].isin(['3호선','일산선'])].copy()
df_line3['호선명'] = '3호선'
df_line3.set_index('사용일자', inplace=True)
df_line3.reset_index(inplace=True)
df_line3.to_csv('data/호선별/3호선.csv')
df_line3 = pd.read_csv('data/호선별/3호선.csv', index_col=0)
df_line3.head()


Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,3호선,대화,6456,5901,12357
1,2020-01-01,3호선,주엽,4213,4264,8477
2,2020-01-01,3호선,정발산,4468,4495,8963
3,2020-01-01,3호선,마두,3643,3685,7328
4,2020-01-01,3호선,백석,5630,5479,11109


In [146]:
# 4호선
df_line4 = df[df['호선명'].isin(['4호선', '과천선' + '안산선'])].copy()
df_line4['호선명'] = '4호선'
df_line4.set_index('사용일자', inplace=True)
df_line4.reset_index(inplace=True)
df_line4.to_csv('data/호선별/4호선.csv')
df_line4 = pd.read_csv('data/호선별/4호선.csv', index_col=0)
df_line4.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,4호선,남태령,520,439,959
1,2020-01-01,4호선,사당,13276,11928,25204
2,2020-01-01,4호선,총신대입구,9089,9895,18984
3,2020-01-01,4호선,동작,1407,1537,2944
4,2020-01-01,4호선,이촌,2894,3238,6132


In [147]:
# 5호선
df_line5 = df[df['호선명'].isin(['5호선'])].copy()
df_line5['호선명'] = '5호선'
df_line5.set_index('사용일자', inplace=True)
df_line5.reset_index(inplace=True)
df_line5.to_csv('data/호선별/5호선.csv')
df_line5 = pd.read_csv('data/호선별/5호선.csv', index_col=0)
df_line5.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,5호선,마천,2513,2886,5399
1,2020-01-01,5호선,거여,3236,3339,6575
2,2020-01-01,5호선,개롱,2618,2800,5418
3,2020-01-01,5호선,오금,1239,1267,2506
4,2020-01-01,5호선,방이,3212,3249,6461


In [148]:
# 6호선
df_line6 = df[df['호선명'].isin(['6호선'])].copy()
df_line6['호선명'] = '6호선'
df_line6.set_index('사용일자', inplace=True)
df_line6.reset_index(inplace=True)
df_line6.to_csv('data/호선별/6호선.csv')
df_line6 = pd.read_csv('data/호선별/6호선.csv', index_col=0)
df_line6.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,6호선,봉화산,3818,3883,7701
1,2020-01-01,6호선,화랑대,4859,4226,9085
2,2020-01-01,6호선,태릉입구,2554,3063,5617
3,2020-01-01,6호선,석계,6045,6302,12347
4,2020-01-01,6호선,돌곶이,3885,4020,7905


In [149]:
# 7호선
df_line7 = df[df['호선명'].isin(['7호선'])].copy()
df_line7['호선명'] = '7호선'
df_line7.set_index('사용일자', inplace=True)
df_line7.reset_index(inplace=True)
df_line7.to_csv('data/호선별/7호선.csv')
df_line7 = pd.read_csv('data/호선별/7호선.csv', index_col=0)
df_line7.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,7호선,부평구청,4647,3983,8630
1,2020-01-01,7호선,굴포천,4424,4720,9144
2,2020-01-01,7호선,삼산체육관,2094,2043,4137
3,2020-01-01,7호선,상동,7062,6914,13976
4,2020-01-01,7호선,부천시청,5832,5755,11587


In [150]:
# 8호선
df_line8 = df[df['호선명'].isin(['8호선'])].copy()
df_line8['호선명'] = '8호선'
df_line8.set_index('사용일자', inplace=True)
df_line8.reset_index(inplace=True)
df_line8.to_csv('data/호선별/8호선.csv')
df_line8 = pd.read_csv('data/호선별/8호선.csv', index_col=0)
df_line8.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,8호선,모란,2542,2103,4645
1,2020-01-01,8호선,수진,3261,3118,6379
2,2020-01-01,8호선,신흥,3062,3257,6319
3,2020-01-01,8호선,단대오거리,4779,5044,9823
4,2020-01-01,8호선,남한산성입구,5961,5950,11911


In [151]:
# 9호선
df_line9 = df[df['호선명'].isin(['9호선', '9호선2~3단계'])].copy()
df_line9['호선명'] = '9호선'
df_line9.set_index('사용일자', inplace=True)
df_line9.reset_index(inplace=True)
df_line9.to_csv('data/호선별/9호선.csv')
df_line9 = pd.read_csv('data/호선별/9호선.csv', index_col=0)
df_line9.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,9호선,중앙보훈병원,2911,2864,5775
1,2020-01-01,9호선,둔촌오륜,333,354,687
2,2020-01-01,9호선,올림픽공원,2371,2475,4846
3,2020-01-01,9호선,한성백제,1482,1185,2667
4,2020-01-01,9호선,송파나루,2060,2100,4160


In [152]:
# 수인분당선
df_line_suin_bundang = df[df['호선명'].isin(['수인선', '분당선'])].copy()
df_line_suin_bundang['호선명'] = '수인분당선'
df_line_suin_bundang.set_index('사용일자', inplace=True)
df_line_suin_bundang.reset_index(inplace=True)
df_line_suin_bundang.to_csv('data/호선별/수인분당선.csv')
df_line_suin_bundang = pd.read_csv('data/호선별/수인분당선.csv', index_col=0)
df_line_suin_bundang.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,수인분당선,인천,1109,1571,2680
1,2020-01-01,수인분당선,신포,1111,1117,2228
2,2020-01-01,수인분당선,숭의,1789,1605,3394
3,2020-01-01,수인분당선,인하대,2319,2306,4625
4,2020-01-01,수인분당선,송도,1598,1586,3184


In [153]:
# 공항철도
df_line_airport = df[df['호선명'].isin(['공항철도 1호선'])].copy()
df_line_airport['호선명'] = '공항철도'
df_line_airport.set_index('사용일자', inplace=True)
df_line_airport.reset_index(inplace=True)
df_line_airport.to_csv('data/호선별/공항철도.csv')
df_line_airport = pd.read_csv('data/호선별/공항철도.csv', index_col=0)
df_line_airport.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,공항철도,영종,2084,2010,4094
1,2020-01-01,공항철도,인천공항2터미널,5990,5801,11791
2,2020-01-01,공항철도,인천공항1터미널,16835,17571,34406
3,2020-01-01,공항철도,공항화물청사,1344,1439,2783
4,2020-01-01,공항철도,운서,5812,6205,12017


In [154]:
# 우이신설선
df_line_ui = df[df['호선명'].isin(['우이신설선'])].copy()
df_line_ui.set_index('사용일자', inplace=True)
df_line_ui.reset_index(inplace=True)
df_line_ui.to_csv('data/호선별/우이신설선.csv')
df_line_ui = pd.read_csv('data/호선별/우이신설선.csv', index_col=0)
df_line_ui.head()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
0,2020-01-01,우이신설선,신설동,892,828,1720
1,2020-01-01,우이신설선,보문,917,855,1772
2,2020-01-01,우이신설선,성신여대입구,2010,2363,4373
3,2020-01-01,우이신설선,정릉,2096,1989,4085
4,2020-01-01,우이신설선,북한산보국문,2945,2849,5794
