In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/2020.csv',encoding='euc-kr',sep=',')
df.iloc[500:505]

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수,등록일자
500,20200101,3호선,경찰병원,2689,2816,20200104
501,20200101,3호선,가락시장,3711,3584,20200104
502,20200101,3호선,수서,9190,9114,20200104
503,20200101,3호선,일원,3016,3423,20200104
504,20200101,3호선,대청,3906,3658,20200104


In [3]:
# 컬럼 수정/삭제
df.rename(columns={'역명': '지하철역','노선명': '호선명'}, inplace=True)
df.drop(columns=['등록일자'], inplace=True)
df.iloc[500:505]

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수
500,20200101,3호선,경찰병원,2689,2816
501,20200101,3호선,가락시장,3711,3584
502,20200101,3호선,수서,9190,9114
503,20200101,3호선,일원,3016,3423
504,20200101,3호선,대청,3906,3658


In [4]:
# 호선 데이터 정리: 공항철도 1호선
# df.지하철역 = df['지하철역'].str.replace(' ',' ',regex=True).str.split().str[0]

# 지하철역 데이터 정리: 경복궁(정부서울청사) -> 경복궁
df.지하철역 = df['지하철역'].str.replace('(',' ',regex=False).str.split().str[0]
df.iloc[500:505]

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수
500,20200101,3호선,경찰병원,2689,2816
501,20200101,3호선,가락시장,3711,3584
502,20200101,3호선,수서,9190,9114
503,20200101,3호선,일원,3016,3423
504,20200101,3호선,대청,3906,3658


In [5]:
# 승차 + 하차 총 승객수
df['승하차총승객수'] = df['승차총승객수'] + df['하차총승객수']
df.iloc[500:505]

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수
500,20200101,3호선,경찰병원,2689,2816,5505
501,20200101,3호선,가락시장,3711,3584,7295
502,20200101,3호선,수서,9190,9114,18304
503,20200101,3호선,일원,3016,3423,6439
504,20200101,3호선,대청,3906,3658,7564


In [7]:
import datetime
# 요일, 평일주말 출력하는 반복문
week_end = []
for i in range(len(df.index)):
    year = int(str(df.사용일자.values[i])[:4])
    mm = int(str(df.사용일자.values[i])[4:6])
    dd = int(str(df.사용일자.values[i])[0:][6:])
    week_end_dict = {0:'평일', 1:'평일', 2:'평일', 3:'평일', 4:'평일', 5:'주말', 6:'주말'}
    week_end.append(week_end_dict[datetime.date(year,mm,dd).weekday()])

# 평일/주말 column 생성
df['평일주말'] = week_end
df.tail()

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수,평일주말
217050,20201231,경의선,신촌,541,633,1174,평일
217051,20201231,경의선,서울역,2512,3329,5841,평일
217052,20201231,경원선,소요산,1522,1401,2923,평일
217053,20201231,경원선,동두천,1379,1589,2968,평일
217054,20201231,경원선,동두천중앙,2155,2219,4374,평일


In [10]:
# 평일
df_week_day = df[df['평일주말'].values == '평일']
df_week_day.to_csv('data/평일/2020_평일.csv', index=False)

In [11]:
df_week_day = pd.read_csv('data/평일/2020_평일.csv',encoding='utf-8',sep=',')
df_week_day

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수,평일주말
0,20200101,1호선,종각,20427,16301,36728,평일
1,20200101,1호선,시청,12126,10516,22642,평일
2,20200101,우이신설선,신설동,892,828,1720,평일
3,20200101,우이신설선,보문,917,855,1772,평일
4,20200101,우이신설선,성신여대입구,2010,2363,4373,평일
...,...,...,...,...,...,...,...
155408,20201231,경의선,신촌,541,633,1174,평일
155409,20201231,경의선,서울역,2512,3329,5841,평일
155410,20201231,경원선,소요산,1522,1401,2923,평일
155411,20201231,경원선,동두천,1379,1589,2968,평일


In [12]:
# 주말
df_week_end = df[df['평일주말'].values == '주말']
df_week_end.to_csv('data/주말/2020_주말.csv', index=False)

In [13]:
df_week_end = pd.read_csv('data/주말/2020_주말.csv', encoding='utf-8', sep=',')
df_week_end

Unnamed: 0,사용일자,호선명,지하철역,승차총승객수,하차총승객수,승하차총승객수,평일주말
0,20200104,우이신설선,북한산보국문,5117,4407,9524,주말
1,20200104,1호선,서울역,48376,46823,95199,주말
2,20200104,1호선,시청,22370,23449,45819,주말
3,20200104,1호선,종각,35345,33183,68528,주말
4,20200104,1호선,종로3가,34613,33921,68534,주말
...,...,...,...,...,...,...,...
61637,20201227,6호선,대흥,1953,2023,3976,주말
61638,20201227,6호선,광흥창,1983,1999,3982,주말
61639,20201227,6호선,상수,2312,2726,5038,주말
61640,20201227,6호선,합정,4258,3832,8090,주말


In [14]:
df['호선명'].value_counts().sort_index()

호선명
1호선          3660
2호선         18300
3호선         12270
4호선          9516
5호선         18958
6호선         13735
7호선         18666
8호선          6222
9호선          9150
9호선2~3단계     4758
경강선          4024
경부선         14274
경원선         10715
경의선          9537
경인선          7320
경춘선          6954
공항철도 1호선     5124
과천선          2928
분당선         12557
수인선          5313
안산선          4758
우이신설선        4758
일산선          3676
장항선          2196
중앙선          7686
Name: count, dtype: int64