In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/2020.csv',encoding='euc-kr')
df.drop(columns=['등록일자'],inplace=True)
df.head()

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수
0,20200101,1호선,종각,20427,16301
1,20200101,1호선,시청,12126,10516
2,20200101,우이신설선,신설동,892,828
3,20200101,우이신설선,보문,917,855
4,20200101,우이신설선,성신여대입구(돈암),2010,2363


In [3]:
df1 = df.copy()
cols = list(df.columns)[:3]
target = list(df.columns)[3:]

In [4]:
# ()제거
df1['역명'] = df1['역명'].str.replace('(', ' ',regex=False,).str.split().str[0]

In [5]:
# '사용일' 열을 datetime 형식으로 변환
df1['사용일자'] = pd.to_datetime(df1['사용일자'], format='%Y%m%d')
# 평일과 주말 구분하는 새로운 열 생성
df1['주중/주말'] = df1['사용일자'].apply(lambda x: '주말' if x.weekday() >= 5 else '주중')
# 주말 데이터만 선택
weekday_df = df1[df1['주중/주말'] == '주말']

In [6]:
week_df = weekday_df.copy()
week_df['사용일자'] = pd.to_datetime(weekday_df['사용일자']).dt.strftime('%Y%m%d').astype(int)

In [7]:
week_df

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수,주중/주말
1774,20200104,우이신설선,북한산보국문,5117,4407,주말
1775,20200104,1호선,서울역,48376,46823,주말
1776,20200104,1호선,시청,22370,23449,주말
1777,20200104,1호선,종각,35345,33183,주말
1778,20200104,1호선,종로3가,34613,33921,주말
...,...,...,...,...,...,...
214660,20201227,6호선,대흥,1953,2023,주말
214661,20201227,6호선,광흥창,1983,1999,주말
214662,20201227,6호선,상수,2312,2726,주말
214663,20201227,6호선,합정,4258,3832,주말


In [8]:
df_list = []
for i in range(1, 13):
    start_date = 20200000 + i*100
    end_date = start_date + 100
    df_temp = week_df[(week_df['사용일자'] >= start_date) & (week_df['사용일자'] < end_date)].copy()
    df_temp['사용일자'] = 202000 + i
    df_temp = df_temp.groupby(cols)[target].agg('sum').reset_index()
    df_list.append(df_temp)
df_res = pd.concat(df_list, axis=0)

df_res.tail()

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수
596,202012,중앙선,원덕,1437,1353
597,202012,중앙선,중랑,20716,20145
598,202012,중앙선,지평,193,186
599,202012,중앙선,팔당,6286,6576
600,202012,중앙선,회기,89236,86296


In [9]:
df_res.to_csv('./res.csv', index=False)

In [10]:
df = pd.read_csv('./res.csv')
df.tail()

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수
7159,202012,중앙선,원덕,1437,1353
7160,202012,중앙선,중랑,20716,20145
7161,202012,중앙선,지평,193,186
7162,202012,중앙선,팔당,6286,6576
7163,202012,중앙선,회기,89236,86296


In [11]:
# 역명, 노선명 --> 지하철역, 호선명
df = df.rename(columns={'역명': '지하철역', '노선명':'호선명', '사용일자':'사용월'})

In [12]:
# 호선명 리스트로 추출해서 사용
lines = df.호선명.unique().tolist()
df_dict = {line: df[df['호선명'] == line].copy() for line in lines}
for line, frame in df_dict.items():
    # frame = df[df['호선명']==line].copy()
    # frame['총 승차인원'] = frame.loc[:,['승차총승객수']].sum(axis=1)
    # frame['총 하차인원'] = frame.loc[:,['하차총승객수']].sum(axis=1)
    frame.loc[(frame['호선명'] == '2호선') & (frame['지하철역'] == '신천'), '지하철역'] = '잠실새내'

    frame.to_csv(f'data/main/temp_files/주말/{line}.csv',index=False,encoding='utf-8')

In [13]:
path = 'data/main/temp_files/주말/'
line_info = [
    ([f'{path}1호선.csv', f'{path}경부선.csv', f'{path}경원선.csv', f'{path}경인선.csv', f'{path}장항선.csv'], '1호선'),
    ([f'{path}2호선.csv'], '2호선'),
    ([f'{path}3호선.csv', f'{path}일산선.csv'], '3호선'),
    ([f'{path}4호선.csv', f'{path}과천선.csv', f'{path}안산선.csv'], '4호선'),
    ([f'{path}5호선.csv'], '5호선'),
    ([f'{path}6호선.csv'], '6호선'),
    ([f'{path}7호선.csv'], '7호선'),
    ([f'{path}8호선.csv'], '8호선'),
    ([f'{path}9호선.csv', f'{path}9호선2~3단계.csv'], '9호선'),
    ([f'{path}수인선.csv', f'{path}분당선.csv'], '수인분당선'),
    ([f'{path}경의선.csv', f'{path}중앙선.csv'], '경의중앙선')

]

for df_list, line_name in line_info:
    df_copies = []
    for file in df_list:
        df = pd.read_csv(file)
        df_copies.append(df.copy())
    result = pd.concat(df_copies, axis=0)
    result = result.reset_index(drop=True)
    result.호선명 = line_name
    cols = list(result.columns)[:3]
    target = list(result.columns)[3:]
    res = result.groupby(cols)[target].agg('sum').reset_index()
    res.to_csv(f'data/main//lines/주말/{line_name}.csv', index=False)