In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/2020.csv',encoding='euc-kr')
df.drop(columns=['등록일자'],inplace=True)
df.head()

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수
0,20200101,1호선,종각,20427,16301
1,20200101,1호선,시청,12126,10516
2,20200101,우이신설선,신설동,892,828
3,20200101,우이신설선,보문,917,855
4,20200101,우이신설선,성신여대입구(돈암),2010,2363


In [3]:
df1 = df.copy()
cols = list(df.columns)[:3]
target = list(df.columns)[3:]

In [4]:
# 역명, 노선명 --> 지하철역, 호선명
# df1 = df1.rename(columns={'역명': '지하철역', '노선명':'호선명'})

In [5]:
# ()제거
df1['역명'] = df1['역명'].str.replace('(', ' ',regex=False,).str.split().str[0]

In [6]:
# '사용일' 열을 datetime 형식으로 변환
df1['사용일자'] = pd.to_datetime(df1['사용일자'], format='%Y%m%d')
# 평일과 주말 구분하는 새로운 열 생성
df1['주중/주말'] = df1['사용일자'].apply(lambda x: '주말' if x.weekday() >= 5 else '주중')
# 평일 데이터만 선택
weekday_df = df1[df1['주중/주말'] == '주중']

In [7]:
week_df = weekday_df.copy()
week_df['사용일자'] = pd.to_datetime(weekday_df['사용일자']).dt.strftime('%Y%m%d').astype(int)

In [8]:
week_df

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수,주중/주말
0,20200101,1호선,종각,20427,16301,주중
1,20200101,1호선,시청,12126,10516,주중
2,20200101,우이신설선,신설동,892,828,주중
3,20200101,우이신설선,보문,917,855,주중
4,20200101,우이신설선,성신여대입구,2010,2363,주중
...,...,...,...,...,...,...
217051,20201231,경의선,신촌,541,633,주중
217052,20201231,경의선,서울역,2512,3329,주중
217053,20201231,경원선,소요산,1522,1401,주중
217054,20201231,경원선,동두천,1379,1589,주중


In [9]:
df_list = []
for i in range(1, 13):
    start_date = 20200000 + i*100
    end_date = start_date + 100
    df_temp = week_df[(week_df['사용일자'] >= start_date) & (week_df['사용일자'] < end_date)].copy()
    df_temp['사용일자'] = 202000 + i
    df_temp = df_temp.groupby(cols)[target].agg('sum').reset_index()
    df_list.append(df_temp)
df_res = pd.concat(df_list, axis=0)

df_res.tail()

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수
598,202012,중앙선,원덕,4879,4764
599,202012,중앙선,중랑,102667,98868
600,202012,중앙선,지평,698,628
601,202012,중앙선,팔당,13923,13937
602,202012,중앙선,회기,421387,404566


In [10]:
df_res.to_csv('./res.csv', index=False)

In [11]:
df = pd.read_csv('./res.csv')
df.tail()

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수
7178,202012,중앙선,원덕,4879,4764
7179,202012,중앙선,중랑,102667,98868
7180,202012,중앙선,지평,698,628
7181,202012,중앙선,팔당,13923,13937
7182,202012,중앙선,회기,421387,404566


In [12]:
# 역명, 노선명 --> 지하철역, 호선명
df = df.rename(columns={'역명': '지하철역', '노선명':'호선명', '사용일자':'사용월'})

In [13]:
# 호선명 리스트로 추출해서 사용
lines = df.호선명.unique().tolist()
df_dict = {line: df[df['호선명'] == line].copy() for line in lines}
for line, frame in df_dict.items():
    # frame = df[df['호선명']==line].copy()
    # frame['총 승차인원'] = frame.loc[:,['승차총승객수']].sum(axis=1)
    # frame['총 하차인원'] = frame.loc[:,['하차총승객수']].sum(axis=1)
    frame.loc[(frame['호선명'] == '2호선') & (frame['지하철역'] == '신천'), '지하철역'] = '잠실새내'

    frame.to_csv(f'data/main/temp_files/주중/{line}.csv',index=False,encoding='utf-8')

In [14]:
path = 'data/main/temp_files/주중/'
line_info = [
    ([f'{path}1호선.csv', f'{path}경부선.csv', f'{path}경원선.csv', f'{path}경인선.csv', f'{path}장항선.csv'], '1호선'),
    ([f'{path}2호선.csv'], '2호선'),
    ([f'{path}3호선.csv', f'{path}일산선.csv'], '3호선'),
    ([f'{path}4호선.csv', f'{path}과천선.csv', f'{path}안산선.csv'], '4호선'),
    ([f'{path}5호선.csv'], '5호선'),
    ([f'{path}6호선.csv'], '6호선'),
    ([f'{path}7호선.csv'], '7호선'),
    ([f'{path}8호선.csv'], '8호선'),
    ([f'{path}9호선.csv', f'{path}9호선2~3단계.csv'], '9호선'),
    ([f'{path}수인선.csv', f'{path}분당선.csv'], '수인분당선'),
    ([f'{path}경의선.csv', f'{path}중앙선.csv'], '경의중앙선')

]

for df_list, line_name in line_info:
    df_copies = []
    for file in df_list:
        df = pd.read_csv(file)
        df_copies.append(df.copy())
    result = pd.concat(df_copies, axis=0)
    result = result.reset_index(drop=True)
    result.호선명 = line_name
    cols = list(result.columns)[:3]
    target = list(result.columns)[3:]
    res = result.groupby(cols)[target].agg('sum').reset_index()
    res.to_csv(f'data/main//lines/주중/{line_name}.csv', index=False)

In [15]:
df = pd.read_csv('data/main//lines/주중/1호선.csv')
df['지하철역'].value_counts()

지하철역
가능      12
아산      12
온양온천    12
온수      12
옥수      12
        ..
두정      12
동인천     12
동암      12
동묘앞     12
회룡      12
Name: count, Length: 103, dtype: int64

In [16]:
df['지하철역'].unique()

array(['가능', '가산디지털단지', '간석', '개봉', '관악', '광명', '광운대', '구로', '구일', '군포',
       '금정', '금천구청', '남영', '노량진', '녹양', '녹천', '당정', '대방', '덕계', '덕정',
       '도봉', '도봉산', '도원', '도화', '독산', '동대문', '동두천', '동두천중앙', '동묘앞', '동암',
       '동인천', '두정', '망월사', '명학', '방학', '배방', '백운', '병점', '보산', '봉명', '부개',
       '부천', '부평', '서동탄', '서빙고', '서울역', '서정리', '석계', '석수', '성균관대', '성환',
       '세류', '세마', '소사', '소요산', '송내', '송탄', '수원', '시청', '신길', '신도림',
       '신설동', '신이문', '신창', '쌍용', '아산', '안양', '양주', '역곡', '영등포', '오류동',
       '오산', '오산대', '옥수', '온수', '온양온천', '왕십리', '외대앞', '용산', '월계', '응봉',
       '의왕', '의정부', '이촌', '인천', '제기동', '제물포', '종각', '종로3가', '종로5가', '주안',
       '중동', '지제', '지행', '직산', '진위', '창동', '천안', '청량리', '평택', '한남', '화서',
       '회룡'], dtype=object)