In [3]:
from tabula import read_pdf
import pandas as pd
import numpy as np
import json

### 1. pdf -> csv : 1차 가공

In [4]:
with open('data/subway/subway_station.json', 'r', encoding='utf8') as f:
    station_lst = json.load(f)

In [5]:
def preprocessing_station_data(station_lst, direction):
    station_lst = list(filter(lambda x: x['direction'] == direction, station_lst))
    
    stations = []
    for idx, station in enumerate(station_lst):
        stations.append({
            'station_name': station['start_station']
        })
        if idx == len(station_lst) - 1:
            stations.append({
                'station_name': station['end_station']
            })
    return stations

In [6]:
station_reverse_f_lst = preprocessing_station_data(station_lst, 1)
station_reverse_t_lst = preprocessing_station_data(station_lst, 2)

In [7]:
station_reverse_f_lst

[{'station_name': '양촌'},
 {'station_name': '구래'},
 {'station_name': '마산'},
 {'station_name': '장기'},
 {'station_name': '운양'},
 {'station_name': '걸포북변'},
 {'station_name': '사우(김포시청)'},
 {'station_name': '풍무'},
 {'station_name': '고촌'},
 {'station_name': '김포공항'}]

In [8]:
kimpo_airport_lst = read_pdf('data/subway_schedule/김포공항행-변경열차시각표.pdf', pages='all', pandas_options={'header': None})
yangchon_lst = read_pdf('data/subway_schedule/양촌행-변경열차시각표.pdf', pages='all', pandas_options={'header': None})

In [9]:
## 김포공항행
def kimpo_airport_extract_dataframe(df_lst):
    total_data = {}
    index = 0
    
    for i, df in enumerate(df_lst):
        if i % 2 == 0:
            ori_name = df[0].values[0]
            name = station_reverse_f_lst[index]['station_name']
            if ori_name in name:
                index += 1
            else: raise
        else:
            data = []
            df = df.drop(0, axis=1)
            for v in df.values:
                data += v[pd.isna(v) == False].tolist()
            total_data[name] = data
    return data_to_df(total_data)

## 행
def yangchon_extract_dataframe(df_lst):
    total_data = {}
    index = 0
    
    for _, df in enumerate(df_lst):
        name = station_reverse_t_lst[index]['station_name']
        df = df.drop(0, axis=1)
        
        data = []
        for v in df.values:
            data += sum([r.split(' ') for r in v[pd.isna(v) == False].tolist()], [])
        
        total_data[name] = data
        index += 1
    return data_to_df(total_data)

def data_to_df(data):
    max_len = max(len(x) for x in data.values())
    for k, v in data.items():
        if max_len > len(v):
            data[k] = np.concatenate([v, np.zeros(max_len - len(v), dtype=str)])
    
    total_df = pd.DataFrame(data)
    return total_df

In [10]:
kimpo_airport_extracted = kimpo_airport_extract_dataframe(kimpo_airport_lst)
yangchon_extracted = yangchon_extract_dataframe(yangchon_lst)

In [11]:
kimpo_airport_extracted.to_csv('data/subway_schedule/kimpo_airport_subway_schedule.csv', encoding='cp949', index=None)
yangchon_extracted.to_csv('data/subway_schedule/yangchon_subway_schedule.csv', encoding='cp949', index=None)

### 2. pdf에서 추출한 시간표 전처리 : 2차 가공 (수작업)

In [12]:
yongchon_df = pd.read_excel('data/subway_schedule/골드라인 운행시간표 정리_양촌행.xlsx', dtype=str)
kimpo_df = pd.read_excel('data/subway_schedule/골드라인 운행시간표 정리_김포공항행.xlsx', dtype=str)

In [13]:
def date_processing(df, direction):
    df = df.apply(lambda x: x.map(lambda x: None if pd.isna(x) else x.split(' ')[-1]), axis=1)
    col_names = df.columns[1:].tolist()
    
    total = {
        '철도_id': [],
        '출발시간': [],
        '도착시간': [],
        '출발정류장': [],
        '도착정류장': [],
    }
    
    for idx, row in df.iterrows():
        row_data = row.to_dict()
        
        for i in range(len(col_names) - 1):
            start_time = row_data[col_names[i]]
            end_time = row_data[col_names[i + 1]]
            
            if pd.isna(start_time) or pd.isna(end_time): continue
            total['철도_id'].append(row_data['철도번호_양촌행'])
            total['출발시간'].append(row_data[col_names[i]])
            total['도착시간'].append(row_data[col_names[i + 1]])
            total['출발정류장'].append(col_names[i].split('_')[0])
            total['도착정류장'].append(col_names[i + 1].split('_')[0])

    df = pd.DataFrame(total)
    df.to_csv(f'data/subway/{direction}_schedule.csv', index=None, encoding='cp949')

date_processing(yongchon_df, 'yongchon')
date_processing(kimpo_df, 'kimpo')