In [1]:
import pandas as pd
import json

### 1. 스마트 카드 데이터 (7일)

In [2]:
df = pd.read_csv('data/kimpo_0825_0831.csv')

with open('data/subway/subway_station.json', 'r') as f:
    stations = json.load(f)

In [3]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', '승차일시', '하차일시', '차이시간', '교통수단코드', '이용자수',
       '이용거리', '탑승시간', '승차정류장ID', '하차정류장ID', '승차지역코드', '하차지역코드', '운행일자_x',
       '노선 ID_승차', '노선 명칭_승차', '정류장 순번_승차', '정류장 ID_승차', '정류장 명칭_승차',
       '정류장 X 좌표_승차', '정류장 Y 좌표_승차', '노선 누적 거리_승차', '정류장 거리_승차', '노선 ID_하차',
       '노선 명칭_하차', '정류장 순번_하차', '정류장 ID_하차', '정류장 명칭_하차', '정류장 X 좌표_하차',
       '정류장 Y 좌표_하차', '노선 누적 거리_하차', '정류장 거리_하차', '교통수단구분(B:버스,T:지하철)'],
      dtype='object')

### 2. subway vs bus rate

In [4]:
subway_cond1 = df['노선 명칭_승차'] == '김포골드라인'
subway_cond2 = df['노선 명칭_하차'] == '김포골드라인'

bus_cond1 = df['노선 명칭_승차'] == '70'
bus_cond2 = df['노선 명칭_하차'] == '70'

subway_df = df.loc[subway_cond1 & subway_cond2, ['정류장 명칭_승차', '정류장 명칭_하차']]
bus_df = df.loc[bus_cond1 & bus_cond2, ['정류장 명칭_승차', '정류장 명칭_하차']]

subway_df = subway_df.loc[subway_df['정류장 명칭_승차'] != subway_df['정류장 명칭_하차']]
bus_df = bus_df.loc[bus_df['정류장 명칭_승차'] != bus_df['정류장 명칭_하차']]

subway_df.shape, bus_df.shape

((72489, 2), (5298, 2))

In [5]:
subway_rate = round(len(subway_df) / (len(subway_df) + len(bus_df)) * 100, 1)
bus_rate = round(len(bus_df) / (len(subway_df) + len(bus_df)) * 100, 1)

subway_rate, bus_rate

(93.2, 6.8)

### 3. 지하철 O-D 비율

In [6]:
def get_station_name(stations, direction):
    station_names = []
    filtered_stations = list(filter(lambda x: x['direction'] == direction, stations))
    for idx, station in enumerate(filtered_stations):
        station_names.append({
            'station_name': station['start_station'],
            'direction': station['direction']
        })
        
        if idx == len(filtered_stations) - 1:
            station_names.append({
                'station_name': station['end_station'],
                'direction': station['direction']
            })
    return station_names

In [7]:
total_station_names = get_station_name(stations, 1) + get_station_name(stations, 2)

In [8]:
def find_direction(row):
    data = row.to_dict()
    d1_station_name_lst = list(filter(lambda x: x['direction'] == 1, total_station_names))
    first_index = [i for i, v in enumerate(d1_station_name_lst) if v['station_name'] == data['정류장 명칭_승차']][0]
    end_index = [i for i, v in enumerate(d1_station_name_lst) if v['station_name'] == data['정류장 명칭_하차']][0]
    if first_index < end_index:
        return '1'
    else:
        return '2'

In [9]:
subway_station_value_count = subway_df.value_counts().reset_index(name='count')
subway_station_value_count['rate'] = subway_station_value_count['count'].map(lambda x: round(x / subway_station_value_count['count'].sum() * 100, 2)) / 100
subway_station_value_count['direction'] = subway_station_value_count.apply(find_direction, axis=1)

In [10]:
subway_station_value_count.to_csv('data/passenger/subway_rate.csv', index=None)