# Preprocess the dataset for obtaining the segments

## import the module

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
import calendar as cal
import matplotlib.pyplot as plt

## read the GTFS data
### generate the route set for choosing
Since all the GTFS file has the exactly the same `routes.txt`, which means it covers all the routes, the specific routes in the district should be found in the `trips.txt` file.
The GTFS data is: GTFS for state island from Jan, 4, 2016

In [2]:
trips = pd.read_csv('../data/GTFS/gtfs/trips.txt')
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15604 entries, 0 to 15603
Data columns (total 6 columns):
route_id         15604 non-null object
service_id       15604 non-null object
trip_id          15604 non-null object
trip_headsign    15604 non-null object
direction_id     15604 non-null int64
shape_id         15604 non-null object
dtypes: int64(1), object(5)
memory usage: 731.5+ KB


In [3]:
stop_times = pd.read_csv('../data/GTFS/gtfs/stop_times.txt')
stop_times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823708 entries, 0 to 823707
Data columns (total 7 columns):
trip_id           823708 non-null object
arrival_time      823708 non-null object
departure_time    823708 non-null object
stop_id           823708 non-null int64
stop_sequence     823708 non-null int64
pickup_type       823708 non-null int64
drop_off_type     823708 non-null int64
dtypes: int64(4), object(3)
memory usage: 44.0+ MB


In [4]:
routes = list(set(list(trips.route_id)))

select the first five routes as the selected results

In [5]:
select_routes = routes[:5]
select_route = trips.iloc[0].route_id

obtaine the corresponding trip id

In [6]:
select_trips = list(trips[(trips.route_id == select_route) & (trips.direction_id == 0)].trip_id)
print len(select_trips)
print select_trips[:3]

44
['CA_H6-Weekday-031500_MISC_402', 'CA_H6-Weekday-037500_MISC_461', 'CA_H6-Weekday-041500_MISC_481']


### generate the date set for choosing
generate the file list for choosing

In [7]:
path = '/Users/junwang/Documents/Github/bus_arrival_prediction/data/history/'
dirs = os.listdir(path)
file_list = []
for dir in dirs:
    if dir.endswith('.csv'):
        file_list.append(dir)
print len(file_list)
print file_list[:5]

57
['bus_time_20160104.csv', 'bus_time_20160105.csv', 'bus_time_20160106.csv', 'bus_time_20160107.csv', 'bus_time_20160108.csv']


select the first month:
    `20160104` ~ `20160131`

test the history file

In [8]:
test_history = pd.read_csv('../data/history/bus_time_20160104.csv')
test_history.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3082617 entries, 0 to 3082616
Data columns (total 12 columns):
timestamp           object
vehicle_id          int64
latitude            float64
longitude           float64
bearing             float64
progress            int64
service_date        int64
trip_id             object
block_assigned      int64
next_stop_id        object
dist_along_route    object
dist_from_stop      object
dtypes: float64(3), int64(4), object(5)
memory usage: 282.2+ MB


In [9]:
test_history.head(5)

Unnamed: 0,timestamp,vehicle_id,latitude,longitude,bearing,progress,service_date,trip_id,block_assigned,next_stop_id,dist_along_route,dist_from_stop
0,2016-01-04T00:00:00Z,276,40.835108,-73.86775,343.64,0,20160103,WF_A6-Sunday-107000_BX36_166,1,103900,9792.33,43.99
1,2016-01-04T00:00:00Z,318,40.835946,-73.883579,340.92,0,20160103,WF_A6-Sunday-108500_BX36_159,1,102387,8705.34,439.21
2,2016-01-04T00:00:00Z,601,40.723844,-73.804858,275.72,0,20160103,11093232-CPPA6-CP_A6-Sunday-10,1,551806,10922.15,37.06
3,2016-01-04T00:00:00Z,618,40.792042,-73.850391,203.45,2,20160103,11093434-CPPA6-CP_A6-Sunday-10,1,553292,13474.09,25.03
4,2016-01-04T00:00:00Z,1228,40.824188,-73.825813,206.11,0,20160103,GH_A6-Sunday-108500_BX44A_113,0,102615,11236.35,259.23


In [10]:
for file in file_list:
    print file
    ptr_history = pd.read_csv('../data/history/' + file)
    tmp_history = ptr_history[ptr_history.trip_id.isin(select_trips)]
    print len(tmp_history)

bus_time_20160104.csv
1530
bus_time_20160105.csv
2105
bus_time_20160106.csv
1989
bus_time_20160107.csv
2327
bus_time_20160108.csv
1857
bus_time_20160109.csv
0
bus_time_20160110.csv
0
bus_time_20160111.csv
1943
bus_time_20160112.csv
2018
bus_time_20160113.csv
1864
bus_time_20160114.csv
1705
bus_time_20160115.csv
1552
bus_time_20160116.csv
0
bus_time_20160117.csv
0
bus_time_20160118.csv
811
bus_time_20160119.csv
1938
bus_time_20160120.csv
1953
bus_time_20160121.csv
1934
bus_time_20160122.csv
1807
bus_time_20160123.csv
0
bus_time_20160124.csv
0
bus_time_20160125.csv
1540
bus_time_20160126.csv
2322
bus_time_20160127.csv
2144
bus_time_20160128.csv
2047
bus_time_20160129.csv
1941
bus_time_20160130.csv
0
bus_time_20160131.csv
0
bus_time_20160201.csv
2120
bus_time_20160202.csv
2047
bus_time_20160203.csv
1951
bus_time_20160204.csv
2234
bus_time_20160205.csv
2292
bus_time_20160206.csv
0
bus_time_20160207.csv
0
bus_time_20160208.csv
1624
bus_time_20160209.csv
1947
bus_time_20160210.csv
1706
bus_t