# Import module

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import csv
import os
import calendar as cal
import matplotlib.pyplot as plt
import urllib

# Data preprocess

**use the state island as the example**

1. select a single route and direction from a specific district from the GTFS
2. find the corresponding `trip_id` and `service_id`
3. find the corresponding dates of the `service_id`
4. Download the historical file and prepare the data analysis

## 1. select a single route and direction from a specific district from the GTFS


read the data into dataframe

In [3]:
routes = pd.read_csv('../data/GTFS/gtfs/routes.txt')
routes.info()

IOError: File ../data/GTFS/gtfs/routes.txt does not exist

In [None]:
trips = pd.read_csv('../data/GTFS/gtfs/trips.txt')
trips.info()

In [None]:
calendar_dates = pd.read_csv('../data/GTFS/gtfs/calendar_dates.txt')
calendar_dates.info()

In [None]:
calendar = pd.read_csv('../data/GTFS/gtfs/calendar.txt')
calendar.info()

In [None]:
stop_times = pd.read_csv('../data/GTFS/gtfs/stop_times.txt')
stop_times.info()

select a single route and a direction

In [None]:
route_id = trips.loc[0, 'route_id']
directon_id = trips.loc[0, 'direction_id']
print route_id
print directon_id

## 2. find the corresponding trip_id and service_id

find the trip_id

In [None]:
select_trips = trips[(trips.route_id == route_id) & (trips.direction_id == directon_id)]
select_trips.info()

find the service_id

In [None]:
select_trips

In [None]:
select_service_id_set = set(list(select_trips.service_id))
print select_service_id_set

## 3. find the corresponding dates of the service_id

read the calendar dataframe to obtain the normal schedule of the trips

In [None]:
select_calendar = calendar[calendar.service_id.isin(select_service_id_set)]
select_calendar.info()

In [None]:
# the select service_id
select_calendar

obtain normal dates for the service_id

In [None]:
dates = []
for i in range(1, 4):
    month = cal.monthcalendar(2016, i)
    for week in month:
        monday = week[cal.MONDAY]
        tuesday = week[cal.TUESDAY]
        wednesday = week[cal.WEDNESDAY]
        thursday = week[cal.THURSDAY]
        friday = week[cal.FRIDAY]
        if monday > 0:
            if monday < 10:
                cur_date = '20160' + str(i) + '0' + str(monday)
            else:
                cur_date = '20160' + str(i) + str(monday)
            if cur_date <= '20160401' and cur_date >= '20160104':
                
                dates.append(cur_date)
        if tuesday > 0:
            if tuesday < 10:
                cur_date = '20160' + str(i) + '0' + str(tuesday)
            else:
                cur_date = '20160' + str(i) + str(tuesday)
            if cur_date <= '20160401' and cur_date >= '20160104':
                dates.append(cur_date)
        if wednesday > 0:
            if wednesday < 10:
                cur_date = '20160' + str(i) + '0' + str(wednesday)
            else:
                cur_date = '20160' + str(i) + str(wednesday)
            if cur_date <= '20160401' and cur_date >= '20160104':
                dates.append(cur_date)
        if thursday > 0:
            if thursday < 10:
                cur_date = '20160' + str(i) + '0' + str(thursday)
            else:
                cur_date = '20160' + str(i) + str(thursday)
            if cur_date <= '20160401' and cur_date >= '20160104':
                dates.append(cur_date)
        if friday > 0:
            if friday < 10:
                cur_date = '20160' + str(i) + '0' + str(friday)
            else:
                cur_date = '20160' + str(i) + str(friday)
            if cur_date <= '20160401' and cur_date >= '20160104':
                dates.append(cur_date)
print dates

check with the calendar_date file to add or delete some dates

In [None]:
select_calendar_date = calendar_dates[calendar_dates.service_id.isin(select_service_id_set)]
select_calendar_date.info()

In [None]:
select_calendar_date[select_calendar_date.exception_type == 1]

In [None]:
date_set = set(dates)
for i in xrange(len(select_calendar_date)):
    exception_type = select_calendar_date.iloc[i].exception_type
    tmp_date = str(select_calendar_date.iloc[i].date)
    tmp_service_id = select_calendar_date.iloc[i].service_id
    if exception_type == 1 and tmp_date not in date_set:
        print "add"
        date_set.add(tmp_date)

In [None]:
date_set

## 4. Download the historical files from website
url: http://data.mytransit.nyc/bus_time/2016/

prepare the dates list for downloading

In [None]:
sorted_dates = sorted(list(date_set))
basic_url = 'http://data.mytransit.nyc.s3.amazonaws.com/bus_time/2016/'
print sorted_dates
print basic_url

use the urllib package to start downloading
The link address for a specific file is: http://data.mytransit.nyc.s3.amazonaws.com/bus_time/2016/2016-01/bus_time_20160101.csv.xz

In [None]:
# download_file = urllib.URLopener()
# for i in xrange(len(sorted_dates)):
#     print i
#     cur_str = sorted_dates[i]
#     filename = 'bus_time_' + cur_str + '.csv.xz'
#     url = basic_url + cur_str[:-4] + '-' + cur_str[-4:-2] + '/' + filename
#     download_file.retrieve(url, filename)

since the unzipped historical file is very large, we will only select several days as the samples.

# baseline algorithm
calcualte the travel time within a segment of a specific route
Prepare four different types of the baseline algorithm:

- baseline algorithm without any features
- baseline algorithm with only weather
- baseline algorithm with only peak hour
- baseline algorithm with both of the weather and the peak hour

Step:
1. randomly select several days: 5 days
2. randomly select mutliple trips within the same route and direction
3. use these mutliple trips at different days to form a dataframe
4. calculate the arrival time for each trip at each sample day
5. obtain the simple baseline data for the first one.

## 1. randomly select several days: 5 days

prepare the directory

In [None]:
dir_name = '../data/history/'
file_list = os.listdir(dir_name)

select the five files by random from the file_list

In [None]:
files = []
for single_file in file_list:
    if not single_file.endswith('.csv'):
        continue
    files.append(single_file)
print files

for simplicity, we select the first 5 days as the result

In [None]:
sample_days = files[:5]
print sample_days

## 2. randomly select a trip within the specific route and direction

In [None]:
specific_trip = select_trips.iloc[10]
specific_trip

## 3. use these mutliple trips at different days to form a dataframe

read the dataframe from these five days and find the corresponding trips. reselect the sample dates by filering the dates with the trip_id

In [None]:
historical_data = []
date_list = ['bus_time_20160208.csv',
 'bus_time_20160217.csv',
 'bus_time_20160218.csv',
 'bus_time_20160219.csv']
for file_name in date_list:
    print dir_name + file_name 
    cur_data = pd.read_csv(dir_name + file_name)
    tmp_data = cur_data[cur_data.trip_id == specific_trip.trip_id]
    if len(tmp_data) > 0:
        historical_data.append(tmp_data)

In [None]:
len(historical_data)

In [None]:
historical_data[0].head(5)

In [None]:
historical_data[0].tail(3)

One specific trip will only be operated by one time every day.

In [None]:
sample_history = pd.concat(historical_data)
sample_history.info()

## 4. calculate the arrival time for each trip at each sample day

In [None]:
tmp_trip = stop_times[stop_times.trip_id == specific_trip.trip_id]
tmp_trip.info()

In [None]:
tmp_trip.head(5)

In [None]:
station_list = list(tmp_trip.stop_id)
print station_list

In [None]:
# idx = 0
# arrival_station_list = []
# arrival_time_list = []
# for item in date_list:
#     date = item[9:17]
#     print date
#     operation_hour = sample_history[sample_history.service_date == int(date)]
#     for i in xrange(1, len(station_list)):
#         station = station_list[i - 1]
#         next_station = station_list[i]
#         while operation_hour.iloc[idx].next_stop_id != station or operation_hour.iloc[idx + 1].next_stop_id != next_station:
#             idx += 1
#         distance_location = float(operation_hour.iloc[idx + 1].dist_along_route) - float(operation_hour.iloc[idx].dist_along_route)
#         distance_station = float(operation_hour.iloc[idx].dist_from_stop)
#         ratio = distance_station / distance_location
#         if ratio < 0:
#             print station, next_station
#         time1 = operation_hour.iloc[idx].timestamp
#         time2 = operation_hour.iloc[idx + 1].timestamp
#         time_span = calculateTimeSpan(time1, time2)
#         if ratio > 0:
#             travel_time = time_span * ratio
#             print station, next_station, travel_time, time1
#             arrival_station_list.append(station)