# GTFS_Dublin Bus Time Table

## This notebook prepares the Timetable at each bus stop for each route using GTFS data

In [2]:
# Import all the important tools
import pandas as pd
import numpy as np

In [3]:
# Calendar data to tell the service days of week
df_calendar = pd.read_csv('google_transit_dublinbus/calendar.txt')

In [4]:
df_stops = pd.read_csv('google_transit_dublinbus/stops.txt')

In [5]:
df_stops_times = pd.read_csv('google_transit_dublinbus/stop_times.txt')

In [6]:
df_calendar

Unnamed: 0,service_id,start_date,end_date,monday,tuesday,wednesday,thursday,friday,saturday,sunday
0,y102x,20180729,20180825,1,0,0,0,0,0,1
1,y1024,20180405,20180728,0,0,0,0,0,1,0
2,y102w,20180729,20180825,0,0,0,0,0,1,0
3,y102v,20180729,20180825,1,1,1,1,1,0,0
4,y1023,20180405,20180728,0,0,0,0,0,0,1
5,y1022,20180405,20180728,1,1,1,1,1,0,0


In [7]:
df_stops.head()

Unnamed: 0,stop_lat,zone_id,stop_lon,stop_id,stop_name,location_type
0,53.35747,,-6.287639,8220DB000801,"Cabra East, North Circular Road",0
1,53.424598,,-6.234909,8240DB003663,"Dublin Airport, Airport Terminal 2",0
2,53.390386,,-6.278349,8220DB000131,"Glasnevin North, Glasnevin Avenue (Beneavin Dr...",0
3,53.28904,,-6.339486,8230DB002554,"Tallaght, Glenview Park",0
4,53.362907,,-6.302642,8220DB001806,"Cabra, Navan Road",0


In [8]:
# Stops table has all the stops with their names and location parameters. We just want to extract the stop ids
# First we will check if the stopid as something other than text
from string import punctuation

In [25]:
BAD_CHARS = list(set(punctuation))

In [26]:
'|'.join(BAD_CHARS)

'{|:|@|,|/|?|&|>|)|`|}|#|||"|-|<|(|\\|]|;|^|.|$|\'|~|[|!|*|+|=|%|_'

In [56]:
df_stops[df_stops['stop_id'].str.contains('{|:|@|,|/|\?|&|>|\)|`|}|#|\||"|=|<|\(|\\|]|;|\^|\.|\$|\'|~|\[|!|\*|\+|=|%|_')]

Unnamed: 0,stop_lat,zone_id,stop_lon,stop_id,stop_name,location_type
287,53.284858,,-6.360721,8230DB002605_merged_7690,"Tallaght, Old Bawn Estate (Watergate Estate)",0
401,53.284056,,-6.359912,8230DB002605_merged_7671,"Tallaght, Old Bawn Estate (Watergate Estate)",0
422,53.367144,,-6.255514,8220DB000019_merged_7703,"Dromcondra, Lower Drumcondra Road",0
485,53.276266,,-6.25013,8250DB002859_merged_7693,"Ballinteer, Ballinteer Avenue",0
545,53.281817,,-6.29578,8230DB007449_merged_7672,"Willbrook, Taylor's Lane",0
563,53.305198,,-6.217225,8250DB000765_merged_7688,"Belfield, University College Dublin",0
569,53.279888,,-6.272252,8230DB002968_merged_7698,"Rathfarnham, Three Rock Hockey Club",0
629,53.518904,,-6.116223,8240DB003774_merged_7668,"Rush, Whitestown Road (Old Road)",0
665,53.276828,,-6.119329,gen:57102:3606:0:1,"Glenageary, Barnhill Rd",0
770,53.281111,,-6.276627,8230DB002981_merged_7696,"Rathfarnham, Eden Avenue",0


> We can see that there two types of stops:
* The gen kind, these are virtual stops
* The merged kind.

-- First we drop the gen kind and remove the merged values and keep only distinct stopids forourselves

In [57]:
df_clean_stops = df_stops[~df_stops['stop_id'].str.contains(':')]

In [58]:
df_stop__ = df_stops[df_stops['stop_id'].str.contains('_')]

In [68]:
df_stop__['stop_id'].str.extract('(.*?)_')

  """Entry point for launching an IPython kernel.


287     8230DB002605
401     8230DB002605
422     8220DB000019
485     8250DB002859
545     8230DB007449
563     8250DB000765
569     8230DB002968
629     8240DB003774
770     8230DB002981
779     8220DB000119
783     8230DB007449
1027    8250DB002860
1207    8250DB000765
1246    8220DB000119
1302    8230DB002981
1473    8240DB003774
1540    8220DB000047
1622    8250DB002861
2214    8250DB002861
2218    8220DB000203
2296    8230DB002968
2383    8250DB002858
2445    8230DB002980
2497    8220DB000018
2783    8230DB004348
3251    8230DB004347
3272    8220DB000019
3331    8230DB004347
3599    8220DB000047
3877    8230DB004348
3909    8250DB002858
3915    8250DB002859
3990    8250DB002860
4241    8220DB000203
4285    8230DB002980
4416    8230DB002983
4462    8220DB000018
4700    8230DB002983
Name: stop_id, dtype: object

In [69]:
df_clean_stops['clean_stop_id'] = np.where(df_clean_stops['stop_id'].str.contains('_'),df_clean_stops['stop_id'].str.extract('(.*?)_').str[-4:],df_clean_stops['stop_id'].str[-4:])

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [81]:
df_clean_stops[df_clean_stops['clean_stop_id']==np.isnan]

Unnamed: 0,stop_lat,zone_id,stop_lon,stop_id,stop_name,location_type,clean_stop_id


In [83]:
df_clean_stops.head()

Unnamed: 0,stop_lat,zone_id,stop_lon,stop_id,stop_name,location_type,clean_stop_id
0,53.35747,,-6.287639,8220DB000801,"Cabra East, North Circular Road",0.0,801.0
1,53.424598,,-6.234909,8240DB003663,"Dublin Airport, Airport Terminal 2",0.0,3663.0
2,53.390386,,-6.278349,8220DB000131,"Glasnevin North, Glasnevin Avenue (Beneavin Dr...",0.0,131.0
3,53.28904,,-6.339486,8230DB002554,"Tallaght, Glenview Park",0.0,2554.0
4,53.362907,,-6.302642,8220DB001806,"Cabra, Navan Road",0.0,1806.0


In [91]:
df_clean_stops[df_clean_stops['clean_stop_id'].isnull()]

Unnamed: 0,stop_lat,zone_id,stop_lon,stop_id,stop_name,location_type,clean_stop_id


In [90]:
df_clean_stops = df_clean_stops[np.isfinite(df_clean_stops['clean_stop_id'])]

In [92]:
df_clean_stops[df_clean_stops['clean_stop_id'].isnull()]

Unnamed: 0,stop_lat,zone_id,stop_lon,stop_id,stop_name,location_type,clean_stop_id


In [93]:
df_clean_stops['clean_stop_id'] = df_clean_stops['clean_stop_id'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [94]:
df_clean_stops.head()

Unnamed: 0,stop_lat,zone_id,stop_lon,stop_id,stop_name,location_type,clean_stop_id
0,53.35747,,-6.287639,8220DB000801,"Cabra East, North Circular Road",0.0,801
1,53.424598,,-6.234909,8240DB003663,"Dublin Airport, Airport Terminal 2",0.0,3663
2,53.390386,,-6.278349,8220DB000131,"Glasnevin North, Glasnevin Avenue (Beneavin Dr...",0.0,131
3,53.28904,,-6.339486,8230DB002554,"Tallaght, Glenview Park",0.0,2554
4,53.362907,,-6.302642,8220DB001806,"Cabra, Navan Road",0.0,1806


> We now have a clean list of stops, now will prepare our timetable

In [95]:
# For timetable we will need our stops times data
df_stops_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,5366.y102v.60-140-d12-1.109.O,16:20:00,16:20:00,8220DB007149,1,Rathmines,,,
1,5366.y102v.60-140-d12-1.109.O,16:20:57,16:20:57,8220DB007277,2,Rathmines,,,282.831755
2,5366.y102v.60-140-d12-1.109.O,16:22:18,16:22:18,8220DB006363,3,Rathmines,,,790.04217
3,5366.y102v.60-140-d12-1.109.O,16:23:14,16:23:14,8220DB006361,4,Rathmines,,,1117.809126
4,5366.y102v.60-140-d12-1.109.O,16:23:44,16:23:44,8220DB006367,5,Rathmines,,,1351.71095


In [96]:
df_clean_stops_times = df_stops_times[['trip_id','arrival_time','stop_id','stop_sequence']]

In [97]:
len(df_clean_stops_times)

1845477

In [98]:
len(df_clean_stops)

4706

In [99]:
df_merged = pd.merge(df_clean_stops_times,df_clean_stops)

In [100]:
len(df_merged)

1844980

In [101]:
df_merged.head()

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,stop_lat,zone_id,stop_lon,stop_name,location_type,clean_stop_id
0,5366.y102v.60-140-d12-1.109.O,16:20:00,8220DB007149,1,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149
1,17210.y1023.60-140-b12-1.108.I,11:56:37,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149
2,5294.y102v.60-140-d12-1.111.I,15:27:28,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149
3,13182.y1024.60-140-b12-1.108.I,10:51:56,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149
4,6647.y1022.60-140-b12-1.108.I,08:55:34,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149


In [135]:
df_test = df_merged.head()

In [128]:
df_test

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,stop_lat,zone_id,stop_lon,stop_name,location_type,clean_stop_id
0,5366.y102v.60-140-d12-1.109.O,16:20:00,8220DB007149,1,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149
1,17210.y1023.60-140-b12-1.108.I,11:56:37,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149
2,5294.y102v.60-140-d12-1.111.I,15:27:28,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149
3,13182.y1024.60-140-b12-1.108.I,10:51:56,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149
4,6647.y1022.60-140-b12-1.108.I,08:55:34,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149


In [127]:
df_calendar[df_calendar['service_id'].str.contains('y1024|y1023|y1022')]

Unnamed: 0,service_id,start_date,end_date,monday,tuesday,wednesday,thursday,friday,saturday,sunday
1,y1024,20180405,20180728,0,0,0,0,0,1,0
4,y1023,20180405,20180728,0,0,0,0,0,0,1
5,y1022,20180405,20180728,1,1,1,1,1,0,0


In [137]:
df_test['service_id'] = df_test['trip_id'].str.extract('.*?\.(.*?)\..*?-\d*')

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [139]:
df_test = df_test[df_test['service_id'].str.contains('y1024|y1023|y1022')]

In [142]:
df_test['dayofservice'] = 'X'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [145]:
df_test.loc[df_test['service_id']=='y1024','dayofservice']='Sunday'
df_test.loc[df_test['service_id']=='y1023','dayofservice']='Saturday'
df_test.loc[df_test['service_id']=='y1022','dayofservice']='Weekday'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [146]:
df_test

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,stop_lat,zone_id,stop_lon,stop_name,location_type,clean_stop_id,service_id,dayofservice
1,17210.y1023.60-140-b12-1.108.I,11:56:37,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1023,S1
3,13182.y1024.60-140-b12-1.108.I,10:51:56,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1024,S
4,6647.y1022.60-140-b12-1.108.I,08:55:34,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1022,W


In [147]:
# We will now perform the above functionality on the entire dataframe
df_merged['service_id'] = df_merged['trip_id'].str.extract('.*?\.(.*?)\..*?-\d*')

  


In [148]:
df_merged = df_merged[df_merged['service_id'].str.contains('y1024|y1023|y1022')]

In [149]:
df_merged['dayofservice'] = 'X'
df_merged.loc[df_merged['service_id']=='y1024','dayofservice']='Sunday'
df_merged.loc[df_merged['service_id']=='y1023','dayofservice']='Saturday'
df_merged.loc[df_merged['service_id']=='y1022','dayofservice']='Weekday'

In [152]:
# df_merged['route'] = df_merged['trip_id'].str.extract('.*?\..*?')
df_merged['route'] = df_merged['trip_id'].str.extract('.*?\..*?\.\d*-(.*?)-')

  


In [153]:
df_merged

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,stop_lat,zone_id,stop_lon,stop_name,location_type,clean_stop_id,service_id,dayofservice,route
1,17210.y1023.60-140-b12-1.108.I,11:56:37,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1023,Saturday,140
3,13182.y1024.60-140-b12-1.108.I,10:51:56,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1024,Sunday,140
4,6647.y1022.60-140-b12-1.108.I,08:55:34,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1022,Weekday,140
5,17250.y1023.60-140-b12-1.108.I,19:22:56,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1023,Saturday,140
10,13163.y1024.60-140-b12-1.106.O,15:00:00,8220DB007149,1,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1024,Sunday,140
11,6631.y1022.60-140-b12-1.106.O,07:35:00,8220DB007149,1,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1022,Weekday,140
12,17206.y1023.60-140-b12-1.108.I,11:26:37,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1023,Saturday,140
13,17215.y1023.60-140-b12-1.106.O,12:40:00,8220DB007149,1,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1023,Saturday,140
14,17267.y1023.60-140-b12-1.108.I,20:22:56,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1023,Saturday,140
16,17245.y1023.60-140-b12-1.108.I,22:48:03,8220DB007149,45,53.407296,,-6.277997,"Ballymun, Saint Margaret's Road",0.0,7149,y1023,Saturday,140


In [167]:
df_merged_tt_temp = df_merged[['clean_stop_id','route','dayofservice','arrival_time']]

In [156]:
df_merged_tt_temp.dtypes

clean_stop_id     int32
route            object
dayofservice     object
arrival_time     object
dtype: object

In [169]:
df_merged_tt_temp['schedule'] = np.where(df_merged_tt_temp['arrival_time'].str.extract('(.*?):').str[0:]=='24','00:'+df_merged_tt_temp['arrival_time'].str.extract(':(.*?):').str[0:],df_merged_tt_temp['arrival_time'].str.extract('(.*?:.*?):'))

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [170]:
df_merged_tt_temp[df_merged_tt_temp['arrival_time'].str.contains('^24:')]

Unnamed: 0,clean_stop_id,route,dayofservice,arrival_time,schedule
21605,819,38,Sunday,24:01:37,00:01
70899,1182,83,Sunday,24:00:20,00:00
71165,1550,83,Sunday,24:00:47,00:00
71431,1551,83,Sunday,24:01:10,00:01
71697,1552,83,Sunday,24:02:19,00:02
72127,3665,747,Weekday,24:11:25,00:11
72375,3665,747,Saturday,24:30:00,00:30
72471,3665,747,Sunday,24:30:00,00:30
72477,3665,747,Sunday,24:00:00,00:00
72706,3665,747,Sunday,24:12:34,00:12


In [171]:
df_merged_tt_temp.drop('arrival_time',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [179]:
df_sorted_tt = df_merged_tt_temp.sort_values(by=['clean_stop_id','route','dayofservice','schedule'])

In [182]:
df_final_sorted_tt = pd.DataFrame(df_sorted_tt.groupby(['clean_stop_id','route','dayofservice'])['schedule'].apply(list)).reset_index()

In [183]:
df_final_sorted_tt

Unnamed: 0,clean_stop_id,route,dayofservice,schedule
0,2,38,Saturday,"[09:47, 10:51, 11:56, 12:57, 13:57, 14:57, 16:..."
1,2,38,Sunday,"[07:42, 08:21, 09:01, 09:41, 10:25, 11:05, 11:..."
2,2,38,Weekday,"[06:23, 06:43, 07:03, 07:23, 07:48, 08:03, 08:..."
3,2,38A,Saturday,"[10:18, 11:23, 12:28, 13:28, 14:28, 15:25, 16:..."
4,2,38A,Sunday,"[07:21, 08:01, 08:43, 09:23, 10:03, 10:49, 11:..."
5,2,38A,Weekday,"[08:25, 08:55, 09:15, 09:35, 09:55, 10:35, 11:..."
6,2,38B,Weekday,"[06:12, 06:32, 06:52, 07:12, 07:32]"
7,2,38D,Weekday,[07:57]
8,2,46A,Saturday,"[09:13, 09:33, 09:53, 10:13, 10:37, 10:52, 11:..."
9,2,46A,Sunday,"[07:36, 07:51, 08:09, 08:24, 08:39, 08:49, 08:..."


In [184]:
df_final_sorted_tt.columns = ['stopid','lineid','dayofservice','schedule']

In [185]:
df_final_sorted_tt

Unnamed: 0,stopid,lineid,dayofservice,schedule
0,2,38,Saturday,"[09:47, 10:51, 11:56, 12:57, 13:57, 14:57, 16:..."
1,2,38,Sunday,"[07:42, 08:21, 09:01, 09:41, 10:25, 11:05, 11:..."
2,2,38,Weekday,"[06:23, 06:43, 07:03, 07:23, 07:48, 08:03, 08:..."
3,2,38A,Saturday,"[10:18, 11:23, 12:28, 13:28, 14:28, 15:25, 16:..."
4,2,38A,Sunday,"[07:21, 08:01, 08:43, 09:23, 10:03, 10:49, 11:..."
5,2,38A,Weekday,"[08:25, 08:55, 09:15, 09:35, 09:55, 10:35, 11:..."
6,2,38B,Weekday,"[06:12, 06:32, 06:52, 07:12, 07:32]"
7,2,38D,Weekday,[07:57]
8,2,46A,Saturday,"[09:13, 09:33, 09:53, 10:13, 10:37, 10:52, 11:..."
9,2,46A,Sunday,"[07:36, 07:51, 08:09, 08:24, 08:39, 08:49, 08:..."


In [186]:
# Loading the timetable data into database
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://postgres:00001234@localhost:5433/jetaDb')

In [189]:
df_final_sorted_tt.to_sql('main_timetable', engine, if_exists='append', index=False)