In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
stop_times = pd.read_csv("stop_times.txt")
stop_times.head(4)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,7712.y1009.60-1-d12-1.1.O,18:40:00,18:40:00,8240DB000226,1,Sandymount,0,0,0.0
1,7712.y1009.60-1-d12-1.1.O,18:40:44,18:40:44,8220DB000228,2,Sandymount,0,0,267.48
2,7712.y1009.60-1-d12-1.1.O,18:41:21,18:41:21,8240DB000229,3,Sandymount,0,0,483.53
3,7712.y1009.60-1-d12-1.1.O,18:42:20,18:42:20,8240DB000227,4,Sandymount,0,0,834.47


### Let's assess for preliminary columns that can be dropped 

In [3]:
stop_times.nunique()

trip_id                32683
arrival_time           77087
departure_time         77087
stop_id                 4208
stop_sequence            109
stop_headsign            132
pickup_type                2
drop_off_type              2
shape_dist_traveled    12430
dtype: int64

In [4]:
print("pickup_type: ", stop_times["pickup_type"].unique().tolist())
print("drop_off_type: ", stop_times["drop_off_type"].unique().tolist())

pickup_type:  [0, 1]
drop_off_type:  [0, 1]


### - I am unsure of the value of these cells are and will remove them for now

In [5]:
stop_times = stop_times.drop(["pickup_type", "drop_off_type", "shape_dist_traveled"], axis=1)
stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign
0,7712.y1009.60-1-d12-1.1.O,18:40:00,18:40:00,8240DB000226,1,Sandymount
1,7712.y1009.60-1-d12-1.1.O,18:40:44,18:40:44,8220DB000228,2,Sandymount
2,7712.y1009.60-1-d12-1.1.O,18:41:21,18:41:21,8240DB000229,3,Sandymount
3,7712.y1009.60-1-d12-1.1.O,18:42:20,18:42:20,8240DB000227,4,Sandymount
4,7712.y1009.60-1-d12-1.1.O,18:42:58,18:42:58,8240DB000230,5,Sandymount
...,...,...,...,...,...,...
1874914,6667.y1003.60-H9-b12-1.97.I,08:03:12,08:03:12,8220DB000619,16,Abbey Street
1874915,6667.y1003.60-H9-b12-1.97.I,08:03:50,08:03:50,8220DB000675,17,Abbey Street
1874916,6667.y1003.60-H9-b12-1.97.I,08:05:22,08:05:22,8220DB000620,18,Abbey Street
1874917,6667.y1003.60-H9-b12-1.97.I,08:06:35,08:06:35,8220DB007569,19,Abbey Street


### The initial goal is to build a dataframe for each shape variation of a single route-line (route 155) to include all the relevant stop data. It should include:
- ### All stops information, such as stop_sequence, stop_number, etc, in their own columns. Each of these will be a list of all values of this on a route.
- ### The earliest times for each first stop departure - a timetable for the shape.
- ### The day of the week the service runs on.

### First, let's get the customer-facing route number for each row in the dataset

In [6]:
def line_id(row):
    shape_id = row['shape_id']
    line_id = shape_id.split('-')[1]
    return line_id

def shape_id(row):
    trip_id = row['trip_id']
    shape_strings = trip_id.split('.')
    shape_id = shape_strings[2] + '.' + shape_strings[3] + "." + shape_strings[4]
    return shape_id

def service_id(row):
    trip_id = row['trip_id']
    id_strings = trip_id.split(".")
    service_id = id_strings[1] + '.' + id_strings[2] + '.' + id_strings[3] + '.' + id_strings[4]
    return service_id

def day_of_week(row):
    day_code = row["trip_id"].split(".")[1]
    
    if day_code == "y1003":
        return "weekday"
    if day_code == "y1004":
        return "sunday"
    if day_code == "y1005":
        return "saturday"
    if day_code == "y1007":
        return "weekday"
    if day_code == "y1008":
        return "sunday/bank holiday"
    if day_code == "y1009":
        return "saturday"

In [7]:
stop_times["shape_id"] = stop_times.apply(shape_id, axis=1)
stop_times["line_id"] = stop_times.apply(line_id, axis=1)
stop_times["service_id"] = stop_times.apply(service_id, axis=1)
stop_times["day_of_week"] = stop_times.apply(day_of_week, axis=1)
stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,shape_id,line_id,service_id,day_of_week
0,7712.y1009.60-1-d12-1.1.O,18:40:00,18:40:00,8240DB000226,1,Sandymount,60-1-d12-1.1.O,1,y1009.60-1-d12-1.1.O,saturday
1,7712.y1009.60-1-d12-1.1.O,18:40:44,18:40:44,8220DB000228,2,Sandymount,60-1-d12-1.1.O,1,y1009.60-1-d12-1.1.O,saturday
2,7712.y1009.60-1-d12-1.1.O,18:41:21,18:41:21,8240DB000229,3,Sandymount,60-1-d12-1.1.O,1,y1009.60-1-d12-1.1.O,saturday
3,7712.y1009.60-1-d12-1.1.O,18:42:20,18:42:20,8240DB000227,4,Sandymount,60-1-d12-1.1.O,1,y1009.60-1-d12-1.1.O,saturday
4,7712.y1009.60-1-d12-1.1.O,18:42:58,18:42:58,8240DB000230,5,Sandymount,60-1-d12-1.1.O,1,y1009.60-1-d12-1.1.O,saturday
...,...,...,...,...,...,...,...,...,...,...
1874914,6667.y1003.60-H9-b12-1.97.I,08:03:12,08:03:12,8220DB000619,16,Abbey Street,60-H9-b12-1.97.I,H9,y1003.60-H9-b12-1.97.I,weekday
1874915,6667.y1003.60-H9-b12-1.97.I,08:03:50,08:03:50,8220DB000675,17,Abbey Street,60-H9-b12-1.97.I,H9,y1003.60-H9-b12-1.97.I,weekday
1874916,6667.y1003.60-H9-b12-1.97.I,08:05:22,08:05:22,8220DB000620,18,Abbey Street,60-H9-b12-1.97.I,H9,y1003.60-H9-b12-1.97.I,weekday
1874917,6667.y1003.60-H9-b12-1.97.I,08:06:35,08:06:35,8220DB007569,19,Abbey Street,60-H9-b12-1.97.I,H9,y1003.60-H9-b12-1.97.I,weekday


### Now, let's take a sub-section to experiment on. Let's choose line 155.

In [8]:
line_155 = stop_times[stop_times["line_id"]=="155"]
line_155

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,shape_id,line_id,service_id,day_of_week
246561,9673.y1009.60-155-d12-1.90.O,06:40:00,06:40:00,8220DB007698,1,Bray,60-155-d12-1.90.O,155,y1009.60-155-d12-1.90.O,saturday
246562,9673.y1009.60-155-d12-1.90.O,06:42:14,06:42:14,8220DB007113,3,Bray,60-155-d12-1.90.O,155,y1009.60-155-d12-1.90.O,saturday
246563,9673.y1009.60-155-d12-1.90.O,06:43:31,06:43:31,8220DB000127,4,Bray,60-155-d12-1.90.O,155,y1009.60-155-d12-1.90.O,saturday
246564,9673.y1009.60-155-d12-1.90.O,06:44:36,06:44:36,8220DB000112,5,Bray,60-155-d12-1.90.O,155,y1009.60-155-d12-1.90.O,saturday
246565,9673.y1009.60-155-d12-1.90.O,06:45:24,06:45:24,8220DB000113,6,Bray,60-155-d12-1.90.O,155,y1009.60-155-d12-1.90.O,saturday
...,...,...,...,...,...,...,...,...,...,...
1207992,9733.y1005.60-155-b12-1.94.I,23:30:22,23:30:22,8220DB000909,52,O'Connell St,60-155-b12-1.94.I,155,y1005.60-155-b12-1.94.I,saturday
1207993,9733.y1005.60-155-b12-1.94.I,23:30:52,23:30:52,8220DB000786,53,O'Connell St,60-155-b12-1.94.I,155,y1005.60-155-b12-1.94.I,saturday
1207994,9733.y1005.60-155-b12-1.94.I,23:32:39,23:32:39,8220DB000792,54,O'Connell St,60-155-b12-1.94.I,155,y1005.60-155-b12-1.94.I,saturday
1207995,9733.y1005.60-155-b12-1.94.I,23:36:06,23:36:06,8220DB000319,55,O'Connell St,60-155-b12-1.94.I,155,y1005.60-155-b12-1.94.I,saturday


## Next step is to match the stop information with this dataframe

In [9]:
stops_df = pd.read_csv("stops.txt")
stops_df

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
0,8220DB000002,"Parnell Square West, stop 2",53.352244,-6.263723
1,8220DB000003,"Parnell Square West, stop 3",53.352309,-6.263811
2,8220DB000004,"Parnell Square West, stop 4",53.352575,-6.264175
3,8220DB000006,"Parnell Square West, stop 6",53.352749,-6.264454
4,8220DB000007,"Parnell Square West, stop 7",53.352841,-6.264570
...,...,...,...,...
4203,8350DB007461,"Charlesland, stop 7461",53.128932,-6.062803
4204,8350DB007462,"Charlesland, stop 7462",53.128801,-6.062480
4205,8350DB007574,"Southern Cross, stop 7574",53.182348,-6.130064
4206,8350DB007823,"Enniskerry Village, stop 7823",53.194198,-6.170184


### We will merge the stop data from all_stops, with the historical stops made on the route. The most important part here is the correct coordinates are aligned in the routes sequence.

In [10]:
merged_df = pd.merge(line_155, stops_df, left_on='stop_id', right_on='stop_id')
merged_df

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,shape_id,line_id,service_id,day_of_week,stop_name,stop_lat,stop_lon
0,9673.y1009.60-155-d12-1.90.O,06:40:00,06:40:00,8220DB007698,1,Bray,60-155-d12-1.90.O,155,y1009.60-155-d12-1.90.O,saturday,"IKEA, stop 7698",53.406385,-6.274635
1,9683.y1009.60-155-d12-1.90.O,07:00:00,07:00:00,8220DB007698,1,Bray,60-155-d12-1.90.O,155,y1009.60-155-d12-1.90.O,saturday,"IKEA, stop 7698",53.406385,-6.274635
2,9710.y1009.60-155-d12-1.90.O,07:20:00,07:20:00,8220DB007698,1,Bray,60-155-d12-1.90.O,155,y1009.60-155-d12-1.90.O,saturday,"IKEA, stop 7698",53.406385,-6.274635
3,9642.y1009.60-155-d12-1.90.O,19:00:00,19:00:00,8220DB007698,1,Bray,60-155-d12-1.90.O,155,y1009.60-155-d12-1.90.O,saturday,"IKEA, stop 7698",53.406385,-6.274635
4,9662.y1009.60-155-d12-1.90.O,19:40:00,19:40:00,8220DB007698,1,Bray,60-155-d12-1.90.O,155,y1009.60-155-d12-1.90.O,saturday,"IKEA, stop 7698",53.406385,-6.274635
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48025,9704.y1005.60-155-b12-1.93.I,11:57:08,11:57:08,8220DB007697,81,IKEA,60-155-b12-1.93.I,155,y1005.60-155-b12-1.93.I,saturday,"IKEA, stop 7697",53.406379,-6.276079
48026,9737.y1005.60-155-b12-1.93.I,12:37:08,12:37:08,8220DB007697,81,IKEA,60-155-b12-1.93.I,155,y1005.60-155-b12-1.93.I,saturday,"IKEA, stop 7697",53.406379,-6.276079
48027,9642.y1005.60-155-b12-1.93.I,18:23:07,18:23:07,8220DB007697,81,IKEA,60-155-b12-1.93.I,155,y1005.60-155-b12-1.93.I,saturday,"IKEA, stop 7697",53.406379,-6.276079
48028,9716.y1005.60-155-b12-1.93.I,18:03:07,18:03:07,8220DB007697,81,IKEA,60-155-b12-1.93.I,155,y1005.60-155-b12-1.93.I,saturday,"IKEA, stop 7697",53.406379,-6.276079


### How many "shapes" exist on the 155 line

In [11]:
shapes = merged_df["shape_id"].unique().tolist()
shapes

['60-155-d12-1.90.O',
 '60-155-d12-1.91.O',
 '60-155-b12-1.91.O',
 '60-155-b12-1.92.O',
 '60-155-d12-1.92.I',
 '60-155-d12-1.93.I',
 '60-155-b12-1.93.I',
 '60-155-b12-1.94.I']

### - Four inbound routes and four outbound

### We know the leading string in the service id indicates the day of the week the service is run on. Let's split the shapes into its days of service

### - All shapes appear to run on all days of the week

In [12]:
shapes_to_days = {}

for shape in shapes:
    temp = merged_df[merged_df["shape_id"]==shape]
    temp = temp["day_of_week"].unique().tolist()
    shapes_to_days.update({shape: temp})
    
shapes_to_days

{'60-155-d12-1.90.O': ['saturday', 'sunday/bank holiday', 'weekday'],
 '60-155-d12-1.91.O': ['saturday', 'weekday', 'sunday/bank holiday'],
 '60-155-b12-1.91.O': ['saturday', 'weekday', 'sunday'],
 '60-155-b12-1.92.O': ['sunday', 'weekday', 'saturday'],
 '60-155-d12-1.92.I': ['saturday', 'sunday/bank holiday', 'weekday'],
 '60-155-d12-1.93.I': ['saturday', 'weekday', 'sunday/bank holiday'],
 '60-155-b12-1.93.I': ['saturday', 'weekday', 'sunday'],
 '60-155-b12-1.94.I': ['sunday', 'weekday', 'saturday']}

### There are no clear correlations between the shape_ids and the days of the week they run on. We cannot use this as a way of dividing the routes into chunks for logistics

### The stategy instead, will be to take the longest shape in a given line, and to use it as a default. Let's find the longest shape for the 155 now.

In [13]:
inbound_shapes = []
outbound_shapes = []

for shape in shapes:
    direction = shape.split('.')[2]
    if direction == "O":
        outbound_shapes.append(shape)
    if direction == "I":
        inbound_shapes.append(shape)

print(outbound_shapes)
print(inbound_shapes)

['60-155-d12-1.90.O', '60-155-d12-1.91.O', '60-155-b12-1.91.O', '60-155-b12-1.92.O']
['60-155-d12-1.92.I', '60-155-d12-1.93.I', '60-155-b12-1.93.I', '60-155-b12-1.94.I']


In [14]:
longest_outbound = ""
longest_length_outbound = 0
for shape in outbound_shapes:
    temp = merged_df[merged_df["shape_id"]==shape]
    print(shape, temp.shape)
    if temp.shape[0] > longest_length_outbound:
        longest_length_outbound = temp.shape[0]
        longest_outbound = shape
    
longest_inbound = ""
longest_length_inbound = 0
for shape in inbound_shapes:
    temp = merged_df[merged_df["shape_id"]==shape]
    print(shape, temp.shape)
    if temp.shape[0] > longest_length_inbound:
        longest_length_inbound = temp.shape[0]
        longest_inbound = shape

longest_shapes = [longest_inbound, longest_outbound]
longest_shapes

60-155-d12-1.90.O (11700, 13)
60-155-d12-1.91.O (72, 13)
60-155-b12-1.91.O (11700, 13)
60-155-b12-1.92.O (72, 13)
60-155-d12-1.92.I (11907, 13)
60-155-d12-1.93.I (336, 13)
60-155-b12-1.93.I (11907, 13)
60-155-b12-1.94.I (336, 13)


['60-155-d12-1.92.I', '60-155-d12-1.90.O']

### Before we make some significant modifcations, we need to collect a list of first-stop-departures, and create a row for them

In [15]:
def sorting_seconds(time_list):
    ftr = [3600, 60, 1]
    times_in_seconds = []

    for time in time_list:
        time_units = time.split(':')
        total_secs = (int(time_units[0]) * ftr[2]) + (int(time_units[1]) * ftr[1]) + (int(time_units[0]) * ftr[0])
        times_in_seconds.append(total_secs)

    times_in_seconds.sort() 
    return times_in_seconds

    
def to_timestamp(seconds):
    hour = 3600
    minute = 60

    hours = str(int(seconds/hour))
    minutes = str(int((seconds % hour)/minute))
    seconds = str(int((seconds % hour) % 60))

    if len(hours) == 1:
        hours = f"0{hours}"
    if len(minutes) == 1:
        minutes = f"0{minutes}"
    
    seconds = f"00"

    timestamp = f"{hours}:{minutes}:{seconds}"

    return timestamp

def sorted_timestamps(times_in_seconds):
    sorted_timestamps = []

    for time in times_in_seconds:
        sorted_timestamps.append(to_timestamp(time))

    return sorted_timestamps

In [16]:
def departure_times(df):    
    first_stop = df[df["stop_sequence"]==1]
    first_stop_times = first_stop["departure_time"].unique().tolist()
    sorted_seconds = sorting_seconds(first_stop_times)
    first_stop_times = sorted_timestamps(sorted_seconds)
    first_stop_times = ([str(x) for x in first_stop_times])
    first_stop_times = ", ".join(first_stop_times)
    return first_stop_times

In [17]:
longest_outbound = merged_df[merged_df["shape_id"]==longest_outbound]
longest_inbound = merged_df[merged_df["shape_id"]==longest_inbound]

longest_outbound["first_departure_schedule"] = departure_times(longest_outbound) 
longest_inbound["first_departure_schedule"] = departure_times(longest_inbound) 

### Let's remove the alternate/shorter routes for now and make some extra modifications for later

In [18]:
def route_direction(row):
    trip_string = row['trip_id']
    direction = trip_string[-1]
    if direction == "O":
        return "outbound"
    if direction == "I":
        return "inbound"
    
# function for isolating the stop number for each row
def stop_finder(row):
    stop_string = row['stop_name'].split(' ')
    if stop_string[-1].isdigit:
        return stop_string[-1]
    else:
        return "No stop number."

def stop_name(row):
    name = row["stop_name"].split(",")[0]
    return name

In [19]:
longest_outbound["direction"] = longest_outbound.apply(route_direction, axis=1)
longest_outbound["stop_num"] = longest_outbound.apply(stop_finder, axis=1)
longest_outbound["stop_name"] = longest_outbound.apply(stop_name, axis=1)
longest_outbound = longest_outbound.drop_duplicates(subset=['stop_sequence'], keep='first')

longest_inbound["direction"] = longest_inbound.apply(route_direction, axis=1)
longest_inbound["stop_num"] = longest_inbound.apply(stop_finder, axis=1)
longest_inbound["stop_name"] = longest_inbound.apply(stop_name, axis=1)
longest_inbound = longest_inbound.drop_duplicates(subset=['stop_sequence'], keep='first')


longest_outbound = longest_outbound.drop(["shape_id", "trip_id"], axis=1)
longest_inbound = longest_inbound.drop(["shape_id", "trip_id"], axis=1)

In [20]:
longest_outbound

Unnamed: 0,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,line_id,service_id,day_of_week,stop_name,stop_lat,stop_lon,first_departure_schedule,direction,stop_num
0,06:40:00,06:40:00,8220DB007698,1,Bray,155,y1009.60-155-d12-1.90.O,saturday,IKEA,53.406385,-6.274635,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",outbound,7698
306,06:42:14,06:42:14,8220DB007113,3,Bray,155,y1009.60-155-d12-1.90.O,saturday,Northwood Avenue,53.402541,-6.264924,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",outbound,7113
612,06:43:31,06:43:31,8220DB000127,4,Bray,155,y1009.60-155-d12-1.90.O,saturday,Nursing Home,53.399108,-6.263680,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",outbound,127
918,06:44:36,06:44:36,8220DB000112,5,Bray,155,y1009.60-155-d12-1.90.O,saturday,Civic Centre,53.396154,-6.263951,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",outbound,112
1224,06:45:24,06:45:24,8220DB000113,6,Bray,155,y1009.60-155-d12-1.90.O,saturday,Trinity Comp School,53.392268,-6.263703,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",outbound,113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22044,07:40:57,07:40:57,8350DB004130,76,Bray,155,y1009.60-155-d12-1.90.O,saturday,Castle Street,53.207282,-6.113513,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",outbound,4130
22344,07:41:15,07:41:15,8350DB004131,77,Bray,155,y1009.60-155-d12-1.90.O,saturday,Dwyer Park,53.206567,-6.111523,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",outbound,4131
22644,07:41:43,07:41:43,8350DB007294,78,Bray,155,y1009.60-155-d12-1.90.O,saturday,Cornerstone Church,53.204724,-6.109254,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",outbound,7294
22944,07:43:17,07:43:17,8350DB004156,79,Bray,155,y1009.60-155-d12-1.90.O,saturday,Quinsborough Road,53.204967,-6.103614,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",outbound,4156


In [21]:
longest_inbound

Unnamed: 0,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,line_id,service_id,day_of_week,stop_name,stop_lat,stop_lon,first_departure_schedule,direction,stop_num
23394,06:40:00,06:40:00,8350DB004168,1,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Bray Station,53.204432,-6.101018,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,4168
23850,06:41:21,06:41:21,8350DB004170,2,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Methodist Church,53.203484,-6.106973,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,4170
24156,06:42:18,06:42:18,8350DB004153,3,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Cornerstone Church,53.204891,-6.109531,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,4153
24462,06:43:11,06:43:11,8350DB004154,4,IKEA,155,y1009.60-155-d12-1.92.I,saturday,St Cronan's Road,53.206634,-6.112344,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,4154
24768,06:44:00,06:44:00,8350DB004416,5,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Roseville Court,53.208344,-6.114739,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,4416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46560,07:37:23,07:37:23,8220DB000094,77,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Civic Centre,53.396035,-6.264377,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,94
46854,07:38:09,07:38:09,8220DB000126,78,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Nursing Home,53.399095,-6.264087,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,126
47148,07:38:59,07:38:59,8220DB006182,79,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Santry Cross,53.402251,-6.265357,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,6182
47442,07:39:30,07:39:30,8220DB000322,80,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Gulliver's Retail Pk,53.404255,-6.265351,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,322


### Some further alterations are necessary, we need to nest the variable details of stops within a row for each df. Each bus line will become a single row with nested details containing stops data

In [22]:
longest_inbound.rename(columns = {'stop_lat':'latitude', "stop_lon":"longitude", "stop_headsign":"destination"}, inplace = True)
longest_outbound.rename(columns = {'stop_lat':'latitude', "stop_lon":"longitude", "stop_headsign":"destination"}, inplace = True)
longest_inbound

Unnamed: 0,arrival_time,departure_time,stop_id,stop_sequence,destination,line_id,service_id,day_of_week,stop_name,latitude,longitude,first_departure_schedule,direction,stop_num
23394,06:40:00,06:40:00,8350DB004168,1,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Bray Station,53.204432,-6.101018,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,4168
23850,06:41:21,06:41:21,8350DB004170,2,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Methodist Church,53.203484,-6.106973,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,4170
24156,06:42:18,06:42:18,8350DB004153,3,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Cornerstone Church,53.204891,-6.109531,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,4153
24462,06:43:11,06:43:11,8350DB004154,4,IKEA,155,y1009.60-155-d12-1.92.I,saturday,St Cronan's Road,53.206634,-6.112344,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,4154
24768,06:44:00,06:44:00,8350DB004416,5,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Roseville Court,53.208344,-6.114739,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,4416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46560,07:37:23,07:37:23,8220DB000094,77,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Civic Centre,53.396035,-6.264377,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,94
46854,07:38:09,07:38:09,8220DB000126,78,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Nursing Home,53.399095,-6.264087,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,126
47148,07:38:59,07:38:59,8220DB006182,79,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Santry Cross,53.402251,-6.265357,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,6182
47442,07:39:30,07:39:30,8220DB000322,80,IKEA,155,y1009.60-155-d12-1.92.I,saturday,Gulliver's Retail Pk,53.404255,-6.265351,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...",inbound,322


In [23]:
def coordinates(row, df, coordinate):
    all_coords = df[coordinate].unique().tolist()

    if len(all_coords) == 0:
        coord = "None"
    else:
        all_coords = ([str(x) for x in all_coords])
        coord = ", ".join(all_coords)
    return coord


def names(row, df):
    all_names = df["stop_name"].unique().tolist()

    if len(all_names) == 0:
        names = "None"
    else:
        names = ", ".join(all_names)
    return names


def stops(row, df):
    all_stops = df["stop_num"]

    if len(all_stops) == 0:
        stops = "None"
    else:
        stops = ", ".join(all_stops)
    return stops


def create_uniques_id(row):
    return row["line_id"] + "_" + row["direction"]


def modify_df(df):
    if df.empty:
        pass
    else:
        df['stops'] = df.apply(stops, df=df, axis=1)
        df['longitudes'] = df.apply(coordinates, df=df, coordinate="longitude", axis=1)
        df['latitudes'] = df.apply(coordinates, df=df, coordinate="latitude", axis=1)
        df['names'] = df.apply(names, df=df, axis=1)
        df['id'] = df.apply(create_uniques_id, axis=1)
        
        
    df = df.drop(["stop_num", "latitude", 
                  "longitude", "stop_name", 
                  "direction", "day_of_week", 
                  "line_id", "service_id", 
                  "stop_sequence", "departure_time",
                  "arrival_time", "stop_id"], axis=1)
    return df

### We will need to created nested data in cells so the front-end can handle the data approrpriately.

In [24]:
longest_inbound = modify_df(longest_inbound)
longest_inbound

Unnamed: 0,destination,first_departure_schedule,stops,longitudes,latitudes,names,id
23394,IKEA,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","4168, 4170, 4153, 4154, 4416, 4201, 4202, 4203...","-6.10101779854804, -6.10697299154263, -6.10953...","53.2044316558395, 53.203483770056, 53.20489064...","Bray Station, Methodist Church, Cornerstone Ch...",155_inbound
23850,IKEA,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","4168, 4170, 4153, 4154, 4416, 4201, 4202, 4203...","-6.10101779854804, -6.10697299154263, -6.10953...","53.2044316558395, 53.203483770056, 53.20489064...","Bray Station, Methodist Church, Cornerstone Ch...",155_inbound
24156,IKEA,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","4168, 4170, 4153, 4154, 4416, 4201, 4202, 4203...","-6.10101779854804, -6.10697299154263, -6.10953...","53.2044316558395, 53.203483770056, 53.20489064...","Bray Station, Methodist Church, Cornerstone Ch...",155_inbound
24462,IKEA,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","4168, 4170, 4153, 4154, 4416, 4201, 4202, 4203...","-6.10101779854804, -6.10697299154263, -6.10953...","53.2044316558395, 53.203483770056, 53.20489064...","Bray Station, Methodist Church, Cornerstone Ch...",155_inbound
24768,IKEA,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","4168, 4170, 4153, 4154, 4416, 4201, 4202, 4203...","-6.10101779854804, -6.10697299154263, -6.10953...","53.2044316558395, 53.203483770056, 53.20489064...","Bray Station, Methodist Church, Cornerstone Ch...",155_inbound
...,...,...,...,...,...,...,...
46560,IKEA,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","4168, 4170, 4153, 4154, 4416, 4201, 4202, 4203...","-6.10101779854804, -6.10697299154263, -6.10953...","53.2044316558395, 53.203483770056, 53.20489064...","Bray Station, Methodist Church, Cornerstone Ch...",155_inbound
46854,IKEA,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","4168, 4170, 4153, 4154, 4416, 4201, 4202, 4203...","-6.10101779854804, -6.10697299154263, -6.10953...","53.2044316558395, 53.203483770056, 53.20489064...","Bray Station, Methodist Church, Cornerstone Ch...",155_inbound
47148,IKEA,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","4168, 4170, 4153, 4154, 4416, 4201, 4202, 4203...","-6.10101779854804, -6.10697299154263, -6.10953...","53.2044316558395, 53.203483770056, 53.20489064...","Bray Station, Methodist Church, Cornerstone Ch...",155_inbound
47442,IKEA,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","4168, 4170, 4153, 4154, 4416, 4201, 4202, 4203...","-6.10101779854804, -6.10697299154263, -6.10953...","53.2044316558395, 53.203483770056, 53.20489064...","Bray Station, Methodist Church, Cornerstone Ch...",155_inbound


In [25]:
longest_outbound = modify_df(longest_outbound)
longest_outbound

Unnamed: 0,destination,first_departure_schedule,stops,longitudes,latitudes,names,id
0,Bray,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","7698, 7113, 127, 112, 113, 114, 115, 37, 38, 3...","-6.2746347975114, -6.26492426394677, -6.263680...","53.4063848668136, 53.4025410678893, 53.3991075...","IKEA, Northwood Avenue, Nursing Home, Civic Ce...",155_outbound
306,Bray,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","7698, 7113, 127, 112, 113, 114, 115, 37, 38, 3...","-6.2746347975114, -6.26492426394677, -6.263680...","53.4063848668136, 53.4025410678893, 53.3991075...","IKEA, Northwood Avenue, Nursing Home, Civic Ce...",155_outbound
612,Bray,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","7698, 7113, 127, 112, 113, 114, 115, 37, 38, 3...","-6.2746347975114, -6.26492426394677, -6.263680...","53.4063848668136, 53.4025410678893, 53.3991075...","IKEA, Northwood Avenue, Nursing Home, Civic Ce...",155_outbound
918,Bray,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","7698, 7113, 127, 112, 113, 114, 115, 37, 38, 3...","-6.2746347975114, -6.26492426394677, -6.263680...","53.4063848668136, 53.4025410678893, 53.3991075...","IKEA, Northwood Avenue, Nursing Home, Civic Ce...",155_outbound
1224,Bray,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","7698, 7113, 127, 112, 113, 114, 115, 37, 38, 3...","-6.2746347975114, -6.26492426394677, -6.263680...","53.4063848668136, 53.4025410678893, 53.3991075...","IKEA, Northwood Avenue, Nursing Home, Civic Ce...",155_outbound
...,...,...,...,...,...,...,...
22044,Bray,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","7698, 7113, 127, 112, 113, 114, 115, 37, 38, 3...","-6.2746347975114, -6.26492426394677, -6.263680...","53.4063848668136, 53.4025410678893, 53.3991075...","IKEA, Northwood Avenue, Nursing Home, Civic Ce...",155_outbound
22344,Bray,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","7698, 7113, 127, 112, 113, 114, 115, 37, 38, 3...","-6.2746347975114, -6.26492426394677, -6.263680...","53.4063848668136, 53.4025410678893, 53.3991075...","IKEA, Northwood Avenue, Nursing Home, Civic Ce...",155_outbound
22644,Bray,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","7698, 7113, 127, 112, 113, 114, 115, 37, 38, 3...","-6.2746347975114, -6.26492426394677, -6.263680...","53.4063848668136, 53.4025410678893, 53.3991075...","IKEA, Northwood Avenue, Nursing Home, Civic Ce...",155_outbound
22944,Bray,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","7698, 7113, 127, 112, 113, 114, 115, 37, 38, 3...","-6.2746347975114, -6.26492426394677, -6.263680...","53.4063848668136, 53.4025410678893, 53.3991075...","IKEA, Northwood Avenue, Nursing Home, Civic Ce...",155_outbound


In [26]:
outbound_155 = longest_outbound.drop_duplicates()
outbound_155

Unnamed: 0,destination,first_departure_schedule,stops,longitudes,latitudes,names,id
0,Bray,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","7698, 7113, 127, 112, 113, 114, 115, 37, 38, 3...","-6.2746347975114, -6.26492426394677, -6.263680...","53.4063848668136, 53.4025410678893, 53.3991075...","IKEA, Northwood Avenue, Nursing Home, Civic Ce...",155_outbound


In [27]:
inbound_155 = longest_inbound.drop_duplicates()
inbound_155

Unnamed: 0,destination,first_departure_schedule,stops,longitudes,latitudes,names,id
23394,IKEA,"06:00:00, 06:20:00, 06:40:00, 07:00:00, 07:20:...","4168, 4170, 4153, 4154, 4416, 4201, 4202, 4203...","-6.10101779854804, -6.10697299154263, -6.10953...","53.2044316558395, 53.203483770056, 53.20489064...","Bray Station, Methodist Church, Cornerstone Ch...",155_inbound


### This process will be modified and run in a loop to create such a pair of rows for every route line number. They will then be attached to a dataframe together, and pushed to a database. This process will be tested in parsing_all_routes.ipynb

### Check the number of 

In [28]:
merged_df = pd.merge(stop_times, stops_df, left_on='stop_id', right_on='stop_id')
merged_df

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,shape_id,line_id,service_id,day_of_week,stop_name,stop_lat,stop_lon
0,7712.y1009.60-1-d12-1.1.O,18:40:00,18:40:00,8240DB000226,1,Sandymount,60-1-d12-1.1.O,1,y1009.60-1-d12-1.1.O,saturday,"Shanard Avenue, stop 226",53.391141,-6.262200
1,7728.y1009.60-1-d12-1.1.O,20:00:00,20:00:00,8240DB000226,1,Sandymount,60-1-d12-1.1.O,1,y1009.60-1-d12-1.1.O,saturday,"Shanard Avenue, stop 226",53.391141,-6.262200
2,7742.y1009.60-1-d12-1.1.O,19:40:00,19:40:00,8240DB000226,1,Sandymount,60-1-d12-1.1.O,1,y1009.60-1-d12-1.1.O,saturday,"Shanard Avenue, stop 226",53.391141,-6.262200
3,7757.y1009.60-1-d12-1.1.O,20:20:00,20:20:00,8240DB000226,1,Sandymount,60-1-d12-1.1.O,1,y1009.60-1-d12-1.1.O,saturday,"Shanard Avenue, stop 226",53.391141,-6.262200
4,7769.y1009.60-1-d12-1.1.O,19:20:00,19:20:00,8240DB000226,1,Sandymount,60-1-d12-1.1.O,1,y1009.60-1-d12-1.1.O,saturday,"Shanard Avenue, stop 226",53.391141,-6.262200
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1874914,7119.y1005.60-65-b12-1.263.I,08:50:39,08:50:39,8230DB002358,38,Poolbeg St,60-65-b12-1.263.I,65,y1005.60-65-b12-1.263.I,saturday,"Killinarden, stop 2358",53.280704,-6.389614
1874915,343.y1003.60-65-b12-1.263.I,06:49:58,06:49:58,8230DB002358,38,Poolbeg St,60-65-b12-1.263.I,65,y1003.60-65-b12-1.263.I,weekday,"Killinarden, stop 2358",53.280704,-6.389614
1874916,304.y1003.60-65-b12-1.263.I,18:51:31,18:51:31,8230DB002358,38,Poolbeg St,60-65-b12-1.263.I,65,y1003.60-65-b12-1.263.I,weekday,"Killinarden, stop 2358",53.280704,-6.389614
1874917,349.y1003.60-65-b12-1.263.I,17:54:25,17:54:25,8230DB002358,38,Poolbeg St,60-65-b12-1.263.I,65,y1003.60-65-b12-1.263.I,weekday,"Killinarden, stop 2358",53.280704,-6.389614


In [31]:
lines = merged_df["line_id"].unique().tolist()
for line in lines:
    temp_df = merged_df[merged_df["line_id"]==line]
    print(line, ": ", len(temp_df["shape_id"].unique().tolist()))

1 :  8
16 :  8
16D :  4
33 :  14
41 :  4
41B :  6
41C :  6
41D :  4
44 :  10
13 :  26
11 :  8
40 :  10
40B :  6
40D :  12
122 :  16
38 :  10
38A :  8
38B :  6
38D :  4
53 :  6
15A :  4
15B :  4
15D :  4
47 :  10
56A :  4
61 :  8
77A :  4
77X :  2
27 :  14
120 :  8
25 :  6
25A :  6
25B :  4
25D :  4
26 :  6
4 :  10
66 :  10
66A :  4
66B :  6
66E :  4
67 :  6
7 :  10
7A :  10
140 :  8
155 :  8
83 :  16
83A :  8
9 :  8
46A :  18
46E :  2
33E :  2
116 :  4
37 :  7
39 :  4
39A :  4
70 :  8
145 :  6
33D :  4
7B :  4
7D :  6
32X :  4
41X :  10
84X :  10
142 :  4
118 :  2
25X :  4
39X :  4
66X :  18
67X :  16
27X :  4
44B :  4
14 :  10
27B :  22
40E :  4
15 :  6
150 :  6
151 :  10
33X :  4
51D :  4
68 :  18
68A :  4
69 :  6
69X :  4
123 :  8
65 :  12
65B :  4
68X :  2
49 :  6
54A :  4
130 :  4
27A :  6
6 :  4
H1 :  4
H2 :  4
H3 :  4
42 :  4
43 :  6
H9 :  2
79 :  4
79A :  4
84 :  12
84A :  6
