In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### First, let's import the stop_times file

In [2]:
stop_times = pd.read_csv("stop_times.txt")
ceann_scribe = pd.read_csv("ainms_ceann_scribe.csv")
stop_times = stop_times.rename(columns = {"stop_headsign":"destination"})
stop_times = stop_times.drop(["pickup_type", "drop_off_type", "shape_dist_traveled"], axis=1)
stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,destination
0,7712.y1009.60-1-d12-1.1.O,18:40:00,18:40:00,8240DB000226,1,Sandymount
1,7712.y1009.60-1-d12-1.1.O,18:40:44,18:40:44,8220DB000228,2,Sandymount
2,7712.y1009.60-1-d12-1.1.O,18:41:21,18:41:21,8240DB000229,3,Sandymount
3,7712.y1009.60-1-d12-1.1.O,18:42:20,18:42:20,8240DB000227,4,Sandymount
4,7712.y1009.60-1-d12-1.1.O,18:42:58,18:42:58,8240DB000230,5,Sandymount
...,...,...,...,...,...,...
1874914,6667.y1003.60-H9-b12-1.97.I,08:03:12,08:03:12,8220DB000619,16,Abbey Street
1874915,6667.y1003.60-H9-b12-1.97.I,08:03:50,08:03:50,8220DB000675,17,Abbey Street
1874916,6667.y1003.60-H9-b12-1.97.I,08:05:22,08:05:22,8220DB000620,18,Abbey Street
1874917,6667.y1003.60-H9-b12-1.97.I,08:06:35,08:06:35,8220DB007569,19,Abbey Street


In [3]:
def destination_whitespace(row):
    return row["destination"].lstrip()

stop_times["destination"] = stop_times.apply(destination_whitespace, axis=1)
stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,destination
0,7712.y1009.60-1-d12-1.1.O,18:40:00,18:40:00,8240DB000226,1,Sandymount
1,7712.y1009.60-1-d12-1.1.O,18:40:44,18:40:44,8220DB000228,2,Sandymount
2,7712.y1009.60-1-d12-1.1.O,18:41:21,18:41:21,8240DB000229,3,Sandymount
3,7712.y1009.60-1-d12-1.1.O,18:42:20,18:42:20,8240DB000227,4,Sandymount
4,7712.y1009.60-1-d12-1.1.O,18:42:58,18:42:58,8240DB000230,5,Sandymount
...,...,...,...,...,...,...
1874914,6667.y1003.60-H9-b12-1.97.I,08:03:12,08:03:12,8220DB000619,16,Abbey Street
1874915,6667.y1003.60-H9-b12-1.97.I,08:03:50,08:03:50,8220DB000675,17,Abbey Street
1874916,6667.y1003.60-H9-b12-1.97.I,08:05:22,08:05:22,8220DB000620,18,Abbey Street
1874917,6667.y1003.60-H9-b12-1.97.I,08:06:35,08:06:35,8220DB007569,19,Abbey Street


### And the stops file

In [4]:
stops_df = pd.read_csv("stops.txt")
stops_df = stops_df.rename(columns = {'stop_lat': 'latitude', 
                                      "stop_lon": "longitude"})
stops_df

Unnamed: 0,stop_id,stop_name,latitude,longitude
0,8220DB000002,"Parnell Square West, stop 2",53.352244,-6.263723
1,8220DB000003,"Parnell Square West, stop 3",53.352309,-6.263811
2,8220DB000004,"Parnell Square West, stop 4",53.352575,-6.264175
3,8220DB000006,"Parnell Square West, stop 6",53.352749,-6.264454
4,8220DB000007,"Parnell Square West, stop 7",53.352841,-6.264570
...,...,...,...,...
4203,8350DB007461,"Charlesland, stop 7461",53.128932,-6.062803
4204,8350DB007462,"Charlesland, stop 7462",53.128801,-6.062480
4205,8350DB007574,"Southern Cross, stop 7574",53.182348,-6.130064
4206,8350DB007823,"Enniskerry Village, stop 7823",53.194198,-6.170184


### Merging the two files

In [45]:
merged_df = pd.merge(stop_times, stops_df, left_on='stop_id', right_on='stop_id')
merged_df

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,destination,stop_name,latitude,longitude
0,7712.y1009.60-1-d12-1.1.O,18:40:00,18:40:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200
1,7728.y1009.60-1-d12-1.1.O,20:00:00,20:00:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200
2,7742.y1009.60-1-d12-1.1.O,19:40:00,19:40:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200
3,7757.y1009.60-1-d12-1.1.O,20:20:00,20:20:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200
4,7769.y1009.60-1-d12-1.1.O,19:20:00,19:20:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200
...,...,...,...,...,...,...,...,...,...
1874914,7119.y1005.60-65-b12-1.263.I,08:50:39,08:50:39,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614
1874915,343.y1003.60-65-b12-1.263.I,06:49:58,06:49:58,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614
1874916,304.y1003.60-65-b12-1.263.I,18:51:31,18:51:31,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614
1874917,349.y1003.60-65-b12-1.263.I,17:54:25,17:54:25,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614


In [46]:
ceann_scribe

Unnamed: 0,name,ainm
0,Abbey St,Sráid na Mainistreach
1,Abbey St Lower,Sráid na Mainistreach Íochtarach
2,Adamstown,Baile Adaim
3,Ashington,
4,Ashtown Stn,Stáisiún Baile an Ásaigh
...,...,...
125,Waterloo Rd,Bóthar Waterloo
126,Westmoreland St,Sráid Westmoreland
127,Whitechurch,An Teampall Geal
128,Wilton Terrace,Ardán Wilton


In [47]:
ceann_scribe[ceann_scribe["name"] =="Abbey St"]

Unnamed: 0,name,ainm
0,Abbey St,Sráid na Mainistreach


In [48]:
# function to find and append the irish name for the stop
def agus_ainm(row, first_list, filtered_list):
    if row['stop_id'] in filtered_list:
        item = first_list[filtered_list.index(row['stop_id'])]
        return item[1]

def agus_ceann_scribe(row):
    name = row["destination"]
    name_df = ceann_scribe[ceann_scribe["name"]==name]
    if name_df.empty:
        return name
    ainm = name_df["ainm"].unique().tolist()
    return ainm[0]

def no_ceann_scribe(row):
    destination = row["destination"]
    if row[row["ceann_scribe"].isnull()]:
        return destination

In [49]:
merged_df["ceann_scribe"] = merged_df.apply(agus_ceann_scribe, axis=1)
merged_df

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,destination,stop_name,latitude,longitude,ceann_scribe
0,7712.y1009.60-1-d12-1.1.O,18:40:00,18:40:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200,Dumhach Thrá
1,7728.y1009.60-1-d12-1.1.O,20:00:00,20:00:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200,Dumhach Thrá
2,7742.y1009.60-1-d12-1.1.O,19:40:00,19:40:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200,Dumhach Thrá
3,7757.y1009.60-1-d12-1.1.O,20:20:00,20:20:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200,Dumhach Thrá
4,7769.y1009.60-1-d12-1.1.O,19:20:00,19:20:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200,Dumhach Thrá
...,...,...,...,...,...,...,...,...,...,...
1874914,7119.y1005.60-65-b12-1.263.I,08:50:39,08:50:39,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614,Sráid an Phoill Bhig
1874915,343.y1003.60-65-b12-1.263.I,06:49:58,06:49:58,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614,Sráid an Phoill Bhig
1874916,304.y1003.60-65-b12-1.263.I,18:51:31,18:51:31,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614,Sráid an Phoill Bhig
1874917,349.y1003.60-65-b12-1.263.I,17:54:25,17:54:25,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614,Sráid an Phoill Bhig


In [54]:
# we need some gtfs_data from an extra file containing more info per each stop
all_routes_sequences = pd.read_csv("route_seqs.csv")
db_routes_sequences = all_routes_sequences[all_routes_sequences["Operator"] == "DB"]
db_stops_filtered = db_routes_sequences[["AtcoCode", "ShortCommonName_ga"]]

In [55]:
first_list = [tuple(r) for r in db_stops_filtered.to_numpy()]
filtered_list = []
for item in first_list:
    filtered_list.append(item[0])

merged_df['ainm'] = merged_df.apply(agus_ainm, first_list=first_list, filtered_list=filtered_list, axis=1)

In [56]:
merged_df

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,destination,stop_name,latitude,longitude,ceann_scribe,ainm
0,7712.y1009.60-1-d12-1.1.O,18:40:00,18:40:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird
1,7728.y1009.60-1-d12-1.1.O,20:00:00,20:00:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird
2,7742.y1009.60-1-d12-1.1.O,19:40:00,19:40:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird
3,7757.y1009.60-1-d12-1.1.O,20:20:00,20:20:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird
4,7769.y1009.60-1-d12-1.1.O,19:20:00,19:20:00,8240DB000226,1,Sandymount,"Shanard Avenue, stop 226",53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird
...,...,...,...,...,...,...,...,...,...,...,...
1874914,7119.y1005.60-65-b12-1.263.I,08:50:39,08:50:39,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin
1874915,343.y1003.60-65-b12-1.263.I,06:49:58,06:49:58,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin
1874916,304.y1003.60-65-b12-1.263.I,18:51:31,18:51:31,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin
1874917,349.y1003.60-65-b12-1.263.I,17:54:25,17:54:25,8230DB002358,38,Poolbeg St,"Killinarden, stop 2358",53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin


### In order to make the correct modifications to the dataframe, we need to split it into the correct chunks. We only want the longest shape of each route. This will be easier with a cell containing the correct line_id per row

In [57]:
def line_id(row):
    shape_id = row['shape_id']
    line_id = shape_id.split('-')[1]
    return line_id


def shape_id(row):
    trip_id = row['trip_id']
    shape_strings = trip_id.split('.')
    shape_id = shape_strings[2] + '.' + shape_strings[3] + "." + shape_strings[4]
    return shape_id


def route_direction(row):
    trip_string = row['trip_id']
    direction = trip_string[-1]
    if direction == "O":
        return "outbound"
    if direction == "I":
        return "inbound"
    
    
# function for isolating the stop number for each row
def stop_number(row):
    stop_string = row['stop_name'].split(' ')
    if stop_string[-1].isdigit:
        return stop_string[-1]
    else:
        return "No stop number."
    
def stop_name(row):
    name = row["stop_name"].split(",")[0]
    return name

In [58]:
merged_df["shape_id"] = merged_df.apply(shape_id, axis=1)

In [59]:
merged_df["line_id"] = merged_df.apply(line_id, axis=1)

In [60]:
merged_df["direction"] = merged_df.apply(route_direction, axis=1)

In [61]:
merged_df["stop_num"] = merged_df.apply(stop_number, axis=1)

In [62]:
merged_df["stop_name"] = merged_df.apply(stop_name, axis=1)
merged_df

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,destination,stop_name,latitude,longitude,ceann_scribe,ainm,shape_id,line_id,direction,stop_num
0,7712.y1009.60-1-d12-1.1.O,18:40:00,18:40:00,8240DB000226,1,Sandymount,Shanard Avenue,53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird,60-1-d12-1.1.O,1,outbound,226
1,7728.y1009.60-1-d12-1.1.O,20:00:00,20:00:00,8240DB000226,1,Sandymount,Shanard Avenue,53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird,60-1-d12-1.1.O,1,outbound,226
2,7742.y1009.60-1-d12-1.1.O,19:40:00,19:40:00,8240DB000226,1,Sandymount,Shanard Avenue,53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird,60-1-d12-1.1.O,1,outbound,226
3,7757.y1009.60-1-d12-1.1.O,20:20:00,20:20:00,8240DB000226,1,Sandymount,Shanard Avenue,53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird,60-1-d12-1.1.O,1,outbound,226
4,7769.y1009.60-1-d12-1.1.O,19:20:00,19:20:00,8240DB000226,1,Sandymount,Shanard Avenue,53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird,60-1-d12-1.1.O,1,outbound,226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1874914,7119.y1005.60-65-b12-1.263.I,08:50:39,08:50:39,8230DB002358,38,Poolbeg St,Killinarden,53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin,60-65-b12-1.263.I,65,inbound,2358
1874915,343.y1003.60-65-b12-1.263.I,06:49:58,06:49:58,8230DB002358,38,Poolbeg St,Killinarden,53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin,60-65-b12-1.263.I,65,inbound,2358
1874916,304.y1003.60-65-b12-1.263.I,18:51:31,18:51:31,8230DB002358,38,Poolbeg St,Killinarden,53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin,60-65-b12-1.263.I,65,inbound,2358
1874917,349.y1003.60-65-b12-1.263.I,17:54:25,17:54:25,8230DB002358,38,Poolbeg St,Killinarden,53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin,60-65-b12-1.263.I,65,inbound,2358


In [64]:
# removing null values where irish names not present. 
# replace with english names
merged_df["ceann_scribe"].fillna(merged_df["destination"], inplace=True)
merged_df["ainm"].fillna(merged_df["stop_name"], inplace=True)
test = merged_df[merged_df["ceann_scribe"].isnull()]
test

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,destination,stop_name,latitude,longitude,ceann_scribe,ainm,shape_id,line_id,direction,stop_num


In [78]:
line_list = merged_df["line_id"].unique().tolist()
print(len(line_list))

102


### We are also going to need an empty dataframe to add to

In [88]:
col_names =  ['destination', 'ceann_scribe', 'first_departure_schedule', 
              'stops', 'longitudes', 'latitudes', 'names', 
              'id', 'gach_ainm', 'line_id', 'direction']
  
final_df  = pd.DataFrame(columns = col_names)
final_df

Unnamed: 0,destination,ceann_scribe,first_departure_schedule,stops,longitudes,latitudes,names,id,gach_ainm,line_id,direction


### The following functions will compile all the starting times of each route

In [89]:
def sorting_seconds(time_list):
    ftr = [3600, 60, 1]
    times_in_seconds = []

    for time in time_list:
        time_units = time.split(':')
        total_secs = (int(time_units[0]) * ftr[2]) + (int(time_units[1]) * ftr[1]) + (int(time_units[0]) * ftr[0])
        times_in_seconds.append(total_secs)

    times_in_seconds.sort() 
    return times_in_seconds

    
def to_timestamp(seconds):
    hour = 3600
    minute = 60

    hours = str(int(seconds/hour))
    minutes = str(int((seconds % hour)/minute))
    seconds = str(int((seconds % hour) % 60))

    if len(hours) == 1:
        hours = f"0{hours}"
    if len(minutes) == 1:
        minutes = f"0{minutes}"
    
    seconds = f"00"

    timestamp = f"{hours}:{minutes}:{seconds}"

    return timestamp

def sorted_timestamps(times_in_seconds):
    sorted_timestamps = []

    for time in times_in_seconds:
        sorted_timestamps.append(to_timestamp(time))

    return sorted_timestamps

def departure_times(df):    
    first_stop = df[df["stop_sequence"]==1]
    first_stop_times = first_stop["departure_time"].unique().tolist()
    sorted_seconds = sorting_seconds(first_stop_times)
    first_stop_times = sorted_timestamps(sorted_seconds)
    first_stop_times = ([str(x) for x in first_stop_times])
    first_stop_times = ", ".join(first_stop_times)
    return first_stop_times

### The following make further alterations to the data frame, creating nested lists

In [90]:
def coordinates(row, df, coordinate):
    all_coords = df[coordinate].unique().tolist()

    if len(all_coords) == 0:
        coord = "None"
    else:
        all_coords = ([str(x) for x in all_coords])
        coord = ", ".join(all_coords)
    return coord


def gach_ainm(row, df):
    gach_ainm = df['ainm'].tolist()
    gach_ainm = ([str(x) for x in gach_ainm])
    gach_ainm = ", ".join(gach_ainm)

    return gach_ainm



def names(row, df):
    route = df["line_id"].unique().tolist()
    all_names = df["stop_name"].tolist()
    names = ", ".join(all_names)
    return names


def stops(row, df):
    all_stops = df["stop_num"]

    if len(all_stops) == 0:
        stops = "None"
    else:
        stops = ", ".join(all_stops)
    return stops


def create_uniques_id(row):
    return row["line_id"] + "_" + row["direction"]


def modify_df(df):
    if df.empty:
        pass
    else:
        df['stops'] = df.apply(stops, df=df, axis=1)
        df['longitudes'] = df.apply(coordinates, df=df, coordinate="longitude", axis=1)
        df['latitudes'] = df.apply(coordinates, df=df, coordinate="latitude", axis=1)
        df['names'] = df.apply(names, df=df, axis=1)
        df['gach_ainm'] = df.apply(gach_ainm, df=df, axis=1)
        df['id'] = df.apply(create_uniques_id, axis=1)
        
        
    df = df.drop(["stop_num", "latitude", "longitude", 
                  "stop_name", "ainm", "stop_sequence", 
                  "departure_time", "arrival_time", "stop_id"], axis=1)
    return df


def make_string(row):
    return int(row["stop_sequence"])

def sort_by_sequence(df):
    df['sort'] = df.apply(make_string, axis=1)
    df.sort_values('sort',inplace=True, ascending=True)
    df = df.drop('sort', axis=1)
    
    return df

### We need to go over every line_id and get the longest shape associated with each 

In [91]:
for line in line_list:
    temp_df = merged_df[merged_df["line_id"]==line]
    shapes = temp_df["shape_id"].unique().tolist()
    
    # iterate over each of the shapes and split into outbound and inbound
    inbound_shapes = []
    outbound_shapes = []

    for shape in shapes:
        direction = shape.split('.')[2]
        if direction == "O":
            outbound_shapes.append(shape)
        if direction == "I":
            inbound_shapes.append(shape)
            
    # get the longest inbound and longest outbound
    longest_outbound = ""
    longest_length_outbound = 0
    for shape in outbound_shapes:
        temp = merged_df[merged_df["shape_id"]==shape]
        if temp.shape[0] > longest_length_outbound:
            longest_length_outbound = temp.shape[0]
            longest_outbound = shape

    longest_inbound = ""
    longest_length_inbound = 0
    for shape in inbound_shapes:
        temp = merged_df[merged_df["shape_id"]==shape]
        if temp.shape[0] > longest_length_inbound:
            longest_length_inbound = temp.shape[0]
            longest_inbound = shape
    
    longest_outbound = merged_df[merged_df["shape_id"]==longest_outbound]
    longest_inbound = merged_df[merged_df["shape_id"]==longest_inbound]

    longest_outbound["first_departure_schedule"] = departure_times(longest_outbound) 
    longest_inbound["first_departure_schedule"] = departure_times(longest_inbound)
    
    longest_outbound = longest_outbound.drop_duplicates(subset=['stop_sequence'], keep='first')
    longest_inbound = longest_inbound.drop_duplicates(subset=['stop_sequence'], keep='first')

    if not longest_outbound.empty:
        longest_outbound = sort_by_sequence(longest_outbound)
    if not longest_inbound.empty:
        longest_inbound = sort_by_sequence(longest_inbound)
    
    longest_outbound = longest_outbound.drop(["shape_id", "trip_id"], axis=1)
    longest_inbound = longest_inbound.drop(["shape_id", "trip_id"], axis=1)
 
    longest_inbound = modify_df(longest_inbound)
    longest_outbound = modify_df(longest_outbound)
    
    final_df = final_df.append(longest_inbound)
    final_df = final_df.append(longest_outbound)

In [92]:
final_df = final_df.drop_duplicates(subset=["id"], keep='first')

In [93]:
final_df.head(5)

Unnamed: 0,destination,ceann_scribe,first_departure_schedule,stops,longitudes,latitudes,names,id,gach_ainm,line_id,direction
49962,Shanard Road,Bóthar Sheanaird,"06:30:00, 06:42:00, 06:54:00, 07:00:00, 07:06:...","381, 7740, 7741, 387, 388, 389, 393, 371, 391,...","-6.21237419337661, -6.21472966508216, -6.21498...","53.3243237661094, 53.3268757414366, 53.3292794...","St John's Church, Park Avenue, Gilford Road, S...",1_inbound,"Séipeál Eoin, nan, nan, Dumhach Thrá, Páirc Fh...",1,inbound
0,Sandymount,Dumhach Thrá,"06:30:00, 06:42:00, 06:54:00, 07:00:00, 07:06:...","226, 228, 229, 227, 230, 231, 1641, 1642, 213,...","-6.26220046436849, -6.25971957291393, -6.25653...","53.391140564198, 53.3918773927815, 53.39139951...","Shanard Avenue, Shanliss Road, Oldtown Road, S...",1_outbound,"Ascail Sheanaird, Br an tSeanleasa, Br an tSea...",1,outbound
1046074,Dublin Airport,Aerfort Bhaile Átha Cliath,"05:30:00, 05:45:00, 06:00:00, 06:15:00, 06:30:...","5171, 2976, 2977, 2978, 2979, 2980, 2981, 2991...","-6.24826056513117, -6.2554595122767, -6.260210...","53.2717359437923, 53.2720572242554, 53.2735737...","Kingston, Grange Hall, Pine Valley, Grange Roa...",16_inbound,"Baile an Rí, Halla na Gráinsí, Gleann na Giúis...",16,inbound
1013778,Ballinteer,Baile an tSaoir,"06:00:00, 06:15:00, 06:30:00, 06:45:00, 07:00:...","7347, 3669, 7349, 1631, 1632, 5053, 1633, 1634...","-6.24202072636356, -6.23507978936044, -6.23471...","53.428019654753, 53.4284655392397, 53.42527838...","Zone 15, Maldron Hotel, Radisson Hotel, ALSAA ...",16_outbound,"Aerfort BÁC C 1, Óstán an Maldron, Óstán an Ra...",16,outbound
1014229,Ballinteer,Baile an tSaoir,"07:10:00, 07:50:00, 08:30:00","7347, 3669, 7349, 1631, 1632, 5053, 1633, 1634...","-6.24202072636356, -6.23507978936044, -6.23471...","53.428019654753, 53.4284655392397, 53.42527838...","Zone 15, Maldron Hotel, Radisson Hotel, ALSAA ...",16D_outbound,"Aerfort BÁC C 1, Óstán an Maldron, Óstán an Ra...",16D,outbound


### This dataframe, final df, will be used to accomodate much of the backend data parsing to occur during the use of our application

## Next we need to add some extra changes to the merged_df in order to facilitate our other schemas
### We will need a dataset that provides a list of all stops per route.
#### first we need a unique key and to parse excess data

In [98]:
# function to create id column values
def create_id(row):
    return row["shape_id"] + "_" + row["stop_num"]

In [99]:
merged_df["id"] = merged_df.apply(create_id, axis=1)

In [100]:
merged_df = merged_df.drop(["arrival_time", "departure_time", "trip_id"], axis=1)
merged_df

Unnamed: 0,stop_id,stop_sequence,destination,stop_name,latitude,longitude,ceann_scribe,ainm,shape_id,line_id,direction,stop_num,id
0,8240DB000226,1,Sandymount,Shanard Avenue,53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird,60-1-d12-1.1.O,1,outbound,226,60-1-d12-1.1.O_226
1,8240DB000226,1,Sandymount,Shanard Avenue,53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird,60-1-d12-1.1.O,1,outbound,226,60-1-d12-1.1.O_226
2,8240DB000226,1,Sandymount,Shanard Avenue,53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird,60-1-d12-1.1.O,1,outbound,226,60-1-d12-1.1.O_226
3,8240DB000226,1,Sandymount,Shanard Avenue,53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird,60-1-d12-1.1.O,1,outbound,226,60-1-d12-1.1.O_226
4,8240DB000226,1,Sandymount,Shanard Avenue,53.391141,-6.262200,Dumhach Thrá,Ascail Sheanaird,60-1-d12-1.1.O,1,outbound,226,60-1-d12-1.1.O_226
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1874914,8230DB002358,38,Poolbeg St,Killinarden,53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin,60-65-b12-1.263.I,65,inbound,2358,60-65-b12-1.263.I_2358
1874915,8230DB002358,38,Poolbeg St,Killinarden,53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin,60-65-b12-1.263.I,65,inbound,2358,60-65-b12-1.263.I_2358
1874916,8230DB002358,38,Poolbeg St,Killinarden,53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin,60-65-b12-1.263.I,65,inbound,2358,60-65-b12-1.263.I_2358
1874917,8230DB002358,38,Poolbeg St,Killinarden,53.280704,-6.389614,Sráid an Phoill Bhig,Cill an Ardáin,60-65-b12-1.263.I,65,inbound,2358,60-65-b12-1.263.I_2358


#### time to get all the unique stops

In [25]:
unique_stops = merged_df.drop_duplicates(subset=['stop_num'], keep='first')
unique_stops = unique_stops[["stop_id",
                             "latitude",
                             "longitude",
                             "stop_name",
                             "ainm",
                             "stop_num"]].sort_values(by='stop_id')

unique_stops

Unnamed: 0,stop_id,latitude,longitude,stop_name,ainm,stop_num
1341704,8220DB000002,53.352244,-6.263723,Parnell Square West,Cg Parnell Thiar,2
246570,8220DB000003,53.352309,-6.263811,Parnell Square West,Cg Parnell Thiar,3
1743370,8220DB000004,53.352575,-6.264175,Parnell Square West,Cg Parnell Thiar,4
977134,8220DB000006,53.352749,-6.264454,Parnell Square West,Cg Parnell Thiar,6
461243,8220DB000007,53.352841,-6.264570,Parnell Square West,Cg Parnell Thiar,7
...,...,...,...,...,...,...
1828114,8350DB007461,53.128932,-6.062803,Charlesland,Acra na mBodach,7461
1828094,8350DB007462,53.128801,-6.062480,Charlesland,Acra na mBodach,7462
787366,8350DB007574,53.182348,-6.130064,Southern Cross,Cros an Deiscirt,7574
1562888,8350DB007823,53.194198,-6.170184,Enniskerry Village,,7823


### We need one more table with all stops, listing all routes attending that stop

#### Technically, this table isn't "neccessary" but the use of it reduces the amount of back end computation and improves our overall loading times

In [162]:
col_names =  ["stop_id", 
              "stop_sequence", 
              "destination",
              "ceann_scribe",
              "stop_name", 
              "latitude", 
              "longitude", "ainm",
              "shape_id", 
              "line_id", 
              "direction", 
              "stop_num", 
              "id"]

pruned_df  = pd.DataFrame(columns = col_names)
pruned_df

Unnamed: 0,stop_id,stop_sequence,destination,stop_name,latitude,longitude,ainm,shape_id,line_id,direction,stop_num,id


In [163]:
for line in line_list:
    temp_df = merged_df[merged_df["line_id"]==line]
    shapes = temp_df["shape_id"].unique().tolist()
    
    # iterate over each of the shapes and split into outbound and inbound
    inbound_shapes = []
    outbound_shapes = []

    for shape in shapes:
        direction = shape.split('.')[2]
        if direction == "O":
            outbound_shapes.append(shape)
        if direction == "I":
            inbound_shapes.append(shape)
            
    # get the longest inbound and longest outbound
    longest_outbound = ""
    longest_length_outbound = 0
    for shape in outbound_shapes:
        temp = merged_df[merged_df["shape_id"]==shape]
        if temp.shape[0] > longest_length_outbound:
            longest_length_outbound = temp.shape[0]
            longest_outbound = shape

    longest_inbound = ""
    longest_length_inbound = 0
    for shape in inbound_shapes:
        temp = merged_df[merged_df["shape_id"]==shape]
        if temp.shape[0] > longest_length_inbound:
            longest_length_inbound = temp.shape[0]
            longest_inbound = shape
    
    longest_outbound = merged_df[merged_df["shape_id"]==longest_outbound]
    longest_inbound = merged_df[merged_df["shape_id"]==longest_inbound]
    
    pruned_df = pruned_df.append(longest_inbound)
    pruned_df = pruned_df.append(longest_outbound)

In [164]:
print(merged_df.shape)
print(pruned_df.shape)
pruned_df

(1874919, 12)
(861532, 12)


Unnamed: 0,stop_id,stop_sequence,destination,stop_name,latitude,longitude,ainm,shape_id,line_id,direction,stop_num,id
169,8240DB000226,40,Shanard Road,Shanard Avenue,53.391141,-6.262200,Ascail Sheanaird,60-1-d12-1.3.I,1,inbound,226,60-1-d12-1.3.I_226
170,8240DB000226,40,Shanard Road,Shanard Avenue,53.391141,-6.262200,Ascail Sheanaird,60-1-d12-1.3.I,1,inbound,226,60-1-d12-1.3.I_226
171,8240DB000226,40,Shanard Road,Shanard Avenue,53.391141,-6.262200,Ascail Sheanaird,60-1-d12-1.3.I,1,inbound,226,60-1-d12-1.3.I_226
172,8240DB000226,40,Shanard Road,Shanard Avenue,53.391141,-6.262200,Ascail Sheanaird,60-1-d12-1.3.I,1,inbound,226,60-1-d12-1.3.I_226
173,8240DB000226,40,Shanard Road,Shanard Avenue,53.391141,-6.262200,Ascail Sheanaird,60-1-d12-1.3.I,1,inbound,226,60-1-d12-1.3.I_226
...,...,...,...,...,...,...,...,...,...,...,...,...
1817647,8250DB003128,16,Bray Station,Cabinteely PO,53.261298,-6.151055,OP Chában tSíle,60-84A-d12-1.339.O,84A,outbound,3128,60-84A-d12-1.339.O_3128
1817648,8250DB003128,16,Bray Station,Cabinteely PO,53.261298,-6.151055,OP Chában tSíle,60-84A-d12-1.339.O,84A,outbound,3128,60-84A-d12-1.339.O_3128
1817649,8250DB003128,16,Bray Station,Cabinteely PO,53.261298,-6.151055,OP Chában tSíle,60-84A-d12-1.339.O,84A,outbound,3128,60-84A-d12-1.339.O_3128
1817650,8250DB003128,16,Bray Station,Cabinteely PO,53.261298,-6.151055,OP Chában tSíle,60-84A-d12-1.339.O,84A,outbound,3128,60-84A-d12-1.339.O_3128


In [137]:
print(len(merged_df["stop_num"].unique().tolist()))
all_stops = pruned_df["stop_num"].unique().tolist()
print(len(all_stops))

4208
4122


In [165]:
def all_routes(row, df):
    all_lines_for_stop = df["line_id"].unique().tolist()
    
    return_list = []
    for line in all_lines_for_stop:
        #print(f"----{line}----")
        line_df = pruned_df[pruned_df["line_id"] == line]
        all_stops_seqs = line_df["stop_sequence"].unique().tolist()
        route_length = 0
        for stop in all_stops_seqs:
            if stop > route_length:
                route_length = stop
    
        #print("line length: ", route_length)
        filtered_df = df[df["line_id"] == line]
        sequence = filtered_df["stop_sequence"].unique().tolist()[0]
        destination = filtered_df["destination"].unique().tolist()[0]
        direction = filtered_df["direction"].unique().tolist()[0]
        #print("sequence: ", sequence)
        divisor = round(route_length/sequence, 2)
        
        return_list.append(f"[{line}, {divisor}, {direction}, {destination}]")
        
    return_list = ", ".join(return_list)
    return return_list

In [166]:
stop231 = pruned_df[pruned_df["stop_num"]=="231"]
stop231 = stop231.drop(["stop_name", 
                        "latitude", 
                        "longitude", 
                        "ainm", 
                        "stop_id",
                        "id",
                        "shape_id"], axis=1)
print(stop231.shape)
stop231 = stop231.drop_duplicates()
stop231

(718, 5)


Unnamed: 0,stop_sequence,destination,line_id,direction,stop_num
2026,6,Sandymount,1,outbound,231
2195,15,Ballinteer,16,outbound,231
2419,15,Ballinteer,16D,outbound,231
2425,85,Abbey St,33,inbound,231
2476,39,Abbey St,41,inbound,231
2647,32,Abbey St,41B,inbound,231
2660,44,Abbey St,41C,inbound,231
2777,22,Abbey St,41D,inbound,231


In [167]:
stop231["stop_route_data"] = stop231.apply(all_routes, df=stop231, axis=1)
stop231

Unnamed: 0,stop_sequence,destination,line_id,direction,stop_num,stop_route_data
2026,6,Sandymount,1,outbound,231,"[1, 6.67, outbound, Sandymount], [16, 5.13, o..."
2195,15,Ballinteer,16,outbound,231,"[1, 6.67, outbound, Sandymount], [16, 5.13, o..."
2419,15,Ballinteer,16D,outbound,231,"[1, 6.67, outbound, Sandymount], [16, 5.13, o..."
2425,85,Abbey St,33,inbound,231,"[1, 6.67, outbound, Sandymount], [16, 5.13, o..."
2476,39,Abbey St,41,inbound,231,"[1, 6.67, outbound, Sandymount], [16, 5.13, o..."
2647,32,Abbey St,41B,inbound,231,"[1, 6.67, outbound, Sandymount], [16, 5.13, o..."
2660,44,Abbey St,41C,inbound,231,"[1, 6.67, outbound, Sandymount], [16, 5.13, o..."
2777,22,Abbey St,41D,inbound,231,"[1, 6.67, outbound, Sandymount], [16, 5.13, o..."


In [168]:
stop231 = stop231.drop(["direction", "line_id", "destination", "stop_sequence"], axis=1)
stop231 = stop231.drop_duplicates()
stop231

Unnamed: 0,stop_num,stop_route_data
2026,231,"[1, 6.67, outbound, Sandymount], [16, 5.13, o..."


In [169]:
col_names =  ["stop_sequence", "line_id", "direction", "destination", "stop_num", "stop_route_data"]
stop_sequencing  = pd.DataFrame(columns = col_names)
stop_sequencing

Unnamed: 0,stop_sequence,line_id,direction,destination,stop_num,stop_route_data


In [170]:
pruned_df = pruned_df.drop(["stop_name", 
                            "latitude", 
                            "longitude", 
                            "ainm", 
                            "stop_id",
                            "id",
                            "shape_id"], axis=1)

pruned_df = pruned_df.drop_duplicates()
pruned_df

Unnamed: 0,stop_sequence,destination,line_id,direction,stop_num
169,40,Shanard Road,1,inbound,226
49962,1,Shanard Road,1,inbound,381
50402,2,Shanard Road,1,inbound,7740
50842,3,Shanard Road,1,inbound,7741
51282,4,Shanard Road,1,inbound,387
...,...,...,...,...,...
1817247,12,Bray Station,84A,outbound,3124
1817347,13,Bray Station,84A,outbound,3125
1817447,14,Bray Station,84A,outbound,3126
1817547,15,Bray Station,84A,outbound,3127


In [171]:
count = 1
for stop in all_stops:
    #print(f"Processing stop {stop}. {count} of {len(all_stops)+1}")
    stop_df = pruned_df[pruned_df["stop_num"]==stop]
    stop_df = stop_df.drop_duplicates()
    stop_df["stop_route_data"] = stop_df.apply(all_routes, df=stop_df, axis=1)
    stop_sequencing = stop_sequencing.append(stop_df)
    count += 1


In [172]:
stop_sequencing

Unnamed: 0,stop_sequence,line_id,direction,destination,stop_num,stop_route_data
169,40,1,inbound,Shanard Road,226,"[1, 1.0, inbound, Shanard Road]"
0,1,1,outbound,Sandymount,226,"[1, 1.0, inbound, Shanard Road]"
49962,1,1,inbound,Shanard Road,381,"[1, 40.0, inbound, Shanard Road], [47, 1.5, i..."
50131,32,47,inbound,Poolbeg St,381,"[1, 40.0, inbound, Shanard Road], [47, 1.5, i..."
50402,2,1,inbound,Shanard Road,7740,"[1, 20.0, inbound, Shanard Road], [47, 1.45, ..."
...,...,...,...,...,...,...
1821462,83,84,outbound,Newcastle,4257,"[84, 1.07, outbound, Newcastle]"
1821556,84,84,outbound,Newcastle,4258,"[84, 1.06, outbound, Newcastle]"
1821650,85,84,outbound,Newcastle,4259,"[84, 1.05, outbound, Newcastle]"
1821744,86,84,outbound,Newcastle,7272,"[84, 1.03, outbound, Newcastle]"


In [173]:
stop_sequencing = stop_sequencing.drop(["line_id", "stop_sequence", "direction", "destination"], axis=1)
stop_sequencing = stop_sequencing.drop_duplicates()
stop_sequencing

Unnamed: 0,stop_num,stop_route_data
169,226,"[1, 1.0, inbound, Shanard Road]"
49962,381,"[1, 40.0, inbound, Shanard Road], [47, 1.5, i..."
50402,7740,"[1, 20.0, inbound, Shanard Road], [47, 1.45, ..."
50842,7741,"[1, 13.33, inbound, Shanard Road], [47, 1.41,..."
51282,387,"[1, 10.0, inbound, Shanard Road], [47, 1.37, ..."
...,...,...
1821462,4257,"[84, 1.07, outbound, Newcastle]"
1821556,4258,"[84, 1.06, outbound, Newcastle]"
1821650,4259,"[84, 1.05, outbound, Newcastle]"
1821744,7272,"[84, 1.03, outbound, Newcastle]"
