# Creating Segmented Data - Part 3

## This is the continuity of NumpyCreatingSegmentFile2.ipynb

> **The following analysis will be performed in this book**:
* Adding City Center Flag (Oisin's notebook)
* Adding Kalman Filter Coefficient (Oisin's notebook)
* Adding Student Holidays Flag (Oisin's notebook)
* Adding distance between stops (Isaac's Notebook)
* Prepare the final Segmented data

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [2]:
# Read weather data into dataframe

URI="localhost"
PORT="5433"
DB="jetaDb"
PORT="5433"
USER="postgres"
PASSWORD='00001234'

engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(USER,PASSWORD,URI,PORT,DB), echo=True)

In [3]:
# Reading the segments file
df = pd.read_csv("Sampled_Data_actualTime.csv")

In [4]:
df.head()

Unnamed: 0,actualtime_arr,is_school_holiday,traveltime,segments,dayofweek,rain,temp
0,79907,0,22.0,1000_1001,Tuesday,0.0,2.9
1,64277,0,160.5,1000_1001,Monday,0.0,9.3
2,63248,0,36.0,1000_1001,Saturday,0.0,12.8
3,27858,0,59.0,1000_1001,Wednesday,0.0,8.1
4,56554,1,79.0,1000_1001,Tuesday,0.0,17.2


In [5]:
# Since our df is too big, we will prepare a test df to run some basic tests
df_test = df.head()

> **Getting back the from stop and to stop columns**

In [6]:
# Test for from stop
df_test['segments'].str.extract("(.*?)_")

  


0    1000
1    1000
2    1000
3    1000
4    1000
Name: segments, dtype: object

Test succeeded

In [7]:
# Test for to stop
df_test['segments'].str.extract(".*?_(\d+)")

  


0    1001
1    1001
2    1001
3    1001
4    1001
Name: segments, dtype: object

Test Succeeded

In [8]:
# Getting back the from stop and to stop columns
df['from_stop'] = df['segments'].str.extract("(.*?)_")
df["to_stop"] = df['segments'].str.extract(".*?_(\d+)")

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
df.head()

Unnamed: 0,actualtime_arr,is_school_holiday,traveltime,segments,dayofweek,rain,temp,from_stop,to_stop
0,79907,0,22.0,1000_1001,Tuesday,0.0,2.9,1000,1001
1,64277,0,160.5,1000_1001,Monday,0.0,9.3,1000,1001
2,63248,0,36.0,1000_1001,Saturday,0.0,12.8,1000,1001
3,27858,0,59.0,1000_1001,Wednesday,0.0,8.1,1000,1001
4,56554,1,79.0,1000_1001,Tuesday,0.0,17.2,1000,1001


We have split up the stops and now moving on to adding features

### STEP 1: <font color="#2196F3">Adding necessary flag</font>

> **City Center**: Split bus stops (city centre and suburbs)

In [10]:
# First we will get all the stops from the database
sql = "SELECT * FROM main_stops"

stops = pd.read_sql(sql, engine)

2018-07-28 11:45:46,206 INFO sqlalchemy.engine.base.Engine select version()
2018-07-28 11:45:46,207 INFO sqlalchemy.engine.base.Engine {}
2018-07-28 11:45:46,236 INFO sqlalchemy.engine.base.Engine select current_schema()
2018-07-28 11:45:46,237 INFO sqlalchemy.engine.base.Engine {}
2018-07-28 11:45:46,267 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2018-07-28 11:45:46,268 INFO sqlalchemy.engine.base.Engine {}
2018-07-28 11:45:46,281 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2018-07-28 11:45:46,282 INFO sqlalchemy.engine.base.Engine {}
2018-07-28 11:45:46,313 INFO sqlalchemy.engine.base.Engine show standard_conforming_strings
2018-07-28 11:45:46,314 INFO sqlalchemy.engine.base.Engine {}
2018-07-28 11:45:46,342 INFO sqlalchemy.engine.base.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
20

In [11]:
stops.head(5)
# Add city centre column, 0 will mean NOT in city centre
stops['city_centre'] = 0
stops.head(5)

Unnamed: 0,stopid,address,lat,lng,lines,city_centre
0,2,"Rotunda, Parnell Square West",53.352241,-6.263695,"[38B, 38D, 38, 38A, 46A, 46E]",0
1,3,"Rotunda, Granby Place",53.352307,-6.263783,"[122, 120]",0
2,4,"Rotunda, Rotunda Hospital",53.352567,-6.264166,"[7, 7B, 9, 7D, 7A]",0
3,6,"Rotunda, Saint Martin's Chapel",53.352744,-6.264443,[4],0
4,7,"Rotunda, Rotunda Hospital",53.352836,-6.264562,"[40, 140, 40D, 40B, 13]",0


In [12]:
# A rectangle is chosen that contains all bus stops in the city centre, 
# roughly drawn between the Royal and Grand canals, coordinats are:

bottom_left = [53.331986, -6.292722]
bottom_right = [53.334034, -6.245183]

top_left = [53.361551, -6.298236]
top_right = [53.361860, -6.242419]

In [13]:
for index, row in stops.iterrows():
    if (row['lat'] > bottom_left[0] and row['lat'] < top_left[0]) and (row['lng'] < bottom_right[1] and row['lng'] > top_left[1]):
        stops.set_value(index, 'city_centre', 1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
stops.head()

Unnamed: 0,stopid,address,lat,lng,lines,city_centre
0,2,"Rotunda, Parnell Square West",53.352241,-6.263695,"[38B, 38D, 38, 38A, 46A, 46E]",1
1,3,"Rotunda, Granby Place",53.352307,-6.263783,"[122, 120]",1
2,4,"Rotunda, Rotunda Hospital",53.352567,-6.264166,"[7, 7B, 9, 7D, 7A]",1
3,6,"Rotunda, Saint Martin's Chapel",53.352744,-6.264443,[4],1
4,7,"Rotunda, Rotunda Hospital",53.352836,-6.264562,"[40, 140, 40D, 40B, 13]",1


The above is the list of all the city center stops. We will now join our segmented dataframe with this

In [15]:
centre_stops = stops['city_centre'] == 1
centre_stops = stops[centre_stops]
center_stops_list = centre_stops['stopid'].tolist()

In [16]:
type(center_stops_list)

list

In [17]:
df.dtypes

actualtime_arr         int64
is_school_holiday      int64
traveltime           float64
segments              object
dayofweek             object
rain                 float64
temp                 float64
from_stop             object
to_stop               object
dtype: object

In [18]:
df['from_stop'] = df['from_stop'].astype("int")
df['to_stop'] = df['to_stop'].astype("int")

In [19]:
def f(row):
    if (row['from_stop'] in center_stops_list) or (row['to_stop'] in center_stops_list):
        val = 1
    else:
        val = 0
    return val

In [20]:
df['is_citycenter']=0

In [21]:
df.loc[df.from_stop.isin(center_stops_list),'is_citycenter']=1
df.loc[df.to_stop.isin(center_stops_list),'is_citycenter']=1

In [22]:
df['is_citycenter'] = df['is_citycenter'].astype("int")

In [23]:
df[(df['from_stop']==100)&(df['to_stop']==307)].head()

Unnamed: 0,actualtime_arr,is_school_holiday,traveltime,segments,dayofweek,rain,temp,from_stop,to_stop,is_citycenter
43009,49458,0,160.5,100_307,Friday,0.0,24.3,100,307,1


In [24]:
stops[stops['stopid']==100]

Unnamed: 0,stopid,address,lat,lng,lines,city_centre
82,100,Wellington Lane (Wellington Green,53.30308,-6.322363,"[150, 54A]",0


In [25]:
stops[stops['stopid']==307]

Unnamed: 0,stopid,address,lat,lng,lines,city_centre
241,307,"Warrenmount, O'Donovan Road",53.334581,-6.280072,[150],1


> **Distance**: Split bus stops (city centre and suburbs)

In [26]:
# We will need the Stop times file from the GTFS data here
stop_times = pd.read_csv("GTFS/google_transit_dublinbus/stop_times.txt")

In [27]:
stop_times.head(5)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,8954.y102p.60-1-d12-1.1.O,12:00:00,12:00:00,8240DB000226,1,Sandymount,0,0,0.0
1,8954.y102p.60-1-d12-1.1.O,12:00:54,12:00:54,8240DB000228,2,Sandymount,0,0,261.136188
2,8954.y102p.60-1-d12-1.1.O,12:01:39,12:01:39,8240DB000229,3,Sandymount,0,0,484.925289
3,8954.y102p.60-1-d12-1.1.O,12:02:53,12:02:53,8240DB000227,4,Sandymount,0,0,836.995679
4,8954.y102p.60-1-d12-1.1.O,12:03:38,12:03:38,8240DB000230,5,Sandymount,0,0,1066.461783


In [28]:
stop_times['lineid'] = stop_times['trip_id'].apply(lambda x: (x.split("-"))[1])

In [29]:
def extract_stopid(x):
    stopid = []
    for i in range(1, len(x)):
        try:
            stopid.insert(0, int(x[-i]))
        except:
            break
    return int(''.join(map(str, stopid)))

stop_times['stopid'] = stop_times['stop_id'].apply(lambda x: extract_stopid(x))

In [30]:
stop_times[stop_times['stopid']==307]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,lineid,stopid
164521,4037.y102q.60-150-d12-1.118.I,20:45:45,20:45:45,8220DB000307,30,Hawkins Street,0,0,7601.869157,150,307
164559,4055.y102q.60-150-d12-1.118.I,21:35:45,21:35:45,8220DB000307,30,Hawkins Street,0,0,7601.869157,150,307
164597,4057.y102q.60-150-d12-1.118.I,23:05:45,23:05:45,8220DB000307,30,Hawkins Street,0,0,7601.869157,150,307
164635,4097.y102q.60-150-d12-1.118.I,22:05:45,22:05:45,8220DB000307,30,Hawkins Street,0,0,7601.869157,150,307
164673,4099.y102q.60-150-d12-1.118.I,23:35:45,23:35:45,8220DB000307,30,Hawkins Street,0,0,7601.869157,150,307
164711,4105.y102q.60-150-d12-1.118.I,21:10:45,21:10:45,8220DB000307,30,Hawkins Street,0,0,7601.869157,150,307
164749,4107.y102q.60-150-d12-1.118.I,22:35:45,22:35:45,8220DB000307,30,Hawkins Street,0,0,7601.869157,150,307
164787,11937.y102p.60-150-d12-1.118.I,13:03:33,13:03:33,8220DB000307,30,Hawkins Street,0,0,7601.869157,150,307
164825,11939.y102p.60-150-d12-1.118.I,14:43:33,14:43:33,8220DB000307,30,Hawkins Street,0,0,7601.869157,150,307
164863,11954.y102p.60-150-d12-1.118.I,12:23:33,12:23:33,8220DB000307,30,Hawkins Street,0,0,7601.869157,150,307


In [31]:
distances = stop_times[['lineid', 'stopid', 'stop_sequence', 'shape_dist_traveled']]
distances = distances.drop_duplicates(['lineid', 'stop_sequence'])
stopid_list = distances['stopid'].tolist()
stopid_list = list(map(str, stopid_list))
journey_segments = [ '_'.join(x) for x in zip(stopid_list[0:], stopid_list[1:])]
distances_list = distances['shape_dist_traveled'].tolist()
distances_segments = [ (x[1] - x[0]) for x in zip(distances_list[0:], distances_list[1:])]
distance_df = pd.DataFrame({'segments': journey_segments, 'distance': distances_segments})
distance_df = distance_df[['segments', 'distance']]
distance_df

Unnamed: 0,segments,distance
0,226_228,261.136188
1,228_229,223.789101
2,229_227,352.070390
3,227_230,229.466104
4,230_231,292.654153
5,231_1641,468.890293
6,1641_1642,293.773927
7,1642_213,287.229230
8,213_214,263.282842
9,214_4432,276.040589


In [32]:
# distance_df[distance_df['segment']=='100_307']

In [33]:
# df.head()

In [34]:
df_dist = pd.merge(df,distance_df,how="inner")

In [35]:
# final_segments = pd.unique(df_dist['segments'].values.ravel('K'))

In [36]:
# distance_segments = pd.unique(distance_df['segments'].values.ravel('K'))

In [37]:
# type(final_segments)

In [38]:
#np.setdiff1d(distance_segments,final_segments)

In [45]:
df_dist_new = pd.merge(df,distance_df,how="inner")

In [46]:
df_dist_new.head()

Unnamed: 0,actualtime_arr,is_school_holiday,traveltime,segments,dayofweek,rain,temp,from_stop,to_stop,is_citycenter,distance
0,79907,0,22.0,1000_1001,Tuesday,0.0,2.9,1000,1001,0,202.536493
1,64277,0,160.5,1000_1001,Monday,0.0,9.3,1000,1001,0,202.536493
2,63248,0,36.0,1000_1001,Saturday,0.0,12.8,1000,1001,0,202.536493
3,27858,0,59.0,1000_1001,Wednesday,0.0,8.1,1000,1001,0,202.536493
4,56554,1,79.0,1000_1001,Tuesday,0.0,17.2,1000,1001,0,202.536493


> The above cases are bad data and are not present for the final model

In [39]:
from pykalman import KalmanFilter

In [40]:
# Code adapted from https://www.quantopian.com/posts/quantopian-lecture-series-kalman-filters
# The Kalman filter
kf = KalmanFilter(transition_matrices = [1],
                  observation_matrices = [1],
                  initial_state_mean = 0,
                  initial_state_covariance = 1,
                  observation_covariance=1,
                  transition_covariance=.01)

# Use the observed values of the price to get a rolling mean
state_means, _ = kf.filter(df_dist.traveltime.values)
state_means = pd.Series(state_means.flatten(), index=df_dist.index)

In [47]:
df_dist_new = pd.concat([df_dist_new, state_means], axis=1)

In [48]:
df_dist_new.head()

Unnamed: 0,actualtime_arr,is_school_holiday,traveltime,segments,dayofweek,rain,temp,from_stop,to_stop,is_citycenter,distance,0
0,79907,0,22.0,1000_1001,Tuesday,0.0,2.9,1000,1001,0,202.536493,11.0
1,64277,0,160.5,1000_1001,Monday,0.0,9.3,1000,1001,0,202.536493,61.493377
2,63248,0,36.0,1000_1001,Saturday,0.0,12.8,1000,1001,0,202.536493,54.915532
3,27858,0,59.0,1000_1001,Wednesday,0.0,8.1,1000,1001,0,202.536493,55.778866
4,56554,1,79.0,1000_1001,Tuesday,0.0,17.2,1000,1001,0,202.536493,59.987633


In [50]:
df_dist_new.columns = ['actualtime_arr','is_school_holiday','traveltime','segments','dayofweek','rain','temp','from_stop','to_stop','is_citycenter','distance', 'kalman_time']

In [51]:
df_dist_new.head()

Unnamed: 0,actualtime_arr,is_school_holiday,traveltime,segments,dayofweek,rain,temp,from_stop,to_stop,is_citycenter,distance,kalman_time
0,79907,0,22.0,1000_1001,Tuesday,0.0,2.9,1000,1001,0,202.536493,11.0
1,64277,0,160.5,1000_1001,Monday,0.0,9.3,1000,1001,0,202.536493,61.493377
2,63248,0,36.0,1000_1001,Saturday,0.0,12.8,1000,1001,0,202.536493,54.915532
3,27858,0,59.0,1000_1001,Wednesday,0.0,8.1,1000,1001,0,202.536493,55.778866
4,56554,1,79.0,1000_1001,Tuesday,0.0,17.2,1000,1001,0,202.536493,59.987633


In [None]:
# df_final = pd.concat([df_dist_new, state_means], axis=1)

In [52]:
df_dist_new.drop(['from_stop','to_stop','temp'],axis=1,inplace=True)

In [53]:
df_sampled_merged = df_dist_new[['segments','actualtime_arr','dayofweek','rain','is_school_holiday','is_citycenter','distance','kalman_time','traveltime']]

In [55]:
for i, val in df_sampled_merged.groupby('segments'):
     val.to_csv('SegmentedSamples2_actual_final\\{}.csv'.format(i),header=False,index_label=False,index=False)

In [None]:
# # Raw data
# df_dist.plot(kind='scatter', x='actualtime_arr', y='traveltime')
# x = df_dist['actualtime_arr']
# y = df_dist['traveltime']

# plt.plot(x, y, 'o', color='blue');

In [56]:
df_dist_new.to_csv('full_segments.csv')