# Creating the Segments files for the model to predict travel time - Part 1

### We are using Numpy to vectorize pandas functions on the large dataframes in order to improve performance

#### Reading the relevant Datasets

In [1]:
import pandas as pd
import json

## Reading the Trips File

In [2]:
# Reading the complete trips file
with open('rt_trips_type.json') as fin:
    column_type = json.load(fin)
df_trips = pd.read_csv('rt_trips_full.csv', dtype=column_type)
df_trips.drop(['datasource','basin','tenderlot','suppressed','justificationid','lastupdate','note'],axis=1, inplace=True)
df_trips.head(5)

Unnamed: 0,dayofservice,tripid,lineid,routeid,direction,plannedtime_arr,plannedtime_dep,actualtime_arr,actualtime_dep
0,09-FEB-16 00:00:00,2824642,145,145_105,2,45668,40800,45375.0,40782.0
1,09-FEB-16 00:00:00,2826717,9,9_7,2,65553,60600,66258.0,60582.0
2,09-FEB-16 00:00:00,2826730,54A,54A_12,2,65951,62100,66053.0,62078.0
3,09-FEB-16 00:00:00,2826743,7,7_51,1,54763,50400,54854.0,50383.0
4,09-FEB-16 00:00:00,2812908,39,39_20,1,27375,22920,27318.0,22931.0


### The trips file gives us all the information of the journey from depot to depot

<hr>

## Reading the Leave times File

In [3]:
# Reading the complete leavetimes file
with open('rt_leavetimes_type.json') as fin:
    column_type = json.load(fin)
df_leavetimes = pd.read_csv('rt_leaveTime_full.csv', dtype=column_type)
df_leavetimes.drop(['datasource','vehicleid','passengers','passengersin','passengersout','distance','suppressed','justificationid','lastupdate','note'],axis=1, inplace=True)
df_leavetimes.head(5)

Unnamed: 0,dayofservice,tripid,progrnumber,stoppointid,plannedtime_arr,plannedtime_dep,actualtime_arr,actualtime_dep
0,31-JAN-16 00:00:00,2811044,63,3390,41820,41820,42920,42920
1,31-JAN-16 00:00:00,2820852,1,7158,69300,69300,69320,69320
2,31-JAN-16 00:00:00,2820852,5,7017,69498,69498,69532,69532
3,31-JAN-16 00:00:00,2820852,10,1893,69711,69711,69858,69858
4,31-JAN-16 00:00:00,2820853,16,1648,76822,76822,76636,76657


### Leave times provides a much more indepth segmented journey log from stop to stop

<hr>

> **We don't want other time values, we will use actual time of arrival for the following two reasons**
* Actual time is the real time when the bus reaches the bus stop
* It takes care of dwell time and traffic between two stops

In [4]:
df_leavetimes.drop(['plannedtime_dep','plannedtime_arr','actualtime_dep'],axis=1,inplace=True)

<hr>

## Preparing data

>### <font color="#9C27B0">Get the route values for the stops</font>

In [6]:
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('postgresql+psycopg2://postgres:00001234@localhost:5433/jetaDb')

In [7]:
# Getting the list of segments and routes with the segments
df_routes_for_segment = pd.read_sql_query("select * from main_routes;",engine)
segmentids = []
routeids = []
segmentnos = []
from_stops=[]
to_stops=[]
for i, rows in df_routes_for_segment.iterrows():
    for j in range(len(rows['stopids'])):
        if j < (len(rows['stopids'])-1):
            segment = str(rows['stopids'][j])+"_"+str(rows['stopids'][j+1])
            from_stop = str(rows['stopids'][j])
            to_stop = str(rows['stopids'][j+1])
            segmentids.append(segment)
            segmentnos.append(j+1)
            routeids.append(rows['routeid'])
            from_stops.append(from_stop)
            to_stops.append(to_stop)
        else:
            break
df_journey = pd.DataFrame({'routeid':routeids,'segmentno':segmentnos,'segmentid':segmentids,'from_stop':from_stops,'to_stop':to_stops})
df_journey_old = df_journey

> ### <font color="#9C27B0">Drop all duplicate segments</font> [Queuing Theory]

In [8]:
# Dropping all the duplicate segments -> Queueing Theory
df_journey.drop_duplicates(subset='segmentid',keep='first',inplace=True)
df_journey.reset_index(inplace=True,drop=True)
df_journey.sort_values(by='routeid', inplace=True)
df_journey.reset_index(inplace=True,drop=True)
fin_df = df_trips[['tripid','routeid']].drop_duplicates()
sdf_stops_ordered = df_leavetimes
sdf_stops_ordered = sdf_stops_ordered.sort_values(by=['tripid','actualtime_arr'])

> **The following steps have been performed in the above step:**
* Drop duplicate segments [Queuing Theory]
* Sort the values as per routeid, tripid and actualtime of arrival

In [9]:
sdf_stops_ordered[sdf_stops_ordered['dayofservice'].str.contains('APR-17')]
# pd.options.display.max_rows=999

Unnamed: 0,dayofservice,tripid,progrnumber,stoppointid,actualtime_arr
71780913,05-APR-17 00:00:00,4519041,1,6282,53996
61154180,03-APR-17 00:00:00,4519041,1,6282,54005
64356579,04-APR-17 00:00:00,4519041,1,6282,54005
62594626,06-APR-17 00:00:00,4519041,1,6282,54010
71791173,05-APR-17 00:00:00,4519041,2,6335,54024
61154181,03-APR-17 00:00:00,4519041,2,6335,54028
64356580,04-APR-17 00:00:00,4519041,2,6335,54035
62606215,06-APR-17 00:00:00,4519041,2,6335,54043
71802761,05-APR-17 00:00:00,4519041,3,6326,54044
61154182,03-APR-17 00:00:00,4519041,3,6326,54055


> The data is sorted

<hr>

> **<font color="#f44336">Using Numpy to get the to_stop time and the to_stop of the segment</font>**

In [14]:
import numpy as np

In [15]:
sdf_stops_ordered['to_stop'] = sdf_stops_ordered.groupby(['tripid','dayofservice'])['stoppointid'].transform(np.roll,shift=-1)

In [17]:
sdf_stops_ordered['to_stop_time'] = sdf_stops_ordered.groupby(['tripid','dayofservice'])['actualtime_arr'].transform(np.roll,shift=-1)

In [18]:
sdf_stops_ordered.to_csv('final_stops_model_v5.csv',index=False)

**This is the end of this workbook because the computer goes out of memory later on. Refer NumpyCreatingSegmentFile2.ipynb for more**

*************************************************************************************************************************