# Creating the segmented model files

## 1. Reading both the files

In [1]:
import pandas as pd
import json

In [2]:
# Reading the complete trips file
with open('rt_trips_type.json') as fin:
    column_type = json.load(fin)
df_trips = pd.read_csv('rt_trips_full.csv', dtype=column_type)
df_trips.drop(['datasource','basin','tenderlot','suppressed','justificationid','lastupdate','note'],axis=1, inplace=True)
df_trips.head(5)

Unnamed: 0,dayofservice,tripid,lineid,routeid,direction,plannedtime_arr,plannedtime_dep,actualtime_arr,actualtime_dep
0,09-FEB-16 00:00:00,2824642,145,145_105,2,45668,40800,45375.0,40782.0
1,09-FEB-16 00:00:00,2826717,9,9_7,2,65553,60600,66258.0,60582.0
2,09-FEB-16 00:00:00,2826730,54A,54A_12,2,65951,62100,66053.0,62078.0
3,09-FEB-16 00:00:00,2826743,7,7_51,1,54763,50400,54854.0,50383.0
4,09-FEB-16 00:00:00,2812908,39,39_20,1,27375,22920,27318.0,22931.0


In [3]:
# Reading the complete leavetimes file
with open('rt_leavetimes_type.json') as fin:
    column_type = json.load(fin)
df_leavetimes = pd.read_csv('rt_leaveTime_full.csv', dtype=column_type)
df_leavetimes.drop(['datasource','vehicleid','passengers','passengersin','passengersout','distance','suppressed','justificationid','lastupdate','note'],axis=1, inplace=True)
df_leavetimes.head(5)

Unnamed: 0,dayofservice,tripid,progrnumber,stoppointid,plannedtime_arr,plannedtime_dep,actualtime_arr,actualtime_dep
0,31-JAN-16 00:00:00,2811044,63,3390,41820,41820,42920,42920
1,31-JAN-16 00:00:00,2820852,1,7158,69300,69300,69320,69320
2,31-JAN-16 00:00:00,2820852,5,7017,69498,69498,69532,69532
3,31-JAN-16 00:00:00,2820852,10,1893,69711,69711,69858,69858
4,31-JAN-16 00:00:00,2820853,16,1648,76822,76822,76636,76657


In [4]:
df_leavetimes.drop(['plannedtime_dep','actualtime_arr','actualtime_dep'],axis=1,inplace=True)

In [33]:
column_type

{'group': 'uint16',
 'datasource': 'category',
 'dayofservice': 'category',
 'tripid': 'uint32',
 'progrnumber': 'uint8',
 'stoppointid': 'uint16',
 'plannedtime_arr': 'uint32',
 'plannedtime_dep': 'uint32',
 'actualtime_arr': 'uint32',
 'actualtime_dep': 'uint32',
 'vehicleid': 'uint32',
 'passengers': 'float32',
 'passengersin': 'float32',
 'passengersout': 'float32',
 'distance': 'float32',
 'suppressed': 'float32',
 'justificationid': 'float32',
 'lastupdate': 'category',
 'note': 'float32'}

In [5]:
df_trips.dtypes

dayofservice       category
tripid               uint32
lineid             category
routeid            category
direction             uint8
plannedtime_arr      uint32
plannedtime_dep      uint32
actualtime_arr      float32
actualtime_dep      float32
dtype: object

In [6]:
df_leavetimes.dtypes

dayofservice       category
tripid               uint32
progrnumber           uint8
stoppointid          uint16
plannedtime_arr      uint32
dtype: object

## 2. Adding day of week in the leavetimes file

In [7]:
# Adding cython to improve the performance
%load_ext Cython

In [8]:
# df_leavetimes['date'] = pd.to_datetime(df_leavetimes['dayofservice'])
# df_leavetimes['dayOfWeek'] = df_leavetimes['date'].dt.weekday_name

In [9]:
df_leavetimes.drop(['progrnumber'],axis=1,inplace=True)

## 3. Add weather information

In [10]:
#This will be added in the sample created

## 4. Creating segment data

In [11]:
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('postgresql+psycopg2://postgres:00001234@localhost:5433/jetaDb')

In [13]:
# Getting the list of segments and routes with the segments
df_routes_for_segment = pd.read_sql_query("select * from main_routes;",engine)
segmentids = []
routeids = []
segmentnos = []
from_stops=[]
to_stops=[]
for i, rows in df_routes_for_segment.iterrows():
    for j in range(len(rows['stopids'])):
        if j < (len(rows['stopids'])-1):
            segment = str(rows['stopids'][j])+"_"+str(rows['stopids'][j+1])
            from_stop = str(rows['stopids'][j])
            to_stop = str(rows['stopids'][j+1])
            segmentids.append(segment)
            segmentnos.append(j+1)
            routeids.append(rows['routeid'])
            from_stops.append(from_stop)
            to_stops.append(to_stop)
        else:
            break
df_journey = pd.DataFrame({'routeid':routeids,'segmentno':segmentnos,'segmentid':segmentids,'from_stop':from_stops,'to_stop':to_stops})

In [14]:
df_journey_old = df_journey

In [15]:
# Dropping all the duplicate segments -> Queueing Theory
df_journey.drop_duplicates(subset='segmentid',keep='first',inplace=True)

In [16]:
df_journey.reset_index(inplace=True,drop=True)

In [17]:
df_journey.sort_values(by='routeid', inplace=True)

In [18]:
df_journey.reset_index(inplace=True,drop=True)

In [19]:
len(df_journey)

9096

In [20]:
fin_df = df_trips[['tripid','routeid']].drop_duplicates()

In [21]:
df_journey

Unnamed: 0,routeid,segmentno,segmentid,from_stop,to_stop
0,102_10,1,1073_3641,1073,3641
1,102_10,30,953_4381,953,4381
2,102_10,29,952_953,952,953
3,102_10,27,928_951,928,951
4,102_10,26,949_928,949,928
5,102_10,25,948_949,948,949
6,102_10,24,947_948,947,948
7,102_10,23,945_947,945,947
8,102_10,22,944_945,944,945
9,102_10,21,4465_944,4465,944


In [22]:
df_leavetimes.dtypes

dayofservice       category
tripid               uint32
stoppointid          uint16
plannedtime_arr      uint32
dtype: object

In [23]:
sdf_stops_ordered = df_leavetimes
sdf_stops_ordered = sdf_stops_ordered.sort_values(by=['tripid','plannedtime_arr'])
# Faster way to sort
#a = df.values
# In [12]: a.sort(axis=1)  #

### Creating From Stops Dataframe

In [24]:
# We need tripid, dayofservice, stopid, arrivaltime
# Getting routes from df_trips that are there in journey table
route_trips_unq = fin_df[fin_df['routeid'].values.isin(df_journey['routeid'].values)]['tripid'].values

In [32]:
df_journey

Unnamed: 0,routeid,segmentno,segmentid,from_stop,to_stop
0,102_10,1,1073_3641,1073,3641
1,102_10,30,953_4381,953,4381
2,102_10,29,952_953,952,953
3,102_10,27,928_951,928,951
4,102_10,26,949_928,949,928
5,102_10,25,948_949,948,949
6,102_10,24,947_948,947,948
7,102_10,23,945_947,945,947
8,102_10,22,944_945,944,945
9,102_10,21,4465_944,4465,944


In [25]:
# Once we have the route we can extract information for those routes from the leavetimes table
# Get arrivaltime of buses for a particular trip on from stops
# from_stops_arrival_time = sdf_stops_ordered[sdf_stops_ordered['tripid'].isin(route_trips_unq) & sdf_stops_ordered['stoppointid'].isin(df_journey['from_stop'].values)].values.tolist()
# # type(from_stops_arrival_time)
# columns_fs = ['dayOfService','tripid','from_stop','time_at_1']
# fs_df = pd.DataFrame(from_stops_arrival_time, columns=columns_fs)

In [26]:
# fs_df.to_csv('from_stops.csv',index=False)

In [27]:
# len(fs_df)

In [28]:
to_stops_arrival_time = sdf_stops_ordered[sdf_stops_ordered['tripid'].isin(route_trips_unq) & sdf_stops_ordered['stoppointid'].isin(df_journey['to_stop'].values)].values.tolist()

In [29]:
len(to_stops_arrival_time)

74619379

In [30]:
columns_ts = ['dayOfService','tripid','to_stop','time_at_2']
ts_df = pd.DataFrame(to_stops_arrival_time, columns=columns_ts)

In [31]:
ts_df.to_csv('to_stops.csv',index=False)

In [110]:
# route_trips_unq = []
# segments = df_journey['segmentid'].values.tolist()
# # from_stops = df_journey['from_stop'].values
# route_trips_unq = fin_df[fin_df['routeid'].values.isin(df_journey['routeid'].values)]['tripid'].values
# from_stop_list = sdf_stops_ordered[sdf_stops_ordered[sdf_stops_ordered['stoppointid'].isin(df_journey['from_stop'].values)]['tripid'].values.isin(fin_df[fin_df['routeid'].values.isin(df_journey['routeid'].values)]['tripid'].values)]['plannedtime_arr'].values


In [109]:
# from_stops_list = sdf_stops_ordered[sdf_stops_ordered[sdf_stops_ordered['stoppointid'].isin(df_journey['from_stop'].values)]['tripid'].isin(fin_df[fin_df['routeid'].values.isin(df_journey['routeid'].values)]['tripid'].values)]['plannedtime_arr'].values.tolist()
# # test_list = [sdf_stops_ordered['stoppointid'].values.isin(df_journey['from_stop'].values)]['plannedtime_arr'].values

In [112]:
sdf_stops_ordered.dtypes

dayofservice       category
tripid               uint32
stoppointid          uint16
plannedtime_arr      uint32
dtype: object

In [114]:
from_stop_tripids = sdf_stops_ordered[sdf_stops_ordered['stoppointid'].isin(df_journey['from_stop'].values)]['tripid'].values
to_stop_tripids = sdf_stops_ordered[sdf_stops_ordered['stoppointid'].isin(df_journey['to_stop'].values)]['tripid'].values

In [None]:
from_stop_time = sdf_stops_ordered[sdf_stops_ordered['stoppointid'].isin(df_journey['from_stop'].values)].values
to_stop_time = sdf_stops_ordered[sdf_stops_ordered['stoppointid'].isin(df_journey['to_stop'].values)].values

In [104]:
type(sdf_stops_ordered)

pandas.core.frame.DataFrame

In [101]:
print(np.__version__)

1.14.5


In [None]:
from_stop_list = sdf_stops_ordered['stoppointid'].isin.values()

In [57]:
routeUnq = generateSegments(df_journey,fin_df)

In [59]:
len(routeUnq)

361425