# Creating the Segments files for the model to predict travel time - Part 2

### We are using Numpy to vectorize pandas functions on the large dataframes in order to improve performance

## This workbook is continued from NumpyCreatingSegmentFile.ipynb

#### Reading the file generated by Part 1

In [1]:
columntype={
    "dayofservice":"category",
    "tripid":"uint32",
    "progrnumber":"uint8",
    "stoppointid":"category",
    "plannedtime_arr":"int32",
    "to_stop":"category",
    "to_stop_time":"int32"
}

In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('final_stops_model_v5.csv',dtype=columntype)

In [3]:
# df[df['dayofservice'].str.contains('JUN')]
df.head()

Unnamed: 0,dayofservice,tripid,progrnumber,stoppointid,actualtime_arr,to_stop_time,to_stop
0,02-JAN-16 00:00:00,2760214,1,324,79054,79333,1635
1,02-JAN-16 00:00:00,2760214,3,1635,79333,79367,1636
2,02-JAN-16 00:00:00,2760214,4,1636,79367,79435,1637
3,02-JAN-16 00:00:00,2760214,5,1637,79435,79462,1638
4,02-JAN-16 00:00:00,2760214,6,1638,79462,79544,992


> **Adding school holidays**

In [4]:
# Converting school holidays as per the day of service provided in the data
school_holidays = ['2016-01-01','2016-01-04','2016-01-05','2016-02-15','2016-02-16','2016-02-17','2016-02-18','2016-02-19','2016-02-19','2016-03-16','2016-03-17','2016-03-18','2016-03-21','2016-03-22','2016-03-23','2016-03-24','2016-03-25','2016-03-28','2016-03-29','2016-03-30','2016-03-31','2016-04-01','2016-06-06','2016-06-07','2016-06-08','2016-06-09','2016-06-10','2016-06-13','2016-06-14','2016-06-15','2016-06-16','2016-06-17','2016-06-20','2016-06-21','2016-06-22','2016-06-23','2016-06-24','2016-06-27','2016-06-28','2016-06-29','2016-06-30','2017-01-02','2017-01-03','2017-01-04','2017-01-05','2017-01-06','2017-02-20','2017-02-21','2017-02-22','2017-02-23','2017-02-24','2017-04-09','2017-04-10','2017-04-11','2017-04-12','2017-04-13','2017-04-14','2017-04-16','2017-04-17','2017-04-18','2017-04-19','2017-04-20','2017-04-21','2017-06-05','2017-06-06','2017-06-07','2017-06-08','2017-06-09','2017-06-12','2017-06-13','2017-06-14','2017-06-15','2017-06-16','2017-06-19','2017-06-20','2017-06-21','2017-06-22','2017-06-23','2017-06-26','2017-06-27','2017-06-28','2017-06-29','2017-06-30']
import datetime as dt
# %d-%b-%y %H:%M:%S
dates_list = [dt.datetime.strptime(date, '%Y-%m-%d').date() for date in school_holidays]
dates_list_2 = [date.strftime('%d-%b-%y %H:%M:%S').upper() for date in dates_list]

In [5]:
df.loc[df.dayofservice.isin(dates_list_2),'is_school_holiday'] = 1
df.loc[~df.dayofservice.isin(dates_list_2),'is_school_holiday'] = 0

In [6]:
df['is_school_holiday'] = df['is_school_holiday'].astype('int')

In [7]:
df[df['is_school_holiday']==1].head()

Unnamed: 0,dayofservice,tripid,progrnumber,stoppointid,actualtime_arr,to_stop_time,to_stop,is_school_holiday
315,01-JAN-16 00:00:00,2760234,1,324,71110,71468,1634,1
320,01-JAN-16 00:00:00,2760234,2,1634,71468,71477,1635,1
321,01-JAN-16 00:00:00,2760234,3,1635,71477,71543,1636,1
322,01-JAN-16 00:00:00,2760234,4,1636,71543,71580,1637,1
325,01-JAN-16 00:00:00,2760234,5,1637,71580,71601,1638,1


> **Calculating the travel time from the file below**

In [8]:
df['traveltime'] = df['to_stop_time'] - df['actualtime_arr']

In [9]:
df.head()

Unnamed: 0,dayofservice,tripid,progrnumber,stoppointid,actualtime_arr,to_stop_time,to_stop,is_school_holiday,traveltime
0,02-JAN-16 00:00:00,2760214,1,324,79054,79333,1635,0,279
1,02-JAN-16 00:00:00,2760214,3,1635,79333,79367,1636,0,34
2,02-JAN-16 00:00:00,2760214,4,1636,79367,79435,1637,0,68
3,02-JAN-16 00:00:00,2760214,5,1637,79435,79462,1638,0,27
4,02-JAN-16 00:00:00,2760214,6,1638,79462,79544,992,0,82


> **Removing all those values which are less than 0**

In [10]:
df = df[df['traveltime']>0]

> **Getting the segment names for the segments**

In [11]:
df['segments'] = df['stoppointid'].astype(str)+'_'+df['to_stop'].astype(str)

In [12]:
df.dtypes

dayofservice         category
tripid                 uint32
progrnumber             uint8
stoppointid          category
actualtime_arr          int64
to_stop_time            int32
to_stop              category
is_school_holiday       int32
traveltime              int64
segments               object
dtype: object

In [13]:
df['traveltime'].std()

96.91930172508243

In [14]:
df.head()

Unnamed: 0,dayofservice,tripid,progrnumber,stoppointid,actualtime_arr,to_stop_time,to_stop,is_school_holiday,traveltime,segments
0,02-JAN-16 00:00:00,2760214,1,324,79054,79333,1635,0,279,324_1635
1,02-JAN-16 00:00:00,2760214,3,1635,79333,79367,1636,0,34,1635_1636
2,02-JAN-16 00:00:00,2760214,4,1636,79367,79435,1637,0,68,1636_1637
3,02-JAN-16 00:00:00,2760214,5,1637,79435,79462,1638,0,27,1637_1638
4,02-JAN-16 00:00:00,2760214,6,1638,79462,79544,992,0,82,1638_992


> Dropping the unwanted columns

In [15]:
df.drop(['tripid','progrnumber','stoppointid','to_stop','to_stop_time'],axis=1,inplace=True)

In [18]:
df.head()

Unnamed: 0,dayofservice,actualtime_arr,is_school_holiday,traveltime,segments
0,02-JAN-16 00:00:00,79054,0,279,324_1635
1,02-JAN-16 00:00:00,79333,0,34,1635_1636
2,02-JAN-16 00:00:00,79367,0,68,1636_1637
3,02-JAN-16 00:00:00,79435,0,27,1637_1638
4,02-JAN-16 00:00:00,79462,0,82,1638_992


In [19]:
# df.to_csv('final_stops_model_v2.csv',index=False)

In [20]:
# sampling the data as per segment groups
# grouped = df.groupby('segments')

In [21]:
# df_sampled = grouped.apply(lambda x: x.sample(frac=0.5))

In [22]:
# len(df_sampled)

In [23]:
len(df)

90270267

In [24]:
unique_vals = df.segments.unique()

In [25]:
len(unique_vals)

92151

In [26]:
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://postgres:00001234@localhost:5433/jetaDb')

In [28]:
# Getting the list of segments and routes with the segments
df_routes_for_segment = pd.read_sql_query("select * from main_routes;",engine)
segmentids = []
routeids = []
segmentnos = []
from_stops=[]
to_stops=[]
for i, rows in df_routes_for_segment.iterrows():
    for j in range(len(rows['stopids'])):
        if j < (len(rows['stopids'])-1):
            segment = str(rows['stopids'][j])+"_"+str(rows['stopids'][j+1])
            from_stop = str(rows['stopids'][j])
            to_stop = str(rows['stopids'][j+1])
            segmentids.append(segment)
            segmentnos.append(j+1)
            routeids.append(rows['routeid'])
            from_stops.append(from_stop)
            to_stops.append(to_stop)
        else:
            break
df_journey = pd.DataFrame({'routeid':routeids,'segmentno':segmentnos,'segmentid':segmentids,'from_stop':from_stops,'to_stop':to_stops})

In [29]:
# Dropping all the duplicate segments -> Queueing Theory
df_journey.drop_duplicates(subset='segmentid',keep='first',inplace=True)

In [30]:
df_journey.reset_index(inplace=True,drop=True)

In [None]:
# df_segments = df[df['segments'].isin(df_journey['segmentid'].values)]

In [None]:
# len(df_segments)

In [None]:
# unique_df_seg = df.segments.unique()

In [None]:
# len(unique_df_seg)

#### Adding day of weeek

In [31]:
grouped = df.groupby('segments')

> Sampling the data by fraction of 30%

In [32]:
df_sampled = grouped.apply(lambda x: x.sample(frac=0.3))

> Adding day of week

In [33]:
df_sampled['dayofservice'] = pd.to_datetime(df_sampled['dayofservice'],format='%d-%b-%y %H:%M:%S',infer_datetime_format=True)

In [34]:
df_sampled['dayofweek'] = df_sampled['dayofservice'].dt.weekday_name

In [35]:
df_sampled['date'] = df_sampled['dayofservice'].dt.date

In [36]:
df_sampled['time'] = pd.to_datetime(df_sampled['actualtime_arr'],unit='s')

In [37]:
df_sampled['hour'] = df_sampled['time'].dt.hour

In [38]:
df_sampled.drop('time',axis=1,inplace=True)

In [39]:
df_sampled.drop('date',axis=1,inplace=True)

In [40]:
len(df_sampled.segments.unique())

56250

> Adding rain

In [41]:
df_weather = pd.read_sql_query("SELECT concat_ws(' ', date::text, time::text) AS date, rain, temp FROM main_weather;",engine)

In [42]:
df_weather.head()

Unnamed: 0,date,rain,temp
0,1/1/2016 0:00,0,1.2
1,1/1/2016 1:00,0,1.5
2,1/1/2016 2:00,0,-0.2
3,1/1/2016 3:00,0,-0.7
4,1/1/2016 4:00,0,-0.3


In [43]:
df_weather['date'] = pd.to_datetime(df_weather['date'],infer_datetime_format=True)

In [44]:
df_weather['dates'] = df_weather['date'].dt.date
df_weather['hour'] = df_weather['date'].dt.hour

In [45]:
df_weather.dtypes

date     datetime64[ns]
rain             object
temp             object
dates            object
hour              int64
dtype: object

In [46]:
df_weather['dates'] = pd.to_datetime(df_weather['dates'],infer_datetime_format=True)

In [47]:
df_sampled.dtypes

dayofservice         datetime64[ns]
actualtime_arr                int64
is_school_holiday             int32
traveltime                    int64
segments                     object
dayofweek                    object
hour                          int64
dtype: object

In [48]:
df_sampled_merged = df_sampled.merge(df_weather,left_on=['dayofservice','hour'],right_on=['dates','hour'],how='left')

In [49]:
df_sampled_merged.dtypes

dayofservice         datetime64[ns]
actualtime_arr                int64
is_school_holiday             int32
traveltime                    int64
segments                     object
dayofweek                    object
hour                          int64
date                 datetime64[ns]
rain                         object
temp                         object
dates                datetime64[ns]
dtype: object

> Adding Holiday Info

In [50]:
# Adding Holiday Information
holidays_series = pd.read_sql_query("select date from main_bankholidays;",engine)

In [51]:
holidays_series['date'] = pd.to_datetime(holidays_series['date'],infer_datetime_format=True)

In [52]:
holidays_series

Unnamed: 0,date
0,2016-01-01
1,2016-03-06
2,2016-03-17
3,2016-03-25
4,2016-03-28
5,2016-05-02
6,2016-06-06
7,2016-06-19
8,2016-08-01
9,2016-10-31


In [53]:
df_sampled_merged[df_sampled_merged['dayofservice']==pd.to_datetime('2016-01-01',infer_datetime_format=True)]

Unnamed: 0,dayofservice,actualtime_arr,is_school_holiday,traveltime,segments,dayofweek,hour,date,rain,temp,dates
201,2016-01-01,85092,1,16,1000_1001,Friday,23,2016-01-01 23:00:00,0.4,8.4,2016-01-01
373,2016-01-01,51622,1,33,1000_1001,Friday,14,2016-01-01 14:00:00,0.7,7.4,2016-01-01
439,2016-01-01,67879,1,34,1000_1001,Friday,18,2016-01-01 18:00:00,1.1,8.5,2016-01-01
692,2016-01-01,55171,1,41,1000_1001,Friday,15,2016-01-01 15:00:00,0.5,7.8,2016-01-01
710,2016-01-01,60703,1,18,1000_1001,Friday,16,2016-01-01 16:00:00,1,8.1,2016-01-01
1016,2016-01-01,79712,1,19,1000_1001,Friday,22,2016-01-01 22:00:00,0.8,8.5,2016-01-01
1510,2016-01-01,39816,1,14,1000_1001,Friday,11,2016-01-01 11:00:00,0,7.8,2016-01-01
1807,2016-01-01,44029,1,18,1000_1001,Friday,12,2016-01-01 12:00:00,0,8.1,2016-01-01
1985,2016-01-01,57354,1,21,1000_1001,Friday,15,2016-01-01 15:00:00,0.5,7.8,2016-01-01
2430,2016-01-01,49664,1,19,1000_1001,Friday,13,2016-01-01 13:00:00,0.7,7.2,2016-01-01


In [54]:
df_sampled_merged.loc[df_sampled_merged.dayofservice.isin(holidays_series['date'].values), 'dayofweek']='Sunday'

In [55]:
df_sampled_merged.drop(['dayofservice','hour','date','dates'],axis=1,inplace=True)

In [56]:
df_sampled_merged.head()

Unnamed: 0,actualtime_arr,is_school_holiday,traveltime,segments,dayofweek,rain,temp
0,79907,0,17,1000_1001,Tuesday,0,2.9
1,64277,0,220,1000_1001,Monday,0,9.3
2,63248,0,36,1000_1001,Saturday,0,12.8
3,27858,0,59,1000_1001,Wednesday,0,8.1
4,56554,1,79,1000_1001,Tuesday,0,17.2


In [57]:
# df_sampled_merged = df_sampled_merged[df_sampled_merged['traveltime']>0]

In [58]:
# df_sampled_merged[df_sampled_merged['traveltime']==0]

> Analysing the outliers and missing values

In [59]:
max(df_sampled_merged['traveltime'].values)

19987

In [60]:
min(df_sampled_merged['traveltime'].values)

1

In [61]:
df_sampled_merged.describe().T.to_csv('actualTimeArrivalDescription.csv')

In [62]:
import matplotlib.pyplot as plt
f,ax = plt.subplots(figsize=(8,8))
df_sampled_merged.boxplot(column='traveltime')
plt.savefig('Figures//actualtime_outliers.png',transparent=True)

In [63]:
df_sampled_merged.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actualtime_arr,27076602.0,53406.369271,17486.338904,15998.0,38063.0,53532.0,67477.0,94064.0
is_school_holiday,27076602.0,0.250752,0.433446,0.0,0.0,0.0,1.0,1.0
traveltime,27076602.0,71.787337,93.735687,1.0,33.0,52.0,84.0,19987.0


In [64]:
len(df_sampled_merged[df_sampled_merged['traveltime']>(84.0+(1.5*51))])

1779841

In [65]:
df_travelTimelessthan25iqr = df_sampled_merged[df_sampled_merged['traveltime']<(33.0)]

In [66]:
df_travelTimelessthan25iqr.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actualtime_arr,6572263.0,54859.259965,18771.697684,16391.0,38129.0,54839.0,71261.0,94064.0
is_school_holiday,6572263.0,0.254247,0.435437,0.0,0.0,0.0,1.0,1.0
traveltime,6572263.0,22.0075,6.642429,1.0,17.0,22.0,28.0,32.0


In [67]:
df_sampled_merged['traveltime'].clip(lower=22.0, upper=(84.0+(1.5*51)),inplace=True)

In [68]:
df_sampled_merged.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actualtime_arr,27076602.0,53406.369271,17486.338904,15998.0,38063.0,53532.0,67477.0,94064.0
is_school_holiday,27076602.0,0.250752,0.433446,0.0,0.0,0.0,1.0,1.0
traveltime,27076602.0,64.311604,40.454098,22.0,33.0,52.0,84.0,160.5


In [None]:
import matplotlib.pyplot as plt
f,ax = plt.subplots(figsize=(8,8))
df_sampled_merged.boxplot(column='traveltime')
# plt.savefig('Figures//actualtime_wo_outliers.png',transparent=True)

In [None]:
df_sampled_merged.head()

In [69]:
df_sampled_merged.to_csv('Sampled_Data_actualTime.csv',index_label=False,index=False)

**This is the end of the notebook, the following code was written to make the final segmented file. This now will be done in a new notebook where in flags for city center, school holiday, distance values, kalman filter coefficient and gradient descent algorithm will be implemented and then the following segments will be generated in the last file**

In [None]:
# # for i, val in df_sampled_merged.groupby('segments'):
# #     val.to_csv('SegmentedSamples2_actual\\{}.csv'.format(i),header=False,index_label=False,index=False)

# # df_journey.head()

# _39A_segments = df_journey[df_journey['routeid'].str.startswith('39A')]['segmentid'].values.tolist()

# unique_39A_segments = list(set(_39A_segments))

# df_sampled_merged.dtypes

# len(df_sampled_merged[df_sampled_merged['segments'].isin(unique_39A_segments)])

# df_39A_segments = df_sampled_merged[df_sampled_merged['segments'].isin(unique_39A_segments)]['segments'].values.tolist()

# unq_39_test = list(set(df_39A_segments))

# len(unq_39_test)

# (unique_39A_segments.isin(unq_39_test))

# set(unique_39A_segments) - set(unq_39_test)

# df_sampled_merged[df_sampled_merged['segments'].str.contains('')]

# df_39A_segments_sampled = df_sampled_merged[df_sampled_merged['segments'].isin(unique_39A_segments)]

# df_39A_segments_sampled

# for i, val in df_39A_segments_sampled.groupby('segments'):
#     val.to_csv('SegmentedSamples2_actual\\39A_segments\\{}.csv'.format(i),header=False,index_label=False,index=False)

<hr>