In [1]:
# -*- coding: utf-8 -*-
"""bus_stops_extraction.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1OikJekLXp81Irr14GkLemhqGEyvsEXlh

Importing python libraries
"""

import pandas as pd
import numpy as np
from datetime import datetime,date, timedelta

! pip install geopandas
import geopandas as gpd
from geopandas import GeoDataFrame as gdf


import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

#Importing 
# 1. raw GPS data
# 2. splitted trip data from bus_trip_extraction.py
# 3. bus stops details (Latitude and longitude)
raw_files = [
    '../Data Sets/Raw GPS data Kandy Digana Buses/digana_2021_10.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2021_11.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2021_12.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_01.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_02.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_07.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_08.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_09.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_10.csv'
]


path_trip_ends = '../DataOut/trip_ends.csv'
path_bus_trips = '../DataOut/bus_trips.csv'
path_bus_stops = '../Data Sets/bus_stops_and_terminals_654.csv'

trip_ends = pd.read_csv(path_trip_ends)
bus_trips = pd.read_csv(path_bus_trips)
bus_stops= pd.read_csv(path_bus_stops)

# Create an empty list to store DataFrames
dfs = []

# Loop through the list of file names and read each CSV into a DataFrame
for file in raw_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate the DataFrames vertically
raw_data = pd.concat(dfs, ignore_index=True)






[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
def raw_data_cleaning(raw_data):
    """
    Removal of records with error records.
    Remove data with zero values for longitude and latitude columns.
    Remove data with dates outside the desired range.
    Sort data by time and device.

    Args:
        raw_data (pd.DataFrame): Crude raw GPS data filtered out from the server for the required time window.

    Returns:
        gps_data (pd.DataFrame): A cleaned dataframe object of GPS data.
    """

    # raw_data = raw_data.drop(drop_columns, axis=1)

    gps_data = raw_data[raw_data.latitude != 0]
    gps_data = gps_data[gps_data.longitude != 0]  # cleaning zero values for latitude & longitude

    gps_data['date'] = pd.to_datetime(gps_data['devicetime']).dt.date  # split date and time separately into datetime variables
    gps_data['time'] = pd.to_datetime(gps_data['devicetime']).dt.time

    # Remove rows with dates outside the desired range
    start_date = pd.Timestamp("2021-10-01").date()
    end_date = pd.Timestamp("2022-10-31").date()
    gps_data = gps_data[(gps_data['date'] >= start_date) & (gps_data['date'] <= end_date)]

    gps_data = gps_data.sort_values(['deviceid', 'date', 'time'])  # sorting dataset by time and device

    return gps_data


raw_data.columns

additional_columns = ['servertime','fixtime','address','routeid']

# drop the additional columns
raw_data = raw_data.drop(additional_columns, axis = 1)



gps_data= raw_data_cleaning(raw_data)

In [3]:
gps_data

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time
281253,574073556,116,2021-10-15 11:37:14,7.293917,80.736137,0.0,2021-10-15,11:37:14
281400,574078368,116,2021-10-15 11:39:26,7.294845,80.735427,0.0,2021-10-15,11:39:26
281492,574082837,116,2021-10-15 11:39:41,7.294825,80.735470,0.0,2021-10-15,11:39:41
281493,574082838,116,2021-10-15 11:39:56,7.294817,80.735472,0.0,2021-10-15,11:39:56
281494,574082840,116,2021-10-15 11:40:11,7.294813,80.735470,0.0,2021-10-15,11:40:11
...,...,...,...,...,...,...,...,...
6656384,912468415,1719,2022-03-01 23:59:14,7.263890,80.700732,0.0,2022-03-01,23:59:14
6656386,912468524,1719,2022-03-01 23:59:24,7.263890,80.700732,0.0,2022-03-01,23:59:24
6656388,912468599,1719,2022-03-01 23:59:34,7.263890,80.700732,0.0,2022-03-01,23:59:34
6656390,912468664,1719,2022-03-01 23:59:44,7.263890,80.700732,0.0,2022-03-01,23:59:44


In [4]:
# select raws where bus id = 279 and device time date is 2021-10-03
gps_data.loc[(gps_data['deviceid'] == 279) & (gps_data['devicetime'].str.contains('2021-10-03 06:'))]

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time
68291,562581159,279,2021-10-03 06:00:14,7.298890,80.733530,18.89850,2021-10-03,06:00:14
68289,562581103,279,2021-10-03 06:00:29,7.297698,80.732940,17.81860,2021-10-03,06:00:29
68288,562581102,279,2021-10-03 06:00:44,7.297433,80.731895,13.49890,2021-10-03,06:00:44
68287,562581101,279,2021-10-03 06:00:59,7.297200,80.730937,17.27860,2021-10-03,06:00:59
68286,562581100,279,2021-10-03 06:01:14,7.296095,80.730060,19.97840,2021-10-03,06:01:14
...,...,...,...,...,...,...,...,...
68774,562612451,279,2021-10-03 06:56:42,7.292058,80.634965,0.00000,2021-10-03,06:56:42
68775,562612606,279,2021-10-03 06:56:56,7.292128,80.635007,2.69978,2021-10-03,06:56:56
68778,562612884,279,2021-10-03 06:56:58,7.292165,80.635035,4.31966,2021-10-03,06:56:58
68777,562612881,279,2021-10-03 06:57:13,7.292538,80.635033,4.85961,2021-10-03,06:57:13


In [5]:
bus_stops

Unnamed: 0,stop_id,route_id,direction,address,latitude,longitude
0,BT01,654,Kandy-Digana,Kandy,7.292462,80.634978
1,101,654,Kandy-Digana,Wales Park,7.291186,80.637662
2,102,654,Kandy-Digana,Mahamaya,7.28784,80.64584
3,103,654,Kandy-Digana,Lewella junction,7.29443,80.65003
4,104,654,Kandy-Digana,Talwatta,7.286701,80.660336
5,105,654,Kandy-Digana,Tennekumbura Bridge,7.281866,80.66603
6,106,654,Kandy-Digana,Kalapura Junction Busstop,7.27983,80.67621
7,107,654,Kandy-Digana,Nattarampotha Junction Bus Stop,7.279117,80.67945
8,108,654,Kandy-Digana,Kundasale New town,7.2809,80.68416
9,109,654,Kandy-Digana,Warapitiya,7.28149,80.68635


In [6]:
#developing geo-buffer rings around every bus stops
def bus_stop_buffer_create(gps_data,bus_stops,stop_buffer,extra_buffer):

  """
  
    Buffer and additional buffer  created  to accomodate points if they were missed in standard stop buffer.

    Args:
        gps_data (pd.DataFrame): Cleaned gps data filtered out from the server for the required time window.
        bus_stops (pd.DataFrame) : Bus stops data for the trip route
        stop_buffer (int):  Radius of the buffer area to represent bus stops
        extra_buffer (int):  Extended radius of the buffer area to represent bus stops.
    
    Returns:
        bus_stops_buffer1 (GeoDataFrame) : Buffer created for filtered  Kandy-Digana direction.
        bus_stops_buffer2 (GeoDataFrame) : Buffer created for filtered  Digana-Kandy direction
        gps_data (GeoDataFrame) :  GPS data as GeoDataFrame with projected corrdinates.
        bus_stops_buffer1_add (GeoDataFrame) : Additional buffer created for filtered  Kandy-Digana direction.
        bus_stops_buffer2_add (GeoDataFrame) : Additional buffer created for filtered  Digana-Kandy direction.
  """

  #Create Geodataframe of GPS data and bus stops data
  gps_data = gpd.GeoDataFrame(gps_data, geometry=gpd.points_from_xy(gps_data.longitude,gps_data.latitude),crs='EPSG:4326')
  bus_stops = gpd.GeoDataFrame(bus_stops, geometry=gpd.points_from_xy(bus_stops.longitude,bus_stops.latitude),crs='EPSG:4326')

  #project the corrdinates in Local coordinate system
  bus_stops = bus_stops.to_crs('EPSG:5234')
  gps_data = gps_data.to_crs('EPSG:5234')

  #split bus stops dataframe into two based on route direction 
  bus_stops_direction1 = bus_stops[bus_stops['direction']=='Kandy-Digana']
  bus_stops_direction2 = bus_stops[bus_stops['direction']=='Digana-Kandy']

  bus_stops_direction2.reset_index(drop = True, inplace = True)

  #proximity analysis
  #creating a buffer
  bus_stops_buffer1 = gpd.GeoDataFrame(bus_stops_direction1, geometry = bus_stops_direction1.geometry.buffer(stop_buffer))
  bus_stops_buffer2 = gpd.GeoDataFrame(bus_stops_direction2, geometry = bus_stops_direction2.geometry.buffer(stop_buffer))

  #creating additional extra buffer to accomodate points if they were missed in standard stop buffer
  bus_stops_buffer1_add = gpd.GeoDataFrame(bus_stops_direction1, geometry = bus_stops_direction1.geometry.buffer(extra_buffer))
  bus_stops_buffer2_add = gpd.GeoDataFrame(bus_stops_direction2, geometry = bus_stops_direction2.geometry.buffer(extra_buffer))

  return bus_stops_buffer1, bus_stops_buffer2,gps_data,bus_stops_buffer1_add,bus_stops_buffer2_add

stop_buffer = 50
extra_buffer = 100
bus_stops_buffer1, bus_stops_buffer2,gps_data,bus_stops_buffer1_add,bus_stops_buffer2_add = bus_stop_buffer_create(gps_data,bus_stops,stop_buffer,extra_buffer)


In [7]:
bus_trips

Unnamed: 0,trip_id,deviceid,date,start_terminal,end_terminal,direction,start_time,end_time,duration,duration_in_mins,day_of_week,hour_of_day
0,1.0,116,2021-10-16,BT02,BT01,2,07:08:31,07:53:04,0:44:33,44.550000,5,7
1,2.0,116,2021-10-16,BT01,BT02,1,08:03:04,08:53:48,0:50:44,50.733333,5,8
2,3.0,116,2021-10-16,BT02,BT01,2,10:50:19,11:44:43,0:54:24,54.400000,5,10
3,4.0,116,2021-10-16,BT01,BT02,1,12:20:45,13:18:33,0:57:48,57.800000,5,12
4,5.0,116,2021-10-16,BT02,BT01,2,14:14:36,15:07:05,0:52:29,52.483333,5,14
...,...,...,...,...,...,...,...,...,...,...,...,...
15886,15887.0,1719,2022-02-25,BT01,BT02,1,09:42:39,10:34:59,0:52:20,52.333333,4,9
15887,15888.0,1719,2022-02-25,BT02,BT01,2,12:20:33,13:15:43,0:55:10,55.166667,4,12
15888,15889.0,1719,2022-02-25,BT01,BT02,1,13:41:19,14:38:03,0:56:44,56.733333,4,13
15889,15890.0,1719,2022-02-25,BT02,BT01,2,16:00:04,16:48:05,0:48:01,48.016667,4,16


In [8]:
trip_ends.head(10)

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,grouped_ends,entry/exit,trip_id
0,574670748,116,2021-10-16 07:08:31,7.299052,80.73441,7.01944,2021-10-16,07:08:31,POINT (195659.52317664068 232979.73272654996),BT02,2,0,1.0
1,574721062,116,2021-10-16 07:53:04,7.29171,80.635112,5.93953,2021-10-16,07:53:04,POINT (184695.3910443462 232169.99429151896),BT01,3,1,1.0
2,574733098,116,2021-10-16 08:03:04,7.293092,80.635573,9.17927,2021-10-16,08:03:04,POINT (184746.41595346577 232322.75960448402),BT01,3,0,2.0
3,574787724,116,2021-10-16 08:53:48,7.299068,80.73435,4.85961,2021-10-16,08:53:48,POINT (195652.88748366528 232981.57999858505),BT02,4,1,2.0
4,574907780,116,2021-10-16 10:50:19,7.298947,80.734155,8.09935,2021-10-16,10:50:19,POINT (195631.36679008024 232968.12393211995),BT02,4,0,3.0
5,574968864,116,2021-10-16 11:44:43,7.291692,80.635037,4.31966,2021-10-16,11:44:43,POINT (184687.1092663893 232167.96208088237),BT01,5,1,3.0
6,575010318,116,2021-10-16 12:20:45,7.293083,80.635543,5.93953,2021-10-16,12:20:45,POINT (184743.10322200865 232321.84276932332),BT01,5,0,4.0
7,575068506,116,2021-10-16 13:18:33,7.299008,80.734472,0.0,2021-10-16,13:18:33,POINT (195666.32424734606 232974.94395519554),BT02,6,1,4.0
8,575128640,116,2021-10-16 14:14:36,7.298962,80.734252,3.7797,2021-10-16,14:14:36,POINT (195642.03286579804 232969.78177808868),BT02,6,0,5.0
9,575185688,116,2021-10-16 15:07:05,7.291613,80.635232,4.85961,2021-10-16,15:07:05,POINT (184708.63767068012 232159.29696145633),BT01,7,1,5.0


In [9]:
# # trip_ends drop rows with index 4 and 5    
# trip_ends.drop([4,5], inplace = True)

In [10]:
# get the trip ids in trip_ends where ther are no couple of trip ends
trip_ends_no_couple = trip_ends[trip_ends['trip_id'].duplicated(keep=False)==False]
trip_ends_no_couple

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,grouped_ends,entry/exit,trip_id


In [11]:
# create a list of unique trip ids in trip_ends
trip_ids = bus_trips.trip_id.unique()

# loop through trip_ends_grouped rows, while counting from 1 to the number of rows
# if the count doesn't match the trip_id, add that count to missing_trips list
# and assign the current trip id to the count variable and continue the loop
missing_trips = []
count = 1
for tid in trip_ids:
    if count != tid:
        missing_trips.append(count)
    count = tid + 1  # Corrected this line to assign the next trip ID to count

print(missing_trips)


[]


In [12]:
# get rid of the column limit
pd.set_option('display.max_columns', None)

In [13]:
test=(pd.merge(left = gps_data, right  = trip_ends[['id','bus_stop','trip_id']],how = 'outer',left_on ='id', right_on= 'id'))

In [14]:
# print all rows
pd.set_option('display.max_rows', 10)

In [15]:
# select raws where bus id = 279 and device time date is 2021-10-03
test.loc[(test['deviceid'] == 279) & (test['devicetime'].str.contains('2021-10-03 05:'))]

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,trip_id
2888964,562572133,279,2021-10-03 05:23:41,7.298328,80.734303,0.539957,2021-10-03,05:23:41,POINT (195647.735 232899.750),,
2888965,562572272,279,2021-10-03 05:24:23,7.298328,80.734303,0.000000,2021-10-03,05:24:23,POINT (195647.735 232899.750),,
2888966,562572278,279,2021-10-03 05:24:26,7.299112,80.735150,0.000000,2021-10-03,05:24:26,POINT (195741.230 232986.361),,
2888967,562572351,279,2021-10-03 05:24:49,7.299287,80.735237,2.699780,2021-10-03,05:24:49,POINT (195750.793 233005.712),,
2888968,562572362,279,2021-10-03 05:24:54,7.299338,80.735253,0.000000,2021-10-03,05:24:54,POINT (195752.637 233011.429),,
...,...,...,...,...,...,...,...,...,...,...,...
2888980,562579202,279,2021-10-03 05:55:06,7.299277,80.734935,3.239740,2021-10-03,05:55:06,POINT (195717.492 233004.609),,
2888981,562579201,279,2021-10-03 05:55:09,7.299260,80.734892,0.000000,2021-10-03,05:55:09,POINT (195712.700 233002.774),,
2888982,562580604,279,2021-10-03 05:59:45,7.299187,80.734687,3.779700,2021-10-03,05:59:45,POINT (195690.065 232994.659),,
2888983,562580623,279,2021-10-03 05:59:51,7.299133,80.734590,0.000000,2021-10-03,05:59:51,POINT (195679.398 232988.766),,


In [16]:
#gps records that are not associated with the terminals are asssigned as trip id = 0
test["trip_id"].fillna(0, inplace = True)

#run a loop to assign trip_id to records that are in between the terminals
test.reset_index(drop = True, inplace = True)

trip =1
for i in range(len(test)-1):
    if (test.at[i,'trip_id']==trip) & (test.at[i+1, 'trip_id'] == 0):
        test.at[i+1,'trip_id'] = trip
    elif (test.at[i,'trip_id']==trip) & (test.at[i+1, 'trip_id'] == trip):
        trip = trip + 1

In [17]:
# print test where trip id  != 0
test.loc[(test['trip_id'] != 0)].head(1000)

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,trip_id
424,574670748,116,2021-10-16 07:08:31,7.299052,80.734410,7.01944,2021-10-16,07:08:31,POINT (195659.523 232979.733),BT02,1.0
425,574670749,116,2021-10-16 07:08:46,7.298598,80.733327,19.43850,2021-10-16,07:08:46,POINT (195539.897 232929.616),,1.0
426,574670750,116,2021-10-16 07:09:01,7.297437,80.732405,26.45790,2021-10-16,07:09:01,POINT (195438.129 232801.161),,1.0
427,574670751,116,2021-10-16 07:09:07,7.297405,80.731912,5.93953,2021-10-16,07:09:07,POINT (195383.650 232797.672),,1.0
428,574670752,116,2021-10-16 07:09:22,7.297420,80.731760,11.33910,2021-10-16,07:09:22,POINT (195366.912 232799.332),,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1819,575165962,116,2021-10-16 14:48:30,7.279800,80.676060,11.87910,2021-10-16,14:48:30,POINT (189216.447 230851.797),,5.0
1820,575165963,116,2021-10-16 14:48:45,7.279825,80.675287,4.85961,2021-10-16,14:48:45,POINT (189131.050 230854.579),,5.0
1821,575165964,116,2021-10-16 14:49:00,7.279832,80.674960,3.77970,2021-10-16,14:49:00,POINT (189094.976 230855.317),,5.0
1822,575165965,116,2021-10-16 14:49:15,7.279823,80.674692,7.01944,2021-10-16,14:49:15,POINT (189065.351 230854.405),,5.0


In [18]:
#splitting trajectories
def bus_trajectory(gps_data,trip_ends,bus_trips):

  """
    Create bus trajectory data of sequence of bus stops with direction of trip.
    
    Args:
        gps_data (GeoDataFrame): Bus trips GPS data
        trip_ends (pd.DataFrame) : Splitted trip data from bus_trip_extraction.py
        bus_trips (pd.DataFrame) : Bus trips data
    
    Returns:
        bus_trajectory (pd.DataFrame): Sequence of bus trip trajectory data
  """
  
  #gps records that are matched with end terminals, are merged with whole GPS records
  trip_ends = trip_ends[['id','bus_stop','trip_id']]
  bus_trajectory = pd.merge(left = gps_data, right  = trip_ends,how = 'outer',left_on ='id', right_on= 'id')

  #gps records that are not associated with the terminals are asssigned as trip id = 0
  bus_trajectory["trip_id"].fillna(0, inplace = True)

  #run a loop to assign trip_id to records that are in between the terminals
  bus_trajectory.reset_index(drop = True, inplace = True)

  trip =1
  for i in range(len(bus_trajectory)-1):
    if (bus_trajectory.at[i,'trip_id']==trip) & (bus_trajectory.at[i+1, 'trip_id'] == 0):
      bus_trajectory.at[i+1,'trip_id'] = trip
    elif (bus_trajectory.at[i,'trip_id']==trip) & (bus_trajectory.at[i+1, 'trip_id'] == trip):
      trip = trip + 1
  
  bus_trajectory.drop(bus_trajectory[bus_trajectory['trip_id']==0].index, inplace = True ) #drop records that are not identified as a bus trip

  #Identify the directions of each bus trajectories using bus trips extracted data
  directions= bus_trips.set_index('trip_id').to_dict()['direction']
  bus_trajectory['direction'] = list(map(lambda x: directions[x]   ,bus_trajectory['trip_id']))

  return bus_trajectory

bus_trajectory = bus_trajectory(gps_data,trip_ends,bus_trips)

In [19]:
bus_trips[bus_trips['trip_id']==8007.0]

Unnamed: 0,trip_id,deviceid,date,start_terminal,end_terminal,direction,start_time,end_time,duration,duration_in_mins,day_of_week,hour_of_day
8006,8007.0,279,2021-10-03,BT02,BT01,2,08:58:20,09:52:02,0:53:42,53.7,6,8


In [20]:
# select raws where bus id = 279 and device time date is 2021-10-03
bus_trajectory.loc[(bus_trajectory['deviceid'] == 279) & (bus_trajectory['devicetime'].str.contains('2021-10-03 06:'))]

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,trip_id,direction
2888985,562581159,279,2021-10-03 06:00:14,7.298890,80.733530,18.89850,2021-10-03,06:00:14,POINT (195562.358 232961.871),,8005.0,2
2888986,562581103,279,2021-10-03 06:00:29,7.297698,80.732940,17.81860,2021-10-03,06:00:29,POINT (195497.203 232830.096),,8005.0,2
2888987,562581102,279,2021-10-03 06:00:44,7.297433,80.731895,13.49890,2021-10-03,06:00:44,POINT (195381.818 232800.801),,8005.0,2
2888988,562581101,279,2021-10-03 06:00:59,7.297200,80.730937,17.27860,2021-10-03,06:00:59,POINT (195275.995 232775.012),,8005.0,2
2888989,562581100,279,2021-10-03 06:01:14,7.296095,80.730060,19.97840,2021-10-03,06:01:14,POINT (195179.195 232652.827),,8005.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2889182,562604888,279,2021-10-03 06:45:36,7.290550,80.639013,19.97840,2021-10-03,06:45:36,POINT (185126.161 232041.590),,8005.0,2
2889183,562604887,279,2021-10-03 06:45:51,7.291018,80.637885,16.73870,2021-10-03,06:45:51,POINT (185001.595 232093.413),,8005.0,2
2889184,562604886,279,2021-10-03 06:46:06,7.291353,80.637058,16.73870,2021-10-03,06:46:06,POINT (184910.325 232130.485),,8005.0,2
2889185,562604885,279,2021-10-03 06:46:21,7.291627,80.635328,23.75810,2021-10-03,06:46:21,POINT (184719.315 232160.764),,8005.0,2


In [21]:
bus_stops_buffer2

Unnamed: 0,stop_id,route_id,direction,address,latitude,longitude,geometry
0,BT02,654,Digana-Kandy,Digana,7.298960,80.734720,"POLYGON ((195743.751 232969.601, 195743.510 23..."
1,201,654,Digana-Kandy,Kengalla,7.291726,80.721430,"POLYGON ((194276.294 232169.792, 194276.054 23..."
2,202,654,Digana-Kandy,BOI,7.284110,80.722100,"POLYGON ((194350.152 231327.589, 194349.912 23..."
3,203,654,Digana-Kandy,Balagolla,7.287349,80.714010,"POLYGON ((193456.924 231685.871, 193456.683 23..."
4,204,654,Digana-Kandy,Pachchakaatuwa,7.284730,80.705390,"POLYGON ((192505.093 231396.387, 192504.852 23..."
...,...,...,...,...,...,...,...
10,210,654,Digana-Kandy,Buwelikada,7.293120,80.649220,"POLYGON ((186303.219 232325.468, 186302.978 23..."
11,211,654,Digana-Kandy,Dharmaraja,7.290670,80.646510,"POLYGON ((186003.919 232054.624, 186003.678 23..."
12,212,654,Digana-Kandy,Children park,7.286810,80.647610,"POLYGON ((186125.259 231627.742, 186125.018 23..."
13,213,654,Digana-Kandy,Hilwood,7.290210,80.639800,"POLYGON ((185263.015 232003.967, 185262.774 23..."


In [22]:
bus_stops_buffer1

Unnamed: 0,stop_id,route_id,direction,address,latitude,longitude,geometry
0,BT01,654,Kandy-Digana,Kandy,7.292462,80.634978,"POLYGON ((184730.643 232253.182, 184730.402 23..."
1,101,654,Kandy-Digana,Wales Park,7.291186,80.637662,"POLYGON ((185026.961 232111.967, 185026.720 23..."
2,102,654,Kandy-Digana,Mahamaya,7.287840,80.645840,"POLYGON ((185929.853 231741.696, 185929.612 23..."
3,103,654,Kandy-Digana,Lewella junction,7.294430,80.650030,"POLYGON ((186392.694 232470.307, 186392.453 23..."
4,104,654,Kandy-Digana,Talwatta,7.286701,80.660336,"POLYGON ((187530.385 231615.322, 187530.144 23..."
...,...,...,...,...,...,...,...
11,111,654,Kandy-Digana,Pachchakaatuwa,7.284730,80.705390,"POLYGON ((192505.093 231396.387, 192504.852 23..."
12,112,654,Kandy-Digana,Balagolla,7.287349,80.714005,"POLYGON ((193456.384 231685.873, 193456.143 23..."
13,113,654,Kandy-Digana,BOI,7.284110,80.722100,"POLYGON ((194350.152 231327.589, 194349.912 23..."
14,114,654,Kandy-Digana,Kengalla,7.291520,80.721330,"POLYGON ((194265.223 232147.012, 194264.982 23..."


In [23]:
# def stop_buffer_filter(bus_trajectory,bus_stops_buffer1,bus_stops_buffer2,bus_stops_buffer1_add,bus_stops_buffer2_add):
    

#   """

#     Filter bus trip data of two buffer ranges with all the bus points, only bus stops points.
    
#     Args:
#         bus_trajectory (pd.DataFrame): Sequence of bus trip trajectory data
#         bus_stops_buffer1 (GeoDataFrame) : Buffer created for filtered  Kandy-Digana direction.
#         bus_stops_buffer2 (GeoDataFrame) : Buffer created for filtered  Digana-Kandy direction
#         bus_stops_buffer1_add (GeoDataFrame) : Additional buffer created for filtered  Kandy-Digana direction.
#         bus_stops_buffer2_add (GeoDataFrame) : Additional buffer created for filtered  Digana-Kandy direction.
    
#     Returns:
#         bus_trip_all_points (pd.DataFrame): Bus trip data with all points including null for bus_stop
#         bus_stop_all_points (pd.DataFrame): Bus trip data with only bus_stops points

#   """

#   #project to local coordinate system before buffer filtering
#   bus_trajectory = bus_trajectory.to_crs('EPSG:5234')

#   #split trajectories by direction 
#   trajectory_dir_1 = bus_trajectory[bus_trajectory['direction'] == 1]
#   trajectory_dir_2 = bus_trajectory[bus_trajectory['direction'] == 2]

#   #reset index before for loop
#   trajectory_dir_1.reset_index(drop = True, inplace = True)
#   trajectory_dir_2.reset_index(drop = True, inplace = True)

#   #filter records within bus stops buffer of both directions
#   for i in range(len(trajectory_dir_1)):
#     for stop in range(len(bus_stops_buffer1)):
#       if bus_stops_buffer1.iloc[stop].geometry.contains(trajectory_dir_1.iloc[i].geometry):
#         trajectory_dir_1.at[i,'bus_stop'] = bus_stops_buffer1.at[stop,'stop_id']
#       else:       
#         if bus_stops_buffer1_add.iloc[stop].geometry.contains(trajectory_dir_1.iloc[i].geometry):
#           trajectory_dir_1.at[i,'bus_stop'] = bus_stops_buffer1_add.at[stop,'stop_id']

#   #filter records within bus stops buffer of both directions
#   for i in range(len(trajectory_dir_2)):
#     for stop in range(len(bus_stops_buffer2)):
#       if bus_stops_buffer2.iloc[stop].geometry.contains(trajectory_dir_2.iloc[i].geometry):
#         trajectory_dir_2.at[i,'bus_stop'] = bus_stops_buffer2.at[stop,'stop_id']
#       else:       
#         if bus_stops_buffer2_add.iloc[stop].geometry.contains(trajectory_dir_2.iloc[i].geometry):
#           trajectory_dir_2.at[i,'bus_stop'] = bus_stops_buffer2_add.at[stop,'stop_id']    

#   #concatenate dataframes of both directions and keep only records filtered within bus stops
#   bus_trip_all_points = pd.concat([trajectory_dir_1,trajectory_dir_2])
#   bus_stop_all_points = bus_trip_all_points.dropna()

#   return bus_trip_all_points,bus_stop_all_points

# bus_trip_all_points,bus_stop_all_points = stop_buffer_filter(bus_trajectory,bus_stops_buffer1,bus_stops_buffer2,bus_stops_buffer1_add,bus_stops_buffer2_add)

In [24]:
# # save bus_trip_all_points,bus_stop_all_points as csv files
# bus_trip_all_points.to_csv('../DataOut/bus_trip_all_points.csv',index=False)
# bus_stop_all_points.to_csv('../DataOut/bus_stop_all_points.csv',index=False)



In [25]:
# load the csv files
bus_trip_all_points = pd.read_csv('../DataOut/bus_trip_all_points.csv')
bus_stop_all_points = pd.read_csv('../DataOut/bus_stop_all_points.csv')

In [26]:
bus_trip_all_points

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,trip_id,direction
0,574733098,116,2021-10-16 08:03:04,7.293092,80.635573,9.17927,2021-10-16,08:03:04,POINT (184746.41595346577 232322.75960448402),BT01,2.0,1
1,574733099,116,2021-10-16 08:03:19,7.293068,80.636430,14.03890,2021-10-16,08:03:19,POINT (184841.00784031872 232320.15459256113),,2.0,1
2,574733100,116,2021-10-16 08:03:34,7.293043,80.637552,10.25920,2021-10-16,08:03:34,POINT (184964.84865244984 232317.35306792473),,2.0,1
3,574733101,116,2021-10-16 08:03:49,7.292725,80.637707,7.55940,2021-10-16,08:03:49,POINT (184981.95255171193 232282.14959255166),,2.0,1
4,574733102,116,2021-10-16 08:04:04,7.292292,80.637663,3.77970,2021-10-16,08:04:04,POINT (184977.15733984314 232234.22458233108),,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4633537,898915939,1719,2022-02-25 16:47:53,7.291572,80.635452,2.15983,2022-02-25,16:47:53,POINT (184732.927692229 232154.67833579454),,15890.0,2
4633538,898916182,1719,2022-02-25 16:47:57,7.291587,80.635355,2.15983,2022-02-25,16:47:57,POINT (184722.26205350138 232156.3402929157),,15890.0,2
4633539,898917152,1719,2022-02-25 16:48:02,7.291597,80.635293,3.77970,2022-02-25,16:48:02,POINT (184715.44974995626 232157.44817551947),,15890.0,2
4633540,898917150,1719,2022-02-25 16:48:03,7.291605,80.635280,3.77970,2022-02-25,16:48:03,POINT (184713.9815042613 232158.37751184579),,15890.0,2


In [27]:
bus_stop_all_points

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,trip_id,direction
0,574733098,116,2021-10-16 08:03:04,7.293092,80.635573,9.17927,2021-10-16,08:03:04,POINT (184746.41595346577 232322.75960448402),BT01,2.0,1
1,574736662,116,2021-10-16 08:04:19,7.291640,80.637455,9.17927,2021-10-16,08:04:19,POINT (184954.1363920236 232162.16484193364),101,2.0,1
2,574736663,116,2021-10-16 08:04:34,7.291210,80.637713,8.63931,2021-10-16,08:04:34,POINT (184982.64260533327 232114.61697960715),101,2.0,1
3,574736664,116,2021-10-16 08:04:49,7.290943,80.638160,0.00000,2021-10-16,08:04:49,POINT (185031.9565310351 232085.11003064158),101,2.0,1
4,574736665,116,2021-10-16 08:05:04,7.290943,80.638157,0.00000,2021-10-16,08:05:04,POINT (185031.58111776915 232085.11014168937),101,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1106260,898907933,1719,2022-02-25 16:45:59,7.290445,80.639287,12.95900,2022-02-25,16:45:59,POINT (185156.33465390973 232029.95910975555),213,15890.0,2
1106261,898908022,1719,2022-02-25 16:46:00,7.290453,80.639238,11.33910,2022-02-25,16:46:00,POINT (185151.00184406477 232030.88956650213),213,15890.0,2
1106262,898908212,1719,2022-02-25 16:46:02,7.290472,80.639112,13.49890,2022-02-25,16:46:02,POINT (185137.01275872233 232032.9173312658),213,15890.0,2
1106263,898908210,1719,2022-02-25 16:46:03,7.290477,80.639043,14.03890,2022-02-25,16:46:03,POINT (185129.47152351646 232033.47245865376),213,15890.0,2


In [28]:
# print raws where bus stop column is not null in bus_trip_all_points df
bus_trip_all_points[bus_trip_all_points['bus_stop'].notnull()]


Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,trip_id,direction
0,574733098,116,2021-10-16 08:03:04,7.293092,80.635573,9.17927,2021-10-16,08:03:04,POINT (184746.41595346577 232322.75960448402),BT01,2.0,1
5,574736662,116,2021-10-16 08:04:19,7.291640,80.637455,9.17927,2021-10-16,08:04:19,POINT (184954.1363920236 232162.16484193364),101,2.0,1
6,574736663,116,2021-10-16 08:04:34,7.291210,80.637713,8.63931,2021-10-16,08:04:34,POINT (184982.64260533327 232114.61697960715),101,2.0,1
7,574736664,116,2021-10-16 08:04:49,7.290943,80.638160,0.00000,2021-10-16,08:04:49,POINT (185031.9565310351 232085.11003064158),101,2.0,1
8,574736665,116,2021-10-16 08:05:04,7.290943,80.638157,0.00000,2021-10-16,08:05:04,POINT (185031.58111776915 232085.11014168937),101,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4633478,898907933,1719,2022-02-25 16:45:59,7.290445,80.639287,12.95900,2022-02-25,16:45:59,POINT (185156.33465390973 232029.95910975555),213,15890.0,2
4633479,898908022,1719,2022-02-25 16:46:00,7.290453,80.639238,11.33910,2022-02-25,16:46:00,POINT (185151.00184406477 232030.88956650213),213,15890.0,2
4633480,898908212,1719,2022-02-25 16:46:02,7.290472,80.639112,13.49890,2022-02-25,16:46:02,POINT (185137.01275872233 232032.9173312658),213,15890.0,2
4633481,898908210,1719,2022-02-25 16:46:03,7.290477,80.639043,14.03890,2022-02-25,16:46:03,POINT (185129.47152351646 232033.47245865376),213,15890.0,2


In [30]:
# bus_trip_all_points date to datetime date year month day
bus_trip_all_points['date'] = pd.to_datetime(bus_trip_all_points['date'], format='%Y-%m-%d').dt.date
bus_stop_all_points['date'] = pd.to_datetime(bus_stop_all_points['date'], format='%Y-%m-%d').dt.date

In [32]:
def dwell_time_estimation(bus_stop_all_points):
       
  """
    Drop terminal points from all points data. 
    Calculate arrival_time, departure_time according to check whether grouped record has 0 speed values or not.
    Dwell time derived in seconds.
    
    Args:
        bus_stop_all_points (pd.DataFrame): Bus trip data with only bus_stops points
    
    Returns:
        bus_stop_times (pd.DataFrame): Bus stops data with arrival_time, departure_time, dwell_time for each stops excluding terminals.

   """
    
  #Drop records with End Bus terminals 
  bus_stop_all_points.drop(bus_stop_all_points[bus_stop_all_points['bus_stop'] == 'BT01'].index, inplace = True)
  bus_stop_all_points.drop(bus_stop_all_points[bus_stop_all_points['bus_stop'] == 'BT02'].index, inplace = True)

  #grouping all records filtered for every bus stop
  bus_stop_all_points['grouped_ends'] = ((bus_stop_all_points['bus_stop'].shift() != bus_stop_all_points['bus_stop'])).cumsum()

  #creating a new dataframe for bus stop times
  columns = ['trip_id','deviceid','date','direction','bus_stop', 'arrival_time','departure_time','dwell_time']
  bus_stop_times = pd.DataFrame(columns=columns)

  #Loop over every grouped filtered records and choose the two records that indicate bus arrival and departure to the stop 
  for name, group in bus_stop_all_points.groupby('grouped_ends'):
    if 0 in group['speed'].values:               #if the grouped filter record has '0" speed values, then bus has stopped more than 15 seconds there and first '0'speed record as the arrival
      values = []
      trip_id = np.unique(group['trip_id'].values)[0]
      direction = np.unique(group['direction'].values)[0]
      deviceid = np.unique(group['deviceid'].values)[0]
      date = np.unique(group['date'].values)[0]
      bus_stop = np.unique(group['bus_stop'].values)[0]
      
      print(date,type(date))

      arrival_time = group[group['speed']==0]['time'].min()
      
      buffer_leaving_time = group['time'].max()
      rough_departure_time = group[group['speed']==0]['time'].max() 

      if (datetime.combine(date.min,buffer_leaving_time) - datetime.combine(date.min,rough_departure_time)).total_seconds() > 15:
        departure_time = (datetime.combine(date.min,rough_departure_time) + timedelta(seconds =15)).time()
      else:
        departure_time = buffer_leaving_time

      values.extend([trip_id,deviceid,date,direction,bus_stop,arrival_time,departure_time])
      new_row = pd.DataFrame([dict(zip(columns, values))])
      bus_stop_times = pd.concat([bus_stop_times, new_row], ignore_index=True)


    else:
      values = []
      trip_id = np.unique(group[['trip_id']].values)[0]
      direction = np.unique(group['direction'].values)[0]
      deviceid = np.unique(group[['deviceid']].values)[0]
      date = np.unique(group['date'].values)[0]
      bus_stop = np.unique(group['bus_stop'].values)[0]  

      arrival_time = group['time'].min()
      departure_time = arrival_time

      values.extend([trip_id,deviceid,date,direction,bus_stop,arrival_time,departure_time])
      new_row = pd.DataFrame([dict(zip(columns, values))])
      bus_stop_times = pd.concat([bus_stop_times, new_row], ignore_index=True)

  for i in range(len(bus_stop_times)):
    bus_stop_times.at[i,'dwell_time'] = datetime.combine(date.min,bus_stop_times.at[i,'departure_time']) - datetime.combine(date.min,bus_stop_times.at[i,'arrival_time'])

  bus_stop_times['dwell_time_in_seconds'] =  bus_stop_times['dwell_time']/np.timedelta64(1,'s')

  return bus_stop_times

bus_stop_times = dwell_time_estimation(bus_stop_all_points)

2021-10-16T00:00:00.000000000 <class 'numpy.datetime64'>


TypeError: combine() argument 1 must be datetime.date, not builtin_function_or_method

In [None]:
def dwell_time_feature_addition(bus_stop_times):
    
  """
    To created aditonal derieved features for bus stops data.
    
    Args:
        bus_stop_times (pd.DataFrame): Bus stops data with arrival_time, departure_time, dwell_time for each stops excluding terminals.
    
    Returns:
        bus_stop_times (pd.DataFrame): Bus stops data with created features. 

  """
    
  #bus_stop_times = bus_stop_times.drop(bus_stop_times[bus_stop_times['dwell_time_in_seconds']>threshold].index )

  bus_stop_times['day_of_week'] = pd.to_datetime(bus_stop_times['date']).dt.weekday
  bus_stop_times['hour_of_day'] = list(map(lambda x: x.hour, (bus_stop_times['arrival_time'])))
  bus_stop_times['weekday/end'] = list(map(lambda x: 1 if x < 5 else 0 , (bus_stop_times['day_of_week'])))

  return bus_stop_times

threshold = 480
bus_stop_times = dwell_time_feature_addition(bus_stop_times)
# save as a csv file
bus_stop_times.to_csv('../DataOut/bus_stop_times.csv', index=False)

In [None]:
bus_stop_times

In [None]:
bus_stops

In [None]:
bus_stops_dir_1= bus_stops[bus_stops['direction']=='Kandy-Digana']

In [None]:
bus_stops_dir_1

In [None]:
from shapely.geometry import Point

def trip_visualization(trip_id, city_location,bus_stops_dir_1):
    """
    Using a  GPS data visualization package of Folium, project the coordinates on 
    Open Street Map (OSM) to explore how the records are spread and to gain some insights and overview.
    
    Args:
        trip_id (int):  A random trip id to visualize its coordinates' distribution
        city_location (arr): Longtitude and latitude of city.
        bus_stop_dir_1 (GeoDataFrame): 
    
    Returns:
        map (MapObject):  A visualizable Map Object for selected trip id .
        bus_stops_map () : 

    """
    geometry = [Point(xy) for xy in zip(bus_stops_dir_1['longitude'], bus_stops_dir_1['latitude'])]
    bus_stops_dir_1 = gpd.GeoDataFrame(bus_stops_dir_1, geometry=geometry)
    trip = bus_trajectory[bus_trajectory['trip_id']==trip_id]
    trip = trip.to_crs('EPSG:4326')
    bus_stops_dir_1 = bus_stops_dir_1.to_crs('EPSG:4326')

    map = folium.Map(location=city_location, tiles='openstreetmap', zoom_start=14)
    for idx, row in trip.iterrows():
        Marker([row['latitude'], row['longitude']]).add_to(map)

    bus_stops_map = folium.Map(location=city_location, tiles='openstreetmap', zoom_start=14)
    for idx, row in bus_stop_dir_1.iterrows():
        Marker([row['latitude'], row['longitude']]).add_to(bus_stops_map)
    
    return map, bus_stops_map

trip_id = 1560  #Choose a random trip id to visualize its coordinates' distribution
city_location = [7.2906,80.6337] #kandy city location

map, bus_stops_map = trip_visualization(trip_id,city_location,bus_stops_dir_1)
