In [1]:
# -*- coding: utf-8 -*-
"""bus_stops_extraction.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1OikJekLXp81Irr14GkLemhqGEyvsEXlh

Importing python libraries
"""

import pandas as pd
import numpy as np
from datetime import datetime,date, timedelta

! pip install geopandas
import geopandas as gpd
from geopandas import GeoDataFrame as gdf


import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

#Importing 
# 1. raw GPS data
# 2. splitted trip data from bus_trip_extraction.py
# 3. bus stops details (Latitude and longitude)
raw_files = [
    '../Data Sets/Raw GPS data Kandy Digana Buses/digana_2021_10.csv',
    # '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2021_11.csv',
    # '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2021_12.csv',
    # '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_01.csv',
    # '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_02.csv',
    # '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_07.csv',
    # '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_08.csv',
    # '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_09.csv',
    # '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_10.csv'
]


path_trip_ends = '../DataOut/trip_ends.csv'
path_bus_trips = '../DataOut/bus_trips.csv'
path_bus_stops = '../Data Sets/bus_stops_and_terminals_654.csv'

trip_ends = pd.read_csv(path_trip_ends)
bus_trips = pd.read_csv(path_bus_trips)
bus_stops= pd.read_csv(path_bus_stops)

# Create an empty list to store DataFrames
dfs = []

# Loop through the list of file names and read each CSV into a DataFrame
for file in raw_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate the DataFrames vertically
raw_data = pd.concat(dfs, ignore_index=True)


[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [2]:
#inital data cleaning steps
def raw_data_cleaning(raw_data):

  """
    Removal of records with error records. 
    Remove data with zero values for longitude and latitude columns.
    Sort data by time and device.
    
    Args:
        raw_data (pd.DataFrame): Crude raw GPS data filtered out from the server for the required time window.
    
    Returns:
        gps_data (pd.DataFrame): A cleaned dataframe object of GPS data.
    """

  #raw_data = raw_data.drop(drop_columns, axis = 1)
  
  gps_data = raw_data[raw_data.latitude != 0]
  gps_data = gps_data[gps_data.longitude != 0] #cleaning zero values for latitude & longitude

  gps_data['date'] = pd.to_datetime(gps_data['devicetime']).dt.date #split date and time separately into datetime variables
  gps_data['time'] = pd.to_datetime(gps_data['devicetime']).dt.time

  gps_data = gps_data.sort_values(['deviceid', 'date', 'time']) #sorting dataset by time and device

  return gps_data

raw_data.columns

additional_columns = ['servertime','fixtime','address','routeid']

#drop_columns = ['servertime','fixtime','address','routeid']


gps_data= raw_data_cleaning(raw_data)

In [3]:
bus_stops

Unnamed: 0,stop_id,route_id,direction,address,latitude,longitude
0,BT01,654,Kandy-Digana,Kandy,7.292462,80.634978
1,101,654,Kandy-Digana,Wales Park,7.291186,80.637662
2,102,654,Kandy-Digana,Mahamaya,7.28784,80.64584
3,103,654,Kandy-Digana,Lewella junction,7.29443,80.65003
4,104,654,Kandy-Digana,Talwatta,7.286701,80.660336
5,105,654,Kandy-Digana,Tennekumbura Bridge,7.281866,80.66603
6,106,654,Kandy-Digana,Kalapura Junction Busstop,7.27983,80.67621
7,107,654,Kandy-Digana,Nattarampotha Junction Bus Stop,7.279117,80.67945
8,108,654,Kandy-Digana,Kundasale New town,7.2809,80.68416
9,109,654,Kandy-Digana,Warapitiya,7.28149,80.68635


In [4]:
#developing geo-buffer rings around every bus stops
def bus_stop_buffer_create(gps_data,bus_stops,stop_buffer,extra_buffer):

  """
  
    Buffer and additional buffer  created  to accomodate points if they were missed in standard stop buffer.

    Args:
        gps_data (pd.DataFrame): Cleaned gps data filtered out from the server for the required time window.
        bus_stops (pd.DataFrame) : Bus stops data for the trip route
        stop_buffer (int):  Radius of the buffer area to represent bus stops
        extra_buffer (int):  Extended radius of the buffer area to represent bus stops.
    
    Returns:
        bus_stops_buffer1 (GeoDataFrame) : Buffer created for filtered  Kandy-Digana direction.
        bus_stops_buffer2 (GeoDataFrame) : Buffer created for filtered  Digana-Kandy direction
        gps_data (GeoDataFrame) :  GPS data as GeoDataFrame with projected corrdinates.
        bus_stops_buffer1_add (GeoDataFrame) : Additional buffer created for filtered  Kandy-Digana direction.
        bus_stops_buffer2_add (GeoDataFrame) : Additional buffer created for filtered  Digana-Kandy direction.
  """

  #Create Geodataframe of GPS data and bus stops data
  gps_data = gpd.GeoDataFrame(gps_data, geometry=gpd.points_from_xy(gps_data.longitude,gps_data.latitude),crs='EPSG:4326')
  bus_stops = gpd.GeoDataFrame(bus_stops, geometry=gpd.points_from_xy(bus_stops.longitude,bus_stops.latitude),crs='EPSG:4326')

  #project the corrdinates in Local coordinate system
  bus_stops = bus_stops.to_crs('EPSG:5234')
  gps_data = gps_data.to_crs('EPSG:5234')

  #split bus stops dataframe into two based on route direction 
  bus_stops_direction1 = bus_stops[bus_stops['direction']=='Kandy-Digana']
  bus_stops_direction2 = bus_stops[bus_stops['direction']=='Digana-Kandy']

  bus_stops_direction2.reset_index(drop = True, inplace = True)

  #proximity analysis
  #creating a buffer
  bus_stops_buffer1 = gpd.GeoDataFrame(bus_stops_direction1, geometry = bus_stops_direction1.geometry.buffer(stop_buffer))
  bus_stops_buffer2 = gpd.GeoDataFrame(bus_stops_direction2, geometry = bus_stops_direction2.geometry.buffer(stop_buffer))

  #creating additional extra buffer to accomodate points if they were missed in standard stop buffer
  bus_stops_buffer1_add = gpd.GeoDataFrame(bus_stops_direction1, geometry = bus_stops_direction1.geometry.buffer(extra_buffer))
  bus_stops_buffer2_add = gpd.GeoDataFrame(bus_stops_direction2, geometry = bus_stops_direction2.geometry.buffer(extra_buffer))

  return bus_stops_buffer1, bus_stops_buffer2,gps_data,bus_stops_buffer1_add,bus_stops_buffer2_add

stop_buffer = 50
extra_buffer = 100
bus_stops_buffer1, bus_stops_buffer2,gps_data,bus_stops_buffer1_add,bus_stops_buffer2_add = bus_stop_buffer_create(gps_data,bus_stops,stop_buffer,extra_buffer)


In [5]:
#splitting trajectories
def bus_trajectory(gps_data,trip_ends,bus_trips):

  """
    Create bus trajectory data of sequence of bus stops with direction of trip.
    
    Args:
        gps_data (GeoDataFrame): Bus trips GPS data
        trip_ends (pd.DataFrame) : Splitted trip data from bus_trip_extraction.py
        bus_trips (pd.DataFrame) : Bus trips data
    
    Returns:
        bus_trajectory (pd.DataFrame): Sequence of bus trip trajectory data
  """
  
  #gps records that are matched with end terminals, are merged with whole GPS records
  trip_ends = trip_ends[['id','bus_stop','trip_id']]
  bus_trajectory = pd.merge(left = gps_data, right  = trip_ends,how = 'outer',left_on ='id', right_on= 'id')

  #gps records that are not associated with the terminals are asssigned as trip id = 0
  bus_trajectory["trip_id"].fillna(0, inplace = True)

  #run a loop to assign trip_id to records that are in between the terminals
  bus_trajectory.reset_index(drop = True, inplace = True)

  trip =1
  for i in range(len(bus_trajectory)-1):
    if (bus_trajectory.at[i,'trip_id']==trip) & (bus_trajectory.at[i+1, 'trip_id'] == 0):
      bus_trajectory.at[i+1,'trip_id'] = trip
    elif (bus_trajectory.at[i,'trip_id']==trip) & (bus_trajectory.at[i+1, 'trip_id'] == trip):
      trip = trip + 1
  
  bus_trajectory.drop(bus_trajectory[bus_trajectory['trip_id']==0].index, inplace = True ) #drop records that are not identified as a bus trip

  #Identify the directions of each bus trajectories using bus trips extracted data
  directions= bus_trips.set_index('trip_id').to_dict()['direction']
  bus_trajectory['direction'] = list(map(lambda x: directions[x]   ,bus_trajectory['trip_id']))

  return bus_trajectory

bus_trajectory = bus_trajectory(gps_data,trip_ends,bus_trips)

In [6]:
bus_trajectory

Unnamed: 0,id,deviceid,servertime,devicetime,fixtime,latitude,longitude,speed,address,routeid,date,time,geometry,bus_stop,trip_id,direction
424,574670748,116,2021-10-16 07:12:08,2021-10-16 07:08:31,2021-10-16 07:08:31,7.299052,80.734410,7.01944,,0,2021-10-16,07:08:31,POINT (195659.523 232979.733),BT02,1.0,2
425,574670749,116,2021-10-16 07:12:08,2021-10-16 07:08:46,2021-10-16 07:08:46,7.298598,80.733327,19.43850,,0,2021-10-16,07:08:46,POINT (195539.897 232929.616),,1.0,2
426,574670750,116,2021-10-16 07:12:08,2021-10-16 07:09:01,2021-10-16 07:09:01,7.297437,80.732405,26.45790,,0,2021-10-16,07:09:01,POINT (195438.129 232801.161),,1.0,2
427,574670751,116,2021-10-16 07:12:08,2021-10-16 07:09:07,2021-10-16 07:09:07,7.297405,80.731912,5.93953,,0,2021-10-16,07:09:07,POINT (195383.650 232797.672),,1.0,2
428,574670752,116,2021-10-16 07:12:08,2021-10-16 07:09:22,2021-10-16 07:09:22,7.297420,80.731760,11.33910,,0,2021-10-16,07:09:22,POINT (195366.912 232799.332),,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651156,585125532,1377,2021-10-27 15:45:15,2021-10-27 15:42:42,2021-10-27 15:42:42,7.291853,80.635037,0.00000,,0,2021-10-27,15:42:42,POINT (184687.115 232185.843),BT01,1723.0,2
651258,585187547,1377,2021-10-27 16:34:15,2021-10-27 16:30:57,2021-10-27 16:30:57,7.293078,80.635622,5.93953,,0,2021-10-27,16:30:57,POINT (184751.749 232321.287),BT01,1724.0,1
651481,585258757,1377,2021-10-27 17:28:51,2021-10-27 17:25:17,2021-10-27 17:25:17,7.298902,80.733947,10.25920,,0,2021-10-27,17:25:17,POINT (195608.356 232963.150),BT02,1724.0,1
651579,585309688,1377,2021-10-27 18:13:08,2021-10-27 18:10:50,2021-10-27 18:10:50,7.298883,80.733975,7.01944,,0,2021-10-27,18:10:50,POINT (195611.481 232961.126),BT02,1725.0,2


In [7]:
bus_stops_buffer2

Unnamed: 0,stop_id,route_id,direction,address,latitude,longitude,geometry
0,BT02,654,Digana-Kandy,Digana,7.29896,80.73472,"POLYGON ((195743.751 232969.601, 195743.510 23..."
1,201,654,Digana-Kandy,Kengalla,7.291726,80.72143,"POLYGON ((194276.294 232169.792, 194276.054 23..."
2,202,654,Digana-Kandy,BOI,7.28411,80.7221,"POLYGON ((194350.152 231327.589, 194349.912 23..."
3,203,654,Digana-Kandy,Balagolla,7.287349,80.71401,"POLYGON ((193456.924 231685.871, 193456.683 23..."
4,204,654,Digana-Kandy,Pachchakaatuwa,7.28473,80.70539,"POLYGON ((192505.093 231396.387, 192504.852 23..."
5,205,654,Digana-Kandy,Pallekele (Clock Tower),7.28036,80.69567,"POLYGON ((191431.758 230913.311, 191431.518 23..."
6,206,654,Digana-Kandy,Warapitiya,7.28148,80.68672,"POLYGON ((190443.540 231037.340, 190443.299 23..."
7,207,654,Digana-Kandy,Kundasale Police College,7.28039,80.68356,"POLYGON ((190094.596 230916.872, 190094.355 23..."
8,208,654,Digana-Kandy,Nattarampotha,7.2798,80.676594,"POLYGON ((189325.389 230851.765, 189325.148 23..."
9,209,654,Digana-Kandy,Tennekumbura,7.28193,80.66594,"POLYGON ((188149.069 231087.586, 188148.828 23..."


In [8]:
bus_stops_buffer1

Unnamed: 0,stop_id,route_id,direction,address,latitude,longitude,geometry
0,BT01,654,Kandy-Digana,Kandy,7.292462,80.634978,"POLYGON ((184730.643 232253.182, 184730.402 23..."
1,101,654,Kandy-Digana,Wales Park,7.291186,80.637662,"POLYGON ((185026.961 232111.967, 185026.720 23..."
2,102,654,Kandy-Digana,Mahamaya,7.28784,80.64584,"POLYGON ((185929.853 231741.696, 185929.612 23..."
3,103,654,Kandy-Digana,Lewella junction,7.29443,80.65003,"POLYGON ((186392.694 232470.307, 186392.453 23..."
4,104,654,Kandy-Digana,Talwatta,7.286701,80.660336,"POLYGON ((187530.385 231615.322, 187530.144 23..."
5,105,654,Kandy-Digana,Tennekumbura Bridge,7.281866,80.66603,"POLYGON ((188159.005 231080.506, 188158.764 23..."
6,106,654,Kandy-Digana,Kalapura Junction Busstop,7.27983,80.67621,"POLYGON ((189283.010 230855.110, 189282.769 23..."
7,107,654,Kandy-Digana,Nattarampotha Junction Bus Stop,7.279117,80.67945,"POLYGON ((189640.749 230776.191, 189640.508 23..."
8,108,654,Kandy-Digana,Kundasale New town,7.2809,80.68416,"POLYGON ((190160.858 230973.256, 190160.617 23..."
9,109,654,Kandy-Digana,Warapitiya,7.28149,80.68635,"POLYGON ((190402.686 231038.453, 190402.445 23..."


In [9]:
def stop_buffer_filter(bus_trajectory,bus_stops_buffer1,bus_stops_buffer2,bus_stops_buffer1_add,bus_stops_buffer2_add):
    

  """

    Filter bus trip data of two buffer ranges with all the bus points, only bus stops points.
    
    Args:
        bus_trajectory (pd.DataFrame): Sequence of bus trip trajectory data
        bus_stops_buffer1 (GeoDataFrame) : Buffer created for filtered  Kandy-Digana direction.
        bus_stops_buffer2 (GeoDataFrame) : Buffer created for filtered  Digana-Kandy direction
        bus_stops_buffer1_add (GeoDataFrame) : Additional buffer created for filtered  Kandy-Digana direction.
        bus_stops_buffer2_add (GeoDataFrame) : Additional buffer created for filtered  Digana-Kandy direction.
    
    Returns:
        bus_trip_all_points (pd.DataFrame): Bus trip data with all points including null for bus_stop
        bus_stop_all_points (pd.DataFrame): Bus trip data with only bus_stops points

  """

  #project to local coordinate system before buffer filtering
  bus_trajectory = bus_trajectory.to_crs('EPSG:5234')

  #split trajectories by direction 
  trajectory_dir_1 = bus_trajectory[bus_trajectory['direction'] == 1]
  trajectory_dir_2 = bus_trajectory[bus_trajectory['direction'] == 2]

  #reset index before for loop
  trajectory_dir_1.reset_index(drop = True, inplace = True)
  trajectory_dir_2.reset_index(drop = True, inplace = True)

  #filter records within bus stops buffer of both directions
  for i in range(len(trajectory_dir_1)):
    for stop in range(len(bus_stops_buffer1)):
      if bus_stops_buffer1.iloc[stop].geometry.contains(trajectory_dir_1.iloc[i].geometry):
        trajectory_dir_1.at[i,'bus_stop'] = bus_stops_buffer1.at[stop,'stop_id']
      else:       
        if bus_stops_buffer1_add.iloc[stop].geometry.contains(trajectory_dir_1.iloc[i].geometry):
          trajectory_dir_1.at[i,'bus_stop'] = bus_stops_buffer1_add.at[stop,'stop_id']

  #filter records within bus stops buffer of both directions
  for i in range(len(trajectory_dir_2)):
    for stop in range(len(bus_stops_buffer2)):
      if bus_stops_buffer2.iloc[stop].geometry.contains(trajectory_dir_2.iloc[i].geometry):
        trajectory_dir_2.at[i,'bus_stop'] = bus_stops_buffer2.at[stop,'stop_id']
      else:       
        if bus_stops_buffer2_add.iloc[stop].geometry.contains(trajectory_dir_2.iloc[i].geometry):
          trajectory_dir_2.at[i,'bus_stop'] = bus_stops_buffer2_add.at[stop,'stop_id']    

  #concatenate dataframes of both directions and keep only records filtered within bus stops
  bus_trip_all_points = pd.concat([trajectory_dir_1,trajectory_dir_2])
  bus_stop_all_points = bus_trip_all_points.dropna()

  return bus_trip_all_points,bus_stop_all_points

bus_trip_all_points,bus_stop_all_points = stop_buffer_filter(bus_trajectory,bus_stops_buffer1,bus_stops_buffer2,bus_stops_buffer1_add,bus_stops_buffer2_add)