In [1]:
# ! pip install geopandas
!pip install folium




[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime,date
import geopandas as gpd
from geopandas import GeoDataFrame as gdf
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
import glob
import os
import folium



In [3]:
raw_files = [
    '../Data Sets/Raw GPS data Kandy Digana Buses/digana_2021_10.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2021_11.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2021_12.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_01.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_02.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_07.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_08.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_09.csv',
    '../Data Sets\Raw GPS data Kandy Digana Buses/digana_2022_10.csv'
]

# Create an empty list to store DataFrames
dfs = []

# Loop through the list of file names and read each CSV into a DataFrame
for file in raw_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate the DataFrames vertically
raw_data = pd.concat(dfs, ignore_index=True)

In [4]:
bus_terminals=pd.read_csv("../Data Sets/bus_stops_and_terminals_654.csv")

In [5]:
raw_data

Unnamed: 0,id,deviceid,servertime,devicetime,fixtime,latitude,longitude,speed,address,routeid
0,560090989,250,2021-10-01 01:00:34,2021-10-01 00:58:59,2021-10-01 00:58:59,7.239907,80.674407,0.0000,,0.0
1,560090990,250,2021-10-01 01:00:34,2021-10-01 00:43:59,2021-10-01 00:43:59,7.239907,80.674407,0.0000,,0.0
2,560090991,250,2021-10-01 01:00:34,2021-10-01 00:28:59,2021-10-01 00:28:59,7.239907,80.674407,0.0000,,0.0
3,560090992,250,2021-10-01 01:00:34,2021-10-01 00:13:59,2021-10-01 00:13:59,7.239907,80.674407,0.0000,,0.0
4,560090993,250,2021-10-01 01:00:34,2021-09-30 23:58:59,2021-09-30 23:58:59,7.239907,80.674407,0.0000,,0.0
...,...,...,...,...,...,...,...,...,...,...
9084313,1550050726,123,,2022-11-01 22:54:40,,7.308682,80.766712,11.3391,,
9084314,1550050727,123,,2022-11-01 22:54:25,,7.307783,80.767427,20.5184,,
9084315,1550050728,123,,2022-11-01 22:54:10,,7.306388,80.767842,17.2786,,
9084316,1550050729,123,,2022-11-01 22:53:55,,7.305193,80.767347,15.1188,,


In [6]:
bus_terminals

Unnamed: 0,stop_id,route_id,direction,address,latitude,longitude
0,BT01,654,Kandy-Digana,Kandy,7.292462,80.634978
1,101,654,Kandy-Digana,Wales Park,7.291186,80.637662
2,102,654,Kandy-Digana,Mahamaya,7.28784,80.64584
3,103,654,Kandy-Digana,Lewella junction,7.29443,80.65003
4,104,654,Kandy-Digana,Talwatta,7.286701,80.660336
5,105,654,Kandy-Digana,Tennekumbura Bridge,7.281866,80.66603
6,106,654,Kandy-Digana,Kalapura Junction Busstop,7.27983,80.67621
7,107,654,Kandy-Digana,Nattarampotha Junction Bus Stop,7.279117,80.67945
8,108,654,Kandy-Digana,Kundasale New town,7.2809,80.68416
9,109,654,Kandy-Digana,Warapitiya,7.28149,80.68635


In [7]:
# Make a new dataa set by just using the two raws of bus_terminals where stop_idis equal to BT01 and BT02

bus_terminals = bus_terminals[bus_terminals['stop_id'].isin(['BT01', 'BT02'])]

# Drop duplicates
bus_terminals = bus_terminals.drop_duplicates(subset=['stop_id'], keep='first')

# change the column name stop_id to terminal_id
bus_terminals = bus_terminals.rename(columns={'stop_id': 'terminal_id'})

# reset index
bus_terminals = bus_terminals.reset_index(drop=True)

In [8]:
bus_terminals

Unnamed: 0,terminal_id,route_id,direction,address,latitude,longitude
0,BT01,654,Kandy-Digana,Kandy,7.292462,80.634978
1,BT02,654,Kandy-Digana,Digana,7.29896,80.73472


In [9]:
def raw_data_cleaning(raw_data):
    """
    Removal of records with error records.
    Remove data with zero values for longitude and latitude columns.
    Remove data with dates outside the desired range.
    Sort data by time and device.

    Args:
        raw_data (pd.DataFrame): Crude raw GPS data filtered out from the server for the required time window.

    Returns:
        gps_data (pd.DataFrame): A cleaned dataframe object of GPS data.
    """

    # raw_data = raw_data.drop(drop_columns, axis=1)

    gps_data = raw_data[raw_data.latitude != 0]
    gps_data = gps_data[gps_data.longitude != 0]  # cleaning zero values for latitude & longitude

    gps_data['date'] = pd.to_datetime(gps_data['devicetime']).dt.date  # split date and time separately into datetime variables
    gps_data['time'] = pd.to_datetime(gps_data['devicetime']).dt.time

    # Remove rows with dates outside the desired range
    start_date = pd.Timestamp("2021-10-01").date()
    end_date = pd.Timestamp("2022-10-31").date()
    gps_data = gps_data[(gps_data['date'] >= start_date) & (gps_data['date'] <= end_date)]

    gps_data = gps_data.sort_values(['deviceid', 'date', 'time'])  # sorting dataset by time and device

    return gps_data


raw_data.columns

additional_columns = ['servertime','fixtime','address','routeid']

# drop the additional columns
raw_data = raw_data.drop(additional_columns, axis = 1)



gps_data= raw_data_cleaning(raw_data)

In [10]:
gps_data

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time
281253,574073556,116,2021-10-15 11:37:14,7.293917,80.736137,0.0,2021-10-15,11:37:14
281400,574078368,116,2021-10-15 11:39:26,7.294845,80.735427,0.0,2021-10-15,11:39:26
281492,574082837,116,2021-10-15 11:39:41,7.294825,80.735470,0.0,2021-10-15,11:39:41
281493,574082838,116,2021-10-15 11:39:56,7.294817,80.735472,0.0,2021-10-15,11:39:56
281494,574082840,116,2021-10-15 11:40:11,7.294813,80.735470,0.0,2021-10-15,11:40:11
...,...,...,...,...,...,...,...,...
6656384,912468415,1719,2022-03-01 23:59:14,7.263890,80.700732,0.0,2022-03-01,23:59:14
6656386,912468524,1719,2022-03-01 23:59:24,7.263890,80.700732,0.0,2022-03-01,23:59:24
6656388,912468599,1719,2022-03-01 23:59:34,7.263890,80.700732,0.0,2022-03-01,23:59:34
6656390,912468664,1719,2022-03-01 23:59:44,7.263890,80.700732,0.0,2022-03-01,23:59:44


In [11]:
def trip_ends(gps_data,bus_terminals,end_buffer):
    
  """
    To extract trip ends dataframe with given buffer range.
    Filter the records within terminals selected buffer range. 
    Within the filtered records get entry & exit to terminals.


    Args:
        gps_data (pd.DataFrame): Cleaned gps data filtered out from the server for the required time window.
        bus_terminals (pd.DataFrame): End and start terminals for the trip.
        end_buffer (int):  Radius of the buffer area to represent terminals.
    
    Returns:
        trip_ends (pd.DataFrame): Trip data with extracted terminals.
  """

  #converting to GeoDataframe with Coordinate Reference system 4326 
  gps_data = gpd.GeoDataFrame(gps_data, geometry=gpd.points_from_xy(gps_data.longitude,gps_data.latitude),crs='EPSG:4326')
  bus_terminals = gpd.GeoDataFrame(bus_terminals, geometry=gpd.points_from_xy(bus_terminals.longitude,bus_terminals.latitude),crs='EPSG:4326') 
  
  #project them in local cordinate system
  gps_data = gps_data.to_crs('EPSG:5234')
  bus_terminals = bus_terminals.to_crs('EPSG:5234')

  #creating buffer area to extract records around bus terminals
  bus_terminals_buffer = gpd.GeoDataFrame(bus_terminals, geometry = bus_terminals.geometry.buffer(end_buffer))

  #filtering coordinates within bus terminals end buffer
  gps_data['bus_stop'] = pd.Series(dtype='object') #create a new column in gps data set
  gps_data.reset_index(drop = True, inplace = True) #reset indices to run a for loop

  for i in range(len(gps_data)):
    for stop in range(len(bus_terminals)):
        if bus_terminals_buffer.iloc[stop].geometry.contains(gps_data.iloc[i].geometry):
          gps_data.at[i, 'bus_stop'] = bus_terminals.at[stop, 'terminal_id']


  trip_ends = gps_data.dropna() #filter records within terminal buffer

  #EXTRACT TRIP ENDS

  #grouping the filtered records of one bus terminal and one date
  trip_ends['grouped_ends'] = ((trip_ends['bus_stop'].shift() != trip_ends['bus_stop']) | (trip_ends['date'].shift() != trip_ends['date'])).cumsum()

  #find the entry or exit record only of the terminals
  #Early records is the entry(1) to the terminal and last record as the exit(0) to the end terminal 
  trip_ends['entry/exit'] = pd.Series(dtype='object')
  trip_ends = trip_ends.reset_index(drop=True)

  for name, group in trip_ends.groupby('grouped_ends'):
    #if 0 in group['speed'].values:
    for index, row in group.iterrows():
      if row['devicetime'] == group['devicetime'].max():
        trip_ends.at[index,'entry/exit'] = '0'
      elif row['devicetime'] == group['devicetime'].min():
        trip_ends.at[index,'entry/exit'] = '1'
  
  trip_ends = trip_ends.dropna() #filter terminal entry/exit records only 
  
  trip_ends = trip_ends.reset_index(drop=True)

  trip_ends['trip_id'] = pd.Series(dtype='int')  # Create an empty 'trip_id' column
  #Providing unique trip id for trips which have entry / exit values within the 2 bus end terminals
  trip = 0
  for i in range(len(trip_ends)-1):
    if (trip_ends.at[i,'bus_stop'] != trip_ends.at[i+1,'bus_stop']) & (trip_ends.at[i,'date'] == trip_ends.at[i+1,'date']):
      trip= trip+1
      trip_ends.at[i,'trip_id'] = trip
      trip_ends.at[i+1,'trip_id'] = trip

  trip_ends = trip_ends.dropna()

  trip_ends = trip_ends.groupby('trip_id').filter(lambda x : len(x)>1)    #remove outliers where no defined 2 trip ends for a trip
  trip_ends = trip_ends.reset_index(drop=True)

  return trip_ends


end_buffer = 100
trip_ends = trip_ends(gps_data,bus_terminals,end_buffer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [12]:
trip_ends=trip_ends.dropna()

In [13]:
trip_ends

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,grouped_ends,entry/exit,trip_id
0,574670748,116,2021-10-16 07:08:31,7.299052,80.734410,7.01944,2021-10-16,07:08:31,POINT (195659.523 232979.733),BT02,2,0,1.0
1,574721062,116,2021-10-16 07:53:04,7.291710,80.635112,5.93953,2021-10-16,07:53:04,POINT (184695.391 232169.994),BT01,3,1,1.0
2,574733098,116,2021-10-16 08:03:04,7.293092,80.635573,9.17927,2021-10-16,08:03:04,POINT (184746.416 232322.760),BT01,3,0,2.0
3,574787724,116,2021-10-16 08:53:48,7.299068,80.734350,4.85961,2021-10-16,08:53:48,POINT (195652.887 232981.580),BT02,4,1,2.0
4,574907780,116,2021-10-16 10:50:19,7.298947,80.734155,8.09935,2021-10-16,10:50:19,POINT (195631.367 232968.124),BT02,4,0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31777,898420758,1719,2022-02-25 14:38:03,7.298883,80.733832,11.87910,2022-02-25,14:38:03,POINT (195595.658 232961.127),BT02,19394,1,15893.0
31778,898722926,1719,2022-02-25 16:00:04,7.298873,80.733850,14.57880,2022-02-25,16:00:04,POINT (195597.679 232960.021),BT02,19394,0,15894.0
31779,898917340,1719,2022-02-25 16:48:05,7.291622,80.635255,3.23974,2022-02-25,16:48:05,POINT (184711.222 232160.214),BT01,19395,1,15894.0
31780,899049276,1719,2022-02-25 17:20:21,7.293082,80.635617,8.63931,2022-02-25,17:20:21,POINT (184751.197 232321.652),BT01,19395,0,15895.0


In [14]:
def download_csv(data, filename):
    """
    Save DataFrame as a CSV file in the root folder of the project.

    Args:
        data (pd.DataFrame): DataFrame Object.
        filename (str): Name of the file.

    Returns:
        None
    """
    filename = filename + '.csv'
    file_path = filename
    data.to_csv(file_path, encoding='utf-8-sig', index=False)
    print(f"CSV file saved at: {file_path}")

# Example usage
download_csv(trip_ends, '../DataOut/trip_ends')


CSV file saved at: ../DataOut/trip_ends.csv


In [15]:
def trip_extraction(trip_ends):
      
  """
    To extract bus trips with derived columns.
    Create end_time, end_terminal for a bus trip.
    Create features of duration, duration_in_mins, day_of_the_week, hour_of_day

    Args:
        trip_ends (pd.DataFrame): Filtered bus trip data with terminals.
    
    Returns:
        bus_trips (pd.DataFrame): Bus trip terminals data with derived features.
  """

  bus_trips = trip_ends.copy()
  bus_trips[['end_time','end_terminal']] = bus_trips[['time','bus_stop']].shift(-1)
  bus_trips = bus_trips.iloc[::2]

  bus_trips = bus_trips.drop(['id','devicetime','latitude','longitude','speed','geometry','grouped_ends','entry/exit'],axis=1)
  bus_trips.insert(0,'trip_id',bus_trips.pop('trip_id'))
  bus_trips.rename(columns = {'time':'start_time','bus_stop': 'start_terminal'}, inplace =True)

  conditions = [(bus_trips['start_terminal'] == 'BT01'),
              (bus_trips['start_terminal'] == 'BT02')]
  values = [1,2]

  bus_trips['direction'] = np.select(conditions, values)

  bus_trips = bus_trips[['trip_id','deviceid','date','start_terminal','end_terminal','direction','start_time','end_time']]
  bus_trips=bus_trips.reset_index(drop = True)

  #Calculate trip duration
  bus_trips['duration'] = pd.Series(dtype='object')
  for i in range(len(bus_trips)):
    bus_trips.at[i,'duration'] = datetime.combine(date.min,bus_trips.at[i,'end_time']) - datetime.combine(date.min,bus_trips.at[i,'start_time'])
  
  bus_trips['duration_in_mins'] = bus_trips['duration']/np.timedelta64(1,'m')

  bus_trips['day_of_week'] = pd.to_datetime(bus_trips['date']).dt.weekday
  bus_trips['hour_of_day'] = list(map(lambda  x: x.hour, (bus_trips['start_time'])))
  
  return bus_trips

bus_trips = trip_extraction(trip_ends)
download_csv(bus_trips,'../DataOut/bus_trips')


CSV file saved at: ../DataOut/bus_trips.csv


In [16]:
bus_trips

Unnamed: 0,trip_id,deviceid,date,start_terminal,end_terminal,direction,start_time,end_time,duration,duration_in_mins,day_of_week,hour_of_day
0,1.0,116,2021-10-16,BT02,BT01,2,07:08:31,07:53:04,0:44:33,44.550000,5,7
1,2.0,116,2021-10-16,BT01,BT02,1,08:03:04,08:53:48,0:50:44,50.733333,5,8
2,3.0,116,2021-10-16,BT02,BT01,2,10:50:19,11:44:43,0:54:24,54.400000,5,10
3,4.0,116,2021-10-16,BT01,BT02,1,12:20:45,13:18:33,0:57:48,57.800000,5,12
4,5.0,116,2021-10-16,BT02,BT01,2,14:14:36,15:07:05,0:52:29,52.483333,5,14
...,...,...,...,...,...,...,...,...,...,...,...,...
15886,15891.0,1719,2022-02-25,BT01,BT02,1,09:42:39,10:34:59,0:52:20,52.333333,4,9
15887,15892.0,1719,2022-02-25,BT02,BT01,2,12:20:33,13:15:43,0:55:10,55.166667,4,12
15888,15893.0,1719,2022-02-25,BT01,BT02,1,13:41:19,14:38:03,0:56:44,56.733333,4,13
15889,15894.0,1719,2022-02-25,BT02,BT01,2,16:00:04,16:48:05,0:48:01,48.016667,4,16


In [17]:
def map_visualization(gps_data,city_location,bus_terminals,bus_terminals_buffer):
    
  """
    Using a  GPS data visualization package of Folium, project the coordinates on 
    Open Street Map (OSM) to explore how the records are spread and to gain some insights and overview.

    Args:
        gps_data (pd.DataFrame): GPS data with selected device ID.
        city_location (arr): Longtitude and lattitude of a city
        bus_terminals (GeoDataFrame) : Bus terminal data with geometry column
        bus_terminals_buffer (GeoDataFrame) :  Bus terminal data with geometry column buffer range      
    
    Returns:
        map (MapObject): A visualizable Map Object.
  """
    
  gps_data = gpd.GeoDataFrame(gps_data, geometry=gpd.points_from_xy(gps_data.longitude,gps_data.latitude),crs='EPSG:4326')  #converting to GeoDataframe with Coordinate Reference system 4326
  map =  folium.Map(location=city_location, tiles='openstreetmap', zoom_start=14)
  for idx, row in gps_data.iterrows():
    Marker([row['latitude'], row['longitude']]).add_to(map)
  
  # bus_terminals = gpd.GeoDataFrame(bus_terminals, geometry=gpd.points_from_xy(bus_terminals.longitude,bus_terminals.latitude),crs='EPSG:4326')
  # for idx, row in bus_terminals.iterrows():
  #   Marker([row['latitude'], row['longitude']]).add_to(map)

  folium.GeoJson(bus_terminals_buffer.to_crs(epsg=4326)).add_to(map)
  map
  return map



bus_terminals = gpd.GeoDataFrame(bus_terminals, geometry=gpd.points_from_xy(bus_terminals.longitude,bus_terminals.latitude),crs='EPSG:4326') 
bus_terminals = bus_terminals.to_crs('EPSG:5234')
bus_terminals_buffer = gpd.GeoDataFrame(bus_terminals, geometry = bus_terminals.geometry.buffer(end_buffer))

gps_data['deviceid'].value_counts()

data84 = gps_data[gps_data['deviceid']==10]

city_location = [7.2906,80.6337]  #Kandy city location
map = map_visualization(data84,city_location,bus_terminals,bus_terminals_buffer)

map


In [18]:
gps_data

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time
281253,574073556,116,2021-10-15 11:37:14,7.293917,80.736137,0.0,2021-10-15,11:37:14
281400,574078368,116,2021-10-15 11:39:26,7.294845,80.735427,0.0,2021-10-15,11:39:26
281492,574082837,116,2021-10-15 11:39:41,7.294825,80.735470,0.0,2021-10-15,11:39:41
281493,574082838,116,2021-10-15 11:39:56,7.294817,80.735472,0.0,2021-10-15,11:39:56
281494,574082840,116,2021-10-15 11:40:11,7.294813,80.735470,0.0,2021-10-15,11:40:11
...,...,...,...,...,...,...,...,...
6656384,912468415,1719,2022-03-01 23:59:14,7.263890,80.700732,0.0,2022-03-01,23:59:14
6656386,912468524,1719,2022-03-01 23:59:24,7.263890,80.700732,0.0,2022-03-01,23:59:24
6656388,912468599,1719,2022-03-01 23:59:34,7.263890,80.700732,0.0,2022-03-01,23:59:34
6656390,912468664,1719,2022-03-01 23:59:44,7.263890,80.700732,0.0,2022-03-01,23:59:44
