In [1]:
# ! pip install geopandas
!pip install folium




[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime,date
import geopandas as gpd
from geopandas import GeoDataFrame as gdf
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
import glob
import os



In [3]:
raw_files = [
    'Raw GPS data Kandy Digana Buses/digana_2021_10.csv',
    # 'Raw GPS data Kandy Digana Buses/digana_2021_11.csv',
    # 'Raw GPS data Kandy Digana Buses/digana_2021_12.csv',
    # 'Raw GPS data Kandy Digana Buses/digana_2022_01.csv',
    # 'Raw GPS data Kandy Digana Buses/digana_2022_02.csv',
    # 'Raw GPS data Kandy Digana Buses/digana_2022_07.csv',
    # 'Raw GPS data Kandy Digana Buses/digana_2022_08.csv',
    # 'Raw GPS data Kandy Digana Buses/digana_2022_09.csv',
    # 'Raw GPS data Kandy Digana Buses/digana_2022_10.csv'
]

# Create an empty list to store DataFrames
dfs = []

# Loop through the list of file names and read each CSV into a DataFrame
for file in raw_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate the DataFrames vertically
raw_data = pd.concat(dfs, ignore_index=True)

In [4]:
bus_terminals=pd.read_csv("bus_stops_and_terminals_654.csv")

In [5]:
raw_data

Unnamed: 0,id,deviceid,servertime,devicetime,fixtime,latitude,longitude,speed,address,routeid
0,560090989,250,2021-10-01 01:00:34,2021-10-01 00:58:59,2021-10-01 00:58:59,7.239907,80.674407,0.0000,,0
1,560090990,250,2021-10-01 01:00:34,2021-10-01 00:43:59,2021-10-01 00:43:59,7.239907,80.674407,0.0000,,0
2,560090991,250,2021-10-01 01:00:34,2021-10-01 00:28:59,2021-10-01 00:28:59,7.239907,80.674407,0.0000,,0
3,560090992,250,2021-10-01 01:00:34,2021-10-01 00:13:59,2021-10-01 00:13:59,7.239907,80.674407,0.0000,,0
4,560090993,250,2021-10-01 01:00:34,2021-09-30 23:58:59,2021-09-30 23:58:59,7.239907,80.674407,0.0000,,0
...,...,...,...,...,...,...,...,...,...,...
662035,590850102,123,2021-11-01 23:10:39,2021-11-01 23:06:53,2021-11-01 23:06:53,7.295320,80.736078,16.1987,,0
662036,590850103,123,2021-11-01 23:10:39,2021-11-01 23:06:38,2021-11-01 23:06:38,7.296228,80.735395,17.8186,,0
662037,590850104,123,2021-11-01 23:10:39,2021-11-01 23:06:23,2021-11-01 23:06:23,7.297178,80.734713,19.9784,,0
662038,590850105,123,2021-11-01 23:10:39,2021-11-01 23:06:08,2021-11-01 23:06:08,7.298578,80.734750,14.0389,,0


In [6]:
bus_terminals

Unnamed: 0,stop_id,route_id,direction,address,latitude,longitude
0,BT01,654,Kandy-Digana,Kandy,7.292462,80.634978
1,101,654,Kandy-Digana,Wales Park,7.291186,80.637662
2,102,654,Kandy-Digana,Mahamaya,7.28784,80.64584
3,103,654,Kandy-Digana,Lewella junction,7.29443,80.65003
4,104,654,Kandy-Digana,Talwatta,7.286701,80.660336
5,105,654,Kandy-Digana,Tennekumbura Bridge,7.281866,80.66603
6,106,654,Kandy-Digana,Kalapura Junction Busstop,7.27983,80.67621
7,107,654,Kandy-Digana,Nattarampotha Junction Bus Stop,7.279117,80.67945
8,108,654,Kandy-Digana,Kundasale New town,7.2809,80.68416
9,109,654,Kandy-Digana,Warapitiya,7.28149,80.68635


In [7]:
# Make a new dataa set by just using the two raws of bus_terminals where stop_idis equal to BT01 and BT02

bus_terminals = bus_terminals[bus_terminals['stop_id'].isin(['BT01', 'BT02'])]

# Drop duplicates
bus_terminals = bus_terminals.drop_duplicates(subset=['stop_id'], keep='first')

# change the column name stop_id to terminal_id
bus_terminals = bus_terminals.rename(columns={'stop_id': 'terminal_id'})

# reset index
bus_terminals = bus_terminals.reset_index(drop=True)

In [8]:
bus_terminals

Unnamed: 0,terminal_id,route_id,direction,address,latitude,longitude
0,BT01,654,Kandy-Digana,Kandy,7.292462,80.634978
1,BT02,654,Kandy-Digana,Digana,7.29896,80.73472


In [9]:
def raw_data_cleaning(raw_data):
    
  """
    Removal of records with error records. 
    Remove data with zero values for longitude and latitude columns.
    Sort data by time and device.
    
    Args:
        raw_data (pd.DataFrame): Crude raw GPS data filtered out from the server for the required time window.
    
    Returns:
        gps_data (pd.DataFrame): A cleaned dataframe object of GPS data.
    """
  
  #raw_data = raw_data.drop(drop_columns, axis = 1)

  gps_data = raw_data[raw_data.latitude != 0]
  gps_data = gps_data[gps_data.longitude != 0] #cleaning zero values for latitude & longitude

  gps_data['date'] = pd.to_datetime(gps_data['devicetime']).dt.date #split date and time separately into datetime variables
  gps_data['time'] = pd.to_datetime(gps_data['devicetime']).dt.time

  gps_data = gps_data.sort_values(['deviceid', 'date', 'time']) #sorting dataset by time and device

  return gps_data

#The additional unwanted columns from the dataset are found to be deleted(Optional Step)
additional_columns = ['servertime','fixtime','address','routeid']
gps_data= raw_data_cleaning(raw_data)

drop_columns = ['address','routeid']
gps_data = gps_data.drop(drop_columns, axis = 1)




In [10]:
gps_data

Unnamed: 0,id,deviceid,servertime,devicetime,fixtime,latitude,longitude,speed,date,time
281253,574073556,116,2021-10-15 11:37:19,2021-10-15 11:37:14,2021-10-15 11:37:14,7.293917,80.736137,0.0,2021-10-15,11:37:14
281400,574078368,116,2021-10-15 11:41:13,2021-10-15 11:39:26,2021-10-15 11:39:26,7.294845,80.735427,0.0,2021-10-15,11:39:26
281492,574082837,116,2021-10-15 11:44:49,2021-10-15 11:39:41,2021-10-15 11:39:41,7.294825,80.735470,0.0,2021-10-15,11:39:41
281493,574082838,116,2021-10-15 11:44:49,2021-10-15 11:39:56,2021-10-15 11:39:56,7.294817,80.735472,0.0,2021-10-15,11:39:56
281494,574082840,116,2021-10-15 11:44:49,2021-10-15 11:40:11,2021-10-15 11:40:11,7.294813,80.735470,0.0,2021-10-15,11:40:11
...,...,...,...,...,...,...,...,...,...,...
608425,587300473,1377,2021-10-29 14:40:38,2021-10-27 19:59:36,2021-10-27 19:59:36,7.263888,80.700603,0.0,2021-10-27,19:59:36
608426,587300474,1377,2021-10-29 14:40:38,2021-10-27 19:59:51,2021-10-27 19:59:51,7.263888,80.700603,0.0,2021-10-27,19:59:51
635647,589032240,1377,2021-10-31 10:06:08,2021-10-27 20:00:06,2021-10-27 20:00:06,7.263888,80.700603,0.0,2021-10-27,20:00:06
556115,585366438,1377,2021-10-27 20:00:12,2021-10-27 20:00:09,2021-10-27 20:00:09,7.263888,80.700603,0.0,2021-10-27,20:00:09


In [20]:
def trip_ends(gps_data,bus_terminals,end_buffer):
    
  """
    To extract trip ends dataframe with given buffer range.
    Filter the records within terminals selected buffer range. 
    Within the filtered records get entry & exit to terminals.


    Args:
        gps_data (pd.DataFrame): Cleaned gps data filtered out from the server for the required time window.
        bus_terminals (pd.DataFrame): End and start terminals for the trip.
        end_buffer (int):  Radius of the buffer area to represent terminals.
    
    Returns:
        trip_ends (pd.DataFrame): Trip data with extracted terminals.
  """

  #converting to GeoDataframe with Coordinate Reference system 4326 
  gps_data = gpd.GeoDataFrame(gps_data, geometry=gpd.points_from_xy(gps_data.longitude,gps_data.latitude),crs='EPSG:4326')
  bus_terminals = gpd.GeoDataFrame(bus_terminals, geometry=gpd.points_from_xy(bus_terminals.longitude,bus_terminals.latitude),crs='EPSG:4326') 
  
  #project them in local cordinate system
  gps_data = gps_data.to_crs('EPSG:5234')
  bus_terminals = bus_terminals.to_crs('EPSG:5234')

  #creating buffer area to extract records around bus terminals
  bus_terminals_buffer = gpd.GeoDataFrame(bus_terminals, geometry = bus_terminals.geometry.buffer(end_buffer))

  #filtering coordinates within bus terminals end buffer
  gps_data['bus_stop'] = pd.Series(dtype='object') #create a new column in gps data set
  gps_data.reset_index(drop = True, inplace = True) #reset indices to run a for loop

  for i in range(len(gps_data)):
    for stop in range(len(bus_terminals)):
        if bus_terminals_buffer.iloc[stop].geometry.contains(gps_data.iloc[i].geometry):
          gps_data.at[i, 'bus_stop'] = bus_terminals.at[stop, 'terminal_id']


  trip_ends = gps_data.dropna() #filter records within terminal buffer

  #EXTRACT TRIP ENDS

  #grouping the filtered records of one bus terminal and one date
  trip_ends['grouped_ends'] = ((trip_ends['bus_stop'].shift() != trip_ends['bus_stop']) | (trip_ends['date'].shift() != trip_ends['date'])).cumsum()

  # #find the entry or exit record only of the terminals
  # #Early records is the entry(1) to the terminal and last record as the exit(0) to the end terminal 
  # trip_ends['entry/exit'] = pd.Series(dtype='object')
  # trip_ends = trip_ends.reset_index(drop=True)

  # for name, group in trip_ends.groupby('grouped_ends'):
  #   #if 0 in group['speed'].values:
  #   for index, row in group.iterrows():
  #     if row['devicetime'] == group['devicetime'].max():
  #       trip_ends.at[index,'entry/exit'] = '0'
  #     elif row['devicetime'] == group['devicetime'].min():
  #       trip_ends.at[index,'entry/exit'] = '1'
  
  # trip_ends = trip_ends.dropna() #filter terminal entry/exit records only 
  
  # trip_ends = trip_ends.reset_index(drop=True)

  # trip_ends['trip_id'] = pd.Series(dtype='int')  # Create an empty 'trip_id' column
  # #Providing unique trip id for trips which have entry / exit values within the 2 bus end terminals
  # trip = 0
  # for i in range(len(trip_ends)-1):
  #   if (trip_ends.at[i,'bus_stop'] != trip_ends.at[i+1,'bus_stop']) & (trip_ends.at[i,'date'] == trip_ends.at[i+1,'date']):
  #     trip= trip+1
  #     trip_ends.at[i,'trip_id'] = trip
  #     trip_ends.at[i+1,'trip_id'] = trip

  # trip_ends = trip_ends.dropna()

  # trip_ends = trip_ends.groupby('trip_id').filter(lambda x : len(x)>1)    #remove outliers where no defined 2 trip ends for a trip
  # trip_ends = trip_ends.reset_index(drop=True)

  return trip_ends


end_buffer = 100
trip_ends = trip_ends(gps_data,bus_terminals,end_buffer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [21]:
trip_ends=trip_ends.dropna()

In [22]:
trip_ends

Unnamed: 0,id,deviceid,servertime,devicetime,fixtime,latitude,longitude,speed,date,time,geometry,bus_stop,grouped_ends
140,574125244,116,2021-10-15 12:21:04,2021-10-15 12:17:46,2021-10-15 12:17:46,7.298880,80.733898,5.39957,2021-10-15,12:17:46,POINT (195603.023 232960.762),BT02,1
141,574125245,116,2021-10-15 12:21:04,2021-10-15 12:18:01,2021-10-15 12:18:01,7.299080,80.734393,5.93953,2021-10-15,12:18:01,POINT (195657.680 232982.873),BT02,1
142,574125246,116,2021-10-15 12:21:04,2021-10-15 12:18:16,2021-10-15 12:18:16,7.299300,80.735048,6.47948,2021-10-15,12:18:16,POINT (195730.002 233007.196),BT02,1
143,574125247,116,2021-10-15 12:21:04,2021-10-15 12:18:31,2021-10-15 12:18:31,7.299253,80.735265,0.00000,2021-10-15,12:18:31,POINT (195753.928 233002.029),BT02,1
144,574125248,116,2021-10-15 12:21:04,2021-10-15 12:18:46,2021-10-15 12:18:46,7.298937,80.735217,6.47948,2021-10-15,12:18:46,POINT (195748.582 232967.008),BT02,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
651813,585354121,1377,2021-10-27 19:12:40,2021-10-27 19:12:36,2021-10-27 19:12:36,7.292425,80.634905,0.00000,2021-10-27,19:12:36,POINT (184672.603 232249.057),BT01,2085
651814,585355461,1377,2021-10-27 19:15:55,2021-10-27 19:12:51,2021-10-27 19:12:51,7.292425,80.634905,0.00000,2021-10-27,19:12:51,POINT (184672.603 232249.057),BT01,2085
651815,585355462,1377,2021-10-27 19:15:55,2021-10-27 19:13:06,2021-10-27 19:13:06,7.292488,80.635005,1.61987,2021-10-27,19:13:06,POINT (184683.647 232256.064),BT01,2085
651816,585355463,1377,2021-10-27 19:15:55,2021-10-27 19:13:21,2021-10-27 19:13:21,7.292700,80.635043,5.39957,2021-10-27,19:13:21,POINT (184687.883 232279.473),BT01,2085


In [14]:
def download_csv(data, filename):
    """
    Save DataFrame as a CSV file in the root folder of the project.

    Args:
        data (pd.DataFrame): DataFrame Object.
        filename (str): Name of the file.

    Returns:
        None
    """
    filename = filename + '.csv'
    file_path = filename
    data.to_csv(file_path, encoding='utf-8-sig', index=False)
    print(f"CSV file saved at: {file_path}")

# Example usage
download_csv(trip_ends, 'trip_ends')


PermissionError: [Errno 13] Permission denied: 'trip_ends.csv'