In [None]:
import numpy as np
import pandas as pd
import os
from data.buses import LTA
from data_proc import *
import plotly

In [None]:
apiKey:str = ""

def to_df(data):
    data = pd.DataFrame.from_dict(data['value'])
    return data

lta = LTA(apiKey)



In [None]:
# Get df_bus_route
bus_route = lta.get_bus_routes()
df_bus_route = to_df(bus_route)
df_bus_route

In [None]:
# preprocess_df_bus_route_1d: function to preprocess df_bus_route
# Input
df = df_bus_route

# Get ServiceNo with two directions
service_no_with_two_directions = df.groupby('ServiceNo')['Direction'].max()
service_no_with_two_directions = service_no_with_two_directions[service_no_with_two_directions == 2].index

# Filter the original DataFrame to include only ServiceNo with two directions
df = df[df['ServiceNo'].isin(service_no_with_two_directions)]

# Filter to include only Direction == 1 for each unique ServiceNo
df = df[df['Direction'] == 1] 

# Get the bus stop code when StopSequence is 1 for each bus
origin_bus_stop = df[df['StopSequence'] == 1].groupby('ServiceNo').agg(
    origin_bus_stop=('BusStopCode', 'first')
)
# Get the bus stop code when StopSequence is the max for each bus
destination_bus_stop = df.groupby('ServiceNo').agg(
    dest_bus_stop=('BusStopCode', 'last'),
    origin_dest_distance=('Distance', 'last')
)

# Merge origin_bus_stop into destination_bus_stop DataFrame
df_bus_route_processed = destination_bus_stop.merge(origin_bus_stop, left_index=True, right_index=True, how='left')

# Add Direction column
df_bus_route_processed['Direction'] = 1

# Reset index to make ServiceNo a column instead of index
df_bus_route_processed.reset_index(inplace=True)

# Reorder columns
df_bus_route_processed = df_bus_route_processed[['ServiceNo', 'Direction', 'origin_bus_stop', 'dest_bus_stop', 'origin_dest_distance']]

df_bus_route_processed


In [None]:
# preprocess_df_bus_route_2d: function to preprocess df_bus_route
# Input
df = df_bus_route

# Get ServiceNo with two directions
service_no_with_two_directions = df.groupby('ServiceNo')['Direction'].max()
service_no_with_two_directions = service_no_with_two_directions[service_no_with_two_directions == 2].index

# Filter the original DataFrame to include only ServiceNo with two directions
df = df[df['ServiceNo'].isin(service_no_with_two_directions)]

# Filter to include only Direction == 1 for each unique ServiceNo
df1 = df[df['Direction'] == 1]
df2 = df[df['Direction'] == 2]  

def process_df_by_serviceNo(df):
    # Get the bus stop code when StopSequence is 1 for each bus
    origin_bus_stop = df[df['StopSequence'] == 1].groupby('ServiceNo').agg(
        origin_bus_stop=('BusStopCode', 'first')
    )
    # Get the bus stop code when StopSequence is the max for each bus
    destination_bus_stop = df.groupby('ServiceNo').agg(
        dest_bus_stop=('BusStopCode', 'last'),
        origin_dest_distance=('Distance', 'last')
    )

    # Merge origin_bus_stop into destination_bus_stop DataFrame
    df_bus_route_processed = destination_bus_stop.merge(origin_bus_stop, left_index=True, right_index=True, how='left')

    # Add Direction column
    df_bus_route_processed['Direction'] = df['Direction'].unique()[0]

    # Reset index to make ServiceNo a column instead of index
    df_bus_route_processed.reset_index(inplace=True)

    # Reorder columns
    df_bus_route_processed = df_bus_route_processed[['ServiceNo', 'Direction', 'origin_bus_stop', 'dest_bus_stop', 'origin_dest_distance']]
    return df_bus_route_processed

df1_processed = process_df_by_serviceNo(df1)
df2_processed = process_df_by_serviceNo(df2)

# Concatenate the two DataFrames
concatenated_df = pd.concat([df1_processed, df2_processed])

# Sort the concatenated DataFrame by ServiceNo and Direction
sorted_df = concatenated_df.sort_values(by=['ServiceNo', 'Direction'])

# Reset the index
sorted_df.reset_index(drop=True, inplace=True)

print(df1_processed)
print(df2_processed)
print(sorted_df)


In [None]:
# Test preprocess_df_bus_route_1d
df_1d = preprocess_df_bus_route_1d(df_bus_route)
df_1d

In [None]:
# Test preprocess_df_bus_route_2d
df_2d = preprocess_df_bus_route_2d(df_bus_route)
df_2d

In [None]:
# Each 'ServiceNo' -> min-max 'StopSequence' -> min-max 'Distance' 
df_bus_route[df_bus_route['ServiceNo'] == '10']


In [None]:
# 'Direction' == 1 and 'Direction' == 2 have different origin-destination distances, can take maximum of these too
    # Better to pick only 'Direction' == 1 since it reduces frontend work when plotting the bus stops
df_bus_route[(df_bus_route['ServiceNo'] == '10') & (df_bus_route['Direction'] == 1)]

In [None]:
# start: 16009, end: 75009
df_bus_route[(df_bus_route['ServiceNo'] == '10') & (df_bus_route['Direction'] == 2)]

In [None]:
# Loop service bus route?
    # Conclusion: only cares about 'Direction'=1, and max'Distance' from this direction 
df_bus_route[df_bus_route['ServiceNo'] == '101']

In [None]:
# Bus 100
df_bus_route[(df_bus_route['ServiceNo'] == '100') & (df_bus_route['Direction'] == 2)]

In [None]:
# Get df_bus_stop

In [None]:
# Load existing zipped data
current_folder_path = os.getcwd()
data_folder_name = "data"
data_file_name = "origin_destination_bus_202402.zip"
data_file_path = os.path.join(current_folder_path, data_folder_name, data_file_name)
df = pd.read_csv(data_file_path)
df

In [None]:
# Preprocess total-trips df to get monthly origin-dest total trips
# Filter 'PT_TYPE' == 'BUS'
df = df[df['PT_TYPE'] == 'BUS']

# Group by YEAR_MONTH, ORIGIN_PT_CODE, DESTINATION_PT_CODE and sum TOTAL_TRIPS
condensed_df = df.groupby(['YEAR_MONTH', 'ORIGIN_PT_CODE', 'DESTINATION_PT_CODE']).agg({'TOTAL_TRIPS': 'sum'}).reset_index()

# Display the condensed DataFrame
print(condensed_df)

In [None]:
# Test proprocess_totalTrips_df
df_total_trips = preprocess_totalTrips_df(df)
df_total_trips

In [None]:
# df_2d
df_2d

In [None]:
print(sorted_df.dtypes)
print(df_total_trips.dtypes)

In [None]:
# Merge df_total_trips into df_2d
merged_df = df_2d.merge(df_total_trips, 
                             left_on=['origin_bus_stop', 'dest_bus_stop'], 
                             right_on=['ORIGIN_PT_CODE', 'DESTINATION_PT_CODE'], 
                             how='left')
merged_df

In [None]:
# Test merge_distance_totalTrips
df_distance_totalTrips = merge_distance_totalTrips(df_2d, df_total_trips)
df_distance_totalTrips

In [None]:
# df_bus_stops
bus_stops = lta.get_bus_stops()
bus_stops = bus_stops['value']
nums =[500* i for i in range(1,11)]

for num in nums:
    bus_stops2 = lta.get_bus_stops2(num)  # modified method to generate dummy  * the api can call 500 record at once, so need to call multiple times to get all records
    bus_stops2 = bus_stops2['value']
    bus_stops.extend(bus_stops2)

print(len(bus_stops))
df_bus_stops =  pd.DataFrame.from_dict(bus_stops)
df_bus_stops

In [None]:
# df_taps
# filepath = 'transport_node_bus_202402.zip'
# taps = lta.get_passenger_vol_by_bus_stops(filepath)
# df_taps = to_df(taps)

# Path to your CSV file
csv_file_path = 'data/transport_node_bus_202402.csv'

# Read the CSV file into a DataFrame
df_taps = pd.read_csv(csv_file_path)

# Display the first few rows of the DataFrame
df_taps

In [None]:
# Preprocess df_taps
df = df_taps

# Filter 'PT_TYPE' == 'BUS'
df = df[df['PT_TYPE'] == 'BUS']

# Group by YEAR_MONTH, PT_CODE, and sum TOTAL_TAP_IN_VOLUME TOTAL_TAP_OUT_VOLUME
condensed_df = df.groupby(['YEAR_MONTH', 'PT_CODE']).agg({'TOTAL_TAP_IN_VOLUME': 'sum', 'TOTAL_TAP_OUT_VOLUME': 'sum'}).reset_index()
condensed_df['TOTAL_TAP_VOLUME'] = condensed_df['TOTAL_TAP_IN_VOLUME'] + condensed_df['TOTAL_TAP_OUT_VOLUME']

# Display the condensed DataFrame
condensed_df

In [None]:
# Test preprocess_df_taps
df_taps = preprocess_df_taps(df_taps)
df_taps

In [None]:
# Check very large tap number
df_taps[df_taps['PT_CODE'] == 75009]

In [None]:
# Merge df_taps in df_distance_totalTrips

# Merge the dataframes based on matching YEAR_MONTH and PT_CODE
merged_df = df_distance_totalTrips.merge(df_taps, 
                                         left_on=['YEAR_MONTH', 'ORIGIN_PT_CODE'], 
                                         right_on=['YEAR_MONTH', 'PT_CODE'], 
                                         how='left')

# Merge again for the destination PT_CODE
merged_df = merged_df.merge(df_taps, 
                             left_on=['YEAR_MONTH', 'DESTINATION_PT_CODE'], 
                             right_on=['YEAR_MONTH', 'PT_CODE'], 
                             suffixes=('_origin', '_destination'), 
                             how='left')

# Calculate the passenger volume by summing TOTAL_TAP_VOLUME from both origin and destination
merged_df['passenger_volume'] = merged_df['TOTAL_TAP_VOLUME_origin'] + merged_df['TOTAL_TAP_VOLUME_destination']

# Calculate the passenger volume by summing TOTAL_TAP_VOLUME from both origin and destination
merged_df['passenger_volume'] = merged_df['TOTAL_TAP_VOLUME_origin'] + merged_df['TOTAL_TAP_VOLUME_destination']

# Drop unnecessary columns
merged_df.drop(columns=['PT_CODE_origin', 'TOTAL_TAP_IN_VOLUME_origin', 'TOTAL_TAP_OUT_VOLUME_origin',  'PT_CODE_destination',
                        'TOTAL_TAP_IN_VOLUME_destination', 'TOTAL_TAP_OUT_VOLUME_destination', 'ORIGIN_PT_CODE', 'DESTINATION_PT_CODE'], inplace=True)

merged_df

In [None]:
# Test merge_taps_distance_totalTrips
df_taps_distance_totalTrips = merge_taps_distance_totalTrips(df_taps, df_distance_totalTrips)
df_taps_distance_totalTrips

In [None]:
# Compute bus/car CO2 emission
df = df_taps_distance_totalTrips
bus_CO2_rate = 0.48  # kg/km
car_CO2_rate = 0.167  # kg/km
bus2car_ratio = 1/4  # approximation: 1/4 passengers taking car and carpool
df['co2_by_bus'] = df['origin_dest_distance']*df['TOTAL_TRIPS']*bus_CO2_rate
df['co2_by_car'] = df['origin_dest_distance']*df['TOTAL_TRIPS']*df['passenger_volume']*bus2car_ratio*car_CO2_rate
df['co2_reduction'] = df['co2_by_car'] - df['co2_by_bus']
df

In [None]:
# Test df_co2
df_co2 = get_df_co2(df_taps_distance_totalTrips)
df_co2

In [None]:
df_co2.iloc[:,0]
df_co2.iloc[:,-3]
df_co2.iloc[:,-2]


In [None]:
# demo graph visualization
import plotly.graph_objects as go

x= df_co2.iloc[:,0]
y_car = df_co2.iloc[:,-2]
y_bus = df_co2.iloc[:,-3]
fig = go.Figure(data=[
    go.Bar(name="Car", x=x, y=y_car),
    go.Bar(name="Bus", x=x, y=y_bus)

])

fig.update_layout()

# Set the y-axis to log scale
fig.update_yaxes(type='log')

# Show the plot
fig.show()

## Checking df_OD_volume

In [None]:
# Load existing zipped data
current_folder_path = os.getcwd()
data_folder_name = "data"
data_file_name = "origin_destination_bus_202402.zip"
data_file_path = os.path.join(current_folder_path, data_folder_name, data_file_name)
df = pd.read_csv(data_file_path)
df

In [None]:
# Bus 10
df[(df['ORIGIN_PT_CODE'] == 75009) & (df['DESTINATION_PT_CODE'] == 76051)]

In [None]:
# Find rows where ORIGIN_PT_CODE equals DESTINATION_PT_CODE
    # Findings: No origin-destination total trips for loop service
df[df['ORIGIN_PT_CODE'] == df['DESTINATION_PT_CODE']]

In [None]:
# Bus 101: loop service
# May need to get rid of loop-service bus (ex: 101) on the map, since there's no origin-destination total-trips for this bus

origin = 66009
destination = 62189
df[(df['ORIGIN_PT_CODE'] == origin) & (df['DESTINATION_PT_CODE'] == destination)]
# df[(df['ORIGIN_PT_CODE'] == origin)]

In [None]:
# Bus 100
# No origin-destination total_trips data for this route?

origin = 11009
destination = 66009
df[(df['ORIGIN_PT_CODE'] == origin) & (df['DESTINATION_PT_CODE'] == destination)]
# df[(df['ORIGIN_PT_CODE'] == origin)]

In [None]:
# Bus 100
# Direction 1 has same total trips as Direction 2?
    # Findings: seems not

origin = 66009
destination = 11009
df[(df['ORIGIN_PT_CODE'] == origin) & (df['DESTINATION_PT_CODE'] == destination)]
# df[(df['ORIGIN_PT_CODE'] == origin)]

In [None]:
# Bus 10

origin = 75009
destination = 16009
df[(df['ORIGIN_PT_CODE'] == origin) & (df['DESTINATION_PT_CODE'] == destination)]

In [None]:
origin = 16009
destination = 75009
df[(df['ORIGIN_PT_CODE'] == origin) & (df['DESTINATION_PT_CODE'] == destination)]