In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import multiprocessing as mp



import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


# Add Geometry Function

In [2]:
def addGeometryIdToDataFrame(df, gdf, xcol, ycol, idColumn="geometry", df_geom='epsg:32610'): 
    gdf.crs = {'init': 'epsg:4326'}
    gdf_data = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[xcol], df[ycol]))
    gdf_data.crs = {'init': df_geom}
    joined = gpd.sjoin(gdf_data.to_crs('epsg:26910'), gdf.to_crs('epsg:26910'))
    gdf_data = gdf_data.merge(joined['blkgrpid'], left_index=True, right_index=True, how="left")
    gdf_data.rename(columns={'blkgrpid': idColumn}, inplace=True)
    df = pd.DataFrame(gdf_data.drop(columns='geometry'))
    # df.drop(columns=[xcol, ycol], inplace=True)

    return df.loc[~df.index.duplicated(keep='first'), :]

# Inputs

In [3]:
block_info = pd.read_csv('https://github.com/LBNL-UCB-STI/beam-core-analysis/raw/main/Users/Nazanin/JoeFish_BlockGroup_Labels/bg_w_geog_labels.csv')
BGs = gpd.read_file('inputs/641aa0d4-ce5b-4a81-9c30-8790c4ab8cfb202047-1-wkkklf.j5ouj.shp')
geoAggregationType = 'BlockGroup' # bgid - tractid - juris_name - county_name - mpo
nrows_links = None #10000
nrows_events = None #6000000
# BGs = gpd.read_file('https://github.com/LBNL-UCB-STI/beam-core-analysis/raw/main/Users/Nazanin/Shapefile2010/641aa0d4-ce5b-4a81-9c30-8790c4ab8cfb202047-1-wkkklf.j5ouj.shp')

# ******LINK-BASED Simulation Outputs******


In [4]:
linkResults = pd.read_csv('gs://beam-core-outputs/sfbay-baseline-20230526/beam/year-2020-iteration-4/ITERS/it.0/0.linkstats.csv.gz',nrows = nrows_links)

In [5]:
# linkResults['averageSpeed'] = linkResults['traveltime']/1000/1.609/(linkResults['length']/3600)
# linkResults['congestionLevel'] = linkResults['volume']/linkResults['capacity']
# linkResults['speedVsFreeflow'] = linkResults['avergaeSpeed']/linkResults['freespeed']

In [6]:
network = pd.read_csv('gs://beam-core-outputs/sfbay-baseline-20230526/beam/year-2020-iteration-4/network.csv.gz', usecols = ['linkId','fromLocationX','toLocationX','fromLocationY','toLocationY'])
# We can add Number of Lanes, Road Yype and Allowed Modes

In [7]:
#Add link baricenter X and Y
network['X'] = (network['fromLocationX']+network['toLocationX'])/2
network['Y'] = (network['fromLocationY']+network['toLocationY'])/2
#Add geometry info
network = addGeometryIdToDataFrame(network, BGs, 'X', 'Y', 'BlockGroup')
#Add additional geographical info
network['BlockGroup'] = network['BlockGroup'].fillna(0).astype(int)
network = pd.merge(network, block_info,  how='left',  left_on = 'BlockGroup',right_on = 'bgid')
#Merge Linkstats with Link attributes 
linkResults = pd.merge(linkResults, network,  how='left',  left_on = 'link',right_on = 'linkId')

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


# ******AGGREGATE Link-Based Results PER ZONE******


In [8]:
# #AGGREGATE LINK results by geography
# linkResultsAggregated = linkResults.groupby([geoAggregationType, 'hour']).agg(
#     total_flow=('volume', 'sum'),
#     total_vmt=('volume', lambda x: (x * linkResults['length']).sum()),
#     total_vmo=('capacity', lambda x: (x * linkResults['length']).sum()),
#     total_vht=('volume', lambda x: (x * linkResults['traveltime']).sum()),
#     total_flow_truck=('TruckVolume', 'sum'),
#     total_vmt_truck=('TruckVolume', lambda x: (x * linkResults['length']).sum()),
#     total_flow_HDtruck=('HDTruckVolume', 'sum'),
#     total_vmt_HDtruck=('HDTruckVolume', lambda x: (x * linkResults['length']).sum()),
# ).reset_index()
# linkResultsAggregated['total_vmt'] = linkResultsAggregated['total_vmt']/1000/1.609
# linkResultsAggregated['total_vmo'] = linkResultsAggregated['total_vmo']/1000/1.609
# linkResultsAggregated['total_vht'] = linkResultsAggregated['total_vht']/3600

# linkResultsAggregated['average_speed_mph'] = linkResultsAggregated['total_vmt']/linkResultsAggregated['total_vht']
# linkResultsAggregated['level_congestion_percentage'] = linkResultsAggregated['total_vmt']/linkResultsAggregated['total_vmo']*100



In [9]:
import pandas as pd

# Optimize data types
linkResults[geoAggregationType] = linkResults[geoAggregationType].astype('category')
linkResults['hour'] = linkResults['hour'].astype('category')

# Pre-calculate values to be used in aggregation
linkResults['volume_length'] = linkResults['volume'] * linkResults['length']
linkResults['capacity_length'] = linkResults['capacity'] * linkResults['length']
linkResults['TruckVolume_length'] = linkResults['TruckVolume'] * linkResults['length']
linkResults['HDTruckVolume_length'] = linkResults['HDTruckVolume'] * linkResults['length']
linkResults['volume_traveltime'] = linkResults['volume'] * linkResults['traveltime']

# Aggregate
linkResultsAggregated = linkResults.groupby([geoAggregationType, 'hour']).agg(
    total_flow=('volume', 'sum'),
    total_vmt=('volume_length', 'sum'),
    total_vmo=('capacity_length', 'sum'),
    total_vht=('volume_traveltime', 'sum'),
    total_flow_truck=('TruckVolume', 'sum'),
    total_vmt_truck=('TruckVolume_length', 'sum'),
    total_flow_HDtruck=('HDTruckVolume', 'sum'),
    total_vmt_HDtruck=('HDTruckVolume_length', 'sum'),
).reset_index()

# Post-aggregation calculations
conversion_factor = 1000 / 1.609
linkResultsAggregated['total_vmt'] /= conversion_factor
linkResultsAggregated['total_vmt_truck'] /= conversion_factor
linkResultsAggregated['total_vmt_HDtruck'] /= conversion_factor
linkResultsAggregated['total_vmo'] /= conversion_factor
linkResultsAggregated['total_vht'] /= 3600

linkResultsAggregated['average_speed_mph'] = linkResultsAggregated['total_vmt'] / linkResultsAggregated['total_vht']
linkResultsAggregated['level_congestion_percentage'] = (linkResultsAggregated['total_vmt'] / linkResultsAggregated['total_vmo']) * 100

# Now linkResultsAggregated contains your aggregated data


In [10]:
#Make some Plots

# import matplotlib.pyplot as plt
# import seaborn as sns

# # Set the aesthetic style of the plots
# sns.set_style("whitegrid")

# # Plotting hourly distributions of some variables
# fig, axes = plt.subplots(3, 1, figsize=(12, 18))

# # Total Flow by Hour
# sns.barplot(x='hour', y='total_flow', data=aggregated_data, ax=axes[0])
# axes[0].set_title('Hourly Distribution of Total Flow')
# axes[0].set_xlabel('Hour of the Day')
# axes[0].set_ylabel('Total Flow')

# # Average Speed by Hour
# sns.barplot(x='hour', y='average_speed', data=aggregated_data, ax=axes[1])
# axes[1].set_title('Hourly Distribution of Average Speed')
# axes[1].set_xlabel('Hour of the Day')
# axes[1].set_ylabel('Average Speed (mph)')

# # Level of Congestion by Hour
# sns.barplot(x='hour', y='level_congestion', data=aggregated_data, ax=axes[2])
# axes[2].set_title('Hourly Distribution of Level of Congestion')
# axes[2].set_xlabel('Hour of the Day')
# axes[2].set_ylabel('Level of Congestion')

# plt.tight_layout()
# plt.show()


# ******VEHICLE-BASED Simulation Outputs******


In [11]:
#Read Vehicle Types
# vTypes = pd.read_csv('gs://beam-core-outputs/sfbay-baseline-20230526/beam/year-2020-iteration-4/vehicles.csv.gz',nrows = None)
vTypes = pd.read_csv('inputs/vehicletypes-baseline.csv',nrows = None)
events = pd.read_csv('gs://beam-core-outputs/sfbay-baseline-20230526/beam/year-2020-iteration-4/ITERS/it.0/0.events.csv.gz',
                     usecols = ['type','vehicle', 'links','mode','vehicleType'],nrows = nrows_events)
#we could add 'departureTime','departTime','linkTravelTime',

  events = pd.read_csv('gs://beam-core-outputs/sfbay-baseline-20230526/beam/year-2020-iteration-4/ITERS/it.0/0.events.csv.gz',


In [12]:
#Dilter per mode
pathTraversal = events[events['type'] == 'PathTraversal']
print(len(pathTraversal),'LEN pathTraversal')
print(pathTraversal['mode'].value_counts())
pathTraversal = pathTraversal.dropna(subset=['links'])
print(len(pathTraversal),'LEN pathTraversal after dropping Nan links')
print(pathTraversal['mode'].value_counts())
pathTraversal = pathTraversal[pathTraversal['mode'].isin(['car','bus','car_hov2','car_hov3'])]
print(len(pathTraversal),'LEN pathTraversal after dropping active modes')
print(pathTraversal['mode'].value_counts())

8965988 LEN pathTraversal
walk         4159618
car          2697477
bus          1286361
car_hov2      405331
car_hov3      287540
tram           54929
bike           40763
cable_car      17836
subway         14358
rail            1702
ferry             73
Name: mode, dtype: int64
6821997 LEN pathTraversal after dropping Nan links
car         2697477
walk        2119938
bus         1270948
car_hov2     405331
car_hov3     287540
bike          40763
Name: mode, dtype: int64
4661296 LEN pathTraversal after dropping active modes
car         2697477
bus         1270948
car_hov2     405331
car_hov3     287540
Name: mode, dtype: int64


In [13]:
pathTraversal['links'] = pathTraversal['links'].str.split(',')

# Guess first and last link for each trip
pathTraversal['firstLink'] = pathTraversal['links'].apply(lambda x: x[0] if x else None)
pathTraversal['lastLink'] = pathTraversal['links'].apply(lambda x: x[-1] if x else None)
depLinkVehicles = pathTraversal.groupby('firstLink')['vehicleType'].agg(list).reset_index()
arrLinkVehicles = pathTraversal.groupby('lastLink')['vehicleType'].agg(list).reset_index()
depLinkVehicles.columns = ['firstLink', 'vehicleType']
depLinkVehicles['firstLink'] = depLinkVehicles['firstLink'].astype(int)
arrLinkVehicles.columns = ['lastLink', 'vehicleType']
arrLinkVehicles['lastLink'] = arrLinkVehicles['lastLink'].astype(int)

In [14]:
# Guess all vehicles traveling through each link
explodedPathTraversal = pathTraversal.explode('links')
linkVehicles = explodedPathTraversal.groupby('links')['vehicleType'].agg(list).reset_index()
linkVehicles.columns = ['link', 'vehicleType']
linkVehicles['link'] = linkVehicles['link'].astype(int)

In [15]:
#Car Buses and Cars and Ridehail - train?
#For each trip, departing locations, crossing locations, and arriving locations by TAZ, BLOCKID, or COUNTY

In [16]:
#Define vehicle park per TAZ, BLOCKID, or COUNTY and per departin, arriving and crossing

# ******Get VEHICLE PARK per Zone******
# Crossing, Departing from and arriving to


In [17]:
linkVehicles = pd.merge(linkVehicles, network, how='left', left_on='link', right_on='linkId')
depLinkVehicles = pd.merge(depLinkVehicles, network, how='left', left_on='firstLink', right_on='linkId')
arrLinkVehicles = pd.merge(arrLinkVehicles, network, how='left', left_on='lastLink', right_on='linkId')

vehicle_types = list(vTypes.vehicleTypeId)

def count_vehicle_types(chunk):
    for v_type in vehicle_types:
        chunk[v_type] = chunk['vehicleType'].apply(lambda x: x.count(v_type))
    return chunk

def process_in_parallel(dataframe, processing_function, num_partitions):
    """
    Process a pandas DataFrame in parallel using a specified function.

    :param dataframe: pandas DataFrame to be processed
    :param processing_function: Function to apply to each chunk of the DataFrame
    :param num_partitions: Number of partitions to divide the DataFrame into
    :return: Processed pandas DataFrame
    """
    chunk_size = len(dataframe) // num_partitions
    chunks = [dataframe.iloc[i:i + chunk_size] for i in range(0, len(dataframe), chunk_size)]

    # Handle any remaining rows
    if len(dataframe) % num_partitions:
        chunks.append(dataframe.iloc[num_partitions * chunk_size:])

    with mp.Pool(num_partitions) as pool:
        result_chunks = pool.map(processing_function, chunks)

    return pd.concat(result_chunks)

num_partitions = mp.cpu_count()

# chunk_size = len(linkVehicles) // num_partitions
# chunks = [linkVehicles.iloc[i:i + chunk_size] for i in range(0, len(linkVehicles), chunk_size)]
# with mp.Pool(num_partitions) as pool:
#     vehiclePark_chunk = pool.map(count_vehicle_types, chunks)
# crossVehiclePark = pd.concat(vehiclePark_chunk)

# chunk_size = len(depLinkVehicles) // num_partitions
# chunks = [depLinkVehicles.iloc[i:i + chunk_size] for i in range(0, len(depLinkVehicles), chunk_size)]
# with mp.Pool(num_partitions) as pool:
#     vehiclePark_chunk = pool.map(count_vehicle_types, chunks)
# depVehiclePark = pd.concat(vehiclePark_chunk)

# chunk_size = len(arrLinkVehicles) // num_partitions
# chunks = [arrLinkVehicles.iloc[i:i + chunk_size] for i in range(0, len(arrLinkVehicles), chunk_size)]
# with mp.Pool(num_partitions) as pool:
#     vehiclePark_chunk = pool.map(count_vehicle_types, chunks)
# arrVehiclePark = pd.concat(vehiclePark_chunk)

# Process linkVehicles DataFrame
crossVehiclePark = process_in_parallel(linkVehicles, count_vehicle_types, num_partitions)

# Process depLinkVehicles DataFrame
depVehiclePark = process_in_parallel(depLinkVehicles, count_vehicle_types, num_partitions)

# Process arrLinkVehicles DataFrame
arrVehiclePark = process_in_parallel(arrLinkVehicles, count_vehicle_types, num_partitions)

crossVehicleParkAggregated = crossVehiclePark.groupby(geoAggregationType).sum()
crossVehicleParkAggregated.reset_index(inplace=True)
depVehicleParkAggregated = depVehiclePark.groupby(geoAggregationType).sum()
depVehicleParkAggregated.reset_index(inplace=True)
arrVehicleParkAggregated = arrVehiclePark.groupby(geoAggregationType).sum()
arrVehicleParkAggregated.reset_index(inplace=True)

  crossVehicleParkAggregated = crossVehiclePark.groupby(geoAggregationType).sum()
  depVehicleParkAggregated = depVehiclePark.groupby(geoAggregationType).sum()
  arrVehicleParkAggregated = arrVehiclePark.groupby(geoAggregationType).sum()


In [18]:
def process_in_parallel(dataframe, processing_function, num_partitions):
    """
    Process a pandas DataFrame in parallel using a specified function.

    :param dataframe: pandas DataFrame to be processed
    :param processing_function: Function to apply to each chunk of the DataFrame
    :param num_partitions: Number of partitions to divide the DataFrame into
    :return: Processed pandas DataFrame
    """
    chunk_size = len(dataframe) // num_partitions
    chunks = [dataframe.iloc[i:i + chunk_size] for i in range(0, len(dataframe), chunk_size)]

    # Handle any remaining rows
    if len(dataframe) % num_partitions:
        chunks.append(dataframe.iloc[num_partitions * chunk_size:])

    with mp.Pool(num_partitions) as pool:
        result_chunks = pool.map(processing_function, chunks)

    return pd.concat(result_chunks)

# Example usage
# num_partitions = number of partitions you want to divide your dataframe into
# count_vehicle_types = your processing function

# Process linkVehicles DataFrame
crossVehiclePark = process_in_parallel(linkVehicles, count_vehicle_types, num_partitions)

# Process depLinkVehicles DataFrame
depVehiclePark = process_in_parallel(depLinkVehicles, count_vehicle_types, num_partitions)

# Process arrLinkVehicles DataFrame
arrVehiclePark = process_in_parallel(arrLinkVehicles, count_vehicle_types, num_partitions)

# ******Save Outputs******


In [19]:
# Aggregated link outputs per zone and hour
linkResultsAggregated.to_csv('outputs/linkResultsAggregated_'+geoAggregationType+'.csv')

In [20]:
# Crossing Vehicle Park per zone: number of vehicles per link traveling through the zone for each vehicle type
crossVehicleParkAggregated = crossVehicleParkAggregated.drop(['link','linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','X','Y',], axis = 1)
crossVehicleParkAggregated.to_csv('outputs/crossVehicleParkAggregated_'+geoAggregationType+'.csv', index = False)

In [21]:
# Departing Vehicle Park per zone: number of vehicles departing from the zone for each vehicle type
depVehicleParkAggregated = depVehicleParkAggregated.drop(['firstLink','linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','X','Y',], axis = 1)
depVehicleParkAggregated.to_csv('outputs/depVehicleParkAggregated_'+geoAggregationType+'.csv', index = False)

In [22]:
# Arriving Vehicle Park per zone: number of vehicles arriving to the zone for each vehicle type
arrVehicleParkAggregated = arrVehicleParkAggregated.drop(['lastLink','linkId','fromLocationX','fromLocationY','toLocationX','toLocationY','X','Y',], axis = 1)
arrVehicleParkAggregated.to_csv('outputs/arrVehicleParkAggregated_'+geoAggregationType+'.csv', index = False)

In [23]:
# Vehicle types and their attributes
vTypes.to_csv('outputs/vTypes_'+geoAggregationType+'.csv')

# ******PLOT******


In [24]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

# linkResultsAggregated['BlockGroup'] = linkResultsAggregated['BlockGroup'].astype(str)
# max_length = linkResultsAggregated['BlockGroup'].str.len().max() + 1
# linkResultsAggregated['BlockGroup'] = linkResultsAggregated['BlockGroup'].str.zfill(max_length)


BGs_with_link_results = BGs.merge(linkResultsAggregated, left_on='blkgrpid', right_on='BlockGroup')

# Plotting
fig, ax = plt.subplots(1, 1, figsize=(20, 20))
BGs_with_link_results = BGs_with_link_results[BGs_with_link_results['hour']==9]
BGs_with_link_results['total_flow/area'] = BGs_with_link_results['total_flow']/BGs_with_link_results['Shape__Are']
BGs_with_link_results = BGs_with_link_results[BGs_with_link_results['total_flow/area']<50000000]
BGs_with_link_results.plot(column='total_flow/area', ax=ax, legend=True, cmap='viridis')
# plt.show()
plt.savefig('outputs/test_fig.png', dpi = 1800)



ValueError: cannot convert float NaN to integer

Error in callback <function _draw_all_if_interactive at 0x7f331d286050> (for post_execute):


ValueError: cannot convert float NaN to integer

ValueError: cannot convert float NaN to integer

<Figure size 2000x2000 with 1 Axes>

# ******Future Possible Directions******


In [None]:
# Split Vehicle Parks per hour to match the hourly link results
# Get deltas from 2 scenarios
# Transfer data to EMFACT to get emissions and then INMAP to get dispersions
# Scale from 10% to 100%