**Scope of this notebook:**
- Reads in events and person to vehicles file
- Gets links travelled, time, and num of passengers from events files over to person to vehicles
- Adds population characteristics to dataframe from person and household info
- Labels scenario
- Exports to csv
- You must run it three times for each year: e.g. baseline/flat/income-based for 2020.

In [1]:
scenario = 'baseline' #income-based, flat, baseline
year = '2020'
scenario_year = scenario + "_" + year

Data from paths below is available upon request.

In [2]:
if scenario == 'baseline':
    url_ptv = 'https://storage.googleapis.com/beam-core-outputs/sfbay-baseline3_20240728/postprocessOutputs/year-2020-iteration-3/personToVehicles.csv.gz'
    url_events = 'https://storage.googleapis.com/beam-core-outputs/sfbay-baseline3_20240728/beam/year-2020-iteration-3/ITERS/it.0/0.events.csv.gz'
    url_pop = 'https://storage.googleapis.com/beam-core-outputs/sfbay-baseline3_20240728/activitysim/final_persons.csv'
    url_hh = 'https://storage.googleapis.com/beam-core-outputs/sfbay-baseline3_20240728/activitysim/final_households.csv'
elif scenario == 'income-based':
    url_ptv = 'https://storage.googleapis.com/beam-core-outputs/sfbay-cordon_income_20241023/postprocessOutputs/year-2020-iteration-3/personToVehicles.csv.gz'
    url_events = 'https://storage.googleapis.com/beam-core-outputs/sfbay-cordon_income_20241023/beam/year-2020-iteration-3/ITERS/it.0/0.events.csv.gz'
    url_pop = 'https://storage.googleapis.com/beam-core-outputs/sfbay-cordon_income_20241023/activitysim/final_persons.csv'
    url_hh = 'https://storage.googleapis.com/beam-core-outputs/sfbay-cordon_income_20241023/activitysim/final_households.csv'
elif scenario == 'flat':
    url_ptv = 'https://storage.googleapis.com/beam-core-outputs/sfbay-cordon_flatrate_20241023/postprocessOutputs/year-2020-iteration-3/personToVehicles.csv.gz'
    url_events = 'https://storage.googleapis.com/beam-core-outputs/sfbay-cordon_flatrate_20241023/beam/year-2020-iteration-3/ITERS/it.0/0.events.csv.gz'
    url_pop = 'https://storage.googleapis.com/beam-core-outputs/sfbay-cordon_flatrate_20241023/activitysim/final_persons.csv'
    url_hh = 'https://storage.googleapis.com/beam-core-outputs/sfbay-cordon_flatrate_20241023/activitysim/final_households.csv'

In [3]:
import gzip
import pandas as pd
import requests
from io import BytesIO

In [4]:
import psutil
from IPython import get_ipython

def get_memory_limit():
    ip = get_ipython()
    if ip is not None:
        config = ip.config
        if 'ResourceUseDisplay' in config:
            return config['ResourceUseDisplay'].get('mem_limit', 'Not set')
    return 'Not available'

print(f"Current Jupyter memory limit: {get_memory_limit()}")
print(f"Total system memory: {psutil.virtual_memory().total / (1024**3):.2f} GB")
print(f"Available memory: {psutil.virtual_memory().available / (1024**3):.2f} GB")

Current Jupyter memory limit: Not available
Total system memory: 16.00 GB
Available memory: 12.27 GB


In [5]:
import os
from IPython.core.magic import register_cell_magic

@register_cell_magic
def beep(line, cell):
    get_ipython().run_cell(cell)
    os.system("echo -n '\a'")
    
print("Beep cell magic registered. Use %%beep at the start of a cell to beep when it finishes.")

Beep cell magic registered. Use %%beep at the start of a cell to beep when it finishes.


**1. Load person-to-vehicles**

In [6]:
# URL of the public file
url = url_ptv

try:
    # Download the file
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    # Read the content directly as CSV without gzip decompression
    df_base_ptv = pd.read_csv(BytesIO(response.content))
    
    # Display the first few rows of the DataFrame
    print(df_base_ptv.head())
    
    # Print additional information about the DataFrame
    print(f"\nDataFrame shape: {df_base_ptv.shape}")
    print(f"DataFrame columns: {df_base_ptv.columns.tolist()}")

except requests.exceptions.RequestException as e:
    print(f"Error downloading file: {e}")
except pd.errors.EmptyDataError:
    print("The CSV file is empty")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


  df_base_ptv = pd.read_csv(BytesIO(response.content))


   Unnamed: 0 vehicleID  pathTraversalID  personID  planIndex mode     length  \
0           0         0         23086219   3285289          1  car   3963.716   
1           1         0         23159166   3285289          1  car   1264.194   
2           2         0         24444032   3285289          3  car   4801.618   
3           3         0         24463377   3285289          3  car    257.111   
4           4      1000          9060446   2416480          1  car  24174.065   

   duration primaryFuelType   primaryFuel  secondaryFuel vehicle2  
0       204        Gasoline  8.780682e+06            0.0        0  
1        94        Gasoline  3.179390e+06            0.0        0  
2       294        Gasoline  1.157165e+07            0.0        0  
3        23        Gasoline  7.953087e+05            0.0        0  
4      1024        Gasoline  5.071594e+07            0.0      100  

DataFrame shape: (7145898, 12)
DataFrame columns: ['Unnamed: 0', 'vehicleID', 'pathTraversalID', 'person

**2. Load events file**

In [7]:
%%beep
# URL of the public file
url = url_events

try:
    # Download the file
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    # Decompress the gzip content and read as CSV
    with gzip.GzipFile(fileobj=BytesIO(response.content)) as gzipped_file:
        df_base = pd.read_csv(gzipped_file, low_memory=False)
    
    # Display the first few rows of the DataFrame
    print(df_base.head())
    
    # Print additional information about the DataFrame
    print(f"\nDataFrame shape: {df_base.shape}")
    print(f"DataFrame columns: {df_base.columns.tolist()}")

except requests.exceptions.RequestException as e:
    print(f"Error downloading file: {e}")
except gzip.BadGzipFile:
    print("The file is not in gzip format.")
except pd.errors.EmptyDataError:
    print("The CSV file is empty")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

  currentTourMode  endY  endX  startY  startX  arrivalTime  departureTime  \
0             NaN   NaN   NaN     NaN     NaN          NaN            NaN   
1             NaN   NaN   NaN     NaN     NaN          NaN            NaN   
2             NaN   NaN   NaN     NaN     NaN          NaN            NaN   
3             NaN   NaN   NaN     NaN     NaN          NaN            NaN   
4             NaN   NaN   NaN     NaN     NaN          NaN            NaN   

  person  time          type  ...  personalVehicleAvailable tourIndex  \
0    NaN   0.0  ParkingEvent  ...                       NaN       NaN   
1    NaN   0.0  ParkingEvent  ...                       NaN       NaN   
2    NaN   0.0  ParkingEvent  ...                       NaN       NaN   
3    NaN   0.0  ParkingEvent  ...                       NaN       NaN   
4    NaN   0.0  ParkingEvent  ...                       NaN       NaN   

  legModes legVehicleIds currentActivity  nextActivity  shiftStatus  \
0      NaN           NaN   

In [8]:
# Bring links from events into PTV:
df_base_ptv['links_ptv'] = df_base.loc[df_base_ptv['pathTraversalID'], 'links'].values
df_base_ptv['numPassengers_ptv'] = df_base.loc[df_base_ptv['pathTraversalID'], 'numPassengers'].values
df_base_ptv['time_ptv'] = df_base.loc[df_base_ptv['pathTraversalID'], 'time'].values

In [9]:
df_base_ptv.head()

Unnamed: 0.1,Unnamed: 0,vehicleID,pathTraversalID,personID,planIndex,mode,length,duration,primaryFuelType,primaryFuel,secondaryFuel,vehicle2,links_ptv,numPassengers_ptv,time_ptv
0,0,0,23086219,3285289,1,car,3963.716,204,Gasoline,8780682.0,0.0,0,"137340,137344,93630,136386,136390,136394,13641...",0.0,58484.0
1,1,0,23159166,3285289,1,car,1264.194,94,Gasoline,3179390.0,0.0,0,"138362,138394,138390,146606,138398,24256,14462...",0.0,58578.0
2,2,0,24444032,3285289,3,car,4801.618,294,Gasoline,11571650.0,0.0,0,"24248,140429,140425,142407,138381,24251,28524,...",0.0,60455.0
3,3,0,24463377,3285289,3,car,257.111,23,Gasoline,795308.7,0.0,0,137345137341,0.0,60478.0
4,4,1000,9060446,2416480,1,car,24174.065,1024,Gasoline,50715940.0,0.0,100,"17497,69706,39,14480,14477,41104,74660,64935,6...",0.0,34532.0


In [10]:
del df_base

In [11]:
%%beep
# Assuming you have a DataFrame called 'df'
df_base_ptv.to_csv(f'{scenario_year}_ptv_links.csv', index=False)

-n 


Run below if file has already been enriched (ptv and events)

In [6]:
df_base_ptv=pd.read_csv(f'{scenario_year}_ptv_links.csv', low_memory=False)

df_base_ptv.head()

Unnamed: 0.1,Unnamed: 0,vehicleID,pathTraversalID,personID,planIndex,mode,length,duration,primaryFuelType,primaryFuel,secondaryFuel,vehicle2,links_ptv,numPassengers_ptv,time_ptv
0,0,0,23086219,3285289,1,car,3963.716,204,Gasoline,8780682.0,0.0,0,"137340,137344,93630,136386,136390,136394,13641...",0.0,58484.0
1,1,0,23159166,3285289,1,car,1264.194,94,Gasoline,3179390.0,0.0,0,"138362,138394,138390,146606,138398,24256,14462...",0.0,58578.0
2,2,0,24444032,3285289,3,car,4801.618,294,Gasoline,11571650.0,0.0,0,"24248,140429,140425,142407,138381,24251,28524,...",0.0,60455.0
3,3,0,24463377,3285289,3,car,257.111,23,Gasoline,795308.7,0.0,0,137345137341,0.0,60478.0
4,4,1000,9060446,2416480,1,car,24174.065,1024,Gasoline,50715940.0,0.0,100,"17497,69706,39,14480,14477,41104,74660,64935,6...",0.0,34532.0


**Read in population**

In [7]:
# URL of the public file
url = url_pop
try:
    # Download the file
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    # Read the content directly as CSV without gzip decompression
    df_pop = pd.read_csv(BytesIO(response.content))
    
    # Display the first few rows of the DataFrame
    print(df_pop.head())
    
    # Print additional information about the DataFrame
    print(f"\nDataFrame shape: {df_pop.shape}")
    print(f"DataFrame columns: {df_pop.columns.tolist()}")

except requests.exceptions.RequestException as e:
    print(f"Error downloading file: {e}")
except pd.errors.EmptyDataError:
    print("The CSV file is empty")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

   person_id  age   earning   edu  hispanic  hours  PNUM  race_id  relate  \
0       2630   67     860.0  21.0         0   40.0     1        2       0   
1       4553   33   78000.0  22.0         0   45.0     1        1       0   
2       4554   33   30000.0  22.0         0   40.0     2        1       1   
3       6571   68       0.0  22.0         0    0.0     1        2       0   
4       7281   50  101000.0  22.0         0   40.0     1        2       0   

   sex  ...  num_joint_tours  non_mandatory_tour_frequency  num_non_mand  \
0    1  ...                0                            16             1   
1    2  ...                0                             0             0   
2    1  ...                0                             0             0   
3    2  ...                0                             8             1   
4    1  ...                0                             0             0   

   num_escort_tours num_eatout_tours num_shop_tours num_maint_tours  \
0        

In [8]:
print(f"DataFrame columns: {df_base_ptv.columns.tolist()}")

DataFrame columns: ['Unnamed: 0', 'vehicleID', 'pathTraversalID', 'personID', 'planIndex', 'mode', 'length', 'duration', 'primaryFuelType', 'primaryFuel', 'secondaryFuel', 'vehicle2', 'links_ptv', 'numPassengers_ptv', 'time_ptv']


In [9]:
def count_matching_values(df1, df2, column1, column2):
    # Convert the specified columns to sets
    set1 = set(df1[column1])
    set2 = set(df2[column2])
    
    # Find the intersection of the two sets
    matching_values = set1.intersection(set2)
    
    # Return the count of matching values
    return len(matching_values)

# Example usage:
# df1 = pd.DataFrame({'A': [1, 2, 3, 4, 5]})
# df2 = pd.DataFrame({'B': [3, 4, 5, 6, 7]})
# result = count_matching_values(df1, df2, 'A', 'B')
# print(f"Number of matching values: {result}")

In [10]:
count_matching_values(df_base_ptv,df_pop,'personID','person_id')

551654

In [11]:
# Define the columns we want to copy from df_pop
columns_to_copy = [
    'person_id',  # This will be our join key
    'household_id'
]

# Create a subset of df_pop with only the columns we need
df_pop_subset = df_pop[columns_to_copy]

# Convert ID columns to strings
df_pop_subset['person_id'] = df_pop_subset['person_id'].astype(str)
df_base_ptv['personID'] = df_base_ptv['personID'].astype(str)

# Merge this subset with df_base_ptv
df_ptv_enriched = pd.merge(
    df_base_ptv,
    df_pop_subset,
    left_on='personID',
    right_on='person_id',
    how='left'
)

# Remove the redundant 'person_id' column
df_ptv_enriched = df_ptv_enriched.drop(columns=['person_id'])

# Print the first few rows to verify the result
print(df_ptv_enriched[['personID', 'household_id']].head())

# Print summary statistics
total_rows = len(df_ptv_enriched)
matched_rows = df_ptv_enriched['household_id'].notna().sum()
print(f"Matched {matched_rows} out of {total_rows} rows")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pop_subset['person_id'] = df_pop_subset['person_id'].astype(str)


  personID  household_id
0  3285289       1267147
1  3285289       1267147
2  3285289       1267147
3  3285289       1267147
4  2416480        908385
Matched 7145898 out of 7145898 rows


In [12]:
df_ptv_enriched.head()

Unnamed: 0.1,Unnamed: 0,vehicleID,pathTraversalID,personID,planIndex,mode,length,duration,primaryFuelType,primaryFuel,secondaryFuel,vehicle2,links_ptv,numPassengers_ptv,time_ptv,household_id
0,0,0,23086219,3285289,1,car,3963.716,204,Gasoline,8780682.0,0.0,0,"137340,137344,93630,136386,136390,136394,13641...",0.0,58484.0,1267147
1,1,0,23159166,3285289,1,car,1264.194,94,Gasoline,3179390.0,0.0,0,"138362,138394,138390,146606,138398,24256,14462...",0.0,58578.0,1267147
2,2,0,24444032,3285289,3,car,4801.618,294,Gasoline,11571650.0,0.0,0,"24248,140429,140425,142407,138381,24251,28524,...",0.0,60455.0,1267147
3,3,0,24463377,3285289,3,car,257.111,23,Gasoline,795308.7,0.0,0,137345137341,0.0,60478.0,1267147
4,4,1000,9060446,2416480,1,car,24174.065,1024,Gasoline,50715940.0,0.0,100,"17497,69706,39,14480,14477,41104,74660,64935,6...",0.0,34532.0,908385


**Read in households**

In [13]:
# URL of the public file
url = url_hh
try:
    # Download the file
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    # Read the content directly as CSV without gzip decompression
    df_hh = pd.read_csv(BytesIO(response.content))
    
    # Display the first few rows of the DataFrame
    print(df_hh.head())
    
    # Print additional information about the DataFrame
    print(f"\nDataFrame shape: {df_hh.shape}")
    print(f"DataFrame columns: {df_hh.columns.tolist()}")

except requests.exceptions.RequestException as e:
    print(f"Error downloading file: {e}")
except pd.errors.EmptyDataError:
    print("The CSV file is empty")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


   household_id       serialno  VEHICL    income  race_of_head  age_of_head  \
0       1230947  2009000507504     0.0   10000.0             2           74   
1       2186170  2013000629215     1.0  162600.0             1           29   
2         98044  2009000696712     2.0   55500.0             9           59   
3       1930944  2010001160546     1.0   40800.0             8           39   
4       2480907  2011001446617     2.0   73700.0             1           63   

   num_workers  hispanic_status_of_head  tenure  recent_mover  ...  \
0          0.0                        0       2             0  ...   
1          1.0                        0       2             0  ...   
2          2.0                        1       1             0  ...   
3          2.0                        1       2             0  ...   
4          1.0                        0       1             0  ...   

  hh_work_auto_savings_ratio num_under16_not_at_school num_travel_active  \
0                   0.000000

In [14]:
# Define the columns we want to copy from df_hh
columns_to_copy = [ # This will be our join key
    'household_id',
    'income_in_thousands',
    'hhsize'
]

# Create a subset of df_pop with only the columns we need
df_hh_subset = df_hh[columns_to_copy]

# Convert ID columns to strings
df_hh_subset['household_id'] = df_hh_subset['household_id'].astype(str)
df_ptv_enriched['household_id'] = df_ptv_enriched['household_id'].astype(str)

# Merge this subset with df_base_ptv
df_ptv_enriched = pd.merge(
    df_ptv_enriched,
    df_hh_subset,
    left_on='household_id',
    right_on='household_id',
    how='left'
)


df_ptv_enriched.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hh_subset['household_id'] = df_hh_subset['household_id'].astype(str)


Unnamed: 0.1,Unnamed: 0,vehicleID,pathTraversalID,personID,planIndex,mode,length,duration,primaryFuelType,primaryFuel,secondaryFuel,vehicle2,links_ptv,numPassengers_ptv,time_ptv,household_id,income_in_thousands,hhsize
0,0,0,23086219,3285289,1,car,3963.716,204,Gasoline,8780682.0,0.0,0,"137340,137344,93630,136386,136390,136394,13641...",0.0,58484.0,1267147,40.0,1
1,1,0,23159166,3285289,1,car,1264.194,94,Gasoline,3179390.0,0.0,0,"138362,138394,138390,146606,138398,24256,14462...",0.0,58578.0,1267147,40.0,1
2,2,0,24444032,3285289,3,car,4801.618,294,Gasoline,11571650.0,0.0,0,"24248,140429,140425,142407,138381,24251,28524,...",0.0,60455.0,1267147,40.0,1
3,3,0,24463377,3285289,3,car,257.111,23,Gasoline,795308.7,0.0,0,137345137341,0.0,60478.0,1267147,40.0,1
4,4,1000,9060446,2416480,1,car,24174.065,1024,Gasoline,50715940.0,0.0,100,"17497,69706,39,14480,14477,41104,74660,64935,6...",0.0,34532.0,908385,35.5,2


In [15]:
df_ptv_enriched['scenario'] = scenario
df_ptv_enriched['year'] = year

In [16]:
%%beep

df_ptv_enriched.to_csv(f'{scenario_year}_ptv_enriched.csv')

-n 
