In [32]:
import pandas as pd
import numpy as np
import glob

#### BEAM 

In [2]:
#File location S3
locSF_rh = "https://beam-outputs.s3.amazonaws.com/pilates-outputs/sfbay-2010-lessrh-20220223/beam/year-2010-iteration-2/ITERS/it.0/"

In [3]:
%%time
#Reading the events file
dtypes = {
    "time": "float32",
    "type": "category",
    "legMode": "category",
    "actType": "category", 
    "reason": "category",
    "primaryFuelLevel": "float64",
    "legMode": "category",
    "chargingPointType":"category",
    "pricingModel":"category",
    "parkingType":"category",
    "mode":"category",
    "personalVehicleAvailable": "category",
    "person": "object",
    "driver": "object",
    "riders": "object"
}

# Use list comprehension to remove the unwanted column in **usecol**
eventsSF = pd.read_csv(locSF_rh + '0.events.csv.gz', compression = 'gzip', dtype = dtypes)

  exec(code, glob, local_ns)


Wall time: 2min 59s


In [4]:
#Show all columns and rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [5]:
#Rename mode column
eventsSF.rename(columns={"mode":"modeType"}, inplace=True)  

In [6]:
#Replace "Work" with "work" in actType column
eventsSF["actType"].replace({"Work": "work"}, inplace=True)

In [7]:
#Adding the IDMerged Column
eventsSF['UniqueID'] = eventsSF['person']
eventsSF['personID'] = np.where(eventsSF['person'].isin(eventsSF['driver'])
                     , eventsSF['person'], np.nan)
eventsSF['driverID'] = np.where(eventsSF['driver'].isin(eventsSF['person'])
                     , eventsSF['driver'], np.nan)

In [8]:
#Merging person and driver ids in one column
eventsSF['IDMerged'] = eventsSF['personID'].combine_first(eventsSF['driverID'])
eventsSF['IDMerged'] = eventsSF['UniqueID'].combine_first(eventsSF['IDMerged'])

In [9]:
#Dropping unused columns
eventsSF = eventsSF.drop(['personID','driverID','UniqueID'], axis=1) 

In [10]:
%%time
#Split the riders column
dfRiders = eventsSF['riders'].str.split(':',expand=True)

Wall time: 5min 51s


In [11]:
riderStartIndex = len(eventsSF.columns)

In [12]:
#Concat the new riders columns to the main dataframe
eventsSF = pd.concat([eventsSF, dfRiders.reindex(eventsSF.index)], axis=1)

In [13]:
%%time
# Making a riderID column and merge it with the already merged id column 
NaN = np.nan
eventsSF['ID'] = NaN 
    
for col in eventsSF.columns[riderStartIndex:-2]:
    eventsSF['riderID'] = np.where(eventsSF[col].isin(eventsSF["IDMerged"]), eventsSF[col], np.nan)
    eventsSF['ID'] = eventsSF['ID'].combine_first(eventsSF['riderID'])

Wall time: 5min 51s


In [14]:
eventsSF['IDMerged'] = eventsSF['IDMerged'].combine_first(eventsSF['ID'])

In [15]:
#dropping unused riders columns
eventsSF = eventsSF.drop(columns=eventsSF.columns[riderStartIndex:])

In [96]:
# shift column 'Name' to first position
first_column = eventsSF.pop('IDMerged')
# insert column using insert(position,column_name,first_column) function
eventsSF.insert(0, 'IDMerged', first_column)

In [98]:
%%time
#To select just the numeric MergedID
#eventsSF.loc[eventsSF.IDMerged.str.isnumeric()]
eventsSFnum = eventsSF[pd.to_numeric(eventsSF.IDMerged, errors='coerce').notnull()].sort_values(['IDMerged','time']).reset_index(drop=True)

Wall time: 2min 1s


In [76]:
eventsSFnum['actEndTime'] = np.where(eventsSFnum['type']=='actend'
                     , eventsSFnum['time'], np.nan)  

In [77]:
eventsSFnum['actStartTime'] = np.where(eventsSFnum['type']=='actstart'
                     , eventsSFnum['time'], np.nan)  

In [78]:
eventsSFnum['PTduration'] = np.where(eventsSFnum['type']=='PathTraversal'
                     , eventsSFnum['arrivalTime'] - eventsSFnum['departureTime'], np.nan)  

In [79]:
eventsSFnum['duration_walking'] = np.where(eventsSFnum['modeType']=='walk'
                     , eventsSFnum['PTduration'], np.nan)

In [80]:
eventsSFnum['duration_in_privateCar'] = np.where((eventsSFnum['modeType']=='car')|(eventsSFnum['modeType']=='car_hov3')|(eventsSFnum['modeType']=='car_hov2')|
                                              (eventsSFnum['modeType']=='hov2_teleportation')|(eventsSFnum['modeType']=='hov3_teleportation')
                                                 , eventsSFnum['PTduration'], np.nan)

In [81]:
eventsSFnum['duration_on_bike'] = np.where(eventsSFnum['modeType']=='bike', eventsSFnum['PTduration'], np.nan)

In [82]:
eventsSFnum['duration_in_ridehail'] = np.where((eventsSFnum['modeType']=='ride_hail')|(eventsSFnum['modeType']=='ride_hail_pooled'), eventsSFnum['PTduration'], np.nan)

In [83]:
eventsSFnum['duration_in_public'] = np.where((eventsSFnum['modeType']=='bike_transit')|(eventsSFnum['modeType']=='drive_transit')|(eventsSFnum['modeType']=='walk_transit'), eventsSFnum['PTduration'], np.nan)

In [84]:
eventsSFnum["tripIndex"] = eventsSFnum.groupby("IDMerged")["tourIndex"].rank(method="first", ascending=True)
eventsSFnum["tripIndex"] = eventsSFnum.tripIndex.fillna(method='ffill')

In [85]:
eventsSFnum["currentTourModeFillna"] = eventsSFnum.currentTourMode.fillna(method='ffill')#.astype('object')

In [86]:
#Change column type to numeric to be able to sort_values
eventsSFnum["IDMerged"] = pd.to_numeric(eventsSFnum["IDMerged"])

In [87]:
eventsSFnum['actEndType'] = np.where(eventsSFnum['type']=='actend'
                     , eventsSFnum['actType'], "")

In [88]:
eventsSFnum['actStartType'] = np.where(eventsSFnum['type']=='actstart'
                     , eventsSFnum['actType'], "")

In [91]:
eventsSFtrips = pd.pivot_table(
   eventsSFnum,
   index=['IDMerged','tripIndex','currentTourModeFillna'],
   aggfunc={'actStartTime': np.sum, 'actEndTime': np.sum, 'PTduration': np.sum, 'primaryFuel': np.sum, 
            'netCost': np.sum, 'actStartType': np.sum, 'actEndType': np.sum, 'duration_walking': np.sum,
            'duration_in_privateCar': np.sum, 'duration_on_bike': np.sum, 'duration_in_ridehail': np.sum, 
            'duration_in_public': np.sum}).reset_index()

In [92]:
#Adding door to door time
eventsSFtrips['DDtime'] = eventsSFtrips['actStartTime'] - eventsSFtrips['actEndTime'] 

In [93]:
eventsSFtrips['actPurpose'] = eventsSFtrips['actEndType'] + "_to_" + eventsSFtrips['actStartType']

In [94]:
#Rename the column
eventsSFtrips = eventsSFtrips.rename(columns={'currentTourModeFillna': 'currentTourMode'})

In [95]:
eventsSFtrips.head()

Unnamed: 0,IDMerged,tripIndex,currentTourMode,PTduration,actEndTime,actEndType,actStartTime,actStartType,duration_in_privateCar,duration_in_public,duration_in_ridehail,duration_on_bike,duration_walking,netCost,primaryFuel,DDtime,actPurpose
0,4,1.0,car,477.0,54694.0,Home,55180.0,othdiscr,477.0,0.0,0.0,0.0,0.0,1.258902,17178400.0,486.0,Home_to_othdiscr
1,4,2.0,car,0.0,0.0,,60702.0,Home,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60702.0,_to_Home
2,8,1.0,car,11817.0,71933.0,shopping,83781.0,Home,0.0,0.0,0.0,0.0,11817.0,0.052631,0.0,11848.0,shopping_to_Home
3,8,2.0,car,11510.0,60454.0,Home,71933.0,shopping,31.0,0.0,0.0,0.0,11479.0,0.0,649895.2,11479.0,Home_to_shopping
4,66,1.0,car,102.0,0.0,,0.0,,0.0,0.0,0.0,0.0,102.0,1.210568,6998.915,0.0,_to_


#### Activity Sim

In [108]:
loc = "https://beam-outputs.s3.amazonaws.com/pilates-outputs/sfbay-2010-lessrh-20220223/activitysim/" 
trips = pd.read_csv(loc + 'final_trips.csv')
households = pd.read_csv(loc + 'final_households.csv')
persons = pd.read_csv(loc + 'final_persons.csv')
tours = pd.read_csv(loc +'final_tours.csv')
plans = pd.read_csv(loc +'final_plans.csv')
landuse = pd.read_csv(loc +'final_land_use.csv')
#checkpoints = pd.read_csv(loc + 'final_checkpoints.csv')
#joint_tour_participants = pd.read_csv(loc + 'final_joint_tour_participants.csv')
#specs = pd.read_csv(loc +'final_trip_mode_choice_specs.csv')
#tripModeChoice = pd.read_csv(loc +'final_trip_mode_choice_raw.csv')
#tripModeChoiceUtilities = pd.read_csv(loc +'final_trip_mode_choice_utilities.csv')

In [60]:
#Merge households and persons 
persons = persons.sort_values(by=['household_id'])
households = households.sort_values(by=['household_id'])
hhpersons = pd.merge(left=persons, right=households, how='left', on='household_id', suffixes=('', '_drop'))
hhpersons.drop([col for col in hhpersons.columns if 'drop' in col], axis=1, inplace=True)

In [61]:
#Merge tours, households and persons
tours = tours.sort_values(by=['person_id'])
hhpersons = hhpersons.sort_values(by=['person_id'])
hhperTours = pd.merge(left=tours, right=hhpersons, how='left', on='person_id', suffixes=('', '_drop'))
hhperTours.drop([col for col in hhperTours.columns if 'drop' in col], axis=1, inplace=True)

In [62]:
%%time
#Merge trips, tours, households and persons
trips = trips.sort_values(by=['person_id', 'tour_id'])
hhperTours = hhperTours.sort_values(by=['person_id','tour_id'])
tourTripsMerged = pd.merge(left=trips, right=hhperTours, how='left', on=['person_id','tour_id'], suffixes=('', '_drop'))
tourTripsMerged.drop([col for col in tourTripsMerged.columns if 'drop' in col], axis=1, inplace=True)

Wall time: 42 s


In [122]:
%%time
#Merge trips, tours, households and persons, and plans
plans = plans.sort_values(by=['person_id', 'trip_id']).loc[plans['ActivityElement'] == 'leg']
tourTripsMerged = tourTripsMerged.sort_values(by=['person_id','trip_id'])
plansTripsMerged = pd.merge(left=plans, right=tourTripsMerged, how='left', on=['person_id','trip_id'], suffixes=('', '_drop'))
plansTripsMerged.drop([col for col in plansTripsMerged.columns if 'drop' in col], axis=1, inplace=True)

Wall time: 53.1 s


In [123]:
plansTripsMerged['PlanElementIndex'] = plansTripsMerged['PlanElementIndex']/2

In [114]:
#Concat mode_choice_raw files
path = "C:/Users/nazanin/Downloads/trip_mode_choice/trip_mode_choice/"
all_files = glob.glob(path + "*raw.csv")
li_mapper = map(lambda filename: pd.read_csv(filename, index_col = None, header = 0), all_files)
li2 = list(li_mapper)
SFmode_choice_raw = pd.concat(li2, axis = 0, ignore_index=True)

In [36]:
#Concat mode_choice_utilities files
path = "C:/Users/nazanin/Downloads/trip_mode_choice/trip_mode_choice/"
all_files = glob.glob(path + "*utilities.csv")
li_mapper = map(lambda filename: pd.read_csv(filename, index_col = None, header = 0), all_files)
li2 = list(li_mapper)
SFmode_choice_utilities = pd.concat(li2, axis = 0, ignore_index=True)

In [37]:
#Merge mode_choice_raw and utilities
SFmode_choice_raw = SFmode_choice_raw.sort_values(by=['trip_id'])
SFmode_choice_utilities = SFmode_choice_utilities.sort_values(by=['trip_id'])
rawUtil = pd.merge(left=SFmode_choice_raw, right=SFmode_choice_utilities, how='left', on='trip_id', suffixes=('', '_drop'))
rawUtil.drop([col for col in rawUtil.columns if 'drop' in col], axis=1, inplace=True)

In [125]:
%%time
#Merge trips, tours, households, persons, trip_mode_choice_raw, and utilities
plansTripsMerged = plansTripsMerged.sort_values(by=['trip_id'])
rawUtil = rawUtil.sort_values(by=['trip_id'])
SFActMerged= pd.merge(left=plansTripsMerged, right=rawUtil, how='left', on=['trip_id'], suffixes=('', '_drop'))
SFActMerged.drop([col for col in SFActMerged.columns if 'drop' in col], axis=1, inplace=True)

Wall time: 1min 26s


In [127]:
SFActMerged_chunk = SFActMerged[0:1000]
SFActMerged_chunk.to_csv('C:/Shared-Work/Data/CleanData/SFActMerged_chunk.csv', index = False)

#### Merging

In [126]:
eventsSFtrips_chunk = pd.read_csv("C:/Shared-Work/Data/CleanData/Chunks/eventsSFtrips_chunk.csv")   

In [134]:
%%time
eventsSFtrips_chunk = eventsSFtrips_chunk.sort_values(by=['IDMerged'])
SFActMerged_chunk = SFActMerged_chunk.sort_values(by=['person_id'])
SFBeamActMerged_chunk = pd.merge(eventsSFtrips_chunk, SFActMerged_chunk, how='left', left_on = ["IDMerged", 'tripIndex'] , right_on=['person_id', 'PlanElementIndex'], suffixes=('', '_drop'))
SFBeamActMerged_chunk.drop([col for col in SFBeamActMerged_chunk.columns if 'drop' in col], axis=1, inplace=True)

Wall time: 88.8 ms


In [136]:
SFBeamActMerged_chunk.to_csv('C:/Shared-Work/Data/CleanData/SFBeamActMerged_chunk.csv', index = False)

In [67]:
%%time
eventsSFtrips = eventsSFtrips.sort_values(by=['IDMerged'])
SFActMerged = SFActMerged.sort_values(by=['person_id'])
SFBeamActMerged = pd.merge(eventsSFtrips, SFActMerged, how='left', left_on = "IDMerged" , right_on='person_id', suffixes=('', '_drop'))
SFBeamActMerged.drop([col for col in SFBeamActMerged.columns if 'drop' in col], axis=1, inplace=True)

Wall time: 15min 55s
