In [1]:
import pandas as pd
import numpy as np
from dask import dataframe as dd

In [2]:
#File location S3
loc = "https://beam-outputs.s3.amazonaws.com/pilates-outputs/sfbay-2010-base-20200206/beam/sfbay-pilates-base__2022-02-06_07-26-58_kwy/ITERS/it.0/"
locSF_rh = "https://beam-outputs.s3.amazonaws.com/pilates-outputs/sfbay-2010-lessrh-20220223/beam/year-2010-iteration-2/ITERS/it.0/"

In [3]:
%%time
#Reading the events file
dtypes = {
    "time": "float32",
    "type": "category",
    "legMode": "category",
    "actType": "category", 
    "reason": "category",
    "primaryFuelLevel": "float64",
    "legMode": "category",
    "chargingPointType":"category",
    "pricingModel":"category",
    "parkingType":"category",
    "mode":"category",
    "personalVehicleAvailable": "category"
}

# Use list comprehension to remove the unwanted column in **usecol**
eventsSF = pd.read_csv(locSF_rh + '0.events.csv.gz', compression = 'gzip', dtype = dtypes)

  exec(code, glob, local_ns)


Wall time: 2min 53s


In [4]:
#Show all columns and rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [5]:
#Rename mode column
eventsSF.rename(columns={"mode":"modeType"}, inplace=True)  

In [6]:
#Replace "Work" with "work" in actType column
eventsSF["actType"].replace({"Work": "work"}, inplace=True)

In [7]:
#Adding the IDMerged Column
eventsSF['UniqueID'] = eventsSF['person']
eventsSF['personID'] = np.where(eventsSF['person'].isin(eventsSF['driver'])
                     , eventsSF['person'], np.nan)
eventsSF['driverID'] = np.where(eventsSF['driver'].isin(eventsSF['person'])
                     , eventsSF['driver'], np.nan)

In [8]:
eventsSF['IDMerged'] = eventsSF['personID'].combine_first(eventsSF['driverID'])
eventsSF['IDMerged'] = eventsSF['UniqueID'].combine_first(eventsSF['IDMerged'])

In [9]:
eventsSF = eventsSF.drop(['personID','driverID','UniqueID'], axis=1) 

In [10]:
%%time
dfRiders = eventsSF['riders'].str.split(':',expand=True)

Wall time: 5min 39s


In [11]:
riderStartIndex = len(eventsSF.columns)

In [12]:
eventsSF = pd.concat([eventsSF, dfRiders.reindex(eventsSF.index)], axis=1)

In [13]:
%%time
NaN = np.nan
eventsSF['ID'] = NaN 
    
for col in eventsSF.columns[riderStartIndex:-2]:
    eventsSF['riderID'] = np.where(eventsSF[col].isin(eventsSF["IDMerged"]), eventsSF[col], np.nan)
    eventsSF['ID'] = eventsSF['ID'].combine_first(eventsSF['riderID'])

Wall time: 6min 15s


In [14]:
eventsSF['IDMerged'] = eventsSF['IDMerged'].combine_first(eventsSF['ID'])

In [15]:
eventsSF = eventsSF.drop(columns=eventsSF.columns[riderStartIndex:])

In [16]:
IDnan = eventsSF[eventsSF['IDMerged'].isna()]
IDnan.head()

Unnamed: 0,modeType,currentTourMode,vehicleType,links,numPassengers,length,primaryFuel,secondaryFuelLevel,riders,toStopIndex,fromStopIndex,seatingCapacity,tollPaid,primaryFuelLevel,endY,endX,startY,startX,capacity,arrivalTime,departureTime,linkTravelTime,secondaryFuel,secondaryFuelType,primaryFuelType,driver,vehicle,time,type,price,shiftStatus,parkingTaz,chargingPointType,pricingModel,parkingType,locationY,locationX,parkingZoneId,fuel,duration,actType,person,reason,link,legMode,score,incentive,tollCost,netCost,facility,cost,expectedMaximumUtility,availableAlternatives,location,personalVehicleAvailable,tourIndex,departTime,IDMerged
180487,walk,car,BODY-TYPE-DEFAULT,,0.0,0.0,0.0,0.0,,,,0.0,0.0,2210000.0,37.690458,-122.115738,37.690458,-122.115738,0.0,18010.0,18010.0,,0.0,,Food,672274,body-672274,18010.0,PathTraversal,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
180492,,,,,,,,,,,,,,,,,,,,,,,,,,672274,34819,18010.0,LeavingParkingEvent,,,856.0,,FlatFee,Public,,,,,,,,,,,-0.0,,,,,,,,,,,,
180493,car,car,conv-L1-10000-to-25000-LowTech-2019,"120470,120256,120260,120264,120253,4609,62042,...",0.0,952.61,1295710.0,0.0,,,,3.0,0.0,3654684000.0,37.683603,-122.116171,37.693085,-122.118928,3.0,18074.0,18010.0,"25.087,5.634,17.556,10.411,0.309,6.204,1.572,2...",0.0,,Gasoline,672274,34819,18074.0,PathTraversal,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
180583,walk,car,BODY-TYPE-DEFAULT,,0.0,0.0,0.0,0.0,,,,0.0,0.0,2210000.0,37.719204,-121.940466,37.719204,-121.940466,0.0,18003.0,18003.0,,0.0,,Food,1257491,body-1257491,18003.0,PathTraversal,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
180586,walk,car,BODY-TYPE-DEFAULT,,0.0,0.0,0.0,0.0,,,,0.0,0.0,2210000.0,38.007934,-121.861463,38.007934,-121.861463,0.0,18007.0,18007.0,,0.0,,Food,2327723,body-2327723,18007.0,PathTraversal,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [17]:
#To select just the numeric MergedID
#eventsSF.loc[eventsSF.IDMerged.str.isnumeric()]
eventsSFnum = eventsSF[pd.to_numeric(eventsSF.IDMerged, errors='coerce').notnull()].sort_values(['IDMerged','time']).reset_index(drop=True)

In [18]:
eventsSFnum['actEndT'] = np.where(eventsSFnum['type']=='actend'
                     , eventsSFnum['time'], np.nan)

In [19]:
eventsSFnum['actStartT'] = np.where(eventsSFnum['type']=='actstart'
                     , eventsSFnum['time'], np.nan)    

In [20]:
eventsSFnum['PTduration'] = np.where(eventsSFnum['type']=='PathTraversal'
                     , eventsSFnum['arrivalTime'] - eventsSFnum['departureTime'], np.nan)

In [21]:
eventsSFnum["tripIndex"] = eventsSFnum.groupby("IDMerged")["tourIndex"].rank(method="first", ascending=True)
eventsSFnum["tripIndex"] = eventsSFnum.tripIndex.fillna(method='ffill')

In [22]:
eventsSFnum["currentTourModeFillna"] = eventsSFnum.currentTourMode.fillna(method='ffill')#.astype('object')

In [23]:
eventsSFnum.head(200)

Unnamed: 0,modeType,currentTourMode,vehicleType,links,numPassengers,length,primaryFuel,secondaryFuelLevel,riders,toStopIndex,fromStopIndex,seatingCapacity,tollPaid,primaryFuelLevel,endY,endX,startY,startX,capacity,arrivalTime,departureTime,linkTravelTime,secondaryFuel,secondaryFuelType,primaryFuelType,driver,vehicle,time,type,price,shiftStatus,parkingTaz,chargingPointType,pricingModel,parkingType,locationY,locationX,parkingZoneId,fuel,duration,actType,person,reason,link,legMode,score,incentive,tollCost,netCost,facility,cost,expectedMaximumUtility,availableAlternatives,location,personalVehicleAvailable,tourIndex,departTime,IDMerged,actEndT,actStartT,PTduration,tripIndex,currentTourModeFillna
0,car,car,,,,11478.017,,,,,,,,,,,,,,,,,,,,,,54694.0,ModeChoice,,,,,,,,,,,,,4.0,,,,,,,,,,,CAR,62008.0,True,1.0,,4.0,,,,1.0,car
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,54694.0,actend,,,,,,,,,,,,Home,4.0,,62008.0,,,,,,,,,,,,,,4.0,54694.0,,,1.0,car
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,54694.0,departure,,,,,,,,,,,,,4.0,,62008.0,car,,,,,,,,,,,,,4.0,,,,1.0,car
3,,,,,,,,,,,,,,,,,,,,,,,,,,,body-4,54694.0,PersonEntersVehicle,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,4.0,,,,1.0,car
4,,,,,,,,,,,,,,,,,,,,,,,,,,,259819,54694.0,PersonEntersVehicle,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,4.0,,,,1.0,car
5,car,,,,,,,,,,,,,,,,,,,,,,,,,,,55180.0,PersonCost,,,,,,,,,,,,,4.0,,,,,0.0,0.0,1.258902,,,,,,,,,4.0,,,,1.0,car
6,,,,,,,,,,,,,,,,,,,,,,,,,,,259819,55180.0,PersonLeavesVehicle,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,4.0,,,,1.0,car
7,,,,,,,,,,,,,,,,,,,,,,,,,,,body-4,55180.0,PersonLeavesVehicle,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,4.0,,,,1.0,car
8,,,,,,,,,,,,,,,,,,,,,,,,,,,,55180.0,arrival,,,,,,,,,,,,,4.0,,113226.0,car,,,,,,,,,,,,,4.0,,,,1.0,car
9,,,,,,,,,,,,,,,,,,,,,,,,,,,,55180.0,actstart,,,,,,,,,,,,othdiscr,4.0,,113226.0,,,,,,,,,,,,,,4.0,,55180.0,,1.0,car


In [24]:
eventsSFtrips = pd.pivot_table(
   eventsSFnum,
   index=['IDMerged','tripIndex','currentTourModeFillna'],
   aggfunc={'actStartT': np.sum, 'actEndT': np.sum, 'PTduration': np.sum, 'primaryFuel': np.sum, 'netCost':np.sum}).reset_index()

In [25]:
eventsSFtrips['DDtime'] = eventsSFtrips['actStartT'] - eventsSFtrips['actEndT'] 

In [26]:
eventsSFtrips = eventsSFtrips.rename(columns={'currentTourModeFillna': 'currentTourMode'})

In [27]:
eventsSFtrips.head(20)

Unnamed: 0,IDMerged,tripIndex,currentTourMode,PTduration,actEndT,actStartT,netCost,primaryFuel,DDtime
0,4.0,1.0,car,0.0,54694.0,55180.0,1.258902,0.0,486.0
1,4.0,2.0,car,0.0,60062.0,60702.0,1.292657,0.0,640.0
2,8.0,1.0,car,0.0,60454.0,71933.0,0.0,0.0,11479.0
3,8.0,2.0,car,0.0,71933.0,83781.0,0.052631,0.0,11848.0
4,66.0,1.0,car,0.0,64965.0,65470.0,1.210568,0.0,505.0
5,66.0,2.0,car,0.0,65901.0,66436.0,1.213589,0.0,535.0
6,66.0,3.0,car,0.0,67863.0,68572.0,1.358076,0.0,709.0
7,66.0,4.0,car,0.0,68572.0,68791.0,0.282286,0.0,219.0
8,113.0,1.0,walk_transit,0.0,23842.0,27300.0,0.0,0.0,3458.0
9,113.0,2.0,walk_transit,5.0,59050.0,60559.0,0.0,367.661,1509.0


In [28]:
eventsSFtrips_chunk = eventsSF[1:10000]

In [29]:
#eventsSFtrips_chunk.to_csv('C:/Shared-Work/Data/CleanData/SFpersonRidehail.csv', index = False)

In [30]:
eventsSFnum[eventsSFnum['IDMerged']=='1000104']

Unnamed: 0,modeType,currentTourMode,vehicleType,links,numPassengers,length,primaryFuel,secondaryFuelLevel,riders,toStopIndex,fromStopIndex,seatingCapacity,tollPaid,primaryFuelLevel,endY,endX,startY,startX,capacity,arrivalTime,departureTime,linkTravelTime,secondaryFuel,secondaryFuelType,primaryFuelType,driver,vehicle,time,type,price,shiftStatus,parkingTaz,chargingPointType,pricingModel,parkingType,locationY,locationX,parkingZoneId,fuel,duration,actType,person,reason,link,legMode,score,incentive,tollCost,netCost,facility,cost,expectedMaximumUtility,availableAlternatives,location,personalVehicleAvailable,tourIndex,departTime,IDMerged,actEndT,actStartT,PTduration,tripIndex,currentTourModeFillna
