In [1]:
import pandas as pd
import numpy as np
from dask import dataframe as dd

In [2]:
#File location S3
loc = "https://beam-outputs.s3.amazonaws.com/pilates-outputs/sfbay-2010-base-20200206/beam/sfbay-pilates-base__2022-02-06_07-26-58_kwy/ITERS/it.0/"
locSF_rh = "https://beam-outputs.s3.amazonaws.com/pilates-outputs/sfbay-2010-lessrh-20220223/beam/year-2010-iteration-2/ITERS/it.0/"

In [3]:
%%time
#Reading the events file
dtypes = {
    "time": "float32",
    "type": "category",
    "legMode": "category",
    "actType": "category", 
    "reason": "category",
    "primaryFuelLevel": "float64",
    "legMode": "category",
    "chargingPointType":"category",
    "pricingModel":"category",
    "parkingType":"category",
    "mode":"category",
    "personalVehicleAvailable": "category",
    "person": "object",
    "driver": "object",
    "riders": "object"
}

# Use list comprehension to remove the unwanted column in **usecol**
eventsSF = pd.read_csv(locSF_rh + '0.events.csv.gz', compression = 'gzip', dtype = dtypes)

  exec(code, glob, local_ns)


Wall time: 2min 54s


In [4]:
#Show all columns and rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [5]:
#Rename mode column
eventsSF.rename(columns={"mode":"modeType"}, inplace=True)  

In [6]:
#Replace "Work" with "work" in actType column
eventsSF["actType"].replace({"Work": "work"}, inplace=True)

In [7]:
#Adding the IDMerged Column
eventsSF['UniqueID'] = eventsSF['person']
eventsSF['personID'] = np.where(eventsSF['person'].isin(eventsSF['driver'])
                     , eventsSF['person'], np.nan)
eventsSF['driverID'] = np.where(eventsSF['driver'].isin(eventsSF['person'])
                     , eventsSF['driver'], np.nan)

In [8]:
#Merging person and driver ids in one column
eventsSF['IDMerged'] = eventsSF['personID'].combine_first(eventsSF['driverID'])
eventsSF['IDMerged'] = eventsSF['UniqueID'].combine_first(eventsSF['IDMerged'])

In [9]:
#Dropping unused columns
eventsSF = eventsSF.drop(['personID','driverID','UniqueID'], axis=1) 

In [10]:
%%time
#Split the riders column
dfRiders = eventsSF['riders'].str.split(':',expand=True)

Wall time: 5min 42s


In [11]:
riderStartIndex = len(eventsSF.columns)

In [12]:
#Concat the new riders columns to the main dataframe
eventsSF = pd.concat([eventsSF, dfRiders.reindex(eventsSF.index)], axis=1)

In [13]:
%%time
# Making a riderID column and merge it with the already merged id column 
NaN = np.nan
eventsSF['ID'] = NaN 
    
for col in eventsSF.columns[riderStartIndex:-2]:
    eventsSF['riderID'] = np.where(eventsSF[col].isin(eventsSF["IDMerged"]), eventsSF[col], np.nan)
    eventsSF['ID'] = eventsSF['ID'].combine_first(eventsSF['riderID'])

Wall time: 5min 55s


In [14]:
eventsSF['IDMerged'] = eventsSF['IDMerged'].combine_first(eventsSF['ID'])

In [15]:
#dropping unused riders columns
eventsSF = eventsSF.drop(columns=eventsSF.columns[riderStartIndex:])

In [16]:
# shift column 'Name' to first position
first_column = eventsSF.pop('IDMerged')
# insert column using insert(position,column_name,first_column) function
eventsSF.insert(0, 'IDMerged', first_column)

In [17]:
IDnan = eventsSF[eventsSF['IDMerged'].isna()]
IDnan.shape

(4690, 58)

In [115]:
%%time
#To select just the numeric MergedID
#eventsSF.loc[eventsSF.IDMerged.str.isnumeric()]
eventsSFnum = eventsSF[pd.to_numeric(eventsSF.IDMerged, errors='coerce').notnull()].sort_values(['IDMerged','time']).reset_index(drop=True)

Wall time: 2min 30s


In [246]:
eventsSFnum['actEndTime'] = np.where(eventsSFnum['type']=='actend'
                     , eventsSFnum['time'], np.nan)

In [245]:
eventsSFnum['actStartTime'] = np.where(eventsSFnum['type']=='actstart'
                     , eventsSFnum['time'], np.nan)    

In [118]:
eventsSFnum['PTduration'] = np.where(eventsSFnum['type']=='PathTraversal'
                     , eventsSFnum['arrivalTime'] - eventsSFnum['departureTime'], np.nan)

In [228]:
eventsSFnum['duration_walking'] = np.where(eventsSFnum['modeType']=='walk'
                     , eventsSFnum['PTduration'], np.nan)

In [229]:
eventsSFnum['duration_in_privateCar'] = np.where((eventsSFnum['modeType']=='car')|(eventsSFnum['modeType']=='car_hov3')|(eventsSFnum['modeType']=='car_hov2')|
                                              (eventsSFnum['modeType']=='hov2_teleportation')|(eventsSFnum['modeType']=='hov3_teleportation')
                                                 , eventsSFnum['PTduration'], np.nan)

In [230]:
eventsSFnum['duration_on_bike'] = np.where(eventsSFnum['modeType']=='bike', eventsSFnum['PTduration'], np.nan)

In [231]:
eventsSFnum['duration_in_ridehail'] = np.where((eventsSFnum['modeType']=='ride_hail')|(eventsSFnum['modeType']=='ride_hail_pooled'), eventsSFnum['PTduration'], np.nan)

In [232]:
eventsSFnum['duration_in_public'] = np.where((eventsSFnum['modeType']=='bike_transit')|(eventsSFnum['modeType']=='drive_transit')|(eventsSFnum['modeType']=='walk_transit'), eventsSFnum['PTduration'], np.nan)

In [233]:
print(eventsSFnum.modeType.unique().tolist())

['car', nan, 'walk', 'car_hov2', 'ride_hail', 'hov3_teleportation', 'walk_transit', 'car_hov3', 'hov2_teleportation', 'bike', 'drive_transit', 'bike_transit', 'ride_hail_pooled']


In [192]:
eventsSFnum["tripIndex"] = eventsSFnum.groupby("IDMerged")["tourIndex"].rank(method="first", ascending=True)
eventsSFnum["tripIndex"] = eventsSFnum.tripIndex.fillna(method='ffill')

In [193]:
eventsSFnum["currentTourModeFillna"] = eventsSFnum.currentTourMode.fillna(method='ffill')#.astype('object')

In [194]:
#Change column type to numeric to be able to sort_values
eventsSFnum["IDMerged"] = pd.to_numeric(eventsSFnum["IDMerged"])

In [160]:
eventsSFnum['actEndType'] = np.where(eventsSFnum['type']=='actend'
                     , eventsSFnum['actType'], "")

In [161]:
eventsSFnum['actStartType'] = np.where(eventsSFnum['type']=='actstart'
                     , eventsSFnum['actType'], "")

In [162]:
eventsSFnum["currentTourModeFillna"] = eventsSFnum.currentTourMode.fillna(method='ffill')#.astype('object')

In [110]:
#SF_chunk = eventsSFnum[0:500]
#SF_chunk.to_csv('C:/Shared-Work/Data/CleanData/SF_chunk.csv', index = False)

In [247]:
eventsSFtrips = pd.pivot_table(
   eventsSFnum,
   index=['IDMerged','tripIndex','currentTourModeFillna'],
   aggfunc={'actStartTime': np.sum, 'actEndTime': np.sum, 'PTduration': np.sum, 'primaryFuel': np.sum, 
            'netCost': np.sum, 'actStartType': np.sum, 'actEndType': np.sum, 'duration_walking': np.sum,
            'duration_in_privateCar': np.sum, 'duration_on_bike': np.sum, 'duration_in_ridehail': np.sum, 
           'duration_in_public': np.sum}).reset_index()

In [248]:
eventsSFtrips['DDtime'] = eventsSFtrips['actStartTime'] - eventsSFtrips['actEndTime'] 

In [None]:
######?????
#eventsSFtrips['duration_wait'] = eventsSFtrips['DDtime'] - eventsSFtrips['PTduration'] 
######?????

In [249]:
eventsSFtrips['actPurpose'] = eventsSFtrips['actEndType'] + "_to_" + eventsSFtrips['actStartType']

In [250]:
eventsSFtrips = eventsSFtrips.rename(columns={'currentTourModeFillna': 'currentTourMode'})

In [251]:
eventsSFtrips.head(10)

Unnamed: 0,IDMerged,tripIndex,currentTourMode,PTduration,actEndTime,actEndType,actStartTime,actStartType,duration_in_privateCar,duration_in_public,duration_in_ridehail,duration_on_bike,duration_walking,netCost,primaryFuel,DDtime,actPurpose
0,4,1.0,car,486.0,54694.0,Home,55180.0,othdiscr,486.0,0.0,0.0,0.0,0.0,1.258902,17410780.0,486.0,Home_to_othdiscr
1,4,2.0,car,640.0,60062.0,othdiscr,60702.0,Home,508.0,0.0,0.0,0.0,132.0,1.292657,17837410.0,640.0,othdiscr_to_Home
2,8,1.0,car,11479.0,60454.0,Home,71933.0,shopping,0.0,0.0,0.0,0.0,11479.0,0.0,0.0,11479.0,Home_to_shopping
3,8,2.0,car,11848.0,71933.0,shopping,83781.0,Home,31.0,0.0,0.0,0.0,11817.0,0.052631,649895.2,11848.0,shopping_to_Home
4,66,1.0,car,505.0,64965.0,Home,65470.0,shopping,505.0,0.0,0.0,0.0,0.0,1.210568,17854620.0,505.0,Home_to_shopping
5,66,2.0,car,535.0,65901.0,shopping,66436.0,eatout,531.0,0.0,0.0,0.0,4.0,1.213589,18632770.0,535.0,shopping_to_eatout
6,66,3.0,car,709.0,67863.0,eatout,68572.0,shopping,531.0,0.0,0.0,0.0,178.0,1.358076,18565600.0,709.0,eatout_to_shopping
7,66,4.0,car,219.0,68572.0,shopping,68791.0,Home,219.0,0.0,0.0,0.0,0.0,0.282286,4517600.0,219.0,shopping_to_Home
8,113,1.0,walk_transit,85.0,23842.0,Home,27300.0,work,0.0,0.0,0.0,0.0,85.0,0.0,5850.882,3458.0,Home_to_work
9,113,2.0,walk_transit,39.0,59050.0,work,60559.0,Home,0.0,0.0,0.0,0.0,39.0,0.0,2708.247,1509.0,work_to_Home


In [252]:
eventsSFtrips_chunk = eventsSFtrips[0:1000]
eventsSFtrips_chunk.to_csv('C:/Shared-Work/Data/CleanData/eventsSFtrips_chunk.csv', index = False)

In [None]:
eventsSFtrips[eventsSFtrips['currentTourMode'] == "bike_transit"]

In [145]:
s8 = eventsSFnum[eventsSFnum['IDMerged']==8]

In [227]:
sfcomplex.to_csv('C:/Shared-Work/Data/CleanData/sfcomplex.csv', index = False)

In [226]:
sfcomplex = eventsSFnum[eventsSFnum['IDMerged']==1840884]