In [1]:
import os
import os.path as op
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import seaborn as sns
sns.set_theme(style="darkgrid", color_codes=True)
sns.set(font_scale=1.35, style="ticks") #set styling preferences
import statsmodels.api as sm
from scipy import stats
import math
from math import pi
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from scipy.spatial.distance import cdist
from scipy.cluster.vq import kmeans2,vq, whiten

In [2]:
# Show all columns and rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

#### Baseline

In [3]:
%%time
baseline2018 = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/Baseline/"
sfbase = pd.read_csv(baseline2018 + 'sf_2018_base_utilities.csv')



CPU times: total: 1min 9s
Wall time: 6min 16s


#### Transit and Stacking Process

In [None]:
%%time
transit_fr_05_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/TransitHead_Frequencies/"
sf_transit_fr_05 = pd.read_csv(transit_fr_05_path + 'sf_2018_tr_fr_0p5.csv')

In [None]:
%%time
transit_fr_1p5_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/TransitHead_Frequencies/"
sf_transit_fr_1p5 = pd.read_csv(transit_fr_1p5_path + 'sf_2018_tr_fr_1p5.csv')

In [26]:
%%time
transit_fr_2_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/TransitHead_Frequencies/"
sf_transit_fr_2 = pd.read_csv(transit_fr_2_path + 'sf_2018_tr_fr_2.csv')



CPU times: total: 4min 34s
Wall time: 11min 7s


In [13]:
sfbase.shape

(2568424, 266)

In [14]:
sf_transit_fr_05.shape

(2529285, 646)

In [47]:
sf_transit_fr_1p5.shape

(2529440, 266)

In [48]:
sf_transit_fr_2.shape

(2529265, 266)

In [44]:
#sf_transit_fr_2 = sf_transit_fr_2.drop(['23','24'], axis=1)

In [28]:
sf_transit_fr_2 = sf_transit_fr_2.drop(sf_transit_fr_2.columns[245:625], axis=1)

In [49]:
sf_transit_fr_05 = sf_transit_fr_05.drop(sf_transit_fr_05.columns[245:625], axis=1)

In [53]:
#sf_transit_fr_05 = sf_transit_fr_05.drop(['5','6','7','8','9','10','11'], axis=1)

In [55]:
sf_transit_fr_05.shape

(2529285, 266)

In [121]:
frames = [sfbase, sf_transit_fr_05, sf_transit_fr_1p5, sf_transit_fr_2]
sf_stacked = pd.concat(frames)

In [122]:
sf_stacked = sf_stacked.drop(['has_school_kid','has_preschool_kid','has_retiree','has_non_worker','adult','hispanic','relate',
                             'pstudent','pemploy','ptype','p_hispanic','age_16_p','age_16_to_19','work_zone_id',
                             'race_id','school_zone_id','person_age','MAR','hours','hispanic.1','student',
                             'worker','stop_frequency','composition','tdd','number_of_participants','tour_type_count',
                             'tour_type_num','tour_num','tour_count', 'outbound', 'trip_num','num_under16_not_at_school',
                             'family','non_family','num_young_adults','num_college_age','num_children_16_to_17',
                              'num_children_5_to_15','num_young_children'], axis=1)

In [123]:
sf_stacked = sf_stacked.drop(['BlockGroupEnd','BlockGroupStart','vehicleIds_estimate','destination_x','origin_x','depart',
                             'origin_y', 'destination_y','parent_tour_id','atwork_subtour_frequency','household_id_y',
                              'TAZ_x','home_x','home_y','male','female', 'has_driving_kid','has_full_time','has_part_time',
                              'has_university','student_is_employed', 'nonstudent_to_school', 'is_student','is_gradeschool',
                              'is_gradeschool','is_highschool','is_university','school_segment','is_worker','home_taz',  
                             'school_taz','distance_to_school','roundtrip_auto_time_to_school','workplace_taz',
                              'distance_to_work','workplace_in_cbd', 'work_taz_area_type', 'hh_race_of_head',
                             'roundtrip_auto_time_to_work','work_auto_savings_ratio','cdap_activity', 'travel_active', 
                              'under16_not_at_school','has_preschool_kid_at_home', 'block_id','gt2','hispanic_head',
                             'has_school_kid_at_home','work_and_school_and_worker', 'age_of_head', 'race_of_head',
                              'work_and_school_and_student', 'VEHICL','hh_children','hh_age_of_head','num_workers',
                              'gt55','seniors', 'recent_mover', 'hh_workers', 'hispanic_status_of_head', 'hh_seniors', 
                             'hh_type', 'TAZ_y', 'HHT', 'sample_rate', 'chunk_id', 'income_segment', 'num_non_workers', 
                             'num_drivers', 'num_adults', 'num_children', 'hh_work_auto_savings_ratio', 'num_travel_active', 
                             'num_travel_active_adults','num_travel_active_preschoolers', 'num_travel_active_children', 
                              'num_travel_active_non_preschoolers', 'participates_in_jtf_model', 'joint_tour_frequency',
                              'num_hh_joint_tours', 'SERIALNO', 'HINCP', 'JWMNP', 'JWRIP', 'JWTR', 'RAC1P', 'RAC2P05'], axis=1)

In [124]:
sf_stacked = sf_stacked.drop(['serialno'], axis=1)

In [125]:
sf_stacked.shape

(10156414, 138)

In [126]:
sf_stacked['lever'] = "default"

In [127]:
sf_stacked['lever'] = sf_stacked['lever'].astype("category")

In [128]:
sf_stacked['lever'] = np.where((sf_stacked['scenario'] == 'TransitFreq0p5')|(sf_stacked['scenario'] == 'TransitFreq1p5')|
                               (sf_stacked['scenario'] == 'TransitFreq2'), 'Frequency' , sf_stacked['lever'])

In [129]:
sf_stacked['lever_position'] = 1

In [130]:
sf_stacked['lever_position'] = np.where(sf_stacked['scenario'] == 'TransitFreq0p5', 0.5 , sf_stacked['lever_position'])

In [131]:
sf_stacked['lever_position'] = np.where(sf_stacked['scenario'] == 'TransitFreq1p5', 1.5 , sf_stacked['lever_position'])

In [132]:
sf_stacked['lever_position'] = np.where(sf_stacked['scenario'] == 'TransitFreq2', 2 , sf_stacked['lever_position'])

In [133]:
sf_stacked['scenario'] = np.where(sf_stacked['scenario'] == 'TransitFreq0p5', "transit" , sf_stacked['scenario'])

In [134]:
sf_stacked['scenario'] = np.where(sf_stacked['scenario'] == 'TransitFreq1p5', "transit" , sf_stacked['scenario'])

In [135]:
sf_stacked['scenario'] = np.where(sf_stacked['scenario'] == 'TransitFreq2', "transit" , sf_stacked['scenario'])

In [137]:
sf_stacked.shape

(10156414, 140)

In [None]:
#%%time
#sf_stacked.to_csv('s3://beam-core-act/deepDive/CleanData/SanFrancisco/Stacked/sf_2018_stacked_tr_fr.csv', index=False)

#### Stacked File

In [3]:
%%time
stacked_tr_fr_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/Stacked/"
sf_stacked_tr_fr = pd.read_csv(stacked_tr_fr_path + 'sf_2018_stacked_tr_fr.csv')



CPU times: total: 3min 31s
Wall time: 9min 24s


In [4]:
sf_stacked_tr_fr = sf_stacked_tr_fr.sort_values(by=['IDMerged', 'tripIndex', 'lever_position']).reset_index(drop=True)

In [5]:
# shift column 'person' to first position
first_column = sf_stacked_tr_fr.pop('lever')
second_column = sf_stacked_tr_fr.pop('lever_position')
third_column = sf_stacked_tr_fr.pop('logsum_trip_mode_AS_trips')
# insert column using insert(position,column_name,first_column) function
sf_stacked_tr_fr.insert(2, 'lever', first_column)
sf_stacked_tr_fr.insert(3, 'lever_position', second_column)
sf_stacked_tr_fr.insert(4, 'logsum_trip_mode_AS_trips', third_column)

In [8]:
#sf_stacked_tr_fr['logsum_relative_to_baseline'] = sf_stacked_tr_fr.groupby(['IDMerged','tripIndex'])['logsum_trip_mode_AS_trips'].apply(lambda x: x.diff())

In [9]:
#sf_stacked_tr_fr['logsum_relative_to_baseline'] = sf_stacked_tr_fr['logsum_trip_mode_AS_trips'] - sf_stacked_tr_fr.loc[sf_stacked_tr_fr.groupby(['IDMerged','tripIndex'])['lever_position'].transform('idxmin'), 'logsum_trip_mode_AS_trips'].values

In [6]:
%%time
sf_stacked_tr_fr['logsum_relative_to_baseline'] = sf_stacked_tr_fr['logsum_trip_mode_AS_trips'] - (sf_stacked_tr_fr['logsum_trip_mode_AS_trips'].where(sf_stacked_tr_fr['lever_position'].eq(1.0))
                                                                                                  .groupby([sf_stacked_tr_fr['IDMerged'],sf_stacked_tr_fr['tripIndex']])
                                                                                                   .transform('first')
                                                                                                   .convert_dtypes())

CPU times: total: 2.83 s
Wall time: 2.6 s


In [11]:
sf_stacked_tr_fr['door_to_door_time_relative_to_baseline'] = sf_stacked_tr_fr['duration_door_to_door'] - (sf_stacked_tr_fr['duration_door_to_door'].where(sf_stacked_tr_fr['lever_position'].eq(1.0))
                                                                                                  .groupby([sf_stacked_tr_fr['IDMerged'],sf_stacked_tr_fr['tripIndex']])
                                                                                                   .transform('first')
                                                                                                   .convert_dtypes())

In [12]:
sf_stacked_tr_fr.head()

Unnamed: 0,IDMerged,tripIndex,lever,lever_position,logsum_trip_mode_AS_trips,actEndTime,actEndType,actStartTime,actStartType,cost_BEAM,distance_bike,distance_mode_choice,distance_privateCar,distance_ridehail,distance_transit,distance_travelling,distance_walking,duration_in_privateCar,duration_in_ridehail,duration_in_transit,duration_on_bike,duration_travelling,duration_walking,emissionBiodiesel,emissionDiesel,emissionElectricity,emissionFood,emissionGasoline,emission_marginal,fuelBiodiesel,fuelDiesel,fuelElectricity,fuelFood,fuelGasoline,fuel_marginal,fuel_not_Food,mode_choice_actual_BEAM,mode_choice_planned_BEAM,numPassengers,parkingType,reason,replanning_status,ride_hail_pooled,scenario,transit_bus,transit_cable_car,transit_rail,transit_subway,transit_tram,vehicleIds,year,duration_door_to_door,waitTime,actPurpose,mode_choice_actual_5,trip_id,person_id,household_id,tour_id,primary_purpose_x,trip_count,purpose,destination_logsum_x,trip_mode_AS_trips,tour_type,tour_category,household_id_x,start,end,duration,destination_logsum_y,tour_mode_AS_tours,logsum_tours_mode_AS_tours,primary_purpose_y,earning,person_sex,PNUM,sex,edu,race,work_at_home,age,value_of_time,workplace_location_logsum,work_auto_savings,free_parking_at_work,mandatory_tour_frequency,num_mand,num_work_tours,num_joint_tours,non_mandatory_tour_frequency,num_non_mand,num_escort_tours,num_eatout_tours,num_shop_tours,num_maint_tours,num_discr_tours,num_social_tours,num_non_escort_tours,lcm_county_id,tenure_mover,hh_size,sf_detached,tenure,hh_cars,income,hh_income,hhsize,income_in_thousands,median_value_of_time,hh_value_of_time,home_is_urban,home_is_rural,auto_ownership,SPORDER,AGEP,SEX,DIS,VEH,DRIVEALONEFREE,DRIVEALONEPAY,SHARED2FREE,SHARED2PAY,SHARED3FREE,SHARED3PAY,WALK,BIKE,WALK_LOC,WALK_LRF,WALK_EXP,WALK_HVY,WALK_COM,DRIVE_LOC,DRIVE_LRF,DRIVE_EXP,DRIVE_HVY,DRIVE_COM,TAXI,TNC_SINGLE,TNC_SHARED,logsum_relative_to_baseline,door_to_door_time_relative_to_baseline
0,1,577.0,Frequency,0.5,-1.977407,29977.0,Home,31382.0,othmaint,0.0,0.0,16638.152,16638.152,0.0,0.0,16638.152,0.0,1405.0,0.0,0.0,0.0,1405.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hov3_teleportation,hov3_teleportation,,,,0,0,transit,0,0,0,0,0,,2018,1405.0,0.0,Home_to_othmaint,car,577.0,1.0,1735309.0,72.0,school,2.0,othmaint,6.874664,SHARED3FREE,school,mandatory,1735309.0,8.0,15.0,7.0,,SHARED3FREE,-0.969775,school,0.0,male,3.0,1.0,1.0,asian,0.0,3.0,19.798008,,0.0,False,school1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6085.0,own not recent,four or more,yes,1.0,two or more,240000.0,gt150,4.0,240.0,12.86,29.682171,False,False,2.0,3.0,3.0,1.0,2.0,2.0,-999.02509,-999.025207,-2.76541,-2.765477,-1.796358,-1.796405,-1.962304,-999.712596,-1000.490635,-1998.779651,-1999.490635,-1998.779651,-1998.779651,-1998.976381,-2997.418679,-2998.052261,-2997.418679,-2997.418679,-8.970132,-9.314771,-7.841193,0.172925,841.0
1,1,577.0,default,1.0,-2.150332,29037.0,Home,29601.0,othmaint,0.0,0.0,6498.758,6498.758,0.0,0.0,6498.758,0.0,564.0,0.0,0.0,0.0,564.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hov3_teleportation,hov3_teleportation,,,,0,0,baseline,0,0,0,0,0,,2018,564.0,0.0,Home_to_othmaint,car,577.0,1.0,1735309.0,72.0,school,2.0,othmaint,5.730384,SHARED2FREE,school,mandatory,1735309.0,8.0,14.0,6.0,,SHARED2PAY,-1.011892,school,0.0,male,3.0,1.0,1.0,asian,0.0,3.0,19.798008,,0.0,False,school1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6085.0,own not recent,four or more,yes,1.0,two or more,240000.0,gt150,4.0,240.0,12.86,29.682171,False,False,2.0,3.0,3.0,1.0,2.0,2.0,-999.02509,-999.025207,-2.76541,-2.765477,-1.796358,-1.796405,-1.962304,-999.712596,-1000.490635,-1998.779651,-1999.490635,-1998.779651,-1998.779651,-1998.976381,-2997.418679,-2998.052261,-2997.418679,-2997.418679,-8.970132,-9.314771,-7.82824,0.0,0.0
2,1,577.0,Frequency,1.5,-1.923495,29926.0,Home,30352.0,othmaint,0.0,0.0,4856.777,4856.777,0.0,0.0,4856.777,0.0,426.0,0.0,0.0,0.0,426.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hov3_teleportation,hov3_teleportation,,,,0,0,transit,0,0,0,0,0,,2018,426.0,0.0,Home_to_othmaint,car,577.0,1.0,1735309.0,72.0,school,2.0,othmaint,6.50218,SHARED2FREE,school,mandatory,1735309.0,8.0,15.0,7.0,,SHARED2PAY,-0.678291,school,0.0,male,3.0,1.0,1.0,asian,0.0,3.0,19.798008,,0.0,False,school1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6085.0,own not recent,four or more,yes,1.0,two or more,240000.0,gt150,4.0,240.0,12.86,29.682171,False,False,2.0,3.0,3.0,1.0,2.0,2.0,-999.02509,-999.025207,-2.76541,-2.765477,-1.796358,-1.796405,-1.962304,-999.712596,-1000.490635,-1998.779651,-1999.490635,-1998.779651,-1998.779651,-1998.976381,-2997.418679,-2998.052261,-2997.418679,-2997.418679,-8.970132,-9.314771,-7.829821,0.226837,-138.0
3,1,577.0,Frequency,2.0,-2.028036,30999.0,Home,32376.0,othmaint,0.0,0.0,16246.598,16246.598,0.0,0.0,16246.598,0.0,1377.0,0.0,0.0,0.0,1377.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hov3_teleportation,hov3_teleportation,,,,0,0,transit,0,0,0,0,0,,2018,1377.0,0.0,Home_to_othmaint,car,577.0,1.0,1735309.0,72.0,school,2.0,othmaint,5.459001,SHARED2FREE,school,mandatory,1735309.0,8.0,15.0,7.0,,SHARED2PAY,-0.970423,school,0.0,male,3.0,1.0,1.0,asian,0.0,3.0,19.798008,,0.0,False,school1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6085.0,own not recent,four or more,yes,1.0,two or more,240000.0,gt150,4.0,240.0,12.86,29.682171,False,False,2.0,3.0,3.0,1.0,2.0,2.0,-999.203047,-999.203862,-2.943366,-2.943832,-1.974314,-1.97464,-2.388872,-999.92588,-1000.490635,-1998.779651,-1999.490635,-1998.779651,-1998.779651,-1998.976381,-2997.418679,-2998.052261,-2997.418679,-2997.418679,-9.202719,-9.493426,-8.09413,0.122297,813.0
4,1,578.0,Frequency,0.5,-1.686592,31532.0,othmaint,31670.0,school,0.0,0.0,1641.981,1641.981,0.0,0.0,1641.981,0.0,138.0,0.0,0.0,0.0,138.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hov3_teleportation,hov3_teleportation,,,,0,0,transit,0,0,0,0,0,,2018,138.0,0.0,othmaint_to_school,car,578.0,1.0,1735309.0,72.0,school,2.0,school,,SHARED2FREE,school,mandatory,1735309.0,8.0,15.0,7.0,,SHARED3FREE,-0.969775,school,0.0,male,3.0,1.0,1.0,asian,0.0,3.0,19.798008,,0.0,False,school1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6085.0,own not recent,four or more,yes,1.0,two or more,240000.0,gt150,4.0,240.0,12.86,29.682171,False,False,2.0,3.0,3.0,1.0,2.0,2.0,-999.573865,-999.575992,-3.314185,-3.3154,-2.345133,-2.345983,-1.962304,-999.712596,-1998.779651,-1998.779651,-1999.050651,-1998.779651,-1998.779651,-1999.528158,-2997.418679,-2998.777234,-2997.418679,-2997.418679,-9.687229,-9.902951,-8.869006,0.333575,-1.0


In [139]:
%%time
sf_stacked.to_csv('s3://beam-core-act/deepDive/CleanData/SanFrancisco/Stacked/sf_2018_stacked_tr_fr.csv', index=False)

CPU times: total: 14min 36s
Wall time: 19min 7s
