In [1]:
import os
import os.path as op
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import seaborn as sns
sns.set_theme(style="darkgrid", color_codes=True)
sns.set(font_scale=1.35, style="ticks") #set styling preferences
import statsmodels.api as sm
from scipy import stats
import math
from math import pi
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from scipy.spatial.distance import cdist
from scipy.cluster.vq import kmeans2,vq, whiten

In [2]:
# Show all columns and rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

#### Baseline

In [3]:
%%time
baseline2018 = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/Baseline/"
sfbase = pd.read_csv(baseline2018 + 'sf_2018_base_utilities.csv')



CPU times: total: 1min 16s
Wall time: 3min 53s


#### Transit and Stacking Process

In [7]:
%%time
transit_fr_05_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/TransitHead_Frequencies/"
sf_transit_fr_05 = pd.read_csv(transit_fr_05_path + 'sf_2018_tr_fr_0p5.csv')



CPU times: total: 4min 6s
Wall time: 9min 51s


In [8]:
%%time
transit_fr_1p5_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/TransitHead_Frequencies/"
sf_transit_fr_1p5 = pd.read_csv(transit_fr_1p5_path + 'sf_2018_tr_fr_1p5.csv')



CPU times: total: 1min 47s
Wall time: 4min 29s


In [26]:
%%time
transit_fr_2_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/TransitHead_Frequencies/"
sf_transit_fr_2 = pd.read_csv(transit_fr_2_path + 'sf_2018_tr_fr_2.csv')



CPU times: total: 4min 34s
Wall time: 11min 7s


In [13]:
sfbase.shape

(2568424, 266)

In [14]:
sf_transit_fr_05.shape

(2529285, 646)

In [47]:
sf_transit_fr_1p5.shape

(2529440, 266)

In [48]:
sf_transit_fr_2.shape

(2529265, 266)

In [44]:
#sf_transit_fr_2 = sf_transit_fr_2.drop(['23','24'], axis=1)

In [28]:
sf_transit_fr_2 = sf_transit_fr_2.drop(sf_transit_fr_2.columns[245:625], axis=1)

In [49]:
sf_transit_fr_05 = sf_transit_fr_05.drop(sf_transit_fr_05.columns[245:625], axis=1)

In [53]:
#sf_transit_fr_05 = sf_transit_fr_05.drop(['5','6','7','8','9','10','11'], axis=1)

In [55]:
sf_transit_fr_05.shape

(2529285, 266)

In [121]:
frames = [sfbase, sf_transit_fr_05, sf_transit_fr_1p5, sf_transit_fr_2]
sf_stacked = pd.concat(frames)

In [122]:
sf_stacked = sf_stacked.drop(['has_school_kid','has_preschool_kid','has_retiree','has_non_worker','adult','hispanic','relate',
                             'pstudent','pemploy','ptype','p_hispanic','age_16_p','age_16_to_19','work_zone_id',
                             'race_id','school_zone_id','person_age','MAR','hours','hispanic.1','student',
                             'worker','stop_frequency','composition','tdd','number_of_participants','tour_type_count',
                             'tour_type_num','tour_num','tour_count', 'outbound', 'trip_num','num_under16_not_at_school',
                             'family','non_family','num_young_adults','num_college_age','num_children_16_to_17',
                              'num_children_5_to_15','num_young_children'], axis=1)

In [123]:
sf_stacked = sf_stacked.drop(['BlockGroupEnd','BlockGroupStart','vehicleIds_estimate','destination_x','origin_x','depart',
                             'origin_y', 'destination_y','parent_tour_id','atwork_subtour_frequency','household_id_y',
                              'TAZ_x','home_x','home_y','male','female', 'has_driving_kid','has_full_time','has_part_time',
                              'has_university','student_is_employed', 'nonstudent_to_school', 'is_student','is_gradeschool',
                              'is_gradeschool','is_highschool','is_university','school_segment','is_worker','home_taz',  
                             'school_taz','distance_to_school','roundtrip_auto_time_to_school','workplace_taz',
                              'distance_to_work','workplace_in_cbd', 'work_taz_area_type', 'hh_race_of_head',
                             'roundtrip_auto_time_to_work','work_auto_savings_ratio','cdap_activity', 'travel_active', 
                              'under16_not_at_school','has_preschool_kid_at_home', 'block_id','gt2','hispanic_head',
                             'has_school_kid_at_home','work_and_school_and_worker', 'age_of_head', 'race_of_head',
                              'work_and_school_and_student', 'VEHICL','hh_children','hh_age_of_head','num_workers',
                              'gt55','seniors', 'recent_mover', 'hh_workers', 'hispanic_status_of_head', 'hh_seniors', 
                             'hh_type', 'TAZ_y', 'HHT', 'sample_rate', 'chunk_id', 'income_segment', 'num_non_workers', 
                             'num_drivers', 'num_adults', 'num_children', 'hh_work_auto_savings_ratio', 'num_travel_active', 
                             'num_travel_active_adults','num_travel_active_preschoolers', 'num_travel_active_children', 
                              'num_travel_active_non_preschoolers', 'participates_in_jtf_model', 'joint_tour_frequency',
                              'num_hh_joint_tours', 'SERIALNO', 'HINCP', 'JWMNP', 'JWRIP', 'JWTR', 'RAC1P', 'RAC2P05'], axis=1)

In [124]:
sf_stacked = sf_stacked.drop(['serialno'], axis=1)

In [125]:
sf_stacked.shape

(10156414, 138)

In [126]:
sf_stacked['lever'] = "default"

In [127]:
sf_stacked['lever'] = sf_stacked['lever'].astype("category")

In [128]:
sf_stacked['lever'] = np.where((sf_stacked['scenario'] == 'TransitFreq0p5')|(sf_stacked['scenario'] == 'TransitFreq1p5')|
                               (sf_stacked['scenario'] == 'TransitFreq2'), 'Frequency' , sf_stacked['lever'])

In [129]:
sf_stacked['lever_position'] = 1

In [130]:
sf_stacked['lever_position'] = np.where(sf_stacked['scenario'] == 'TransitFreq0p5', 0.5 , sf_stacked['lever_position'])

In [131]:
sf_stacked['lever_position'] = np.where(sf_stacked['scenario'] == 'TransitFreq1p5', 1.5 , sf_stacked['lever_position'])

In [132]:
sf_stacked['lever_position'] = np.where(sf_stacked['scenario'] == 'TransitFreq2', 2 , sf_stacked['lever_position'])

In [133]:
sf_stacked['scenario'] = np.where(sf_stacked['scenario'] == 'TransitFreq0p5', "transit" , sf_stacked['scenario'])

In [134]:
sf_stacked['scenario'] = np.where(sf_stacked['scenario'] == 'TransitFreq1p5', "transit" , sf_stacked['scenario'])

In [135]:
sf_stacked['scenario'] = np.where(sf_stacked['scenario'] == 'TransitFreq2', "transit" , sf_stacked['scenario'])

In [137]:
sf_stacked.shape

(10156414, 140)

In [None]:
#%%time
#sf_stacked.to_csv('s3://beam-core-act/deepDive/CleanData/SanFrancisco/Stacked/sf_2018_stacked_tr_fr.csv', index=False)

#### Stacked File

In [None]:
%%time
stacked_tr_fr_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/Stacked/"
sf_stacked_tr_fr = pd.read_csv(stacked_tr_fr_path + 'sf_2018_stacked_tr_fr.csv')

In [None]:
sf_stacked['logsum_relative_to_baseline'] = logsum - baseline logsum

In [None]:
sf_stacked['door_to_door_time_relative_to_baseline'] = 

In [138]:
sf_stacked.head()

Unnamed: 0,IDMerged,tripIndex,actEndTime,actEndType,actStartTime,actStartType,cost_BEAM,distance_bike,distance_mode_choice,distance_privateCar,distance_ridehail,distance_transit,distance_travelling,distance_walking,duration_in_privateCar,duration_in_ridehail,duration_in_transit,duration_on_bike,duration_travelling,duration_walking,emissionBiodiesel,emissionDiesel,emissionElectricity,emissionFood,emissionGasoline,emission_marginal,fuelBiodiesel,fuelDiesel,fuelElectricity,fuelFood,fuelGasoline,fuel_marginal,fuel_not_Food,mode_choice_actual_BEAM,mode_choice_planned_BEAM,numPassengers,parkingType,reason,replanning_status,ride_hail_pooled,scenario,transit_bus,transit_cable_car,transit_rail,transit_subway,transit_tram,vehicleIds,year,duration_door_to_door,waitTime,actPurpose,mode_choice_actual_5,trip_id,person_id,household_id,tour_id,primary_purpose_x,trip_count,purpose,destination_logsum_x,trip_mode_AS_trips,logsum_trip_mode_AS_trips,tour_type,tour_category,household_id_x,start,end,duration,destination_logsum_y,tour_mode_AS_tours,logsum_tours_mode_AS_tours,primary_purpose_y,earning,person_sex,PNUM,sex,edu,race,work_at_home,age,value_of_time,workplace_location_logsum,work_auto_savings,free_parking_at_work,mandatory_tour_frequency,num_mand,num_work_tours,num_joint_tours,non_mandatory_tour_frequency,num_non_mand,num_escort_tours,num_eatout_tours,num_shop_tours,num_maint_tours,num_discr_tours,num_social_tours,num_non_escort_tours,lcm_county_id,tenure_mover,hh_size,sf_detached,tenure,hh_cars,income,hh_income,hhsize,income_in_thousands,median_value_of_time,hh_value_of_time,home_is_urban,home_is_rural,auto_ownership,SPORDER,AGEP,SEX,DIS,VEH,DRIVEALONEFREE,DRIVEALONEPAY,SHARED2FREE,SHARED2PAY,SHARED3FREE,SHARED3PAY,WALK,BIKE,WALK_LOC,WALK_LRF,WALK_EXP,WALK_HVY,WALK_COM,DRIVE_LOC,DRIVE_LRF,DRIVE_EXP,DRIVE_HVY,DRIVE_COM,TAXI,TNC_SINGLE,TNC_SHARED,lever,lever_position
0,1,577.0,29037.0,Home,29601.0,othmaint,0.0,0.0,6498.758,6498.758,0.0,0.0,6498.758,0.0,564.0,0.0,0.0,0.0,564.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hov3_teleportation,hov3_teleportation,,,,0,0,baseline,0,0,0,0,0,,2018,564.0,0.0,Home_to_othmaint,car,577.0,1.0,1735309.0,72.0,school,2.0,othmaint,5.730384,SHARED2FREE,-2.150332,school,mandatory,1735309.0,8.0,14.0,6.0,,SHARED2PAY,-1.011892,school,0.0,male,3.0,1.0,1.0,asian,0.0,3.0,19.798008,,0.0,False,school1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6085.0,own not recent,four or more,yes,1.0,two or more,240000.0,gt150,4.0,240.0,12.86,29.682171,False,False,2.0,3.0,3.0,1.0,2.0,2.0,-999.02509,-999.025207,-2.76541,-2.765477,-1.796358,-1.796405,-1.962304,-999.712596,-1000.490635,-1998.779651,-1999.490635,-1998.779651,-1998.779651,-1998.976381,-2997.418679,-2998.052261,-2997.418679,-2997.418679,-8.970132,-9.314771,-7.82824,default,1.0
1,1,578.0,29751.0,othmaint,29890.0,school,0.0,0.0,1641.981,1641.981,0.0,0.0,1641.981,0.0,139.0,0.0,0.0,0.0,139.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hov3_teleportation,hov3_teleportation,,,,0,0,baseline,0,0,0,0,0,,2018,139.0,0.0,othmaint_to_school,car,578.0,1.0,1735309.0,72.0,school,2.0,school,,SHARED2FREE,-2.020166,school,mandatory,1735309.0,8.0,14.0,6.0,,SHARED2PAY,-1.011892,school,0.0,male,3.0,1.0,1.0,asian,0.0,3.0,19.798008,,0.0,False,school1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6085.0,own not recent,four or more,yes,1.0,two or more,240000.0,gt150,4.0,240.0,12.86,29.682171,False,False,2.0,3.0,3.0,1.0,2.0,2.0,-999.02509,-999.025207,-2.76541,-2.765477,-1.796358,-1.796405,-1.962304,-999.712596,-1998.779651,-1998.779651,-1999.050651,-1998.779651,-1998.779651,-1999.528158,-2997.418679,-2998.777234,-2997.418679,-2997.418679,-8.970132,-9.314771,-7.95832,default,1.0
2,1,581.0,52812.0,school,53394.0,Home,0.0,0.0,6326.206,6326.206,0.0,0.0,6326.206,0.0,582.0,0.0,0.0,0.0,582.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hov3_teleportation,hov3_teleportation,,,,0,0,baseline,0,0,0,0,0,,2018,582.0,0.0,school_to_Home,car,581.0,1.0,1735309.0,72.0,school,1.0,Home,,SHARED2FREE,-1.999284,school,mandatory,1735309.0,8.0,14.0,6.0,,SHARED2PAY,-1.011892,school,0.0,male,3.0,1.0,1.0,asian,0.0,3.0,19.798008,,0.0,False,school1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6085.0,own not recent,four or more,yes,1.0,two or more,240000.0,gt150,4.0,240.0,12.86,29.682171,False,False,2.0,3.0,3.0,1.0,2.0,2.0,-999.689724,-999.692354,-3.428142,-3.429645,-2.458329,-2.459381,-2.303144,-999.883016,-1000.424595,-1998.779651,-1999.424595,-1998.779651,-1998.779651,-2997.418679,-2997.418679,-2997.418679,-2997.418679,-2997.418679,-9.975599,-10.166096,-9.463768,default,1.0
3,2,689.0,46533.0,work,48475.0,atwork,3.656274,0.0,34122.434,33336.011,0.0,0.0,34122.434,786.423,1337.0,0.0,0.0,0.0,1942.0,605.0,0.0,0.0,0.0,0.0,0.003731,0.0,0.0,0.0,0.0,41680.419,50479440.0,50521120.0,50479440.0,car,car,"1.0, 0.0, 0.0, 1.0","Public, Public",,0,0,baseline,0,0,0,0,0,"539168, body-2",2018,1942.0,0.0,work_to_atwork,car,689.0,2.0,1735309.0,86.0,atwork,1.0,atwork,,DRIVEALONEFREE,-0.505729,eat,atwork,1735309.0,12.0,14.0,2.0,15.981749,DRIVEALONEFREE,-0.390809,atwork,77000.0,female,2.0,2.0,21.0,asian,0.0,30.0,29.682171,13.778423,707.1312,False,work1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6085.0,own not recent,four or more,yes,1.0,two or more,240000.0,gt150,4.0,240.0,12.86,29.682171,False,False,2.0,2.0,30.0,2.0,2.0,2.0,-0.81591,-0.818141,-999.81591,-999.817185,-999.81591,-999.816802,-14.840951,-1999.742783,-1998.140532,-1998.140532,-1998.140532,-1998.140532,-1998.140532,-1998.308616,-1998.308616,-1998.308616,-1998.308616,-1998.308616,-10.575011,-10.203032,-9.792319,default,1.0
4,2,693.0,49010.0,atwork,49126.0,work,0.182227,0.0,1691.729,1661.452,0.0,0.0,1691.729,30.277,93.0,0.0,0.0,0.0,116.0,23.0,0.0,0.0,0.0,0.0,0.000165,0.0,0.0,0.0,0.0,1604.681,2238445.0,2240049.0,2238445.0,car,car,"1.0, 0.0, 0.0, 1.0","Public, Workplace",,0,0,baseline,0,0,0,0,0,"539168, body-2",2018,116.0,0.0,atwork_to_work,car,693.0,2.0,1735309.0,86.0,atwork,1.0,Work,,DRIVEALONEPAY,-0.5078,eat,atwork,1735309.0,12.0,14.0,2.0,15.981749,DRIVEALONEFREE,-0.390809,atwork,77000.0,female,2.0,2.0,21.0,asian,0.0,30.0,29.682171,13.778423,707.1312,False,work1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6085.0,own not recent,four or more,yes,1.0,two or more,240000.0,gt150,4.0,240.0,12.86,29.682171,False,False,2.0,2.0,30.0,2.0,2.0,2.0,-0.81695,-0.819181,-999.816504,-999.817779,-999.816326,-999.817218,-14.840951,-1999.742783,-1998.140532,-1998.140532,-1998.140532,-1998.140532,-1998.140532,-1998.308616,-1998.308616,-1998.308616,-1998.308616,-1998.308616,-10.575011,-10.203032,-9.792319,default,1.0


In [139]:
%%time
sf_stacked.to_csv('s3://beam-core-act/deepDive/CleanData/SanFrancisco/Stacked/sf_2018_stacked_tr_fr.csv', index=False)

CPU times: total: 14min 36s
Wall time: 19min 7s
