In [1]:
import os
import os.path as op
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import seaborn as sns
sns.set_theme(style="darkgrid", color_codes=True)
sns.set(font_scale=1.35, style="ticks") #set styling preferences
import statsmodels.api as sm
from scipy import stats
import math
from math import pi
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from scipy.spatial.distance import cdist
from scipy.cluster.vq import kmeans2,vq, whiten

In [2]:
# Show all columns and rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

#### Baseline

In [3]:
%%time
baseline2018 = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/Baseline/"
sfbase = pd.read_csv(baseline2018 + 'sf_2018_base_utilities.csv')



CPU times: total: 1min 9s
Wall time: 6min 16s


#### Transit and Stacking Process

In [None]:
%%time
transit_fr_05_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/TransitHead_Frequencies/"
sf_transit_fr_05 = pd.read_csv(transit_fr_05_path + 'sf_2018_tr_fr_0p5.csv')

In [None]:
%%time
transit_fr_1p5_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/TransitHead_Frequencies/"
sf_transit_fr_1p5 = pd.read_csv(transit_fr_1p5_path + 'sf_2018_tr_fr_1p5.csv')

In [26]:
%%time
transit_fr_2_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/TransitHead_Frequencies/"
sf_transit_fr_2 = pd.read_csv(transit_fr_2_path + 'sf_2018_tr_fr_2.csv')



CPU times: total: 4min 34s
Wall time: 11min 7s


In [13]:
sfbase.shape

(2568424, 266)

In [14]:
sf_transit_fr_05.shape

(2529285, 646)

In [47]:
sf_transit_fr_1p5.shape

(2529440, 266)

In [48]:
sf_transit_fr_2.shape

(2529265, 266)

In [44]:
#sf_transit_fr_2 = sf_transit_fr_2.drop(['23','24'], axis=1)

In [28]:
sf_transit_fr_2 = sf_transit_fr_2.drop(sf_transit_fr_2.columns[245:625], axis=1)

In [49]:
sf_transit_fr_05 = sf_transit_fr_05.drop(sf_transit_fr_05.columns[245:625], axis=1)

In [53]:
#sf_transit_fr_05 = sf_transit_fr_05.drop(['5','6','7','8','9','10','11'], axis=1)

In [55]:
sf_transit_fr_05.shape

(2529285, 266)

In [121]:
frames = [sfbase, sf_transit_fr_05, sf_transit_fr_1p5, sf_transit_fr_2]
sf_stacked = pd.concat(frames)

In [122]:
sf_stacked = sf_stacked.drop(['has_school_kid','has_preschool_kid','has_retiree','has_non_worker','adult','hispanic','relate',
                             'pstudent','pemploy','ptype','p_hispanic','age_16_p','age_16_to_19','work_zone_id',
                             'race_id','school_zone_id','person_age','MAR','hours','hispanic.1','student',
                             'worker','stop_frequency','composition','tdd','number_of_participants','tour_type_count',
                             'tour_type_num','tour_num','tour_count', 'outbound', 'trip_num','num_under16_not_at_school',
                             'family','non_family','num_young_adults','num_college_age','num_children_16_to_17',
                              'num_children_5_to_15','num_young_children'], axis=1)

In [123]:
sf_stacked = sf_stacked.drop(['BlockGroupEnd','BlockGroupStart','vehicleIds_estimate','destination_x','origin_x','depart',
                             'origin_y', 'destination_y','parent_tour_id','atwork_subtour_frequency','household_id_y',
                              'TAZ_x','home_x','home_y','male','female', 'has_driving_kid','has_full_time','has_part_time',
                              'has_university','student_is_employed', 'nonstudent_to_school', 'is_student','is_gradeschool',
                              'is_gradeschool','is_highschool','is_university','school_segment','is_worker','home_taz',  
                             'school_taz','distance_to_school','roundtrip_auto_time_to_school','workplace_taz',
                              'distance_to_work','workplace_in_cbd', 'work_taz_area_type', 'hh_race_of_head',
                             'roundtrip_auto_time_to_work','work_auto_savings_ratio','cdap_activity', 'travel_active', 
                              'under16_not_at_school','has_preschool_kid_at_home', 'block_id','gt2','hispanic_head',
                             'has_school_kid_at_home','work_and_school_and_worker', 'age_of_head', 'race_of_head',
                              'work_and_school_and_student', 'VEHICL','hh_children','hh_age_of_head','num_workers',
                              'gt55','seniors', 'recent_mover', 'hh_workers', 'hispanic_status_of_head', 'hh_seniors', 
                             'hh_type', 'TAZ_y', 'HHT', 'sample_rate', 'chunk_id', 'income_segment', 'num_non_workers', 
                             'num_drivers', 'num_adults', 'num_children', 'hh_work_auto_savings_ratio', 'num_travel_active', 
                             'num_travel_active_adults','num_travel_active_preschoolers', 'num_travel_active_children', 
                              'num_travel_active_non_preschoolers', 'participates_in_jtf_model', 'joint_tour_frequency',
                              'num_hh_joint_tours', 'SERIALNO', 'HINCP', 'JWMNP', 'JWRIP', 'JWTR', 'RAC1P', 'RAC2P05'], axis=1)

In [124]:
sf_stacked = sf_stacked.drop(['serialno'], axis=1)

In [125]:
sf_stacked.shape

(10156414, 138)

In [126]:
sf_stacked['lever'] = "default"

In [127]:
sf_stacked['lever'] = sf_stacked['lever'].astype("category")

In [128]:
sf_stacked['lever'] = np.where((sf_stacked['scenario'] == 'TransitFreq0p5')|(sf_stacked['scenario'] == 'TransitFreq1p5')|
                               (sf_stacked['scenario'] == 'TransitFreq2'), 'Frequency' , sf_stacked['lever'])

In [129]:
sf_stacked['lever_position'] = 1

In [130]:
sf_stacked['lever_position'] = np.where(sf_stacked['scenario'] == 'TransitFreq0p5', 0.5 , sf_stacked['lever_position'])

In [131]:
sf_stacked['lever_position'] = np.where(sf_stacked['scenario'] == 'TransitFreq1p5', 1.5 , sf_stacked['lever_position'])

In [132]:
sf_stacked['lever_position'] = np.where(sf_stacked['scenario'] == 'TransitFreq2', 2 , sf_stacked['lever_position'])

In [133]:
sf_stacked['scenario'] = np.where(sf_stacked['scenario'] == 'TransitFreq0p5', "transit" , sf_stacked['scenario'])

In [134]:
sf_stacked['scenario'] = np.where(sf_stacked['scenario'] == 'TransitFreq1p5', "transit" , sf_stacked['scenario'])

In [135]:
sf_stacked['scenario'] = np.where(sf_stacked['scenario'] == 'TransitFreq2', "transit" , sf_stacked['scenario'])

In [137]:
sf_stacked.shape

(10156414, 140)

In [None]:
#%%time
#sf_stacked.to_csv('s3://beam-core-act/deepDive/CleanData/SanFrancisco/Stacked/sf_2018_stacked_tr_fr.csv', index=False)

#### Stacked File

In [3]:
%%time
stacked_tr_fr_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/Stacked/"
sf_stacked_tr_fr = pd.read_csv(stacked_tr_fr_path + 'sf_2018_stacked_tr_fr.csv')



CPU times: total: 3min 46s
Wall time: 8min 2s


In [4]:
sf_stacked_tr_fr = sf_stacked_tr_fr.sort_values(by=['IDMerged', 'tripIndex', 'lever_position']).reset_index(drop=True)

In [5]:
# shift column 'person' to first position
first_column = sf_stacked_tr_fr.pop('lever')
second_column = sf_stacked_tr_fr.pop('lever_position')
third_column = sf_stacked_tr_fr.pop('logsum_trip_mode_AS_trips')
# insert column using insert(position,column_name,first_column) function
sf_stacked_tr_fr.insert(2, 'lever', first_column)
sf_stacked_tr_fr.insert(3, 'lever_position', second_column)
sf_stacked_tr_fr.insert(4, 'logsum_trip_mode_AS_trips', third_column)

In [8]:
#sf_stacked_tr_fr['logsum_relative_to_baseline'] = sf_stacked_tr_fr.groupby(['IDMerged','tripIndex'])['logsum_trip_mode_AS_trips'].apply(lambda x: x.diff())

In [9]:
#sf_stacked_tr_fr['logsum_relative_to_baseline'] = sf_stacked_tr_fr['logsum_trip_mode_AS_trips'] - sf_stacked_tr_fr.loc[sf_stacked_tr_fr.groupby(['IDMerged','tripIndex'])['lever_position'].transform('idxmin'), 'logsum_trip_mode_AS_trips'].values

In [6]:
%%time
sf_stacked_tr_fr['logsum_relative_to_baseline'] = sf_stacked_tr_fr['logsum_trip_mode_AS_trips'] - (sf_stacked_tr_fr['logsum_trip_mode_AS_trips'].where(sf_stacked_tr_fr['lever_position'].eq(1.0))
                                                                                                  .groupby([sf_stacked_tr_fr['IDMerged'],sf_stacked_tr_fr['tripIndex']])
                                                                                                   .transform('first')
                                                                                                   .convert_dtypes())

CPU times: total: 2.95 s
Wall time: 2.85 s


In [7]:
sf_stacked_tr_fr['door_to_door_time_relative_to_baseline'] = sf_stacked_tr_fr['duration_door_to_door'] - (sf_stacked_tr_fr['duration_door_to_door'].where(sf_stacked_tr_fr['lever_position'].eq(1.0))
                                                                                                  .groupby([sf_stacked_tr_fr['IDMerged'],sf_stacked_tr_fr['tripIndex']])
                                                                                                   .transform('first')
                                                                                                   .convert_dtypes())

In [10]:
# shift columns to first position
first_column = sf_stacked_tr_fr.pop('logsum_relative_to_baseline')
second_column = sf_stacked_tr_fr.pop('door_to_door_time_relative_to_baseline')
third_column = sf_stacked_tr_fr.pop('income')
fourth_column = sf_stacked_tr_fr.pop('actPurpose')
# insert column using insert(position,column_name,first_column) function
sf_stacked_tr_fr.insert(5, 'logsum_relative_to_baseline', first_column)
sf_stacked_tr_fr.insert(6, 'door_to_door_time_relative_to_baseline', second_column)
sf_stacked_tr_fr.insert(7, 'income', third_column)
sf_stacked_tr_fr.insert(8, 'actPurpose', fourth_column)

In [12]:
# Drop unused columns
sf_stacked_tr_fr = sf_stacked_tr_fr.drop(['sf_detached','destination_logsum_x',
                                          'household_id_x','destination_logsum_y','logsum_tours_mode_AS_tours','primary_purpose_y',
                                          'PNUM','workplace_location_logsum','work_auto_savings','mandatory_tour_frequency',
                                          'SPORDER'], axis=1)

In [13]:
# Rename to potential INEXUS
sf_stacked_tr_fr.rename(columns={"logsum_trip_mode_AS_trips":"logsum_trip_potential_INEXUS"}, inplace=True) 

In [16]:
# Column with six summarized modes
conditions  = [(sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'ride_hail') | (sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'ride_hail_pooled'), 
               (sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'walk_transit') | (sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'drive_transit')|(sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'bike_transit'),
               (sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'walk'), (sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'bike'),
               (sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'car') | (sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'car_hov2')| (sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'car_hov3')|(sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'hov2_teleportation')| (sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'hov3_teleportation'),
               (sf_stacked_tr_fr['mode_choice_actual_BEAM'] == 'ride_hail_transit')]
choices = [ 'ride_hail', 'transit', 'walk', 'bike', 'car', 'ride_hail_transit']

In [17]:
sf_stacked_tr_fr['mode_choice_actual_6'] = np.select(conditions, choices, default=np.nan)

In [30]:
# shift columns to first position
#first_column = sf_stacked_tr_fr.pop('mode_choice_actual_6')
second_column = sf_stacked_tr_fr.pop('mode_choice_actual_BEAM')
#third_column = sf_stacked_tr_fr.pop('income')
#fourth_column = sf_stacked_tr_fr.pop('actPurpose')
# insert column using insert(position,column_name,first_column) function
#sf_stacked_tr_fr.insert(9, 'mode_choice_actual_6', first_column)
sf_stacked_tr_fr.insert(10, 'mode_choice_actual_BEAM', second_column)
#sf_stacked_tr_fr.insert(11, 'income', third_column)
#sf_stacked_tr_fr.insert(12, 'actPurpose', fourth_column)

In [28]:
sf_stacked_tr_fr[sf_stacked_tr_fr['mode_choice_actual_6'] == 'ride_hail_transit'].shape

(2607, 132)

In [26]:
sf_stacked_tr_fr[sf_stacked_tr_fr['replanning_status'] > 0].shape

(24516, 132)

In [34]:
sf_stacked_tr_fr.tail()

Unnamed: 0,IDMerged,tripIndex,lever,lever_position,logsum_trip_potential_INEXUS,logsum_relative_to_baseline,door_to_door_time_relative_to_baseline,income,actPurpose,mode_choice_actual_6,mode_choice_actual_BEAM,actEndTime,actEndType,actStartTime,actStartType,cost_BEAM,distance_bike,distance_mode_choice,distance_privateCar,distance_ridehail,distance_transit,distance_travelling,distance_walking,duration_in_privateCar,duration_in_ridehail,duration_in_transit,duration_on_bike,duration_travelling,duration_walking,emissionBiodiesel,emissionDiesel,emissionElectricity,emissionFood,emissionGasoline,emission_marginal,fuelBiodiesel,fuelDiesel,fuelElectricity,fuelFood,fuelGasoline,fuel_marginal,fuel_not_Food,mode_choice_planned_BEAM,numPassengers,parkingType,reason,replanning_status,ride_hail_pooled,scenario,transit_bus,transit_cable_car,transit_rail,transit_subway,transit_tram,vehicleIds,year,duration_door_to_door,waitTime,mode_choice_actual_5,trip_id,person_id,household_id,tour_id,primary_purpose_x,trip_count,purpose,trip_mode_AS_trips,tour_type,tour_category,start,end,duration,tour_mode_AS_tours,earning,person_sex,sex,edu,race,work_at_home,age,value_of_time,free_parking_at_work,num_mand,num_work_tours,num_joint_tours,non_mandatory_tour_frequency,num_non_mand,num_escort_tours,num_eatout_tours,num_shop_tours,num_maint_tours,num_discr_tours,num_social_tours,num_non_escort_tours,lcm_county_id,tenure_mover,hh_size,tenure,hh_cars,hh_income,hhsize,income_in_thousands,median_value_of_time,hh_value_of_time,home_is_urban,home_is_rural,auto_ownership,AGEP,SEX,DIS,VEH,DRIVEALONEFREE,DRIVEALONEPAY,SHARED2FREE,SHARED2PAY,SHARED3FREE,SHARED3PAY,WALK,BIKE,WALK_LOC,WALK_LRF,WALK_EXP,WALK_HVY,WALK_COM,DRIVE_LOC,DRIVE_LRF,DRIVE_EXP,DRIVE_HVY,DRIVE_COM,TAXI,TNC_SINGLE,TNC_SHARED
10156409,7399583,2427063000.0,Frequency,2.0,-0.233137,-0.009141,-172.0,50300.0,othmaint_to_othmaint,car,car,40502.0,othmaint,41042.0,othmaint,1.222932,0.0,11150.061,11150.061,0.0,0.0,11150.061,0.0,540.0,0.0,0.0,0.0,540.0,0.0,0.0,0.0,0.0,0.0,0.001361,0.0,0.0,0.0,0.0,0.0,18419710.0,18419710.0,18419710.0,car,"1.0, 0.0, 0.0, 1.0","Public, Public",,0,0,transit,0,0,0,0,0,"body-7399583, 397535",2018,540.0,0.0,car,2427063000.0,7399583.0,2895003.0,303382931.0,othmaint,2.0,othmaint,DRIVEALONEFREE,othmaint,non_mandatory,8.0,20.0,12.0,SHARED2FREE,0.0,female,2.0,19.0,white,0.0,63.0,5.413348,False,0.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,6081.0,own not recent,one,1.0,one,gt30-lt60,1.0,50.3,8.81,5.413348,False,False,1.0,63.0,2.0,2.0,1.0,-0.809381,-0.818719,-1000.531417,-1000.536753,-1000.526392,-1000.530127,-19.998541,-1001.430446,-1001.079926,-1999.280771,-2000.079926,-2000.040215,-1998.228851,-1998.258251,-1998.258251,-1998.258251,-1998.258251,-1998.258251,-5.312021,-4.156547,-2.842031
10156410,7399583,2427063000.0,Frequency,0.5,-0.474279,-0.863033,-161.0,50300.0,othmaint_to_Home,car,car,42854.0,othmaint,43866.0,Home,2.405412,0.0,21931.3,21931.3,0.0,0.0,21931.3,0.0,1012.0,0.0,0.0,0.0,1012.0,0.0,0.0,0.0,0.0,0.0,0.002369,0.0,0.0,0.0,0.0,0.0,32048400.0,32048400.0,32048400.0,car,"1.0, 0.0, 0.0, 1.0","Public, Residential",,0,0,transit,0,0,0,0,0,"397535, body-7399583",2018,1012.0,0.0,car,2427063000.0,7399583.0,2895003.0,303382931.0,othmaint,2.0,Home,DRIVEALONEPAY,othmaint,non_mandatory,8.0,20.0,12.0,SHARED2FREE,0.0,female,2.0,19.0,white,0.0,63.0,5.413348,False,0.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,6081.0,own not recent,one,1.0,one,gt30-lt60,1.0,50.3,8.81,5.413348,False,False,1.0,63.0,2.0,2.0,1.0,-0.703996,-0.711579,-1000.430404,-1000.434738,-1000.427128,-1000.430161,-46.067103,-1014.463761,-1000.282505,-1998.226915,-1999.457505,-1998.226915,-1998.226915,-1998.256315,-1998.256315,-1998.256315,-1998.256315,-1998.256315,-7.636213,-5.524961,-2.618138
10156411,7399583,2427063000.0,default,1.0,0.388754,0.0,0.0,50300.0,othmaint_to_Home,car,car,41436.0,othmaint,42609.0,Home,2.257056,0.0,20882.013,20578.669,0.0,0.0,20882.013,303.344,940.0,0.0,0.0,0.0,1173.0,233.0,0.0,0.0,0.0,0.0,0.002218,0.0,0.0,0.0,0.0,16077.232,30006300.0,30022380.0,30006300.0,car,"1.0, 0.0, 0.0, 1.0","Public, Residential",,0,0,baseline,0,0,0,0,0,"397535, body-7399583",2018,1173.0,0.0,car,2427063000.0,7399583.0,2895003.0,303382931.0,othmaint,2.0,Home,DRIVEALONEPAY,othmaint,non_mandatory,8.0,20.0,12.0,SHARED2FREE,0.0,female,2.0,19.0,white,0.0,63.0,5.413348,False,0.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,6081.0,own not recent,one,1.0,one,gt30-lt60,1.0,50.3,8.81,5.413348,False,False,1.0,63.0,2.0,2.0,1.0,-0.703996,-0.711579,-1000.430404,-1000.434738,-1000.427128,-1000.430161,-46.067103,-1014.463761,-1000.282505,-1998.226915,-1999.457505,-1998.226915,-1998.226915,-1998.256315,-1998.256315,-1998.256315,-1998.256315,-1998.256315,-7.636213,-5.524961,-2.617893
10156412,7399583,2427063000.0,Frequency,1.5,0.055413,-0.333342,-2.0,50300.0,othmaint_to_Home,car,car,42253.0,othmaint,43424.0,Home,2.433778,0.0,22364.823,22189.927,0.0,0.0,22364.823,174.896,1036.0,0.0,0.0,0.0,1171.0,135.0,0.0,0.0,0.0,0.0,0.002592,0.0,0.0,0.0,0.0,9269.488,35064730.0,35074000.0,35064730.0,car,"1.0, 0.0, 0.0, 1.0","Public, Residential",,0,0,transit,0,0,0,0,0,"397535, body-7399583",2018,1171.0,0.0,car,2427063000.0,7399583.0,2895003.0,303382931.0,othmaint,2.0,Home,DRIVEALONEPAY,othmaint,non_mandatory,8.0,20.0,12.0,SHARED2FREE,0.0,female,2.0,19.0,white,0.0,63.0,5.413348,False,0.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,6081.0,own not recent,one,1.0,one,gt30-lt60,1.0,50.3,8.81,5.413348,False,False,1.0,63.0,2.0,2.0,1.0,-1.0929,-1.105661,-1000.819308,-1000.826601,-1000.816032,-1000.821136,-46.067103,-1014.463761,-1000.282505,-1998.226915,-1999.457505,-1998.226915,-1998.226915,-1998.256315,-1998.256315,-1998.256315,-1998.256315,-1998.256315,-8.544944,-6.98241,-3.641737
10156413,7399583,2427063000.0,Frequency,2.0,-0.050872,-0.439626,-167.0,50300.0,othmaint_to_Home,car,car,42393.0,othmaint,43399.0,Home,2.267462,0.0,20682.763,20673.541,0.0,0.0,20682.763,9.222,999.0,0.0,0.0,0.0,1006.0,7.0,0.0,0.0,0.0,0.0,0.002406,0.0,0.0,0.0,0.0,488.766,32546820.0,32547310.0,32546820.0,car,"1.0, 0.0, 0.0, 1.0","Public, Residential",,0,0,transit,0,0,0,0,0,"body-7399583, 397535",2018,1006.0,0.0,car,2427063000.0,7399583.0,2895003.0,303382931.0,othmaint,2.0,Home,DRIVEALONEPAY,othmaint,non_mandatory,8.0,20.0,12.0,SHARED2FREE,0.0,female,2.0,19.0,white,0.0,63.0,5.413348,False,0.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,6081.0,own not recent,one,1.0,one,gt30-lt60,1.0,50.3,8.81,5.413348,False,False,1.0,63.0,2.0,2.0,1.0,-0.703996,-0.711579,-1000.430404,-1000.434738,-1000.427128,-1000.430161,-46.067103,-1014.463761,-1000.282505,-1998.226915,-1999.457505,-1998.226915,-1998.226915,-1998.256315,-1998.256315,-1998.256315,-1998.256315,-1998.256315,-7.636213,-5.524961,-2.618079


In [51]:
%%time
sf_stacked_tr_fr.to_csv('s3://beam-core-act/deepDive/CleanData/SanFrancisco/Stacked/sf_2018_stacked_tr_fr_v2.csv', index=False)

CPU times: total: 12min 32s
Wall time: 16min 43s


#### Stacked v2

In [None]:
%%time
stacked_tr_fr_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/Stacked/"
sf_stacked_tr_fr = pd.read_csv(stacked_tr_fr_path + 'sf_2018_stacked_tr_fr_v2.csv')

In [32]:
# Add the mandatory category column
mandatory = ['work' , 'univ', 'school']
sf_stacked_tr_fr['mandatoryCat'] = np.where((sf_stacked_tr_fr.actEndType.isin(mandatory)) & (sf_stacked_tr_fr.actStartType.isin(mandatory)), 'from_M_to_M' , None)

In [33]:
sf_stacked_tr_fr['mandatoryCat'] = np.where((sf_stacked_tr_fr.actEndType == 'Home') & (sf_stacked_tr_fr.actStartType.isin(mandatory)), 'from_H_to_M' , sf_stacked_tr_fr['mandatoryCat'])

In [34]:
sf_stacked_tr_fr['mandatoryCat'] = np.where((sf_stacked_tr_fr.actEndType.isin(mandatory)) & (sf_stacked_tr_fr.actStartType == "Home"), 'from_M_to_H' , sf_stacked_tr_fr['mandatoryCat'])

In [35]:
non_mandatory = ['othmaint' , 'othdiscr', 'escort', 'eatout', 'social', 'shopping', 'atwork']
sf_stacked_tr_fr['mandatoryCat'] = np.where((sf_stacked_tr_fr.actEndType.isin(non_mandatory)) & (sf_stacked_tr_fr.actStartType.isin(non_mandatory)), 'from_N_to_N' , sf_stacked_tr_fr['mandatoryCat'])

In [36]:
sf_stacked_tr_fr.mandatoryCat.unique()

array([None, 'from_M_to_H', 'from_H_to_M', 'from_M_to_M', 'from_N_to_N'],
      dtype=object)

In [42]:
# shift columns to first position
first_column = sf_stacked_tr_fr.pop('mandatoryCat')
# insert column using insert(position,column_name,first_column) function
sf_stacked_tr_fr.insert(11, 'mandatoryCat', first_column)

In [48]:
# Drop unused columns
sf_stacked_tr_fr = sf_stacked_tr_fr.drop(['num_work_tours','num_joint_tours', 'non_mandatory_tour_frequency',
                                          'num_non_mand','num_escort_tours','num_eatout_tours','num_shop_tours',
                                          'num_maint_tours','num_discr_tours','num_social_tours','num_non_escort_tours'], axis=1)

In [50]:
sf_stacked_tr_fr.head()

(10156414, 122)

In [None]:
sf_stacked_tr_fr['realizedINEXUS'] = np.where((sf_stacked_tr_fr.actEndType.isin(mandatory)) & (sf_stacked_tr_fr.actStartType == "Home"), 'from_M_to_H' , sf_stacked_tr_fr['mandatoryCat'])

In [44]:
%%time
sf_stacked_tr_fr.to_csv('s3://beam-core-act/deepDive/CleanData/SanFrancisco/Stacked/sf_2018_stacked_tr_fr_v3.csv', index=False)

CPU times: total: 13min 29s
Wall time: 18min 9s
