In [88]:
import os
import os.path as op
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import seaborn as sns
sns.set_theme(style="darkgrid", color_codes=True)
sns.set(font_scale=1.35, style="ticks") #set styling preferences
import statsmodels.api as sm
from scipy import stats
import math
from math import pi
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from scipy.spatial.distance import cdist
from scipy.cluster.vq import kmeans2,vq, whiten
import geopandas as gpd
import h5py
import boto.s3
import glob    
import boto3
from zipfile import ZipFile
import shutil

In [89]:
# Show all columns and rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

#### Baseline

In [3]:
%%time  
sf_rh_base_0p01 = pd.read_csv('gs://beam-core-outputs/wheelchair/may2023/sfbay-10pop-1fleet/inexus/sfbay_baseline_default-1.0_2019__20230529.csv.gz', compression = 'gzip')



CPU times: total: 11min 48s
Wall time: 13min 56s


In [4]:
sf_rh_base_0p01.shape

(12433113, 275)

In [5]:
# Adding scenario info
sf_rh_base_0p01['scenario'] = "baseline"
sf_rh_base_0p01['scenario'] = sf_rh_base_0p01['scenario'].astype("category")
sf_rh_base_0p01['lever'] = "WAV"
sf_rh_base_0p01['lever'] = sf_rh_base_0p01['lever'].astype("category")
sf_rh_base_0p01['year'] = 2018
sf_rh_base_0p01['lever_position_wav'] = 0.01

In [6]:
#baseline2018 = "https://beam-core-act.s3.amazonaws.com/beam-outputs/pilates-outputs/sfbay-baseline-20221220/inexus"
#sfbase = pd.read_csv(baseline2018 + 'sfbay_baseline_default-1.0_2012__20221219.csv.gz')

In [7]:
# Showing the entire number in dataframe
pd.set_option('float_format', '{:f}'.format)

In [8]:
#describe = sfbase.describe()

In [9]:
#describe.to_csv('C:/Shared-Work/Data/CleanData/ASIM_BEAM_Merged/describe.csv')

#### Ridehail Price Scenarios

In [10]:
%%time  
sf_wav_5 = pd.read_csv('gs://beam-core-outputs/wheelchair/may2023/sfbay-10pop-5fleet/inexus/sfbay_baseline_default-1.0_2019__20230513.csv.gz', compression = 'gzip')



CPU times: total: 40min 9s
Wall time: 42min 22s


In [11]:
# Adding scenario info
sf_wav_5['scenario'] = "ridehail"
sf_wav_5['scenario'] = sf_wav_5['scenario'].astype("category")
sf_wav_5['lever'] = "WAV"
sf_wav_5['lever'] = sf_wav_5['lever'].astype("category")
sf_wav_5['year'] = 2018
sf_wav_5['lever_position_wav'] = 0.05

In [12]:
%%time
sf_wav_10 = pd.read_csv('gs://beam-core-outputs/wheelchair/may2023/sfbay-10pop-10fleet/inexus/sfbay_baseline_default-1.0_2019__20230521.csv.gz', compression = 'gzip')



CPU times: total: 1h 5min 19s
Wall time: 1h 7min 33s


In [13]:
# Adding scenario info
sf_wav_10['scenario'] = "ridehail"
sf_wav_10['scenario'] = sf_wav_10['scenario'].astype("category")
sf_wav_10['lever'] = "WAV"
sf_wav_10['lever'] = sf_wav_10['lever'].astype("category")
sf_wav_10['year'] = 2018
sf_wav_10['lever_position_wav'] = 0.1

In [14]:
%%time 
sf_wav_20 = pd.read_csv('gs://beam-core-outputs/wheelchair/may2023/sfbay-10pop-20fleet/inexus/sfbay_baseline_default-1.0_2019__20230521.csv.gz', compression = 'gzip')



CPU times: total: 1h 19min 19s
Wall time: 1h 21min 40s


In [15]:
# Adding scenario info
sf_wav_20['scenario'] = "ridehail"
sf_wav_20['scenario'] = sf_wav_10['scenario'].astype("category")
sf_wav_20['lever'] = "WAV"
sf_wav_20['lever'] = sf_wav_10['lever'].astype("category")
sf_wav_20['year'] = 2018
sf_wav_20['lever_position_wav'] = 0.2

In [16]:
%%time 
sf_wav_50 = pd.read_csv('gs://beam-core-outputs/wheelchair/may2023/sfbay-10pop-50fleet/inexus/sfbay_baseline_default-1.0_2018__20230527.csv.gz', compression = 'gzip')

CPU times: total: 1h 26min 35s
Wall time: 1h 28min 56s


In [17]:
# Adding scenario info
sf_wav_50['scenario'] = "ridehail"
sf_wav_50['scenario'] = sf_wav_50['scenario'].astype("category")
sf_wav_50['lever'] = "WAV"
sf_wav_50['lever'] = sf_wav_50['lever'].astype("category")
sf_wav_50['year'] = 2018
sf_wav_50['lever_position_wav'] = 0.5

In [18]:
%%time  
sf_wav_100 = pd.read_csv('gs://beam-core-outputs/wheelchair/may2023/sfbay-10pop-100fleet/inexus/sfbay_baseline_default-1.0_2019__20230501.csv.gz', compression = 'gzip')



CPU times: total: 1h 46min 29s
Wall time: 1h 47min 56s


In [19]:
# Adding scenario info
sf_wav_100['scenario'] = "ridehail"
sf_wav_100['scenario'] = sf_wav_100['scenario'].astype("category")
sf_wav_100['lever'] = "WAV"
sf_wav_100['lever'] = sf_wav_100['lever'].astype("category")
sf_wav_100['year'] = 2018
sf_wav_100['lever_position_wav'] = 1

In [20]:
#sf_rh_flsz_175 = sf_rh_flsz_175.drop(sf_rh_flsz_175.columns[233:235], axis=1)

In [21]:
frames = [sf_rh_base_0p01, sf_wav_5, sf_wav_10, sf_wav_20, sf_wav_50, sf_wav_100]
sf_stacked = pd.concat(frames)

In [22]:
sf_stacked = sf_stacked.drop(['has_school_kid','has_preschool_kid','has_retiree','has_non_worker','adult','hispanic','relate',
                             'pstudent','pemploy','ptype','p_hispanic','age_16_p','age_16_to_19',
                             'person_age','MAR','hours','hispanic.1','student',
                             'worker','stop_frequency','composition','tdd','number_of_participants','tour_type_count',
                             'tour_type_num','tour_num','tour_count', 'outbound', 'trip_num','num_under16_not_at_school',
                             'family','non_family','num_young_adults','num_college_age','num_children_16_to_17',
                              'num_children_5_to_15','num_young_children'], axis=1)

In [23]:
sf_stacked = sf_stacked.drop(['vehicleIds_estimate','parent_tour_id','atwork_subtour_frequency','household_id_y',
                              'male','female', 'has_driving_kid','has_full_time','has_part_time',
                              'has_university','student_is_employed', 'nonstudent_to_school', 'is_student','is_gradeschool',
                              'is_gradeschool','is_highschool','is_university','school_segment','is_worker',  
                              'distance_to_school','roundtrip_auto_time_to_school',
                              'distance_to_work','workplace_in_cbd', 'work_taz_area_type', 'hh_race_of_head',
                             'roundtrip_auto_time_to_work','work_auto_savings_ratio','cdap_activity', 'travel_active', 
                              'under16_not_at_school','has_preschool_kid_at_home', 'gt2','hispanic_head',
                             'has_school_kid_at_home','work_and_school_and_worker', 'age_of_head', 'race_of_head',
                              'work_and_school_and_student', 'VEHICL','hh_children','hh_age_of_head','num_workers',
                              'gt55','seniors', 'recent_mover', 'hh_workers', 'hispanic_status_of_head', 'hh_seniors', 
                             'hh_type', 'HHT', 'sample_rate', 'chunk_id', 'income_segment', 'num_non_workers', 
                             'num_drivers', 'num_adults', 'num_children', 'hh_work_auto_savings_ratio', 'num_travel_active', 
                             'num_travel_active_adults','num_travel_active_preschoolers', 'num_travel_active_children', 
                              'num_travel_active_non_preschoolers', 'participates_in_jtf_model', 'joint_tour_frequency',
                              'num_hh_joint_tours'], axis=1)

In [24]:
sf_stacked = sf_stacked.drop(['serialno'], axis=1)

#### Stacked File

In [25]:
#stacked_tr_fr_path = "https://beam-core-act.s3.amazonaws.com/deepDive/CleanData/SanFrancisco/Stacked/"
#sf_stacked_tr_fr = pd.read_csv(stacked_tr_fr_path + 'sf_2018_stacked_tr_fr.csv')

In [26]:
sf_stacked = sf_stacked.sort_values(by=['IDMerged', 'tripIndex', 'lever_position_wav']).reset_index(drop=True)

In [27]:
#sf_stacked['lever_position_price'] = 1

In [30]:
%%time
sf_stacked['logsum_relative_to_baseline'] = sf_stacked['logsum_trip_Potential_INEXUS'] - (sf_stacked['logsum_trip_Potential_INEXUS'].where(sf_stacked['lever_position_wav'].eq(0.01))
                                                                                                  .groupby([sf_stacked['IDMerged'],sf_stacked['tripIndex']])
                                                                                                   .transform('first')
                                                                                                   .convert_dtypes())

CPU times: total: 19.7 s
Wall time: 19.2 s


In [32]:
sf_stacked['door_to_door_time_relative_to_baseline'] = sf_stacked['duration_door_to_door'] - (sf_stacked['duration_door_to_door'].where(sf_stacked['lever_position_wav'].eq(0.01))
                                                                                                  .groupby([sf_stacked['IDMerged'],sf_stacked['tripIndex']])
                                                                                                   .transform('first')
                                                                                                   .convert_dtypes())

In [33]:
# Drop unused columns
sf_stacked = sf_stacked.drop(['sf_detached', 'household_id_x','destination_logsum_y','logsum_tours_mode_AS_tours',
                              'primary_purpose_y','PNUM','work_auto_savings','mandatory_tour_frequency'], axis=1)

In [35]:
sf_stacked.lever_position_wav.unique()

array([0.01, 0.05, 0.1 , 0.2 , 0.5 , 1.  ])

In [36]:
sf_stacked[sf_stacked['replanning_status'] > 0].shape

(1719592, 170)

In [37]:
# Add the mandatory category column
mandatory = ['work' , 'univ', 'school']
sf_stacked['mandatoryCat'] = np.where((sf_stacked.actEndType.isin(mandatory)) & (sf_stacked.actStartType.isin(mandatory)), 'from_M_to_M' , None)

In [38]:
sf_stacked['mandatoryCat'] = np.where((sf_stacked.actEndType == 'Home') & (sf_stacked.actStartType.isin(mandatory)), 'from_H_to_M' , sf_stacked['mandatoryCat'])

In [39]:
sf_stacked['mandatoryCat'] = np.where((sf_stacked.actEndType.isin(mandatory)) & (sf_stacked.actStartType == "Home"), 'from_M_to_H' , sf_stacked['mandatoryCat'])

In [40]:
non_mandatory = ['othmaint' , 'othdiscr', 'escort', 'eatout', 'social', 'shopping', 'atwork']
sf_stacked['mandatoryCat'] = np.where((sf_stacked.actEndType.isin(non_mandatory)) & (sf_stacked.actStartType.isin(non_mandatory)), 'from_N_to_N' , sf_stacked['mandatoryCat'])

In [41]:
# Drop unused columns
sf_stacked = sf_stacked.drop(['num_work_tours','num_joint_tours', 'non_mandatory_tour_frequency',
                                          'num_non_mand','num_escort_tours','num_eatout_tours','num_shop_tours',
                                          'num_maint_tours','num_discr_tours','num_social_tours','num_non_escort_tours'], axis=1)

In [42]:
#sf_stacked[sf_stacked['Realized_INEXUS'].isna()].head(10000).to_csv('C:/Shared-Work/Data/CleanData/ASIM_BEAM_Merged/realizedIN_empty.csv', index = False)

In [43]:
#%%time
#sf_stacked_tr_fr.to_csv('s3://beam-core-act/deepDive/CleanData/SanFrancisco/Stacked/sf_2018_stacked_tr_fr_v3.csv', index=False)

In [49]:
sf_stacked = sf_stacked.drop(['Unnamed: 0.1','Unnamed: 0'], axis=1)

In [47]:
sf_stacked[sf_stacked['SHARED3FREE'].isna()].shape

(640569, 160)

In [None]:
sf_stacked.shape

In [51]:
sf_stacked = sf_stacked[sf_stacked['income'].notna()]

In [None]:
# add a lower and upper range for the bins in pd.cut
#quartiles = [0] + quartiles + [float('inf')]

In [None]:
# add a quartiles column to groups, using pd.cut
#sf_stacked['income_quartiles'] = pd.cut(sf_stacked['income'], bins=quartiles, labels=['1stQ', '2ndQ', '3rdQ', '4thQ'])

In [52]:
sf_stacked['socialCarbonCost'] = sf_stacked['emission_marginal']*185 # 51 or 102

In [None]:
# Drop unused columns
sf_stacked = sf_stacked.drop(['home_is_urban', 'home_is_rural','median_value_of_time','tenure',
                              'tenure_mover','emissionFood','emissionElectricity','emissionDiesel',
                             'emissionGasoline','emissionBiodiesel','fuelFood','fuelElectricity',
                             'fuelBiodiesel','fuel_not_Food','fuelDiesel','transit_rail','transit_bus',
                             'ride_hail_pooled','transit_cable_car','transit_tram','transit_subway',
                             'parkingType','transit_cable_car','transit_tram','transit_subway'], axis=1)

In [None]:
# Drop unused columns
sf_stacked = sf_stacked.drop(['trip_count', 'mode_choice_actual_5','distance_mode_choice', 'person_sex',
                              'income_in_thousands', 'TAZ_y', 'hh_income', 'socialCarbonCost',
                              'income_quartiles','num_mand','free_parking_at_work','workplace_location_logsum',
                             'workplace_taz','school_taz','home_taz','home_y', 'home_x', 
                             'TAZ_x','work_zone_id','work_at_home','race','race_id', 'school_zone_id',
                             'edu','sex','earning','tour_mode_AS_tours', 'destination_x', 'origin_x', 
                             'primary_purpose_x','tour_id','household_id','person_id', 'trip_id', 'fuelGasoline'], axis=1)

In [61]:
# Drop unused columns
sf_stacked = sf_stacked.drop(['fuelGasoline', 'duration_in_transit', 'distance_walking', 'distance_bike',
                             'distance_ridehail', 'distance_bike', 'distance_privateCar', 'distance_transit', 
                              'duration_in_ridehail', 'duration_on_bike', 'duration_in_privateCar'], axis=1)

In [64]:
%%time
#sf_stacked.to_csv('s3://beam-core-act/deepDive/CleanData/SanFrancisco/Stacked/sf_2018_stacked_rh_WAV.csv', index=False)

CPU times: total: 0 ns
Wall time: 0 ns


In [84]:
import gcsfs
from google.cloud import storage
# Authenticate the GCS client
client = storage.Client()

In [85]:
# Specify the GCS bucket and file path
bucket_name = 'beam-core-analysis'
file_path = 'sf_2018_stacked_rh_WAV_20230603.csv'

In [80]:
# Convert the DataFrame to a CSV string
csv_data = sf_stacked.to_csv(index=False)

In [87]:
# Upload the CSV data to GCS
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(file_path)
blob.upload_from_string(csv_data, content_type='text/csv')

MemoryError: 

In [None]:
%%time
sf_stacked.to_csv('gs://beam-core-analysis/stacked/sf_2018_stacked_rh_WAV_20230603.csv', index=False)