In [5]:
import os
import os.path as op
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import seaborn as sns
sns.set_theme(style="darkgrid", color_codes=True)
sns.set(font_scale=1.35, style="ticks") #set styling preferences
import statsmodels.api as sm
from scipy import stats
import math
from math import pi
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from scipy.spatial.distance import cdist
from scipy.cluster.vq import kmeans2,vq, whiten
import geopandas as gpd
import h5py
import boto.s3
import glob
import boto3
from zipfile import ZipFile
import shutil

In [6]:
# Show all columns and rows
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [7]:
# Showing the entire number in dataframe
pd.set_option('float_format', '{:f}'.format)

### Reading Ridehail Price Scenarios from AWS S3

In [11]:
%%time 
# Function to load data and add scenario info
def load_data_and_add_scenario_info(key, scenario, lever, lever_position):
    s3 = boto3.client("s3")
    obj = s3.get_object(Bucket="beam-outputs", Key=key)
    data = pd.read_csv(obj['Body'], compression='gzip', index_col='Unnamed: 0')
    
    data['scenario'] = scenario
    data['scenario'] = data['scenario'].astype("category")
    data['lever'] = lever
    data['lever'] = data['lever'].astype("category")
    data['year'] = 2018
    data['lever_position_price'] = lever_position
    data['lever_position_fleetsize'] = 1
    
    return data

# Load data and add scenario info for each lever position
sfbase = load_data_and_add_scenario_info("pilates-outputs/sfbay-baseline-2022124/inexus/sfbay_baseline_default-1.0_2020__20221224.csv.gz", "baseline", "default", 1)

sf_rh_price_0 = load_data_and_add_scenario_info("pilates-outputs/sfbay-rh-price-0.000-20221224/inexus/sfbay_rh_price_0_000_rh_price-1.0_2020__20221224.csv.gz", "ridehail", "price", 0)
sf_rh_price_0p125 = load_data_and_add_scenario_info("pilates-outputs/sfbay-rh-price-0.125-20221224/inexus/sfbay_rh_price_0_125_rh_price-1.0_2020__20221224.csv.gz", "ridehail", "price", 0.125)
sf_rh_price_0p25 = load_data_and_add_scenario_info("pilates-outputs/sfbay-rh-price-0.250-20221224/inexus/sfbay_rh_price_0_250_rh_price-1.0_2020__20221224.csv.gz", "ridehail", "price", 0.25)
sf_rh_price_0p5 = load_data_and_add_scenario_info("pilates-outputs/sfbay-rh-price-0.500-20221224/inexus/sfbay_rh_price_0_50_rh_price-1.0_2020__20230126.csv.gz", "ridehail", "price", 0.5)
sf_rh_price_1p75 = load_data_and_add_scenario_info("pilates-outputs/sfbay-rh-price-1.750-20221224/inexus/sfbay_rh_price_1_750_rh_price-1.0_2020__20221224.csv.gz", "ridehail", "price", 1.75)
sf_rh_price_3 = load_data_and_add_scenario_info("pilates-outputs/sfbay-rh-price-3.000-20221224/inexus/sfbay_rh_price_3_000_rh_price-1.0_2020__20221224.csv.gz", "ridehail", "price", 3)
sf_rh_price_5 = load_data_and_add_scenario_info("pilates-outputs/sfbay-rh-price-5.000-20221224/inexus/sfbay_rh_price_5_000_rh_price-1.0_2020__20221224.csv.gz", "ridehail", "price", 5)
sf_rh_price_8 = load_data_and_add_scenario_info("pilates-outputs/sfbay-rh-price-8.000-20221224/inexus/sfbay_rh_price_8_000_rh_price-1.0_2020__20221224.csv.gz", "ridehail", "price", 8)



CPU times: total: 5min 46s
Wall time: 5min 49s


### Stacking the files

In [13]:
# Define the list of columns to be dropped
columns_to_drop = [
    'has_school_kid', 'has_preschool_kid', 'has_retiree', 'has_non_worker', 'adult', 'hispanic', 'relate',
    'pstudent', 'pemploy', 'ptype', 'p_hispanic', 'age_16_p', 'age_16_to_19', 'person_age', 'MAR', 'hours',
    'hispanic.1', 'student', 'worker', 'stop_frequency', 'composition', 'tdd', 'number_of_participants',
    'tour_type_count', 'tour_type_num', 'tour_num', 'tour_count', 'outbound', 'trip_num', 'num_under16_not_at_school',
    'family', 'non_family', 'num_young_adults', 'num_college_age', 'num_children_16_to_17', 'num_children_5_to_15',
    'num_young_children',
    'vehicleIds_estimate', 'parent_tour_id', 'atwork_subtour_frequency', 'household_id_y', 'male', 'female',
    'has_driving_kid', 'has_full_time', 'has_part_time', 'has_university', 'student_is_employed', 'nonstudent_to_school',
    'is_student', 'is_gradeschool', 'is_highschool', 'is_university', 'school_segment', 'is_worker',
    'distance_to_school', 'roundtrip_auto_time_to_school', 'distance_to_work', 'workplace_in_cbd', 'work_taz_area_type',
    'hh_race_of_head', 'roundtrip_auto_time_to_work', 'work_auto_savings_ratio', 'cdap_activity', 'travel_active',
    'under16_not_at_school', 'has_preschool_kid_at_home', 'gt2', 'hispanic_head', 'has_school_kid_at_home',
    'work_and_school_and_worker', 'age_of_head', 'race_of_head', 'work_and_school_and_student', 'VEHICL',
    'hh_children', 'hh_age_of_head', 'num_workers', 'gt55', 'seniors', 'recent_mover', 'hh_workers',
    'hispanic_status_of_head', 'hh_seniors', 'HHT', 'sample_rate', 'chunk_id', 'income_segment', 'num_non_workers',
    'num_adults', 'num_children', 'hh_work_auto_savings_ratio', 'num_travel_active', 'num_travel_active_adults',
    'num_travel_active_preschoolers', 'num_travel_active_children', 'num_travel_active_non_preschoolers',
    'participates_in_jtf_model', 'joint_tour_frequency', 'num_hh_joint_tours', 'serialno',
    'sf_detached', 'household_id_x','destination_logsum_y','logsum_tours_mode_AS_tours',
    'primary_purpose_y','PNUM','work_auto_savings','mandatory_tour_frequency',
    'num_work_tours','num_joint_tours', 'non_mandatory_tour_frequency',
    'num_non_mand','num_escort_tours','num_eatout_tours','num_shop_tours',
    'num_maint_tours','num_discr_tours','num_social_tours','num_non_escort_tours'
]

# Define the list of dataframes to be concatenated
frames = [sf_rh_price_0, sf_rh_price_0p125, sf_rh_price_0p25, sf_rh_price_0p5, sfbase, sf_rh_price_1p75, sf_rh_price_3,
          sf_rh_price_5, sf_rh_price_8]

# Combine all dataframes into one
sf_stacked = pd.concat(frames, ignore_index=True)

# Drop the specified columns
sf_stacked = sf_stacked.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
sf_stacked = sf_stacked.sort_values(by=['IDMerged', 'tripIndex', 'lever_position']).reset_index(drop=True)

In [43]:
sf_stacked['door_to_door_time_relative_to_baseline'] = sf_stacked['duration_door_to_door'] - (sf_stacked['duration_door_to_door'].where(sf_stacked['lever_position_price'].eq(1.0))
                                                                                                  .groupby([sf_stacked['IDMerged'],sf_stacked['tripIndex']])
                                                                                                   .transform('first')
                                                                                                   .convert_dtypes())

In [17]:
def categorize_mandatory(row):
    mandatory = ['work', 'univ', 'school']
    non_mandatory = ['othmaint', 'othdiscr', 'escort', 'eatout', 'social', 'shopping', 'atwork']
    
    if row['actEndType'] in mandatory and row['actStartType'] in mandatory:
        return 'from_M_to_M'
    elif row['actEndType'] == 'Home' and row['actStartType'] in mandatory:
        return 'from_H_to_M'
    elif row['actEndType'] in mandatory and row['actStartType'] == 'Home':
        return 'from_M_to_H'
    elif row['actEndType'] in non_mandatory and row['actStartType'] in non_mandatory:
        return 'from_N_to_N'
    else:
        return None

# Apply the categorization function to create 'mandatoryCat' column
sf_stacked['mandatoryCat'] = sf_stacked.apply(categorize_mandatory, axis=1)

In [64]:
# Dropping the rows that don't have any ActivitySim information because of the mismatched trip id 
sf_stacked = sf_stacked[sf_stacked['income'].notna()]

In [65]:
%%time
sf_stacked.to_csv('s3://beam-core-act/deepDive/CleanData/SanFrancisco/Stacked/sf_2018_stacked_rh_price_5_5_23.csv', index=False)

CPU times: total: 44min 28s
Wall time: 3h 14min 20s
