In [35]:
import pandas as pd
!pip install dbfread
from dbfread import DBF



In [55]:
# Importing intersections 
df_b = pd.DataFrame(iter(DBF('Intersections_V1.1_Bike.dbf')))
df_p = pd.DataFrame(iter(DBF('Intersections_V1.1_Ped.dbf')))

# Defining variables that are not readily available in EPA smart data
df_b['RD_CLASS_RE'] = df_b['HPMS_NAT_F'].map({'7': 0, '6': 1, '5': 2, '4': 3, '3': 4, '2': 5, '1': 6})

df_p['RD_CLASS_RE'] = df_p['HPMS_NAT_F'].map({'7': 0, '6': 1, '5': 2, '4': 3, '3': 4, '2': 5, '1': 6})

# Renaming street attribute columns for consistency
df_b.rename(columns={'SHOULDER_m':'Shoulder', 'MDN_CODE_m':'Median_type', 'Nub_of_Lan' : 'Number_of_lanes', 
                     'RD_CLASS_RE':'National_functional_classification','ONE_WAY_ma':'One_way', 
                     'SPEEDLIM_m':'Speed_limit'} , inplace=True)

df_p.rename(columns={'SHOULDER_m':'Shoulder', 'MDN_CODE_m':'Median_type', 'Nub_of_Lan' : 'Number_of_lanes', 
                     'RD_CLASS_RE':'National_functional_classification','ONE_WAY_ma':'One_way', 
                     'SPEEDLIM_m':'Speed_limit'} , inplace=True)                     

# Now appending the streets attributes to Nick's RF input to maintain full EPA variable names
df_b2=pd.read_csv("Intersections_Bike_Nick.csv", encoding= 'unicode_escape') # Nick's bike input with MEID Added
df_p2=pd.read_csv("Intersections_Ped_Nick.csv", encoding= 'unicode_escape') # Nick's ped input with MEID Added

columns_to_merge = ['MEID', 'Shoulder', 'Median_type', 'Number_of_lanes', 'National_functional_classification', 
                    'One_way', 'Speed_limit']

df_b3 = pd.merge(df_b[columns_to_merge], df_b2, on='MEID')
df_p3 = pd.merge(df_p[columns_to_merge], df_p2, on='MEID')

# Renaming columns so spaces are replaced by '_'
df_b3.columns = df_b3.columns.str.replace(' ', '_')
df_p3.columns = df_p3.columns.str.replace(' ', '_')

# Saving to the final model input CSVs
df_b3.to_csv('RF_Intersections_bike_final.csv', index=False)
df_p3.to_csv('RF_Intersections_ped_final.csv', index=False)


In [56]:
# Dropping NaN values and checking the no. of rows left
df_b4=pd.read_csv("RF_Intersections_bike_final.csv", encoding= 'unicode_escape')
df_p4=pd.read_csv("RF_Intersections_ped_final.csv", encoding= 'unicode_escape')

# Dropping rows with NaN values
df_b5 = df_b4.dropna()
df_p5 = df_p4.dropna()

# Checking the change in the number of rows
num_rows_bike = len(df_b4)
num_rows_bike_left = len(df_b5)
num_rows_ped = len(df_p4)
num_rows_ped_left = len(df_p5)

# Saving to the final model input CSVs
df_b5.to_csv('RF_Intersections_bike_final.csv', index=False)
df_p5.to_csv('RF_Intersections_ped_final.csv', index=False)

print(f"Number of bike rows before removing NaN values: {num_rows_bike}")
print(f"Number of bike rows left after removing NaN values: {num_rows_bike_left}")
print(f"Number of ped rows before removing NaN values: {num_rows_ped}")
print(f"Number of ped rows left after removing NaN values: {num_rows_ped_left}")

Number of bike rows before removing NaN values: 3559
Number of bike rows left after removing NaN values: 3300
Number of ped rows before removing NaN values: 3348
Number of ped rows left after removing NaN values: 3112
Number of veh rows before removing NaN values: 2263
Number of veh rows left after removing NaN values: 2133


In [57]:
# Importing links
df_bike = pd.DataFrame(iter(DBF('Bik_Network_StL_V3.dbf')))
df_ped = pd.DataFrame(iter(DBF('Ped_Network_StL_V2.dbf')))

# Defining variables that are not readily available in EPA smart data
df_bike['Percent_medium_wage_workers_(workplace)'] = 100*df_bike['E_MedWageW']/ (df_bike['E_MedWageW'] + df_bike['E_LowWageW'] + df_bike['E_HiWageWk'])
df_bike['Percent_of_high_wage_workers_(workplace)'] = 100*df_bike['E_HiWageWk']/ (df_bike['E_MedWageW'] + df_bike['E_LowWageW'] + df_bike['E_HiWageWk'])

df_ped['Percent_medium_wage_workers_(workplace)'] = 100*df_ped['E_MedWageW']/ (df_ped['E_MedWageW'] + df_ped['E_LowWageW'] + df_ped['E_HiWageWk'])
df_ped['Percent_of_high_wage_workers_(workplace)'] = 100*df_ped['E_HiWageWk']/ (df_ped['E_MedWageW'] + df_ped['E_LowWageW'] + df_ped['E_HiWageWk'])

# Defining street attribute variables
df_bike['Shoulder'] = (df_bike['TOT_SHLDR_'] > 0).astype(int)
df_bike['Median_type'] = df_bike['MDN_TYP_DE'].map({'NONE': 0, 'PAINTED': 1, 'PAVED - OPEN': 2, 'DEPRESSED - OPEN': 3, 'BARRIER': 4, 'RAISED': 5})
df_bike['National_functional_classification'] = df_bike['HPMS_NAT_F'].map({'7': 0, '6': 1, '5': 2, '4': 3, '3': 4, '2': 5, '1': 6})
df_bike['One_way'] = (df_bike['FCLTY_TYP_'] == 1).astype(int)

df_ped['Shoulder'] = (df_ped['TOT_SHLDR_'] > 0).astype(int)
df_ped['Median_type'] = df_ped['MDN_TYP_DE'].map({'NONE': 0, 'PAINTED': 1, 'PAVED - OPEN': 2, 'DEPRESSED - OPEN': 3, 'BARRIER': 4, 'RAISED': 5})
df_ped['National_functional_classification'] = df_ped['HPMS_NAT_F'].map({'7': 0, '6': 1, '5': 2, '4': 3, '3': 4, '2': 5, '1': 6})
df_ped['One_way'] = (df_ped['FCLTY_TYP_'] == 1).astype(int)


# Renaming columns so they match with the model
df_bike.rename(columns={'StL_Averag': 'Average_Daily_Zone_Traffic_(StL_Volume)', 'Nub_of_Lan' : 'Number_of_lanes', 'D3A':'Total_road_network_density', 
                     'Ac_Unpr':'Total_land_area_(acres)_that_is_not_protected_from_development_(i.e.,_not_a_park,_natural_area_or_conservation_area)',
                     'Ac_Land':'Total_land_area_(acres)', 'D1C8_IND':'Gross_industrial_(8-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                     'Pct_AO2p':'Percent_of_two-plus-car_households_in_CBG,_2018', 'D1A':'Gross_residential_density_(HU/acre)_on_unprotected_land',
                     'CountHU':'Housing_units,_2018', 'D1C5_IND':'Gross_industrial_(5-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                     'D2A_EPHHM':'Employment_and_household_entropy', 'Pct_AO1':'Percent_of_one-car_households_in_CBG,_2018',
                     'D2B_E8MIXA':'8-tier_employment_entropy_(denominator_set_to_the_static_8_employment_types_in_the_CBG)',
                     'TotPop':'Population,_2018', 'HH':'Households_(occupied_housing_units),_2018', 'Pct_AO0':'Percent_of_zero-car_households_in_CBG,_2018',
                     'TotEmp':'Total_employment,_2017', 'D5AR':'Jobs_within_45_minutes_auto_travel_time,_time-_decay_(network_travel_time)_weighted',
                     'D3BAO':'Intersection_density_in_terms_of_auto-oriented_intersections_per_square_mile'}, inplace=True)

df_ped.rename(columns={'StL_Averag': 'Average_Daily_Zone_Traffic_(StL_Volume)', 'Nub_of_Lan' : 'Number_of_lanes', 'D3A':'Total_road_network_density', 
                     'Ac_Unpr':'Total_land_area_(acres)_that_is_not_protected_from_development_(i.e.,_not_a_park,_natural_area_or_conservation_area)',
                     'Ac_Land':'Total_land_area_(acres)', 'D1C8_IND':'Gross_industrial_(8-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                     'Pct_AO2p':'Percent_of_two-plus-car_households_in_CBG,_2018', 'D1A':'Gross_residential_density_(HU/acre)_on_unprotected_land',
                     'CountHU':'Housing_units,_2018', 'D1C5_IND':'Gross_industrial_(5-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                     'D2A_EPHHM':'Employment_and_household_entropy', 'Pct_AO1':'Percent_of_one-car_households_in_CBG,_2018',
                     'D2B_E8MIXA':'8-tier_employment_entropy_(denominator_set_to_the_static_8_employment_types_in_the_CBG)',
                     'TotPop':'Population,_2018', 'HH':'Households_(occupied_housing_units),_2018', 'Pct_AO0':'Percent_of_zero-car_households_in_CBG,_2018',
                     'TotEmp':'Total_employment,_2017', 'D5AR':'Jobs_within_45_minutes_auto_travel_time,_time-_decay_(network_travel_time)_weighted',
                     'D3BAO':'Intersection_density_in_terms_of_auto-oriented_intersections_per_square_mile'}, inplace=True)

# Saving to the final model input CSVs
df_bike.to_csv('Links_bike_final.csv', index=False)
df_ped.to_csv('Links_ped_final.csv', index=False)


In [58]:
# Redefining links dfs so they only include used ones + Target FID
df_bike2 = df_bike.loc[:, ['TARGET_FID', 'Average_Daily_Zone_Traffic_(StL_Volume)','Number_of_lanes', 'Total_road_network_density',
                           'Total_land_area_(acres)_that_is_not_protected_from_development_(i.e.,_not_a_park,_natural_area_or_conservation_area)', 
                           'Total_land_area_(acres)', 'Gross_industrial_(8-tier)_employment_density_(jobs/acre)_on_unprotected_land', 
                           'Percent_of_two-plus-car_households_in_CBG,_2018', 'Gross_residential_density_(HU/acre)_on_unprotected_land',
                           'Housing_units,_2018', 'Gross_industrial_(5-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                           'Employment_and_household_entropy', 'Percent_of_one-car_households_in_CBG,_2018',
                           '8-tier_employment_entropy_(denominator_set_to_the_static_8_employment_types_in_the_CBG)','Population,_2018', 
                           'Households_(occupied_housing_units),_2018', 'Percent_of_zero-car_households_in_CBG,_2018',
                           'Total_employment,_2017', 'Jobs_within_45_minutes_auto_travel_time,_time-_decay_(network_travel_time)_weighted',
                           'Intersection_density_in_terms_of_auto-oriented_intersections_per_square_mile', 'Shoulder',
                           'Median_type', 'National_functional_classification', 'One_way', 'Percent_medium_wage_workers_(workplace)',
                           'Percent_of_high_wage_workers_(workplace)']]

df_ped2 = df_ped.loc[:, ['TARGET_FID', 'Average_Daily_Zone_Traffic_(StL_Volume)','Number_of_lanes', 'Total_road_network_density',
                           'Total_land_area_(acres)_that_is_not_protected_from_development_(i.e.,_not_a_park,_natural_area_or_conservation_area)', 
                           'Total_land_area_(acres)', 'Gross_industrial_(8-tier)_employment_density_(jobs/acre)_on_unprotected_land', 
                           'Percent_of_two-plus-car_households_in_CBG,_2018', 'Gross_residential_density_(HU/acre)_on_unprotected_land',
                           'Housing_units,_2018', 'Gross_industrial_(5-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                           'Employment_and_household_entropy', 'Percent_of_one-car_households_in_CBG,_2018',
                           '8-tier_employment_entropy_(denominator_set_to_the_static_8_employment_types_in_the_CBG)','Population,_2018', 
                           'Households_(occupied_housing_units),_2018', 'Percent_of_zero-car_households_in_CBG,_2018',
                           'Total_employment,_2017', 'Jobs_within_45_minutes_auto_travel_time,_time-_decay_(network_travel_time)_weighted',
                           'Intersection_density_in_terms_of_auto-oriented_intersections_per_square_mile', 'Shoulder',
                           'Median_type', 'National_functional_classification', 'One_way', 'Percent_medium_wage_workers_(workplace)',
                           'Percent_of_high_wage_workers_(workplace)']]

# Assigning zero values to blank street attributes
df_bike2.fillna(0, inplace=True)
df_ped2.fillna(0, inplace=True)

# Dropping rows with NaN values
df_bike3 = df_bike2.dropna()
df_ped3 = df_ped2.dropna()

# Checking the change in the number of rows
num_rows_bike = len(df_bike2)
num_rows_bike_left = len(df_bike3)
num_rows_ped = len(df_ped2)
num_rows_ped_left = len(df_ped3)

# Saving to the final model input CSVs
df_bike2.to_csv('Links_bike_final.csv', index=False)
df_ped2.to_csv('Links_ped_final.csv', index=False)

print(f"Number of bike rows before removing NaN values: {num_rows_bike}")
print(f"Number of bike rows left after removing NaN values: {num_rows_bike_left}")
print(f"Number of ped rows before removing NaN values: {num_rows_ped}")
print(f"Number of ped rows left after removing NaN values: {num_rows_ped_left}")

Number of bike rows before removing NaN values: 66949
Number of bike rows left after removing NaN values: 66949
Number of ped rows before removing NaN values: 68994
Number of ped rows left after removing NaN values: 68994
Number of veh rows before removing NaN values: 50432
Number of veh rows left after removing NaN values: 50432


In [66]:
# Importing intersections for prediction
df_int_bike = pd.DataFrame(iter(DBF('Intersections_Input_V2.2_RD_EPA_Filtered_Bike.dbf')))
df_int_ped = pd.DataFrame(iter(DBF('Intersections_Input_V2.2_RD_EPA_Filtered_Ped.dbf')))

# Defining variables that are not readily available in EPA smart data
df_int_bike['Percent_medium_wage_workers_(workplace)'] = 100*df_int_bike['E_MedWageW']/ (df_int_bike['E_MedWageW'] + df_int_bike['E_LowWageW'] + df_int_bike['E_HiWageWk'])
df_int_bike['Percent_of_high_wage_workers_(workplace)'] = 100*df_int_bike['E_HiWageWk']/ (df_int_bike['E_MedWageW'] + df_int_bike['E_LowWageW'] + df_int_bike['E_HiWageWk'])

df_int_ped['Percent_medium_wage_workers_(workplace)'] = 100*df_int_ped['E_MedWageW']/ (df_int_ped['E_MedWageW'] + df_int_ped['E_LowWageW'] + df_int_ped['E_HiWageWk'])
df_int_ped['Percent_of_high_wage_workers_(workplace)'] = 100*df_int_ped['E_HiWageWk']/ (df_int_ped['E_MedWageW'] + df_int_ped['E_LowWageW'] + df_int_ped['E_HiWageWk'])

# Renaming columns so they match with the model
df_int_bike.rename(columns={'AADT_Bike': 'Average_Daily_Zone_Traffic_(StL_Volume)', 'RF_Number_' : 'Number_of_lanes', 'D3A':'Total_road_network_density', 
                            'Ac_Unpr':'Total_land_area_(acres)_that_is_not_protected_from_development_(i.e.,_not_a_park,_natural_area_or_conservation_area)',
                            'Ac_Land':'Total_land_area_(acres)', 'D1C8_IND':'Gross_industrial_(8-tier)_employment_density_(jobs/acre)_on_unprotected_land', 
                            'Pct_AO2p':'Percent_of_two-plus-car_households_in_CBG,_2018', 'D1A':'Gross_residential_density_(HU/acre)_on_unprotected_land',
                            'CountHU':'Housing_units,_2018', 'D1C5_IND':'Gross_industrial_(5-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                            'D2A_EPHHM':'Employment_and_household_entropy', 'Pct_AO1':'Percent_of_one-car_households_in_CBG,_2018',
                            'D2B_E8MIXA':'8-tier_employment_entropy_(denominator_set_to_the_static_8_employment_types_in_the_CBG)',
                            'TotPop':'Population,_2018', 'HH':'Households_(occupied_housing_units),_2018', 'Pct_AO0':'Percent_of_zero-car_households_in_CBG,_2018',
                            'TotEmp':'Total_employment,_2017', 'D5AR':'Jobs_within_45_minutes_auto_travel_time,_time-_decay_(network_travel_time)_weighted',
                            'D3BAO':'Intersection_density_in_terms_of_auto-oriented_intersections_per_square_mile', 'RF_Speed_l':'Speed_limit',
                            'RF_One_way':'One_way', 'RF_Shoulde':'Shoulder', 'RF_Median_':'Median_type', 
                            'RF_Nationa':'National_functional_classification'}, inplace=True)

df_int_ped.rename(columns={'AADT_Ped': 'Average_Daily_Zone_Traffic_(StL_Volume)', 'RF_Number_' : 'Number_of_lanes', 'D3A':'Total_road_network_density', 
                           'Ac_Unpr':'Total_land_area_(acres)_that_is_not_protected_from_development_(i.e.,_not_a_park,_natural_area_or_conservation_area)',
                           'Ac_Land':'Total_land_area_(acres)', 'D1C8_IND':'Gross_industrial_(8-tier)_employment_density_(jobs/acre)_on_unprotected_land', 
                           'Pct_AO2p':'Percent_of_two-plus-car_households_in_CBG,_2018', 'D1A':'Gross_residential_density_(HU/acre)_on_unprotected_land',
                           'CountHU':'Housing_units,_2018', 'D1C5_IND':'Gross_industrial_(5-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                           'D2A_EPHHM':'Employment_and_household_entropy', 'Pct_AO1':'Percent_of_one-car_households_in_CBG,_2018',
                           'D2B_E8MIXA':'8-tier_employment_entropy_(denominator_set_to_the_static_8_employment_types_in_the_CBG)',
                           'TotPop':'Population,_2018', 'HH':'Households_(occupied_housing_units),_2018', 'Pct_AO0':'Percent_of_zero-car_households_in_CBG,_2018',
                           'TotEmp':'Total_employment,_2017', 'D5AR':'Jobs_within_45_minutes_auto_travel_time,_time-_decay_(network_travel_time)_weighted',
                           'D3BAO':'Intersection_density_in_terms_of_auto-oriented_intersections_per_square_mile', 'RF_Speed_l':'Speed_limit',
                           'RF_One_way':'One_way', 'RF_Shoulde':'Shoulder', 'RF_Median_':'Median_type', 
                           'RF_Nationa':'National_functional_classification'}, inplace=True)

# Saving to the final model input CSVs
df_int_bike.to_csv('Intersections_bike_final.csv', index=False)
df_int_ped.to_csv('Intersections_ped_final.csv', index=False)

In [70]:
# Redefining intersection dfs so they only include used ones + MEID
df_int_bike2 = df_int_bike.loc[:, ['MEID', 'Average_Daily_Zone_Traffic_(StL_Volume)','Number_of_lanes', 'Speed_limit',
                                   'Total_road_network_density', 'Total_land_area_(acres)_that_is_not_protected_from_development_(i.e.,_not_a_park,_natural_area_or_conservation_area)',
                                   'Total_land_area_(acres)', 'Gross_industrial_(8-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                                   'Percent_of_two-plus-car_households_in_CBG,_2018', 'Gross_residential_density_(HU/acre)_on_unprotected_land',
                                   'Housing_units,_2018', 'Gross_industrial_(5-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                                   'Employment_and_household_entropy', 'Percent_of_one-car_households_in_CBG,_2018',
                                   '8-tier_employment_entropy_(denominator_set_to_the_static_8_employment_types_in_the_CBG)','Population,_2018', 
                                   'Households_(occupied_housing_units),_2018', 'Percent_of_zero-car_households_in_CBG,_2018', 
                                   'Total_employment,_2017', 'Jobs_within_45_minutes_auto_travel_time,_time-_decay_(network_travel_time)_weighted',
                                   'Intersection_density_in_terms_of_auto-oriented_intersections_per_square_mile', 'Shoulder',
                                   'Median_type', 'National_functional_classification', 'One_way', 'Percent_medium_wage_workers_(workplace)',
                                   'Percent_of_high_wage_workers_(workplace)']]

df_int_ped2 = df_int_ped.loc[:, ['MEID', 'Average_Daily_Zone_Traffic_(StL_Volume)','Number_of_lanes', 'Speed_limit',
                                 'Total_road_network_density', 'Total_land_area_(acres)_that_is_not_protected_from_development_(i.e.,_not_a_park,_natural_area_or_conservation_area)',
                                 'Total_land_area_(acres)', 'Gross_industrial_(8-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                                 'Percent_of_two-plus-car_households_in_CBG,_2018', 'Gross_residential_density_(HU/acre)_on_unprotected_land',
                                 'Housing_units,_2018', 'Gross_industrial_(5-tier)_employment_density_(jobs/acre)_on_unprotected_land',
                                 'Employment_and_household_entropy', 'Percent_of_one-car_households_in_CBG,_2018',
                                 '8-tier_employment_entropy_(denominator_set_to_the_static_8_employment_types_in_the_CBG)','Population,_2018', 
                                 'Households_(occupied_housing_units),_2018', 'Percent_of_zero-car_households_in_CBG,_2018', 
                                 'Total_employment,_2017', 'Jobs_within_45_minutes_auto_travel_time,_time-_decay_(network_travel_time)_weighted',
                                 'Intersection_density_in_terms_of_auto-oriented_intersections_per_square_mile', 'Shoulder',
                                 'Median_type', 'National_functional_classification', 'One_way', 'Percent_medium_wage_workers_(workplace)',
                                 'Percent_of_high_wage_workers_(workplace)']]

# Dropping rows with NaN values
df_int_bike3 = df_int_bike2.dropna()
df_int_ped3 = df_int_ped2.dropna()

# Checking the change in the number of rows
num_rows_bike = len(df_int_bike2)
num_rows_bike_left = len(df_int_bike3)
num_rows_ped = len(df_int_ped2)
num_rows_ped_left = len(df_int_ped3)

# Saving to the final model input CSVs
df_int_bike3.to_csv('Intersections_bike_final.csv', index=False)
df_int_ped3.to_csv('Intersections_ped_final.csv', index=False)

print(f"Number of bike rows before removing NaN values: {num_rows_bike}")
print(f"Number of bike rows left after removing NaN values: {num_rows_bike_left}")
print(f"Number of ped rows before removing NaN values: {num_rows_ped}")
print(f"Number of ped rows left after removing NaN values: {num_rows_ped_left}")

Number of bike rows before removing NaN values: 8760
Number of bike rows left after removing NaN values: 8579
Number of ped rows before removing NaN values: 8760
Number of ped rows left after removing NaN values: 8574
Number of veh rows before removing NaN values: 8760
Number of veh rows left after removing NaN values: 8566
