In [19]:
# Data preparation

import pandas as pd

#Loading data
Bik_NE = pd.read_csv('Bik_Not_Entering.CSV')
Ped_NE = pd.read_csv('Ped_Not_Entering.CSV')
Veh_NE = pd.read_csv('Veh_Not_Entering.CSV')
Bik_df_Cat = pd.read_csv('Predicted_bike_volumes_Cat.CSV')
Ped_df_Cat = pd.read_csv('Predicted_ped_volumes_Cat.CSV')
Veh_df_Cat = pd.read_csv('Predicted_vehicle_volumes_Cat.CSV')
Bik_df_Cont = pd.read_csv('Predicted_bike_volumes_Cont.CSV')
Ped_df_Cont = pd.read_csv('Predicted_ped_volumes_Cont.CSV')
Veh_df_Cont = pd.read_csv('Predicted_vehicle_volumes_Cont.CSV')

Bik_Cat = pd.merge(Bik_df_Cat, Bik_NE, how='left', on='TARGET_FID')
Bik_Cont = pd.merge(Bik_df_Cont, Bik_NE, how='left', on='TARGET_FID')
Ped_Cat = pd.merge(Ped_df_Cat, Ped_NE, how='left', on='TARGET_FID')
Ped_Cont = pd.merge(Ped_df_Cont, Ped_NE, how='left', on='TARGET_FID')
Veh_Cat = pd.merge(Veh_df_Cat, Veh_NE, how='left', on='TARGET_FID')
Veh_Cont = pd.merge(Veh_df_Cont, Veh_NE, how='left', on='TARGET_FID')

# Filling blanks with INTERSECT = 'Y'
Bik_Cat['INTERSECT'] = Bik_Cat['INTERSECT'].fillna('Y')
Bik_Cont['INTERSECT'] = Bik_Cont['INTERSECT'].fillna('Y')
Ped_Cat['INTERSECT'] = Ped_Cat['INTERSECT'].fillna('Y')
Ped_Cont['INTERSECT'] = Ped_Cont['INTERSECT'].fillna('Y')
Veh_Cat['INTERSECT'] = Veh_Cat['INTERSECT'].fillna('Y')
Veh_Cont['INTERSECT'] = Veh_Cont['INTERSECT'].fillna('Y')

Veh_Cont.to_csv('Veh_Cont.csv', index=False)
Ped_Cont.to_csv('Ped_Cont.csv', index=False)

In [13]:
# ANOVA for Bike Model Variables

import scipy.stats as stats

# Excluding 'TARGET_FID' and 'INTERSECT'
test_features = [col for col in Bik_Cont.columns if col not in ['TARGET_FID', 'INTERSECT']]

# Looping through features
anova_results = {}
for col in test_features:
    # Splitting the data based on the value of 'INTERSECT' (Y/N)
    group_Y = Bik_Cont[Bik_Cont['INTERSECT'] == 'Y'][col]
    group_N = Bik_Cont[Bik_Cont['INTERSECT'] == 'N'][col]
    
    # Performing one-way ANOVA
    f_stat, p_value = stats.f_oneway(group_Y, group_N)
    
    # Storing the result
    anova_results[col] = {'F-statistic': f_stat, 'p-value': p_value}

# Displaying the results
anova_results

{'Number_of_lanes': {'F-statistic': np.float64(4009.774112638549),
  'p-value': np.float64(0.0)},
 'Total_road_network_density': {'F-statistic': np.float64(342.11831074291484),
  'p-value': np.float64(3.4184332216718403e-76)},
 'Gross_industrial_(8-tier)_employment_density_(jobs/acre)_on_unprotected_land': {'F-statistic': np.float64(6.934862794611794),
  'p-value': np.float64(0.008455097144466458)},
 'Gross_residential_density_(HU/acre)_on_unprotected_land': {'F-statistic': np.float64(390.05701756988344),
  'p-value': np.float64(1.4207436420763727e-86)},
 'Percent_of_one-car_households_in_CBG,_2018': {'F-statistic': np.float64(477.6210305694989),
  'p-value': np.float64(1.648773944598431e-105)},
 '8-tier_employment_entropy_(denominator_set_to_the_static_8_employment_types_in_the_CBG)': {'F-statistic': np.float64(368.3467478288339),
  'p-value': np.float64(7.12179124736177e-82)},
 'Percent_medium_wage_workers_(workplace)': {'F-statistic': np.float64(28.24647458520397),
  'p-value': np.f

In [14]:
# ANOVA for Ped Model Variables

import scipy.stats as stats

# Excluding 'TARGET_FID' and 'INTERSECT'
test_features = [col for col in Ped_Cont.columns if col not in ['TARGET_FID', 'INTERSECT']]

# Looping through features
anova_results = {}
for col in test_features:
    # Split the data based on the value of 'INTERSECT' (Y/N)
    group_Y = Ped_Cont[Ped_Cont['INTERSECT'] == 'Y'][col]
    group_N = Ped_Cont[Ped_Cont['INTERSECT'] == 'N'][col]
    
    # Performing one-way ANOVA
    f_stat, p_value = stats.f_oneway(group_Y, group_N)
    
    # Storing the result
    anova_results[col] = {'F-statistic': f_stat, 'p-value': p_value}

# Displaying the results
anova_results

{'Number_of_lanes': {'F-statistic': np.float64(4280.045773065163),
  'p-value': np.float64(0.0)},
 'Population,_2018': {'F-statistic': np.float64(410.48300767565394),
  'p-value': np.float64(5.301424698961906e-91)},
 'Households_(occupied_housing_units),_2018': {'F-statistic': np.float64(185.6630368796243),
  'p-value': np.float64(3.189956083570955e-42)},
 'Percent_of_zero-car_households_in_CBG,_2018': {'F-statistic': np.float64(1098.9916121903516),
  'p-value': np.float64(4.19024365946539e-239)},
 'Total_employment,_2017': {'F-statistic': np.float64(825.9145073199115),
  'p-value': np.float64(1.4637526283490748e-180)}}

In [21]:
# ANOVA for Veh Model Variables

import scipy.stats as stats
import numpy as np

# Excluding 'TARGET_FID' and 'INTERSECT'
test_features = [col for col in Veh_Cont.columns if col not in ['TARGET_FID', 'INTERSECT']]

# Looping through features
anova_results = {}
for col in test_features:
    # Splitting the data based on the value of 'INTERSECT' (Y/N)
    group_Y = Veh_Cont[Veh_Cont['INTERSECT'] == 'Y'][col].dropna()  # Removing NaNs
    group_N = Veh_Cont[Veh_Cont['INTERSECT'] == 'N'][col].dropna()  # Removing NaNs
    
    # Checking if both groups have enough data and variance
    if len(group_Y) > 1 and len(group_N) > 1 and np.var(group_Y) > 0 and np.var(group_N) > 0:
        # Performing one-way ANOVA
        f_stat, p_value = stats.f_oneway(group_Y, group_N)
    else:
        f_stat, p_value = np.nan, np.nan  # Assign NaN if conditions aren't met

    # Storing the result
    anova_results[col] = {'F-statistic': f_stat, 'p-value': p_value}

# Displaying the results
anova_results

{'Ac_Unpr': {'F-statistic': np.float64(255.78747618681697),
  'p-value': np.float64(2.0793337418930282e-57)},
 'E8_Svc': {'F-statistic': np.float64(598.7922409819575),
  'p-value': np.float64(2.4144317894840414e-131)},
 'D3A': {'F-statistic': np.float64(413.8928676894444),
  'p-value': np.float64(1.4023201859048543e-91)},
 'D3BAO': {'F-statistic': np.float64(418.9058276422307),
  'p-value': np.float64(1.1644400813031174e-92)},
 'D5CEI': {'F-statistic': np.float64(397.2261600086245),
  'p-value': np.float64(5.509331552266949e-88)},
 'RF_Number_': {'F-statistic': np.float64(2704.6170466649864),
  'p-value': np.float64(0.0)},
 'RF_Speed_l': {'F-statistic': np.float64(2185.9334626355544),
  'p-value': np.float64(0.0)}}

In [22]:
# Chi-sq test for Bike model
import pandas as pd
import scipy.stats as stats

# Excluding 'TARGET_FID' and 'INTERSECT'
cat_columns = [col for col in Bik_Cat.columns if col not in ['TARGET_FID', 'INTERSECT']]

# Initialize a dictionary to store Chi-Square test results
chi2_results = {}

# Looping through features and comparing 'Y' vs 'N' groups in 'INTERSECT'
for col in cat_columns:
    
    contingency_table = pd.crosstab(Bik_Cat['INTERSECT'], Bik_Cat[col])
    
    # Checking if the table has more than 1 value per row and column
    if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
        # Performing the Chi-Square test
        chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency_table)
        
        # Storing the result
        chi2_results[col] = {
            'Chi2 Statistic': chi2_stat,
            'p-value': p_value,
            'Degrees of Freedom': dof,
            'Expected Table': expected
        }
    else:
        # If contingency table is invalid, store None values
        chi2_results[col] = {
            'Chi2 Statistic': None,
            'p-value': None,
            'Degrees of Freedom': None,
            'Expected Table': None
        }

# Displaying the results
chi2_results

{'Number_of_lanes': {'Chi2 Statistic': np.float64(4316.715504378186),
  'p-value': np.float64(0.0),
  'Degrees of Freedom': 6,
  'Expected Table': array([[  850.16124214, 38201.09139793,  8440.06228622,  2362.75581413,
           1018.65473719,   313.13631272,   323.13820968],
         [  254.83875786, 11450.90860207,  2529.93771378,   708.24418587,
            305.34526281,    93.86368728,    96.86179032]])},
 'National_functional_classification': {'Chi2 Statistic': np.float64(16367.849754418656),
  'p-value': np.float64(0.0),
  'Degrees of Freedom': 6,
  'Expected Table': array([[4.79321678e+02, 3.44865407e+04, 3.14675066e+03, 1.02434812e+04,
          3.14290378e+03, 2.30813007e+00, 7.69376690e+00],
         [1.43678322e+02, 1.03374593e+04, 9.43249339e+02, 3.07051875e+03,
          9.42096222e+02, 6.91869931e-01, 2.30623310e+00]])},
 'One_way': {'Chi2 Statistic': None,
  'p-value': None,
  'Degrees of Freedom': None,
  'Expected Table': None}}

In [23]:
# Chi-sq test for Ped model
import pandas as pd
import scipy.stats as stats

# Excluding 'TARGET_FID' and 'INTERSECT'
cat_columns = [col for col in Ped_Cat.columns if col not in ['TARGET_FID', 'INTERSECT']]

# Initialize a dictionary to store Chi-Square test results
chi2_results = {}

# Looping through features and comparing 'Y' vs 'N' groups in 'INTERSECT'
for col in cat_columns:
  
    contingency_table = pd.crosstab(Ped_Cat['INTERSECT'], Ped_Cat[col])
    
    # Checking if the table has more than 1 value per row and column
    if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
        # Performing the Chi-Square test
        chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency_table)
        
        # Storing the result
        chi2_results[col] = {
            'Chi2 Statistic': chi2_stat,
            'p-value': p_value,
            'Degrees of Freedom': dof,
            'Expected Table': expected
        }
    else:
        # If contingency table is invalid, store None values
        chi2_results[col] = {
            'Chi2 Statistic': None,
            'p-value': None,
            'Degrees of Freedom': None,
            'Expected Table': None
        }

# Displaying the results
chi2_results

{'Number_of_lanes': {'Chi2 Statistic': np.float64(4638.366350443465),
  'p-value': np.float64(0.0),
  'Degrees of Freedom': 6,
  'Expected Table': array([[ 1177.05597588, 40273.30260602,  8520.94674899,  2366.62550367,
           1021.41867409,   290.15798475,   310.49250659],
         [  327.94402412, 11220.69739398,  2374.05325101,   659.37449633,
            284.58132591,    80.84201525,    86.50749341]])},
 'Shoulder': {'Chi2 Statistic': np.float64(14.613324958405011),
  'p-value': np.float64(0.00013197805016745265),
  'Degrees of Freedom': 1,
  'Expected Table': array([[51795.15552077,  2164.84447923],
         [14430.84447923,   603.15552077]])},
 'Median_type': {'Chi2 Statistic': np.float64(11120.38349882744),
  'p-value': np.float64(0.0),
  'Degrees of Freedom': 5,
  'Expected Table': array([[4.71878221e+04, 2.72951851e+02, 4.69258196e+00, 6.49140505e+01,
          1.56419399e+00, 6.42805519e+03],
         [1.31471779e+04, 7.60481491e+01, 1.30741804e+00, 1.80859495e+01,
       

In [25]:
# Chi-sq test for Veh model
import pandas as pd
import scipy.stats as stats

# Excluding 'TARGET_FID' and 'INTERSECT'
cat_columns = [col for col in Veh_Cat.columns if col not in ['TARGET_FID', 'INTERSECT']]

# Initialize a dictionary to store Chi-Square test results
chi2_results = {}

# Looping through features and comparing 'Y' vs 'N' groups in 'INTERSECT'
for col in cat_columns:
    
    contingency_table = pd.crosstab(Veh_Cat['INTERSECT'], Veh_Cat[col])
    
    # Checking if the table has more than 1 value per row and column
    if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
        # Performing the Chi-Square test
        chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency_table)
        
        # Storing the result
        chi2_results[col] = {
            'Chi2 Statistic': chi2_stat,
            'p-value': p_value,
            'Degrees of Freedom': dof,
            'Expected Table': expected
        }
    else:
        # If contingency table is invalid, store None values
        chi2_results[col] = {
            'Chi2 Statistic': None,
            'p-value': None,
            'Degrees of Freedom': None,
            'Expected Table': None
        }

# Displaying the results
chi2_results

{'RF_Number_': {'Chi2 Statistic': np.float64(3025.72834028355),
  'p-value': np.float64(0.0),
  'Degrees of Freedom': 6,
  'Expected Table': array([[7.48289889e+02, 2.99906711e+04, 4.36730747e+03, 1.17894185e+03,
          4.39213631e+02, 1.25000371e+02, 1.21575703e+02],
         [1.25710111e+02, 5.03832891e+03, 7.33692534e+02, 1.98058149e+02,
          7.37863693e+01, 2.09996295e+01, 2.04242972e+01]])},
 'RF_Median_': {'Chi2 Statistic': np.float64(9742.63102139009),
  'p-value': np.float64(0.0),
  'Degrees of Freedom': 5,
  'Expected Table': array([[3.35052363e+04, 1.57534714e+02, 2.56850076e+00, 5.05138484e+01,
          1.71233384e+00, 3.25343430e+03],
         [5.62876370e+03, 2.64652865e+01, 4.31499236e-01, 8.48615164e+00,
          2.87666157e-01, 5.46565699e+02]])},
 'RF_Nationa': {'Chi2 Statistic': np.float64(13391.695268152987),
  'p-value': np.float64(0.0),
  'Degrees of Freedom': 6,
  'Expected Table': array([[6.55823862e+02, 2.77723426e+04, 1.93750574e+03, 5.07535751e+03,
 