In [5]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

### Get Land Use data

In [6]:
parcel_points_gdf = gpd.read_file('./data/NYS_Tax_Parcels_Public_Points_2022.geojson')

In [7]:
# get shortage_tracts_gdf and compute additional columns
shortage_tracts_gdf = gpd.read_file('./data/shortage_tracts_gdf_1pcp220.geojson')
shortage_tracts_gdf['unserved medicaid enrollees per km2'] = (shortage_tracts_gdf['unserved_medicaid'] / shortage_tracts_gdf['ALAND'] * 1000000).round(1)
shortage_tracts_gdf['unserved commercial insurance enrollees per km2'] = (shortage_tracts_gdf['unserved_commercial'] / shortage_tracts_gdf['ALAND'] * 1000000).round(1)
shortage_tracts_gdf['insured population per km2'] = ((shortage_tracts_gdf['medicaid'] + shortage_tracts_gdf['commercial']) / shortage_tracts_gdf['ALAND'] * 1000000).round(1)

columns_to_keep = ['COUNTYFP', 'GEOID', 'geometry', 'insured population per km2', 'unserved medicaid enrollees per km2', 'unserved commercial insurance enrollees per km2', 'medicaid', 'commercial']
shortage_tracts_gdf = shortage_tracts_gdf[columns_to_keep]
shortage_tracts_gdf['GEOID'] = shortage_tracts_gdf['GEOID'].astype('string')
print(shortage_tracts_gdf.shape)
print(shortage_tracts_gdf.columns)

(3533, 8)
Index(['COUNTYFP', 'GEOID', 'geometry', 'insured population per km2',
       'unserved medicaid enrollees per km2',
       'unserved commercial insurance enrollees per km2', 'medicaid',
       'commercial'],
      dtype='object')


In [8]:
# Perform the spatial join
joined_gdf = gpd.sjoin(parcel_points_gdf, shortage_tracts_gdf, how="inner", predicate='within')
print(joined_gdf.columns)

# Calculate land use data while processing spatial join
residential_area_sum_sqft = joined_gdf.groupby('GEOID')['SQFT_LIVING'].sum().reset_index(name='residential_area_sum_sqft')
commercial_area_sum_sqft = joined_gdf.groupby('GEOID')['GFA'].sum().reset_index(name='commercial_area_sum_sqft')
agriculture_ratio = (joined_gdf.groupby('GEOID')['agriculture'].sum() / joined_gdf.groupby('GEOID')['agriculture'].count()).reset_index(name='agriculture_ratio').round(2)
residential_ratio = (joined_gdf.groupby('GEOID')['residential'].sum() / joined_gdf.groupby('GEOID')['residential'].count()).reset_index(name='residential_ratio').round(2)
vacant_ratio = (joined_gdf.groupby('GEOID')['vacant'].sum() / joined_gdf.groupby('GEOID')['vacant'].count()).reset_index(name='vacant_ratio').round(2)
commercial_ratio = (joined_gdf.groupby('GEOID')['commercial_left'].sum() / joined_gdf.groupby('GEOID')['commercial_left'].count()).reset_index(name='commercial_ratio').round(2)
industrial_ratio = (joined_gdf.groupby('GEOID')['industrial'].sum() / joined_gdf.groupby('GEOID')['industrial'].count()).reset_index(name='industrial_ratio').round(2)
avg_land_price_per_sqft = joined_gdf.groupby('GEOID')['land_price_per_sqft(dollars)'].mean().reset_index(name='avg_land_price_per_sqft').round(2)

# Merge the average land price back into merged_gdf
shortage_tracts_gdf = shortage_tracts_gdf.merge(residential_area_sum_sqft, left_on='GEOID', right_on='GEOID')
shortage_tracts_gdf = shortage_tracts_gdf.merge(commercial_area_sum_sqft, left_on='GEOID', right_on='GEOID')
shortage_tracts_gdf = shortage_tracts_gdf.merge(agriculture_ratio, left_on='GEOID', right_on='GEOID')
shortage_tracts_gdf = shortage_tracts_gdf.merge(residential_ratio, left_on='GEOID', right_on='GEOID')
shortage_tracts_gdf = shortage_tracts_gdf.merge(vacant_ratio, left_on='GEOID', right_on='GEOID')
shortage_tracts_gdf = shortage_tracts_gdf.merge(commercial_ratio, left_on='GEOID', right_on='GEOID')
shortage_tracts_gdf = shortage_tracts_gdf.merge(industrial_ratio, left_on='GEOID', right_on='GEOID')
shortage_tracts_gdf = shortage_tracts_gdf.merge(avg_land_price_per_sqft, left_on='GEOID', right_on='GEOID')
shortage_tracts_gdf = shortage_tracts_gdf.dropna()

Index(['SQFT_LIVING', 'GFA', 'land_price_per_sqft(dollars)', 'agriculture',
       'residential', 'vacant', 'commercial_left', 'industrial', 'geometry',
       'index_right', 'COUNTYFP', 'GEOID', 'insured population per km2',
       'unserved medicaid enrollees per km2',
       'unserved commercial insurance enrollees per km2', 'medicaid',
       'commercial_right'],
      dtype='object')


In [9]:
shortage_tracts_gdf["residential_area_per_sqft"] = (shortage_tracts_gdf["residential_area_sum_sqft"] / shortage_tracts_gdf.geometry.area).round(2)
shortage_tracts_gdf["commercial_area_per_sqft"] = (shortage_tracts_gdf["commercial_area_sum_sqft"] / shortage_tracts_gdf.geometry.area).round(2)

print(shortage_tracts_gdf.shape)
print(shortage_tracts_gdf.columns)

(3533, 18)
Index(['COUNTYFP', 'GEOID', 'geometry', 'insured population per km2',
       'unserved medicaid enrollees per km2',
       'unserved commercial insurance enrollees per km2', 'medicaid',
       'commercial', 'residential_area_sum_sqft', 'commercial_area_sum_sqft',
       'agriculture_ratio', 'residential_ratio', 'vacant_ratio',
       'commercial_ratio', 'industrial_ratio', 'avg_land_price_per_sqft',
       'residential_area_per_sqft', 'commercial_area_per_sqft'],
      dtype='object')


### Get Mode of Transportation data

In [19]:
transportation_df = pd.read_csv('./data/Means of Transportation to Work by Vehicles Available (ACSDT5Y2022.B08141)/ACSDT5Y2022.B08141-Data.csv')
transportation_df = transportation_df.iloc[1:, :]

columns_to_keep = ['GEO_ID', 'B08141_006E', 'B08141_011E', 'B08141_016E', 'B08141_021E', 'B08141_026E', 'B08141_031E']
transportation_df = transportation_df[columns_to_keep]
transportation_df.rename(columns={'B08141_006E': 'drove alone', 'B08141_011E': 'carpooled', 'B08141_016E': 'public transit', 'B08141_021E': 'walked', 'B08141_026E': 'taxicab, motorcycle, bicycle, or other means', 'B08141_031E': 'worked from home'}, inplace=True)
transportation_df['GEO_ID'] = transportation_df['GEO_ID'].str.slice(-11)

# Ensure numerica datatype 
non_numeric_cols = transportation_df.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_cols:
    if col == 'GEO_ID':
        continue
    transportation_df[col] = pd.to_numeric(transportation_df[col], errors='coerce')

print(transportation_df.shape)
print(transportation_df.columns)
print(transportation_df.dtypes)

(5411, 7)
Index(['GEO_ID', 'drove alone', 'carpooled', 'public transit', 'walked',
       'taxicab, motorcycle, bicycle, or other means', 'worked from home'],
      dtype='object')
GEO_ID                                          object
drove alone                                      int64
carpooled                                        int64
public transit                                   int64
walked                                           int64
taxicab, motorcycle, bicycle, or other means     int64
worked from home                                 int64
dtype: object


In [20]:
sum = transportation_df['drove alone'] + transportation_df['carpooled'] + transportation_df['public transit'] + transportation_df['walked'] + transportation_df['taxicab, motorcycle, bicycle, or other means'] + transportation_df['worked from home']
transportation_df['% workers drove alone'] = (transportation_df['drove alone'] / sum).round(2)
transportation_df['% workers public transit'] = (transportation_df['public transit'] / sum).round(2)
transportation_df['% workers walked'] = (transportation_df['walked'] / sum).round(2)

transportation_df = transportation_df[['GEO_ID', '% workers drove alone', '% workers public transit', '% workers walked']]
print(transportation_df.sample(10))

           GEO_ID  % workers drove alone  % workers public transit  \
2870  36061028500                   0.06                      0.59   
4338  36087011511                   0.66                      0.14   
522   36009940301                   0.72                      0.01   
5231  36119004802                   0.43                      0.23   
4679  36103124401                   0.76                      0.12   
3632  36081028200                   0.37                      0.44   
2577  36061000500                    NaN                       NaN   
5256  36119006801                   0.52                      0.16   
1015  36029014703                   0.76                      0.02   
26    36001002200                   0.49                      0.06   

      % workers walked  
2870              0.18  
4338              0.01  
522               0.10  
5231              0.00  
4679              0.05  
3632              0.01  
2577               NaN  
5256              0.01  
1015

### Merge Land Use & Mode of Transportation data

In [22]:
merged_gdf = shortage_tracts_gdf.merge(transportation_df, how='left', left_on='GEOID', right_on='GEO_ID')
merged_gdf["medicaid enrollees"] = merged_gdf["medicaid"]
merged_gdf["commercial insurance enrollees"] = merged_gdf["commercial"]
merged_gdf = merged_gdf[[
    'GEOID', 'geometry', 'insured population per km2',
    'unserved medicaid enrollees per km2', 'unserved commercial insurance enrollees per km2', 
    'medicaid enrollees', 'commercial insurance enrollees', 'agriculture_ratio', 'residential_ratio', 
    'vacant_ratio', 'commercial_ratio', 'industrial_ratio', 'avg_land_price_per_sqft',
    'residential_area_per_sqft', 'commercial_area_per_sqft', '% workers drove alone', 
    '% workers public transit', '% workers walked'
]]

print(merged_gdf.columns)
print(merged_gdf.shape)

Index(['GEOID', 'geometry', 'insured population per km2',
       'unserved medicaid enrollees per km2',
       'unserved commercial insurance enrollees per km2', 'medicaid enrollees',
       'commercial insurance enrollees', 'agriculture_ratio',
       'residential_ratio', 'vacant_ratio', 'commercial_ratio',
       'industrial_ratio', 'avg_land_price_per_sqft',
       'residential_area_per_sqft', 'commercial_area_per_sqft',
       '% workers drove alone', '% workers public transit',
       '% workers walked'],
      dtype='object')
(3533, 18)


In [23]:
merged_gdf.to_file('built_environment_analysis.geojson', driver='GeoJSON')