In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

In [3]:
file_path = './data/NYS_2022_Tax_Parcels_Public.geojson'
gdf = gpd.read_file(file_path)

In [36]:
# Filter columns to keep
columns_to_keep =  ["geometry", "OBJECTID", "COUNTY_NAME", "PARCEL_ADDR", "SBL", "CITYTOWN_NAME", "LOC_STREET", "LOC_ZIP", "PROP_CLASS", "LAND_AV", "TOTAL_AV", "FULL_MARKET_VAL", "YR_BLT", "FRONT", "DEPTH", "SQ_FT", "SQFT_LIVING", "GFA", "SPATIAL_YR"]
gdf_subset = gdf[columns_to_keep]


# Match land classification codes of NYC with other counties
counties_nyc = ["Kings", "Bronx", "NewYork", "Queens", "Richmond"]

# Edit "PROP_CLASS" for non-NYC counties
gdf_subset.loc[~gdf_subset['COUNTY_NAME'].isin(counties_nyc), 'PROP_CLASS'] = gdf_subset.loc[~gdf_subset['COUNTY_NAME'].isin(counties_nyc), 'PROP_CLASS'].str[0]

# Edit "PROP_CLASS" for NYC counties (nyc_residential --> '2' Residential)
nyc_residential = ['01', '02', '03']
gdf_subset.loc[gdf_subset['PROP_CLASS'].isin(nyc_residential), 'PROP_CLASS'] = '2'

# Edit "PROP_CLASS" for NYC counties (nyc_commercial --> '4' Commercial)
nyc_commercial = ['05']
gdf_subset.loc[gdf_subset['PROP_CLASS'].isin(nyc_commercial), 'PROP_CLASS'] = '4'

# Edit "PROP_CLASS" for NYC counties (nyc_industrial --> '7' Industrial)
nyc_industrial = ['06']
gdf_subset.loc[gdf_subset['PROP_CLASS'].isin(nyc_industrial), 'PROP_CLASS'] = '7'

# Edit "PROP_CLASS" for NYC counties (nyc_transportation --> '8' Public services)
nyc_transportation = ['07']
gdf_subset.loc[gdf_subset['PROP_CLASS'].isin(nyc_transportation), 'PROP_CLASS'] = '8'

# Edit "PROP_CLASS" for NYC counties (nyc_institutions --> '6' Community services)
nyc_institutions = ['08']
gdf_subset.loc[gdf_subset['PROP_CLASS'].isin(nyc_institutions), 'PROP_CLASS'] = '6'

# Edit "PROP_CLASS" for NYC counties (nyc_outdoor --> '9' public parks)
nyc_outdoor = ['09']
gdf_subset.loc[gdf_subset['PROP_CLASS'].isin(nyc_outdoor), 'PROP_CLASS'] = '9'

# Edit "PROP_CLASS" for NYC counties (nyc_parking --> null)
nyc_parking = ['10']
gdf_subset.loc[gdf_subset['PROP_CLASS'].isin(nyc_parking), 'PROP_CLASS'] = None

# Edit "PROP_CLASS" for NYC counties (nyc_vacant --> '3' vacant land)
nyc_vacant = ['11']
gdf_subset.loc[gdf_subset['PROP_CLASS'].isin(nyc_vacant), 'PROP_CLASS'] = '3'

# Edit "PROP_CLASS" for NYC counties ('04' --> '10' Mixed Residential & Commercial)
nyc_mixed = ['04']
gdf_subset.loc[gdf_subset['PROP_CLASS'].isin(nyc_mixed), 'PROP_CLASS'] = '10'

In [38]:
gdf_subset.to_file('./outputs/NYS_2022_Tax_Parcels_Public_32Counties.geojson', driver='GeoJSON')

### Prepare parcel data for Profitability / built environment clustering

In [4]:
parcels_gdf = gpd.read_file('./data/NYS_2022_Tax_Parcels_Public_32Counties.geojson')

In [5]:
print(parcels_gdf.crs)

EPSG:32016


In [47]:
parcels_gdf['PROP_CLASS'] = pd.to_numeric(parcels_gdf['PROP_CLASS'], errors='coerce')

parcels_gdf['area_sqft'] = parcels_gdf.area
parcels_gdf['land_price_per_sqft(dollars)'] = (parcels_gdf['LAND_AV'] / parcels_gdf['area_sqft']).round(2)

parcels_gdf['agriculture'] = (parcels_gdf['PROP_CLASS'] == 1).astype(int)
parcels_gdf['residential'] = (parcels_gdf['PROP_CLASS'] == 2).astype(int)
parcels_gdf['vacant'] = (parcels_gdf['PROP_CLASS'] == 3).astype(int)
parcels_gdf['commercial'] = (parcels_gdf['PROP_CLASS'] == 4).astype(int)
parcels_gdf['industrial'] = (parcels_gdf['PROP_CLASS'] == 7).astype(int)
parcels_gdf.loc[parcels_gdf['PROP_CLASS'] == 10, 'residential'] = 0.5
parcels_gdf.loc[parcels_gdf['PROP_CLASS'] == 10, 'commercial'] = 0.5

  parcels_gdf.loc[parcels_gdf['PROP_CLASS'] == 10, 'residential'] = 0.5
  parcels_gdf.loc[parcels_gdf['PROP_CLASS'] == 10, 'commercial'] = 0.5


In [54]:
parcels_gdf['centroid'] = parcels_gdf.geometry.centroid
columns_to_keep = ['SQFT_LIVING', 'GFA', 'centroid', 'land_price_per_sqft(dollars)', 'agriculture', 'residential', 'vacant', 'commercial', 'industrial']
parcels_gdf = parcels_gdf[columns_to_keep]

AttributeError: You are calling a geospatial method on the GeoDataFrame, but the active geometry column ('geometry') is not present. 
There are columns with geometry data type (['centroid']), and you can either set one as the active geometry with df.set_geometry("name") or access the column as a GeoSeries (df["name"]) and call the method directly on it.

In [59]:
parcels_gdf.rename(columns={'centroid': 'geometry'}, inplace=True)
parcels_gdf['SQFT_LIVING'].fillna(0, inplace=True)
parcels_gdf['GFA'].fillna(0, inplace=True)

In [60]:
parcels_gdf.to_file('NYS_Tax_Parcels_Public_Points_2022.geojson', driver='GeoJSON')