#### This file will serve to build a list of addresses in a specific county


In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import fiona

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

req_cols = ['FULLADDR','POST_CODE','POST_COMM','COUNTY','PLACE_TYPE','PLACEMENT','LONG_', 'LAT']

data = gpd.read_file('../../Downloads/Addr_NG911.gdb/Addr_NG911.gdb', include_fields=req_cols)
# 8m 22.4s for full df

In [3]:
#useful = ['FULLADDR','POST_CODE','POST_COMM','COUNTY','PLACE_TYPE','PLACEMENT','LONG_', 'LAT']

df = data.drop("geometry", axis=1).copy()
df.sample(1)

Unnamed: 0,COUNTY,FULLADDR,POST_COMM,POST_CODE,PLACE_TYPE,PLACEMENT,LONG_,LAT
1866552,882276,39 Pacio Court,Roseland,7068,Residence,Structure - Rooftop,-74.308694,40.828572


In [4]:
df[~df["PLACE_TYPE"].isin(["Residence", "Unknown"])].head(10)

Unnamed: 0,COUNTY,FULLADDR,POST_COMM,POST_CODE,PLACE_TYPE,PLACEMENT,LONG_,LAT
271,882910,405 Union Avenue,Brielle,8730,COMM,Structure - Rooftop,-74.056324,40.113623
358,882228,40 Pittstown Road,Clinton,8809,Hospital,Structure - Rooftop,-74.926723,40.627704
359,882228,40 Pittstown Road,Clinton,8809,Hospital,Structure - Rooftop,-74.927245,40.627334
360,882228,40 Pittstown Road,Clinton,8809,Hospital,Structure - Rooftop,-74.927305,40.627364
361,882228,40 Pittstown Road,Clinton,8809,Hospital,Structure - Rooftop,-74.926955,40.627527
362,882228,40 Pittstown Road,Clinton,8809,Hospital,Structure - Rooftop,-74.928485,40.629972
363,882275,1676 East Landis Avenue,Vineland,8361,Hospital,Property Access Point,-74.99543,39.484836
446,882910,360 Main Street,Matawan,7747,Industrial,Structure - Rooftop,-74.238636,40.40141
461,882228,40 Pittstown Road,Clinton,8809,Hospital,Structure - Rooftop,-74.926663,40.627158
462,882228,40 Pittstown Road,Clinton,8809,Hospital,Structure - Rooftop,-74.928565,40.628133


In [5]:
df["PLACE_TYPE"].value_counts()
# Outdoors == farm land??
# Hospitals, shopping --> gentrifying??

Residence           3136410
Unknown              476588
Industrial            24768
Outdoors              14567
Office                10507
School                 3629
COMM                   2702
Store                  2144
Shopping-Area          1772
Hospital               1124
Other                  1024
Government              663
EMER                    300
Place-Of-Worship        276
Restaurant              238
Hotel                   176
Public                  172
Warehouse                95
Street                   86
Bank                     31
Parking                  29
Cafe                     23
TRAN                     22
Club                     17
Prison                   14
Bar                      12
Theater                   9
Library                   5
Stadium                   3
Classroom                 2
Arena                     2
Public-Transport          2
Train-Station             1
Name: PLACE_TYPE, dtype: int64

In [6]:
df["PLACE_TYPE"].mask(~df["PLACE_TYPE"].isin(['Residence', 'Unknown', 'Industrial', 'Outdoors']), "Commercial", inplace=True)
# Sets all other possible Place Types to commercial as it fits that category

In [7]:
# morris county land size: 1.24713107e9 m^2, 100m apart for 5m width roadway
MORRIS_COUNTY_ADDR_N = (1247131070/(100*5))
MORRIS_COUNTY_ADDR_N_Actual = int(MORRIS_COUNTY_ADDR_N / 100000)*1000
MORRIS_COUNTY_ADDR_N_Actual

MIDDLESEX_COUNTY_ADDR_N = (836307161/(100*5))
MIDDLESEX_COUNTY_ADDR_N_Actual = int(MIDDLESEX_COUNTY_ADDR_N / 100000)*1000
MIDDLESEX_COUNTY_ADDR_N_Actual

OCEAN_COUNTY_ADDR_N = (2369839120/(100*5))
OCEAN_COUNTY_ADDR_N_Actual = int(OCEAN_COUNTY_ADDR_N / 100000)*1000
OCEAN_COUNTY_ADDR_N_Actual


24000

16000

47000

In [8]:
raw_boundry = pd.read_csv("../../Downloads/County_Boundaries_of_NJ.csv")
boundry_df_list = ['COUNTY','GNIS_NAME', 'GNIS'] # Will help cross reference actual county names and GNIS values
boundry_df = raw_boundry.copy()[boundry_df_list]

In [9]:
df["COUNTY"] = df["COUNTY"].astype(np.int64) # need to convert this column into int64

combined = df.merge(boundry_df, left_on='COUNTY', right_on='GNIS').drop("COUNTY_x", axis=1)
combined.head(1)

Unnamed: 0,FULLADDR,POST_COMM,POST_CODE,PLACE_TYPE,PLACEMENT,LONG_,LAT,COUNTY_y,GNIS_NAME,GNIS
0,349 Fairview Avenue,Orange,7050,Residence,Structure - Rooftop,-74.240721,40.761214,ESSEX,County of Essex,882276


In [10]:
# need to build: '950 Cambridge St, Cambridge, MA 02141' because lat long is too inaccurate
combined["ADDRESS"] = combined["FULLADDR"] + ", " +  combined["COUNTY_y"] + ", NJ " + combined["POST_CODE"]

In [14]:
#combined[combined["COUNTY_y"].str.startswith('MONR', na=False)]["COUNTY_y"].value_counts() # used to find counties in long df
# len(combined[combined["COUNTY_y"] == "MORRIS"]["ADDRESS"]) = 202132
morris_address_list = combined[combined["COUNTY_y"] == "MORRIS"][["PLACE_TYPE","ADDRESS","LONG_","LAT"]]
middlesex_address_list = combined[combined["COUNTY_y"] == "MIDDLESEX"][["PLACE_TYPE","ADDRESS","LAT","LONG_"]]
ocean_address_list = combined[combined["COUNTY_y"] == "OCEAN"][["PLACE_TYPE","ADDRESS","LAT","LONG_"]]

In [15]:
morris_address_list.to_csv('../../Downloads/morris_address_list.csv', index=False)
middlesex_address_list.to_csv('../../Downloads/middlesex_address_list.csv', index=False)
ocean_address_list.to_csv('../../Downloads/ocean_address_list.csv', index=False)

# Unfortuntely, we cannot subset the number of addresses here because we have to make sure they are valid addresses, therefore, we would need backups if they fail. 
# We shall export entire sets instead.