In [10]:
import os
import boto3
import getpass
from io import StringIO
import pandas as pd
import geopandas as gpd

In [2]:
aws_access_key_id = getpass.getpass(prompt="Enter the AWS access key id: ")
aws_secret_access_key = getpass.getpass(prompt="Enter the AWS secret access key id: ")

credentials = {
    'region_name': 'us-east-2',
    'aws_access_key_id': aws_access_key_id,
    'aws_secret_access_key': aws_secret_access_key
}

Enter the AWS access key id: ········
Enter the AWS secret access key id: ········


In [3]:
session = boto3.session.Session(**credentials)
s3 = session.client('s3')

**Load in the address data**

In [4]:
bucket_name = 'charlotte-choropleth-data'
file_path = 's3://charlotte-choropleth-data/charlotte-addresses/Master_Address.csv'

response = s3.list_objects(Bucket=bucket_name)

In [5]:
for obj in response.get('Contents', []):
    print('Object Key:', obj['Key'])

Object Key: charlotte-addresses/
Object Key: charlotte-addresses/Master_Address.csv


In [6]:
s3_client = boto3.client('s3')
bucket_name = 'charlotte-choropleth-data'

#response = s3_client.list_objects(Bucket=bucket_name)

obj = s3.get_object(Bucket=bucket_name, Key='charlotte-addresses/Master_Address.csv')
data = obj['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(data))

In [7]:
df.head()

Unnamed: 0,X,Y,OBJECTID,AddressID,HouseNumber,Direction,StreetName,StreetType,Suffix,Unit,...,ZipCode,GISParcelID,TaxParcelID,ParcelID,DateUpdated,StatusCode,E911,XCoordinate,YCoordinate,FullAddress
0,1422115.0,616465.000202,1,37382,13623,,N C 73,HY,,,...,28078,101102,101102.0,101102,2020/05/05 17:43:28+00,A,2932.0,1422115.0,616465.000202,13623 N C 73 HY
1,1422447.0,617840.000077,2,436958,13520,,HAGERS FERRY,RD,,,...,28078,101108,101108.0,101108,2020/05/05 17:53:52+00,A,4435.0,1422447.0,617840.000077,13520 HAGERS FERRY RD
2,1422173.0,617803.000151,3,36800,13516,,HAGERS FERRY,RD,,,...,28078,101108,101108.0,101108,2020/05/05 18:40:36+00,A,4435.0,1422173.0,617803.000151,13516 HAGERS FERRY RD
3,1422874.0,617469.000163,4,37719,13710,,HAGERS FERRY,RD,,,...,28078,101109,101109.0,101109,2020/05/05 17:46:38+00,A,3504.0,1422874.0,617469.000163,13710 HAGERS FERRY RD
4,1422759.0,617256.00026,5,37824,13720,,HAGERS FERRY,RD,,,...,28078,101110,101110.0,101110,2020/05/05 17:43:38+00,A,3504.0,1422759.0,617256.00026,13720 HAGERS FERRY RD


**Now use geopandas to convert the weird X and Y coordinates to latitude and longitude**

In [8]:
df_filtered = df[['X', 'Y', 'HouseNumber', 'Direction', 'StreetName', 'StreetType', 'Suffix', 
                  'Unit', 'State', 'ZipCode', 'XCoordinate', 'YCoordinate']]

In [30]:
df_filtered.head()

Unnamed: 0,X,Y,HouseNumber,Direction,StreetName,StreetType,Suffix,Unit,State,ZipCode,XCoordinate,YCoordinate
0,1422115.0,616465.000202,13623,,N C 73,HY,,,NC,28078,1422115.0,616465.000202
1,1422447.0,617840.000077,13520,,HAGERS FERRY,RD,,,NC,28078,1422447.0,617840.000077
2,1422173.0,617803.000151,13516,,HAGERS FERRY,RD,,,NC,28078,1422173.0,617803.000151
3,1422874.0,617469.000163,13710,,HAGERS FERRY,RD,,,NC,28078,1422874.0,617469.000163
4,1422759.0,617256.00026,13720,,HAGERS FERRY,RD,,,NC,28078,1422759.0,617256.00026


In [31]:
import geopandas as gpd
from shapely.geometry import Point

geometry = [Point(xy) for xy in zip(df_filtered['X'], df_filtered['Y'])]
gdf = gpd.GeoDataFrame(df, geometry=geometry)

In [32]:
gdf.crs = 'epsg:2264'

In [29]:
gdf = gdf.to_crs('epsg:4326')

In [31]:
gdf.to_file("../gpd-dfs/address-geometry-gdf.geojson", driver="GeoJSON")

In [32]:
address_geometry_gdf = gpd.read_file("../gpd-dfs/address-geometry-gdf.geojson")

In [12]:
len(address_geometry_gdf.index)

634334

In [33]:
# This library allows for fuzzy input
import Levenshtein

# This library parses user input and extracts different elements
import usaddress

In [34]:
# Example of how to use usaddress library

address = "1234 Elm St NW Washington DC 20001"
parsed = usaddress.parse(address)

print(parsed)

[('1234', 'AddressNumber'), ('Elm', 'StreetName'), ('St', 'StreetNamePostType'), ('NW', 'StreetNamePostDirectional'), ('Washington', 'PlaceName'), ('DC', 'StateName'), ('20001', 'ZipCode')]


In [35]:
def closest_addresses(search_address, df, top_n=5):
    # Directional can be Stre
    
    
    df['distance'] = df['FullAddress'].apply(lambda x: Levenshtein.distance(search_address, x))
    return df.nsmallest(top_n, 'distance')[['FullAddress', 'geometry']]

In [38]:
search_address = '300 Camp Rd, Charlotte, NC 28206'

print(closest_addresses(search_address, address_geometry_gdf))

[('300', 'AddressNumber'), ('Camp', 'StreetName'), ('Rd,', 'StreetNamePostType'), ('Charlotte,', 'PlaceName'), ('NC', 'StateName'), ('28206', 'ZipCode')]
['HouseNumber', 'StreetName', 'StreetType', 'State', 'ZipCode']


KeyError: 'PlaceName'

In [17]:
address_geometry_gdf.iloc[1034]

X                                          1436135.000195
Y                                            636976.00019
OBJECTID                                             1035
AddressID                                           69508
HouseNumber                                         20444
Direction                                             NaN
StreetName                               GREENWAY HEIGHTS
StreetType                                             DR
Suffix                                                NaN
Unit                                                  NaN
Jurisdiction                                    CORNELIUS
PostalCity                                      CORNELIUS
State                                                  NC
ZipCode                                             28031
GISParcelID                                      00113367
TaxParcelID                                      113367.0
ParcelID                                         00113367
DateUpdated   

In [18]:
#DF: HouseNumber, Direction, StreetName, StreetType, ZipCode, State, PostalCity
#US: AddressNumber, StreetNamePreDirectional, StreetName, StreetNamePostType, StreetNamePostDirectional, CityName, StateName, ZipCode



# FIX ME

In [57]:
def closest_addresses(search_address, df, top_n=5):
    # used to connect us address labels to column names
    map_usaddress_to_cols = {
        'AddressNumber': 'HouseNumber',
        'StreetNamePreDirectional': 'Direction',
        'StreetNamePostDirectional': 'Direction',
        'StreetName': 'StreetName',
        'StreetNamePostType': 'StreetType',
        'ZipCode': 'ZipCode',
        'StateName': 'State',
        'CityName': 'PostalCity'
    }
    
    address_parsed = usaddress.parse(search_address)
    
    # combine repeated elements (i.e. ('GREENWAY', 'StreetName'), ('HEIGHTS', 'StreetName'))
    output = []
    seen_labels = {}
    for value, label in address_parsed:
        if label in seen_labels:
            idx = seen_labels[label]
            prev_value, _ = output[idx]
            output[idx] = (f"{prev_value} {value}", label)
        else:
            seen_labels[label] = len(output)
            output.append((value, label))
            
    # update to include merged values
    address_parsed = output
    
    cols_present = list(map(lambda x: map_usaddress_to_cols.get(x[1], None), address_parsed))
    cols_present = [r for r in cols_present if r]
    
    print(address_parsed)
    print(cols_present)
    
    # stores the found values under names understood by df
    address_df_context = {}
    for value, col_key in address_parsed:
        address_df_context[map_usaddress_to_cols[col_key]] = value

    add_geom_gdf_copy = address_geometry_gdf.copy()
    if 'StreetName' in cols_present and any(add_geom_gdf_copy['StreetName'].str.contains(address_df_context['StreetName'], case=False, na=False).values):
        add_geom_gdf_copy = add_geom_gdf_copy[add_geom_gdf_copy['StreetName'].str.contains(address_df_context['StreetName'], case=False, na=False)]
    if 'ZipCode' in cols_present and any(add_geom_gdf_copy['ZipCode'].str.contains(address_df_context['ZipCode'], case=False, na=False).values):
        add_geom_gdf_copy = add_geom_gdf_copy[add_geom_gdf_copy['ZipCode'].str.contains(address_df_context['ZipCode'], case=False, na=False)]
    if 'StreetType' in cols_present and any(add_geom_gdf_copy['StreetType'].str.contains(address_df_context['StreetType'], case=False, na=False).values):
        add_geom_gdf_copy = add_geom_gdf_copy[add_geom_gdf_copy['StreetType'].str.contains(address_df_context['StreetType'], case=False, na=False)]
        
    # Find the Levenshtein distance for each of the sub-parts and sum them together
    def compute_distance(row):
        total_distance = 0
        for col, address_part in address_df_context.items():
            # Ensure the dataframe's column and the dictionary value are both strings
            row_value = str(row[col]) if not pd.isna(row[col]) else ""
            total_distance += Levenshtein.distance(row_value, address_part)
        return total_distance
    
    add_geom_gdf_copy['distance'] = add_geom_gdf_copy.apply(compute_distance, axis=1)

    return add_geom_gdf_copy.nsmallest(top_n, 'distance')[['FullAddress', 'geometry']]

In [56]:
#address_geometry_gdf[['FullAddress']].values
if 'StreetName' in cols_present and any(address_geometry_gdf['StreetName'].str.contains('HAGERS', case=False, na=False).values):
    pass
if 'ZipCode' in cols_present and any(address_geometry_gdf['ZipCode'].str.contains('HAGERS', case=False, na=False).values):
    pass
if 'StreetType' in cols_present and any(address_geometry_gdf['StreetType'].str.contains('HAGERS', case=False, na=False).values):
    pass


True

In [61]:
closest_addresses(search_address="20444 GREENWAY HEIGHTS DRIVE", df=address_geometry_gdf)

[('20444', 'AddressNumber'), ('GREENWAY HEIGHTS', 'StreetName'), ('DRIVE', 'StreetNamePostType')]
['HouseNumber', 'StreetName', 'StreetType']


Unnamed: 0,FullAddress,geometry
1034,20444 GREENWAY HEIGHTS DR,POINT (-80.89427 35.48542)
1033,20448 GREENWAY HEIGHTS DR,POINT (-80.89428 35.48525)
1035,20440 GREENWAY HEIGHTS DR,POINT (-80.89430 35.48558)
1037,20424 GREENWAY HEIGHTS DR,POINT (-80.89432 35.48636)
1031,20456 GREENWAY HEIGHTS DR,POINT (-80.89395 35.48499)


In [23]:
address_geometry_gdf.crs

<Projected CRS: EPSG:2264>
Name: NAD83 / North Carolina (ftUS)
Axis Info [cartesian]:
- X[east]: Easting (US survey foot)
- Y[north]: Northing (US survey foot)
Area of Use:
- name: United States (USA) - North Carolina - counties of Alamance; Alexander; Alleghany; Anson; Ashe; Avery; Beaufort; Bertie; Bladen; Brunswick; Buncombe; Burke; Cabarrus; Caldwell; Camden; Carteret; Caswell; Catawba; Chatham; Cherokee; Chowan; Clay; Cleveland; Columbus; Craven; Cumberland; Currituck; Dare; Davidson; Davie; Duplin; Durham; Edgecombe; Forsyth; Franklin; Gaston; Gates; Graham; Granville; Greene; Guilford; Halifax; Harnett; Haywood; Henderson; Hertford; Hoke; Hyde; Iredell; Jackson; Johnston; Jones; Lee; Lenoir; Lincoln; Macon; Madison; Martin; McDowell; Mecklenburg; Mitchell; Montgomery; Moore; Nash; New Hanover; Northampton; Onslow; Orange; Pamlico; Pasquotank; Pender; Perquimans; Person; Pitt; Polk; Randolph; Richmond; Robeson; Rockingham; Rowan; Rutherford; Sampson; Scotland; Stanly; Stokes; Sur

In [81]:
usaddress.parse("20444 GREENWAY HEIGHTS DR")

[('20444', 'AddressNumber'),
 ('GREENWAY', 'StreetName'),
 ('HEIGHTS', 'StreetName'),
 ('DR', 'StreetNamePostType')]