In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import random

# module imports
! pip install polyleven
#from data.pairs import make_entries
import distances
from geocoding import geocode




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [29]:
def make_entries():
    """
    Creates a pandas dataframe with the following columns: address1, address2, label, 
    where each entry represents a pair of addresses and their label (0 for different, 1 for same)
    """
    # initialize pandas dataframe with defined columns
    df = pd.DataFrame(columns=['address1', 'address2', 'label'])

    # open addresses json file
    with open("data/addresses_data.json") as f:
        addresses = json.load(f)
    addresses = addresses["res"]

    # add case1 entries
    df = pd.concat([df, case1(addresses)], ignore_index=True)

    # add case4 entries
    df = pd.concat([df, case4(addresses)], ignore_index=True)

    return df


def case1(addresses):
    """
    Given a list of addresses, return a DF with the following entries as epcified by case1:
    For every address object, create an entry with the unstructured address to itself with a label of 1
    """
    df = pd.DataFrame(columns=['address1', 'address2', 'label'])
    for address in addresses:
        string_add = address["street_address"] + ", " + \
            address["city"] + ", " + address["state"] + " " + address["zip"]
        df.loc[len(df)] = {'address1': string_add, 'address2': string_add,
                           'label': 1}
    return df


def case4(addresses):
    """
    Given a list of addresses, return a DF with the 10000 entries as epcified by case4:
    Select two random addresses with label 0
    """
    df = pd.DataFrame(columns=['address1', 'address2', 'label'])
    for i in range(10000):
        # select two random addresses
        address1 = random.choice(addresses)
        address2 = random.choice(addresses)
        # make sure they are different
        while address1 == address2:
            address2 = random.choice(addresses)
        # create entry
        string_add1 = address1["street_address"] + ", " + \
            address1["city"] + ", " + address1["state"] + " " + address1["zip"]
        string_add2 = address2["street_address"] + ", " + \
            address2["city"] + ", " + address2["state"] + " " + address2["zip"]
        df.loc[len(df)] = {'address1': string_add1,
                           'address2': string_add2, 'label': 0}
    return df


In [31]:
addresses = make_entries()

Unnamed: 0,address1,address2,label
count,11121,11121,11121
unique,920,920,2
top,"111 COLCHESTER AVE, BURLINGTON, VT 05401","111 COLCHESTER AVE, BURLINGTON, VT 05401",0
freq,81,94,10000


### Vectorization of Addresses

In [35]:
! pip install usaddress
import label

Collecting usaddress
  Downloading usaddress-0.5.10-py2.py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting probableparsing (from usaddress)
  Downloading probableparsing-0.0.1-py2.py3-none-any.whl (3.1 kB)
Collecting python-crfsuite>=0.7 (from usaddress)
  Downloading python_crfsuite-0.9.9-cp310-cp310-macosx_10_9_x86_64.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.8/184.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, probableparsing, usaddress
Successfully installed probableparsing-0.0.1 python-crfsuite-0.9.9 usaddress-0.5.10

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [45]:
from distances import distance_jaro_winkler
import usaddress

# Create a new dataframe to store the labeled entries and distance features
labeled_df = pd.DataFrame(columns=['address_number_dist', 'street_name_dist', 'secondary_address_dist', 'city_dist', 'state_dist', 'postal_code', 'label'])
labels = ['address_number', 'street_name', 'secondary_address', 'city', 'state', 'postal_code']
# Iterate over each entry in the addresses dataframe
for index, row in addresses.iterrows():
  address1 = row['address1']
  address2 = row['address2']
  labeled_1 = usaddress.tag(address1, tag_mapping=label.address_map_dict)[0]
  labeled_2 = usaddress.tag(address1, tag_mapping=label.address_map_dict)[0]
  
  for label in labels:
    if label not in labeled_1:
      labeled_1.update({label:''})
    if label not in labeled_2:
      labeled_2.update({label:''})

  #handle cases with different number of labels here, but for now, assume same
  print(labeled_1)
  
  # Add the labeled entry and distance feature to the new dataframe
  labeled_df.loc[index] = {
    'address_number_dist':distance_jaro_winkler(labeled_1['address_number'], labeled_2['address_number']), 
    'street_name_dist':distance_jaro_winkler(labeled_1['street_name'], labeled_2['street_name']),
    'secondary_address_dist':distance_jaro_winkler(labeled_1['secondary_address'], labeled_2['secondary_address']),
    'city_dist':distance_jaro_winkler(labeled_1['city'], labeled_2['city']),
    'state_dist':distance_jaro_winkler(labeled_1['state'], labeled_2['state']),
    'postal_code':distance_jaro_winkler(labeled_1['postal_code'], labeled_2['postal_code']),
    'label': row['label']}

labeled_df.head()


OrderedDict([('address_number', '2841'), ('street_name', 'DEBARR RD'), ('city', 'ANCHORAGE'), ('state', 'AK'), ('postal_code', '99508')])


KeyError: 'secondary_address'