# Read in the dataset

In [1]:
import gzip
import os
import humanfriendly
import numpy as np
import pandas as pd

from openclean.data.source.socrata import Socrata

import warnings
warnings.filterwarnings('ignore')

dob_historical_permit_issuance_id = 'bty7-2jhb'
dob_cellular_antenna_filings_id = 'iz2q-9x8d'
dob_C_of_O_id = 'bs8b-p36w'
dob_violations_id = '3h2n-5cm9'

# Dictionary for the name of target columns in each data set
column_name_mapping = {dob_historical_permit_issuance_id: {'Borough': 'BOROUGH', 
                                                           'City': 'Owner’s House City',
                                                           'Street': 'Street',
                                                           'Number': 'Number',
                                                           'Block': 'Block',
                                                           'Lot': 'Lot',},
                       dob_cellular_antenna_filings_id: {'Borough': 'Borough', 
                                                           'City': 'City',
                                                           'Street': 'Street Name',
                                                           'Number': 'House #',
                                                           'Block': 'Block',
                                                           'Lot': 'Lot',},
                       dob_C_of_O_id: {'Borough': 'BOROUGH', 
                                       'Street': 'STREET',
                                       'Number': 'NUMBER',
                                       'Block': 'BLOCK',
                                       'Lot': 'LOT',},
                       dob_violations_id: {'Borough': 'BORO', 
                                           'Street': 'STREET',
                                           'Number': 'HOUSE_NUMBER',
                                           'Block': 'BLOCK',
                                           'Lot': 'LOT',},
                      }

# Set the unique id of the dataset you want to use here
db_id = dob_cellular_antenna_filings_id

dataset = Socrata().dataset(db_id)

datafile = f'./{db_id}.tsv.gz'

# Download file if it doesn't exist
if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as  f:
        print('Downloading ...\n')
        dataset.write(f)

fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print(f'Using "{dataset.name}" in file {datafile} of size {fsize}')

Using "DOB Cellular Antenna Filings" in file ./iz2q-9x8d.tsv.gz of size 711.77 KB


## Create data stream

In [2]:
from openclean.pipeline import stream

ds_full = stream(datafile)

## Initial Profile

In [3]:
from openclean.profiling.column import DefaultColumnProfiler

target_cols = list(column_name_mapping[db_id].values())
profiles = ds_full.select(columns=target_cols).profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Borough,6090,22,66,0.010877,2.219848
City,6090,103,282,0.047102,4.315146
Street Name,6090,87,2185,0.363985,10.36927
House #,6090,62,2447,0.405939,10.656315
Block,6090,93,2862,0.477239,11.151813
Lot,6090,98,283,0.04723,6.294356


### Method to generate a histogram

In [4]:
def get_histogram(data_set, column_name):
    print(f'\n{column_name}:')
    value = data_set.distinct(column_name)
    for rank, val in enumerate(value.most_common()):
        v, freq = val
        print(f'{rank+1:<3} {v} {freq:>10,}')

## Clean Borough data

In [5]:
def clean_borough_name(name):
    boroughs = ['MANHATTAN', 'BRONX', 'BROOKLYN', 'QUEENS', 'STATEN ISLAND']
    if is_empty(name):
        return 'N/A'
    elif name == '1':
        return 'MANHATTAN'
    elif name == '2':
        return 'BRONX'
    elif name == '3':
        return 'BROOKLYN'
    elif name == '4':
        return 'QUEENS'
    elif name == '5':
        return 'STATEN ISLAND'
    elif name.upper() in boroughs:
        return name.upper()
    else:
        return 'N/A'

def clean_borough_data(ds, column_name):
    cleaned_data = ds.update(column_name, lambda x: clean_borough_name(x))
    return cleaned_data

## Clean city data

In [6]:
from fuzzywuzzy import fuzz
import json

from openclean.data.refdata import RefStore
from openclean.function.value.null import is_empty

def clean_city_name(name, valid_city_lookup):
    if is_empty(name):
        return 'N/A'
    return valid_city_lookup.get(name.upper(), 'N/A')


def clean_city_data(ds, column_name):
    # Download the license plate state codes dataset.
    refdata = RefStore()
    refdata\
        .load('encyclopaedia_britannica:us_cities', auto_download=True)\
        .df()\

    # Get set of distinct state codes.
    city_ref = refdata.load('encyclopaedia_britannica:us_cities', auto_download=True).distinct('city')
    # Get list of distinct owner house city names
    city_names = ds.distinct(column_name)
    # Init lookup dictionary for fuzzy matching
    city_ref_lookup = {}
    ref_file_name = 'city_ref_lookup.json'
    # 
    if os.path.isfile(ref_file_name):
        with open(ref_file_name) as f:
            city_ref_lookup = json.load(f)
    # compare city name to each valid city and add to lookup table
    # if the similarity is high enough
    for city in city_names:
        if is_empty(city):
            continue
        name = city.upper()
        if city_ref_lookup.get(name):
            continue
        if (name == 'NYC' or 
              name == 'NY' or
              fuzz.ratio('NY', name) > 70 or
              fuzz.ratio('NYC', name) > 70):
            city_ref_lookup[name] = 'NEW YORK'
            continue
        found_match = False
        for valid_city in city_ref:
            percent_match = fuzz.ratio(valid_city.upper(), name)
            if percent_match > 70:
                found_match = True
                city_ref_lookup[name] = valid_city.upper()
                break
        if not found_match:
            city_ref_lookup[name] = 'N/A'

    cleaned_data = ds.update(column_name, lambda x: clean_city_name(x, city_ref_lookup))
    with open(ref_file_name, 'w') as f:
        json.dump(city_ref_lookup, f)
    return cleaned_data

## Clean U.S. Street data

In [7]:
from openclean_geo.address.usstreet import StandardizeUSStreetName

def clean_street_name(name):
    # Replace empty data with 'N/A'
    if is_empty(name):
        return 'N/A'
    # Function to help standardize the street names
    street_func = StandardizeUSStreetName(characters='upper', alphanum=True, repeated=False)
    name = ''.join(street_func.apply([name], threads=None))
    # The conditional statements below are used to try and reduce the remaining
    # number of outlier data by fixing some common errors revealed in the histogram.
    if name == 'CLARKE PLACE EAST':
        name = 'EAST CLARKE PLACE'
    elif name == 'EAST BEDFORD PARK BLVD':
        name = 'BEDFORD PARK BLVD EAST'
    elif name == 'WTC':
        name = 'WORLD TRADE CTR'
    elif name == 'TIME SQ':
        name = 'TIMES SQ'
    elif name == 'PITT':
        name = 'PITT ST'
    elif name == 'BOGARDUS':
        name = 'BOGARDUS PLACE'
    elif name == 'NAGLE':
        name = 'NAGLE AVE'
    elif name == 'SHEPHERD':
        name = 'SHEPHERD AVE'

    split_name = name.split()
    if len(split_name) == 0:
        return 'N/A'
    
    if split_name[-1] in ['SSTREET', 'STRET', 'STREET', 'STREE']:
        split_name[-1] = 'ST'
    elif split_name[-1] == 'PL':
        split_name[-1] = 'PLACE'
    elif split_name[-1].isnumeric():
        split_name.append('ST')
    elif split_name[0] == 'ST':
        split_name[0] = 'SAINT'
    elif split_name[-1] == 'E':
        split_name[-1] = 'EAST'
    elif split_name[-1] == 'W':
        split_name[-1] = 'WEST'
    elif split_name[-1] == 'N':
        split_name[-1] = 'NORTH'
    elif split_name[-1] in ['S', 'SOUIH']:
        split_name[-1] = 'SOUTH'
    elif split_name[-1] in ['BLDV', 'BLV', 'BOULEVARD', 'BOOULEVARD']:
        split_name[-1] = 'BLVD'

    name = ' '.join(split_name)

    return name

def clean_street_data(ds, column_name):
    cleaned_data = ds.update(column_name, lambda x: clean_street_name(x))
    return cleaned_data

## Clean building number

In [8]:
def clean_number(num):
    if is_empty(num):
        return 'N/A'
    # remove any leading zero's
    num = num.lstrip('0')
    if len(num) == 0:
        return 'N/A'

    return num

def clean_building_number_data(ds, column_name):
    cleaned_data = ds.update(column_name, lambda x: clean_number(x)) 
    return cleaned_data

## Clean block and lot data

In [9]:
import requests
import urllib.parse

# Make a request the url to try and find the block and lot for an address.
# Replace missing data with N/A if it fails to find a value
# This requires you to first clean street and number data
def get_block_and_lot(borough, number, street, block, lot):
    # Return if block and lot are already filled in
    if not is_empty(block) and not is_empty(lot):
        return borough, number, street, block, lot
    if not is_empty(number) and not is_empty(street):
        req = f'https://stevemorse.org/vital/nycblocklot.php?borough={borough.title()}&number={number}&street={urllib.parse.quote(street.title())}'
        r = requests.get(req)
        if r.text == "Callback('?', '?');":
            # Try again without applying title() to street value
            req = f'https://stevemorse.org/vital/nycblocklot.php?borough={borough.title()}&number={number}&street={urllib.parse.quote(street)}'
            r = requests.get(req)
        if r.status_code == 200:
            r_str = r.text.removeprefix('Callback(').removesuffix(');').replace("'", '')
            block_lot = r_str.split(',')
            if is_empty(block):
                block = block_lot[0].strip()
            if is_empty(lot):
                lot = block_lot[1].strip()
    # Return 'N/A' if the web app was unable to find the block and lot data
    # for this input
    block = 'N/A' if block == '?' else block
    if is_empty(block):
        block = 'N/A'
    lot = 'N/A' if lot == '?' else lot
    if is_empty(lot):
        lot = 'N/A'
    return borough, number, street, block, lot

# needed_columns is a list of names for the five columns that hold borough, number, street, block, and lot
## Ex: clean_block_and_lot(ds_full, ['BOROUGH','Number','Street','Block','Lot'])
def clean_block_and_lot(ds, needed_columns):
    cleaned_data = ds.update(needed_columns, lambda bo, n, s, bl, l: get_block_and_lot(bo, n, s, bl, l))
    return cleaned_data

## Run clean data methods

In [10]:
col_mapper = column_name_mapping.get(db_id)
if col_mapper.get('Borough'):
    ds_full = clean_borough_data(ds_full, col_mapper['Borough'])
if col_mapper.get('City'):
    ds_full = clean_city_data(ds_full, col_mapper['City'])
if col_mapper.get('Street'):
    ds_full = clean_street_data(ds_full, col_mapper['Street'])
if col_mapper.get('Number'):
    ds_full = clean_building_number_data(ds_full, col_mapper['Number'])
if (col_mapper.get('Block') 
      and col_mapper.get('Lot')
      and col_mapper.get('Borough')
      and col_mapper.get('Number')
      and col_mapper.get('Street')):
    args_list = [col_mapper['Borough'], col_mapper['Number'], col_mapper['Street'],
                 col_mapper['Block'], col_mapper['Lot']]
    ds_full = clean_block_and_lot(ds_full, args_list)

## Profile after clean

In [11]:
target_cols = list(column_name_mapping[db_id].values())
profiles = ds_full.select(columns=target_cols).profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Borough,6090,0,6,0.000985,2.161557
City,6090,0,55,0.009031,1.010187
Street Name,6090,0,1607,0.263875,9.817057
House #,6090,0,2448,0.40197,10.629816
Block,6090,0,2863,0.470115,11.095507
Lot,6090,0,284,0.046634,6.311963
