# Read in the dataset

In [1]:
import gzip
import os
import humanfriendly
import numpy as np
import pandas as pd

from openclean.data.source.socrata import Socrata

import warnings
warnings.filterwarnings('ignore')

# Original dataset
dob_historical_permit_issuance_id = 'bty7-2jhb'
# Other datasets with overlapping fields
dob_cellular_antenna_filings_id = 'iz2q-9x8d'
dob_C_of_O_id = 'bs8b-p36w'
nyc_restaurant_inspection_id = '43nn-pn8j'

residential_addresses_id = '3ub5-4ph8'
charges_id = '5fn4-dr26'
trade_waste_hauler_licensees_id = '867j-5pgi'
inspections_id = 'jzhd-m6uv'

self_hauler_registrants_id = 'a8wp-rerh'
dob_electrical_permit_applications_id = 'dm9a-ab7w'
dob_stalled_const_sites_id = 'i296-73x5'

# Set the unique id of the dataset you want to use here
db_id = residential_addresses_id

# Dictionary for the name of target columns in each data set
## Dictionary String values:
#### - Borough
#### - Street
#### - Number
#### - Block
#### - Lot
#### - Community Board
## Dictionary List values:
#### - City
#### - State
#### - First Name
#### - Last Name
#### - Additional Street
#### - Additional Number
column_name_mapping = {dob_historical_permit_issuance_id: {'Borough': 'BOROUGH', 
                                                           'City': ["Owner’s House City",],
                                                           'Street': 'Street',
                                                           'Number': 'Number',
                                                           'Block': 'Block',
                                                           'Lot': 'Lot',
                                                           'State': ["Owner’s House State",],
                                                           'Community Board': 'Community Board',
                                                           'First Name': [
                                                               "Permittee's First Name",
                                                               "Site Safety Mgr's First Name",
                                                               "Owner's First Name",
                                                           ],
                                                          'Last Name': [
                                                              "Permittee's Last Name",
                                                              "Site Safety Mgr's Last Name",
                                                              "Owner's Last Name",
                                                          ],
                                                          'Additional Street': [
                                                              "Owner's House Street Name",
                                                          ],
                                                          'Additional Number': [
                                                              "Owner's House #",
                                                          ],},
                       dob_cellular_antenna_filings_id: {'Borough': 'Borough', 
                                                           'City': ['City',],
                                                           'Street': 'Street Name',
                                                           'Number': 'House #',
                                                           'Block': 'Block',
                                                           'Lot': 'Lot',
                                                            'State': ['State',],
                                                           'Community Board': 'Community - Board',
                                                           'First Name': [
                                                               "Applicant's First Name",
                                                               "Owner's First Name",
                                                           ],
                                                           'Last Name': [
                                                             "Applicant's Last Name",
                                                               "Owner's Last Name",
                                                           ],
                                                          'Additional Street': [
                                                              "Owner's  House Street",
                                                          ],
                                                          'Additional Number': [
                                                              "Owner's  House #",
                                                          ],},
                       dob_C_of_O_id: {'Borough': 'BOROUGH', 
                                       'Street': 'STREET',
                                       'Number': 'NUMBER',
                                       'Block': 'BLOCK',
                                       'Lot': 'LOT',
                                      'Community Board': 'COMMUNITY_BOARD',},
                       nyc_restaurant_inspection_id: {'Borough': 'BORO', 
                                           'Street': 'STREET',
                                           'Number': 'BUILDING',
                                           'Community Board': 'Community Board',},
                       residential_addresses_id: {'Borough': 'BOROUGH', 
                                       'Street': 'STREET',
                                       'Number': 'HOUSE #',
                                       'Block': 'BLOCK',
                                       'Lot': 'LOT',
                                      'Community Board': 'COMMUNITY DISTRICT',},
                       
                       charges_id: {'Borough' : 'Borough',
                                    'Street': 'Street',
                                    'Number': 'Building Number',
                                    'City' : ['City',],
                                    'State': ['State',],},
                       
                       trade_waste_hauler_licensees_id: {'Borough' : 'BORO',
                                                        'Street': 'ADDRESS',
                                                        'City' : ['CITY',],
                                                        'State': ['STATE',],},
                       
                       inspections_id: {'Borough' : 'Borough',
                                        'Street': 'Street',
                                        'Number': 'Building Number',
                                        'City' : ['City',],
                                        'State': ['State',],},
                       self_hauler_registrants_id: {'Borough' : 'BORO',
                                                    'Street' : 'ADDRESS',
                                                     'City': ['CITY',],
                                                     'State': ['STATE',],
                                                     'Community Board': 'COMMUNITY BOARD',
                                                   },
                       dob_electrical_permit_applications_id : {'Borough': 'BOROUGH',
                                                           'City': ["CITY","OWNER_CITY"],
                                                           'Street': 'STREET_NAME',
                                                           'Number': 'HOUSE_NUMBER',
                                                           'Block': 'BLOCK',
                                                           'Lot': 'LOT',
                                                           'State': ["STATE","OWNER_STATE"],
                                                           'Community Board': 'COMMUNITY_BOARD',
                                                           'First Name': [
                                                               "APPLICANT_FIRST_NAME",
                                                               "OWNER_FIRST_NAME",
                                                               "AUTH_REP_FIRST_NAME",
                                                           ],
                                                          'Last Name': [
                                                              "APPLICANT_LAST_NAME",
                                                              "OWNER_LAST_NAME",
                                                              "AUTH_REP_LAST_NAME",
                                                          ],
                                                        },
                       dob_stalled_const_sites_id: {'Borough' : 'Borough Name',
                                                    'Street' : 'Street Name',
                                                    'Number' : 'House Number',
                                                     'Community Board': 'Community Board',
                                                   },
                      }

dataset = Socrata().dataset(db_id)

datafile = f'./{db_id}.tsv.gz'

# Download file if it doesn't exist
if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as  f:
        print('Downloading ...\n')
        dataset.write(f)

fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print(f'Using "{dataset.name}" in file {datafile} of size {fsize}')

Using "NYCHA Residential Addresses" in file ./3ub5-4ph8.tsv.gz of size 129.25 KB


## Create data stream

In [2]:
from openclean.pipeline import stream

ds_full = stream(datafile)

# sampling n = 385
# Porportion, Condifence level 95%, 
# population size = infinite, Expected proportion = 0.5, Margin of error = 5%

ds_portion = ds_full.sample(n = 385) 

## Initial Profile

In [4]:
from openclean.profiling.column import DefaultColumnProfiler

target_cols = []
for v in column_name_mapping[db_id].values():
    if type(v) == list:
        target_cols.extend(v)
    else:
        target_cols.append(v)
profiles = ds_portion.select(columns=target_cols).profile(default_profiler=DefaultColumnProfiler)
beforeCleaned = profiles.stats()
beforeCleaned

Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,385,0,5,0.012987,2.063419
STREET,385,0,230,0.597403,7.56836
HOUSE #,385,0,354,0.919481,8.423754
BLOCK,385,0,197,0.511688,7.184238
LOT,385,0,64,0.166234,3.336418
COMMUNITY DISTRICT,385,0,17,0.044156,3.835946


## For comparison at the end

In [5]:
beforeBoro = ds_portion.distinct('BOROUGH')
beforeStreet = ds_portion.distinct('STREET')
beforeHouse = ds_portion.distinct('HOUSE #')
beforeBlock = ds_portion.distinct('BLOCK')
beforeLot = ds_portion.distinct('LOT')
beforeComDistrict = ds_portion.distinct('COMMUNITY DISTRICT')

### Method to generate a histogram

In [6]:
def get_histogram(data_set, column_name):
    print(f'\n{column_name}:')
    value = data_set.distinct(column_name)
    for rank, val in enumerate(value.most_common()):
        v, freq = val
        print(f'{rank+1:<3} {v} {freq:>10,}')

## Clean Borough data

In [7]:
def clean_borough_name(name):
    boroughs = ['MANHATTAN', 'BRONX', 'BROOKLYN', 'QUEENS', 'STATEN ISLAND']
    if is_empty(name):
        return 'N/A'
    elif name == '1':
        return 'MANHATTAN'
    elif name == '2':
        return 'BRONX'
    elif name == '3':
        return 'BROOKLYN'
    elif name == '4':
        return 'QUEENS'
    elif name == '5':
        return 'STATEN ISLAND'
    elif name.upper() in boroughs:
        return name.upper()
    else:
        return 'N/A'

def clean_borough_data(ds, column_name):
    cleaned_data = ds.update(column_name, lambda x: clean_borough_name(x))
    return cleaned_data

## Clean city data

In [8]:
from fuzzywuzzy import fuzz
import json

from openclean.data.refdata import RefStore
from openclean.function.value.null import is_empty

def clean_city_name(name, valid_city_lookup):
    if is_empty(name):
        return 'N/A'
    return valid_city_lookup.get(name.upper(), 'N/A')


def clean_city_data(ds, column_name):
    # Download the license plate state codes dataset.
    refdata = RefStore()
    refdata\
        .load('encyclopaedia_britannica:us_cities', auto_download=True)\
        .df()\

    # Get set of distinct state codes.
    city_ref = refdata.load('encyclopaedia_britannica:us_cities', auto_download=True).distinct('city')
    # Get list of distinct owner house city names
    city_names = ds.distinct(column_name)
    # Init lookup dictionary for fuzzy matching
    city_ref_lookup = {}
    ref_file_name = 'city_ref_lookup.json'
    # 
    if os.path.isfile(ref_file_name):
        with open(ref_file_name) as f:
            city_ref_lookup = json.load(f)
    # compare city name to each valid city and add to lookup table
    # if the similarity is high enough
    for city in city_names:
        if is_empty(city):
            continue
        name = city.lstrip().rstrip().upper()
        if city_ref_lookup.get(name):
            continue
        if (name == 'NYC' or 
              name == 'NY' or
              fuzz.ratio('NY', name) > 70 or
              fuzz.ratio('NYC', name) > 70):
            city_ref_lookup[name] = 'NEW YORK'
            continue
        found_match = False
        for valid_city in city_ref:
            percent_match = fuzz.ratio(valid_city.upper(), name)
            if percent_match > 70:
                found_match = True
                city_ref_lookup[name] = valid_city.upper()
                break
        if not found_match:
            city_ref_lookup[name] = 'N/A'

    cleaned_data = ds.update(column_name, lambda x: clean_city_name(x, city_ref_lookup))
    with open(ref_file_name, 'w') as f:
        json.dump(city_ref_lookup, f)
    return cleaned_data

## Clean state data

In [9]:
def clean_state(name, states_ref):
    # Return 'N/A' if the state value is invalid
    if name not in states_ref:
        return 'N/A'
    else:
        return name

def clean_state_data(ds, column_name):
    # Download the license plate state codes dataset.
    refdata = RefStore()
    refdata\
        .load('nyc.gov:dof:state_codes', auto_download=True)\
        .df()\
        .head()

    # Get set of distinct state codes.
    states_ref = refdata.load('nyc.gov:dof:state_codes', auto_download=True).distinct('code')

    cleaned_data = ds.update(column_name, lambda x: clean_state(x, states_ref))
    return cleaned_data

## Clean U.S. Street data

In [10]:
from openclean_geo.address.usstreet import StandardizeUSStreetName

def clean_street_name(name):
    # Replace empty data with 'N/A'
    if is_empty(name):
        return 'N/A'
    # Function to help standardize the street names
    street_func = StandardizeUSStreetName(characters='upper', alphanum=True, repeated=False)
    name = ''.join(street_func.apply([name], threads=None))
    # The conditional statements below are used to try and reduce the remaining
    # number of outlier data by fixing some common errors revealed in the histogram.
    if name == 'CLARKE PLACE EAST':
        name = 'EAST CLARKE PLACE'
    elif name == 'EAST BEDFORD PARK BLVD':
        name = 'BEDFORD PARK BLVD EAST'
    elif name == 'WTC':
        name = 'WORLD TRADE CTR'
    elif name == 'TIME SQ':
        name = 'TIMES SQ'
    elif name == 'PITT':
        name = 'PITT ST'
    elif name == 'BOGARDUS':
        name = 'BOGARDUS PLACE'
    elif name == 'NAGLE':
        name = 'NAGLE AVE'
    elif name == 'SHEPHERD':
        name = 'SHEPHERD AVE'

    split_name = name.split()
    if len(split_name) == 0:
        return 'N/A'
    
    if split_name[-1] in ['SSTREET', 'STRET', 'STREET', 'STREE']:
        split_name[-1] = 'ST'
    elif split_name[-1] == 'PL':
        split_name[-1] = 'PLACE'
    elif split_name[-1].isnumeric():
        split_name.append('ST')
    elif split_name[0] == 'ST':
        split_name[0] = 'SAINT'
    elif split_name[-1] == 'E':
        split_name[-1] = 'EAST'
    elif split_name[-1] == 'W':
        split_name[-1] = 'WEST'
    elif split_name[-1] == 'N':
        split_name[-1] = 'NORTH'
    elif split_name[-1] in ['S', 'SOUIH']:
        split_name[-1] = 'SOUTH'
    elif split_name[-1] in ['BLDV', 'BLV', 'BOULEVARD', 'BOOULEVARD']:
        split_name[-1] = 'BLVD'

    name = ' '.join(split_name)

    return name

def clean_street_data(ds, column_name):
    cleaned_data = ds.update(column_name, lambda x: clean_street_name(x))
    return cleaned_data

## Clean building number

In [11]:
def clean_number(num):
    if is_empty(num):
        return 'N/A'
    # remove any leading zero's
    num = num.lstrip('0')
    if len(num) == 0:
        return 'N/A'

    return num

def clean_building_number_data(ds, column_name):
    cleaned_data = ds.update(column_name, lambda x: clean_number(x)) 
    return cleaned_data

## Clean block and lot data

In [12]:
import requests
import urllib.parse

# Make a request the url to try and find the block and lot for an address.
# Replace missing data with N/A if it fails to find a value
# This requires you to first clean street and number data
def get_block_and_lot(borough, number, street, block, lot):
    # Return if block and lot are already filled in
    if not is_empty(block) and not is_empty(lot):
        return borough, number, street, block, lot
    if not is_empty(number) and not is_empty(street):
        req = f'https://stevemorse.org/vital/nycblocklot.php?borough={borough.title()}&number={number}&street={urllib.parse.quote(street.title())}'
        r = requests.get(req)
        if r.text == "Callback('?', '?');":
            # Try again without applying title() to street value
            req = f'https://stevemorse.org/vital/nycblocklot.php?borough={borough.title()}&number={number}&street={urllib.parse.quote(street)}'
            r = requests.get(req)
        if r.status_code == 200:
            r_str = r.text.removeprefix('Callback(').removesuffix(');').replace("'", '')
            block_lot = r_str.split(',')
            if is_empty(block):
                block = block_lot[0].strip()
            if is_empty(lot):
                lot = block_lot[1].strip()
    # Return 'N/A' if the web app was unable to find the block and lot data
    # for this input
    block = 'N/A' if block == '?' else block
    if is_empty(block):
        block = 'N/A'
    lot = 'N/A' if lot == '?' else lot
    if is_empty(lot):
        lot = 'N/A'
    return borough, number, street, block, lot

# needed_columns is a list of names for the five columns that hold borough, number, street, block, and lot
## Ex: clean_block_and_lot(ds_full, ['BOROUGH','Number','Street','Block','Lot'])
def clean_block_and_lot(ds, needed_columns):
    cleaned_data = ds.update(needed_columns, lambda bo, n, s, bl, l: get_block_and_lot(bo, n, s, bl, l))
    return cleaned_data

## Clean community board data

In [13]:
# Replace missing or incorrect data with N/A
def fix_community_board_data(data):
    is_valid = True
    if not is_empty(data) and len(data) == 3:
        for i in range(3):
            if not data[i].isnumeric():
                is_valid = False
                break
    else:
        is_valid = False
    if is_valid and data[0].isnumeric():
        if int(data[0]) > 5:
            is_valid = False
    if is_valid:
        return data
    else:
        return 'N/A'

def clean_community_board_data(ds, column_name):
    cleaned_data = ds.update(column_name, lambda x: fix_community_board_data(x)) 
    return cleaned_data

## Clean first name

In [14]:
def clean_first_name(name):
    if is_empty(name):
        return 'N/A'
    
    name = name.removeprefix('MR. ')
    name = name.removeprefix('MR ')
    name = name.removeprefix('\\')
    name = name.removeprefix(' ')
    name = name.strip('_')

    # Getting the first name only
    # some names contain '-', " ", "_", "/" between first and middle name
    name = name.split(" ")[0]
    name = name.split("-")[0]
    name = name.split("/")[0]
    name = name.split("\\")[0]
    name = name.split("_")[0]

    # further clean the first name
    name = name.strip("_")
    name = name.strip("`")
    name = name.strip("\\")
    name = name.strip("{")
    name = name.strip(".")                                                                      
    name = name.strip("-")                                                                    
    name = name.strip(",")

    # Not-a-name & missing value rows are given "NA"
    if is_empty(name) or name.isnumeric():
        return 'N/A'

    return name

def clean_first_name_data(ds, column_name):
    cleaned_data = ds.update(column_name, lambda x: clean_first_name(x)) 
    return cleaned_data

## Clean last name

In [15]:
def clean_last_name(name):
    if is_empty(name):
        return 'N/A'
    name = name.removeprefix('\\\\')
    name = name.removeprefix('\\')
    name = name.removeprefix('\\ ')
    name = name.removeprefix('/')
    name = name.removeprefix('\\\'')
    name = name.removeprefix('0 ')
    name = name.removeprefix('11 ')
    name = name.strip('_')
    # Getting the last name only
    # some names contain '-', " ", "_", "/" between first and middle name
    name = name.split(" ")[0]
    name = name.split("-")[0]
    name = name.split("/")[0]
    name = name.split("\\")[0]
    name = name.split("_")[0]

    # further clean the last name
    name = name.strip("_")
    name = name.strip("`")
    name = name.strip("\\")
    name = name.strip("{")
    name = name.strip(".")                                                                      
    name = name.strip("-")                                                                    
    name = name.strip(",")

    # Not-a-name & missing value rows are given "NA"
    if is_empty(name) or name.isnumeric():
        return 'N/A'

    return name

def clean_last_name_data(ds, column_name):
    cleaned_data = ds.update(column_name, lambda x: clean_last_name(x)) 
    return cleaned_data

## Run clean data methods

In [16]:
col_mapper = column_name_mapping.get(db_id)
if col_mapper.get('Borough'):
    #ds_full = clean_borough_data(ds_full, col_mapper['Borough'])
    ds_portion = clean_borough_data(ds_portion, col_mapper['Borough'])
    
if col_mapper.get('Street'):
    #ds_full = clean_street_data(ds_full, col_mapper['Street'])
    ds_portion = clean_street_data(ds_portion, col_mapper['Street'])
if col_mapper.get('Number'):
    #ds_full = clean_building_number_data(ds_full, col_mapper['Number'])
    ds_portion = clean_building_number_data(ds_portion, col_mapper['Number'])
       
if (col_mapper.get('Block') 
      and col_mapper.get('Lot')
      and col_mapper.get('Borough')
      and col_mapper.get('Number')
      and col_mapper.get('Street')):
    args_list = [col_mapper['Borough'], col_mapper['Number'], col_mapper['Street'],
                 col_mapper['Block'], col_mapper['Lot']]
    #ds_full = clean_block_and_lot(ds_full, args_list)
    ds_portion = clean_block_and_lot(ds_portion, args_list)
    
if col_mapper.get('Community Board'):
    #ds_full = clean_community_board_data(ds_full, col_mapper['Community Board'])
    ds_portion = clean_community_board_data(ds_portion, col_mapper['Community Board'])
    
# The following attributes are stored as lists
if col_mapper.get('City'):
    for col in col_mapper['City']:
        #ds_full = clean_city_data(ds_full, col)
        ds_portion = clean_city_data(ds_portion, col)
        
if col_mapper.get('State'):
    for col in col_mapper['State']:
        #ds_full = clean_state_data(ds_full, col)
        ds_portion = clean_state_data(ds_portion, col)
        
if col_mapper.get('First Name'):
    for col in col_mapper['First Name']:
        #ds_full = clean_first_name_data(ds_full, col)
        ds_portion = clean_first_name_data(ds_portion, col)
        
if col_mapper.get('Last Name'):
    for col in col_mapper['Last Name']:
        #ds_full = clean_last_name_data(ds_full, col)
        ds_portion = clean_last_name_data(ds_portion, col)
        
# Some data sets have multiple street and number columns that need to be cleaned.
# Since the Block and Lot data cleaning depend on a specific street and number column
# The "Additional" key is used to store the other relevant columns
if col_mapper.get('Additional Street'):
    for col in col_mapper['Additional Street']:
        #ds_full = clean_street_data(ds_full, col)
        ds_portion = clean_street_data(ds_portion, col)
    
if col_mapper.get('Additional Number'):
    for col in col_mapper['Additional Number']:
        #ds_full = clean_building_number_data(ds_full, col)
        ds_portion = clean_building_number_data(ds_portion, col)

## Profile after clean

In [17]:
target_cols = []
for v in column_name_mapping[db_id].values():
    if type(v) == list:
        target_cols.extend(v)
    else:
        target_cols.append(v)
profiles = ds_portion.select(columns=target_cols).profile(default_profiler=DefaultColumnProfiler)
afterCleaned = profiles.stats()

## Compare Results
#### Having a instant view of the data before and after the cleaning

In [18]:
beforeCleaned

Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,385,0,5,0.012987,2.063419
STREET,385,0,230,0.597403,7.56836
HOUSE #,385,0,354,0.919481,8.423754
BLOCK,385,0,197,0.511688,7.184238
LOT,385,0,64,0.166234,3.336418
COMMUNITY DISTRICT,385,0,17,0.044156,3.835946


In [19]:
afterCleaned

Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,385,0,5,0.012987,2.025649
STREET,385,0,218,0.566234,7.478004
HOUSE #,385,0,354,0.919481,8.418559
BLOCK,385,0,189,0.490909,7.175365
LOT,385,0,66,0.171429,3.74154
COMMUNITY DISTRICT,385,0,1,0.002597,0.0


In [20]:
afterBoro = ds_portion.distinct('BOROUGH')
afterStreet = ds_portion.distinct('STREET')
afterHouse = ds_portion.distinct('HOUSE #')
afterBlock = ds_portion.distinct('BLOCK')
afterLot = ds_portion.distinct('LOT')
afterComDistrict = ds_portion.distinct('COMMUNITY DISTRICT')

In [21]:
beforeBoro

Counter({'BROOKLYN': 160,
         'MANHATTAN': 65,
         'BRONX': 82,
         'QUEENS': 61,
         'STATEN ISLAND': 17})

In [22]:
afterBoro

Counter({'BROOKLYN': 166,
         'BRONX': 89,
         'MANHATTAN': 67,
         'QUEENS': 53,
         'STATEN ISLAND': 10})

In [23]:
beforeStreet

Counter({'71ST AVENUE': 5,
         'HAMMELS BOULEVARD': 2,
         'METCALF AVENUE': 3,
         'RICHMOND TERRACE': 1,
         'BATCHELDER STREET': 4,
         'BRUCKNER BOULEVARD': 1,
         'BOND STREET': 1,
         'RALPH AVENUE': 8,
         'LORRAINE STREET': 1,
         'PITT STREET': 2,
         'AVENUE W': 3,
         'WESTCHESTER AVENUE': 1,
         '137TH STREET': 1,
         'WEST 103RD STREET': 3,
         'CLASON POINT LANE NORTH': 3,
         'TROY AVENUE': 1,
         'BENNETT COURT': 1,
         'ALBANY AVENUE': 2,
         'GRAFTON STREET': 1,
         'EAST 141ST STREET': 1,
         'UNION AVENUE': 1,
         'BERGEN STREET': 1,
         'MONROE STREET': 1,
         'EAGLE AVENUE': 3,
         'HOYT STREET': 3,
         '120TH AVENUE': 1,
         'FOUNTAIN AVENUE': 5,
         'AMSTERDAM AVENUE': 3,
         'CLAREMONT PARKWAY': 1,
         'LIVONIA AVENUE': 1,
         'MYRTLE AVENUE': 1,
         'DUMONT AVENUE': 6,
         'UNIVERSITY AVENUE': 3,
      

In [24]:
afterStreet

Counter({'LEXINGTON AVE': 2,
         '109 RD': 1,
         'KINGSBOROUGH 4 WALK': 1,
         'SCHOLES ST': 6,
         'COLUMBIA ST': 2,
         'SAINT JOHNS PLACE': 3,
         'AVENUE D': 3,
         'NOSTRAND AVE': 4,
         'AMSTERDAM AVE': 3,
         'EAST 152 ST': 2,
         'JACKSON ST': 2,
         'BEACH CHANNEL DR': 3,
         'LONGFELLOW AVE': 2,
         'LEWIS AVE': 2,
         'CLASON POINT LANE NORTH': 1,
         'LAFAYETTE AVE': 7,
         'LORING AVE': 1,
         'NEWTOWN RD': 2,
         'ALBANY AVE': 2,
         'LORRAINE ST': 2,
         'VERNON BLVD': 8,
         'FULTON AVE': 1,
         'BATCHELDER ST': 3,
         'EAST 163 ST': 2,
         'COURTLANDT AVE': 1,
         'BLAKE AVE': 3,
         'LA SALLE ST': 1,
         'GRAFTON ST': 2,
         'NORTH ELLIOT WALK': 2,
         'PARSONS BLVD': 7,
         'STERLING PLACE': 7,
         'EAST 143 ST': 1,
         'MOORE ST': 1,
         '1 AVE': 2,
         'BRAGG ST': 1,
         'NOBLE AVE': 4,
     

In [25]:
beforeHouse

Counter({'388': 2,
         '104': 1,
         '67-41': 1,
         '875': 1,
         '231': 1,
         '157': 1,
         '39': 1,
         '41-01': 1,
         '855': 1,
         '85-02': 1,
         '427': 1,
         '865': 1,
         '546': 1,
         '1075': 2,
         '1940': 1,
         '2185': 2,
         '3204': 1,
         '2729': 1,
         '205': 1,
         '2352': 1,
         '1768': 1,
         '1451': 1,
         '714': 1,
         '41-14': 1,
         '215': 1,
         '2220': 1,
         '1131': 1,
         '159-38': 1,
         '625': 1,
         '515': 1,
         '40': 3,
         '155-18': 1,
         '1385': 1,
         '726': 1,
         '1790A': 1,
         '4044': 1,
         '413': 1,
         '108-41': 1,
         '60': 4,
         '67-19': 1,
         '3024': 1,
         '1000': 1,
         '105-11': 1,
         '178': 1,
         '72': 1,
         '672': 1,
         '720': 1,
         '797': 1,
         '3045': 2,
         '921': 1,
         '28-02

In [26]:
afterHouse

Counter({'574': 1,
         '160': 2,
         '95': 1,
         '207': 1,
         '1677': 1,
         '12-21': 1,
         '126': 1,
         '2354': 1,
         '1825': 1,
         '353': 1,
         '113-32': 1,
         '201': 4,
         '1480': 1,
         '1131': 2,
         '2323': 1,
         '1735': 1,
         '1380': 1,
         '67-12': 1,
         '119-12': 1,
         '2770': 1,
         '77': 2,
         '508B': 1,
         '36': 1,
         '1180': 1,
         '2125': 1,
         '109-20': 1,
         '275': 1,
         '60': 2,
         '70': 4,
         '865': 3,
         '67-02': 1,
         '216': 2,
         '580': 2,
         '2630': 1,
         '345': 2,
         '359': 2,
         '644': 1,
         '228': 1,
         '229': 1,
         '48': 2,
         '1744': 1,
         '34': 2,
         '213': 1,
         '3549': 1,
         '40': 2,
         '291': 1,
         '583': 1,
         '50-03': 1,
         '101': 2,
         '778': 1,
         '1135': 1,
      

In [27]:
beforeBlock

Counter({'3130': 1,
         '1204': 3,
         '3633': 8,
         '1696': 2,
         '1874': 1,
         '2011': 1,
         '738': 3,
         '10125': 4,
         '1808': 2,
         '2654': 1,
         '3787': 1,
         '3026': 7,
         '396': 1,
         '7405': 1,
         '4292': 6,
         '3027': 3,
         '465': 4,
         '2901': 1,
         '10146': 2,
         '1203': 1,
         '1470': 8,
         '538': 8,
         '12825': 1,
         '6792': 10,
         '5582': 4,
         '4905': 9,
         '2626': 5,
         '1640': 3,
         '255': 2,
         '1719': 4,
         '1154': 4,
         '3723': 2,
         '10148': 2,
         '3593': 2,
         '3993': 1,
         '1631': 3,
         '2628': 1,
         '2040': 2,
         '4594': 1,
         '1471': 5,
         '1984': 1,
         '111': 3,
         '1933': 2,
         '1875': 1,
         '2106': 3,
         '7389': 4,
         '2894': 1,
         '2867': 1,
         '490': 1,
         '5567': 2,
  

In [28]:
afterBlock

Counter({'15782': 1,
         '538': 4,
         '2372': 1,
         '7978': 1,
         '3737': 1,
         '1610': 1,
         '12778': 1,
         '307': 1,
         '3637': 3,
         '4581': 2,
         '7387': 3,
         '1470': 15,
         '3243': 3,
         '1344': 3,
         '4444': 4,
         '533': 4,
         '3598': 1,
         '470': 3,
         '3026': 4,
         '3024': 7,
         '7067': 1,
         '332': 4,
         '3725': 2,
         '12438': 1,
         '260': 7,
         '2626': 1,
         '2557': 1,
         '2895': 1,
         '4480': 3,
         '7405': 4,
         '3098': 1,
         '3633': 12,
         '2325': 3,
         '7053': 1,
         '6792': 14,
         '1154': 1,
         '367': 1,
         '4795': 1,
         '3561': 3,
         '1221': 1,
         '196': 2,
         '4488': 1,
         '3025': 3,
         '490': 2,
         '3576': 3,
         '2429': 1,
         '1330': 1,
         '2106': 2,
         '1580': 1,
         '4508': 1,
   

In [29]:
beforeLot

Counter({'100': 17,
         '31': 2,
         '21': 7,
         '1': 211,
         '2': 11,
         '29': 1,
         '20': 2,
         '14': 1,
         '11': 8,
         '60': 4,
         '101': 5,
         '10': 6,
         '50': 4,
         '41': 1,
         '28': 2,
         '30': 11,
         '63': 1,
         '39': 2,
         '33': 2,
         '83': 1,
         '49': 1,
         '5': 5,
         '26': 1,
         '4': 4,
         '8': 4,
         '15': 4,
         '34': 2,
         '68': 1,
         '75': 1,
         '53': 1,
         '24': 2,
         '51': 1,
         '40': 2,
         '35': 1,
         '225': 3,
         '45': 2,
         '78': 1,
         '57': 1,
         '110': 1,
         '46': 4,
         '12': 1,
         '500': 3,
         '27': 1,
         '18': 3,
         '156': 1,
         '135': 1,
         '200': 3,
         '61': 4,
         '1001': 1,
         '9': 2,
         '7': 1,
         '180': 1,
         '16': 4,
         '205': 2,
         '136': 1,

In [30]:
afterLot

Counter({'1': 201,
         '21': 7,
         '9': 2,
         '19': 2,
         '55': 1,
         '2': 16,
         '11': 4,
         '20': 5,
         '50': 6,
         '28': 2,
         '12': 2,
         '22': 2,
         '80': 1,
         '30': 10,
         '8': 1,
         '101': 7,
         '23': 4,
         '75': 4,
         '3': 5,
         '24': 3,
         '41': 3,
         '18': 4,
         '60': 3,
         '53': 2,
         '15': 9,
         '46': 3,
         '32': 2,
         '59': 1,
         '90': 1,
         '100': 9,
         '5': 5,
         '225': 5,
         '47': 1,
         '61': 6,
         '500': 3,
         '51': 1,
         '45': 1,
         '17': 2,
         '1001': 2,
         '109': 1,
         '25': 2,
         '180': 1,
         '40': 4,
         '49': 2,
         '63': 3,
         '29': 1,
         '26': 2,
         '57': 1,
         '16': 3,
         '205': 2,
         '200': 2,
         '10': 3,
         '6': 1,
         '64': 1,
         '37': 1,
   

In [31]:
beforeComDistrict

Counter({'16': 47,
         '9': 40,
         '1': 55,
         '2': 19,
         '3': 55,
         '12': 32,
         '10': 16,
         '4': 6,
         '8': 21,
         '7': 12,
         '15': 4,
         '5': 20,
         '14': 7,
         '11': 22,
         '6': 8,
         '13': 8,
         '18': 13})

In [32]:
afterComDistrict

Counter({'N/A': 385})

# Export Results to csv

In [33]:
# Uncomment to write the cleaned data to a csv file
#ds_full.write('./cleaned_data.csv')
#ds_portion.write('./afterCleaned_data.csv')

[]