This notebook contains code to profile and clean the Historical DOB Permit Issuance data found here https://data.cityofnewyork.us/Housing-Development/Historical-DOB-Permit-Issuance/bty7-2jhb

# Download Dataset

In [1]:
import gzip
import os
import humanfriendly
import numpy as np
import pandas as pd

from openclean.data.source.socrata import Socrata

import warnings
warnings.filterwarnings('ignore')

# Unique id for Historical DOB Permit Issuance data set
db_id = 'bty7-2jhb'
dataset = Socrata().dataset(db_id)

datafile = f'./{db_id}.tsv.gz'

# Download file if it doesn't exist
if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as  f:
        print('Downloading ...\n')
        dataset.write(f)

fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print(f'Using "{dataset.name}" in file {datafile} of size {fsize}')


Using "Historical DOB Permit Issuance" in file ./bty7-2jhb.tsv.gz of size 321.34 MB


# Profiling the Data

We load the data in using opencleans stream method.

In [2]:
from openclean.pipeline import stream

ds_full = stream(datafile)

Profiling the data to get the data type for each column

In [3]:
from openclean.profiling.column import DefaultColumnProfiler

# Print the most frequent data type for each column.

print('Schema\n------')
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)
for col in ds_full.columns:
    p = profiles.column(col)
    print("  '{}' ({})".format(col, p['datatypes']['distinct'].most_common(1)[0][0]))


Schema
------
  'BOROUGH' (str)
  'BIN' (int)
  'Number' (str)
  'Street' (str)
  'Job #' (int)
  'Job doc. #' (int)
  'Job Type' (str)
  'Self_Cert' (str)
  'Block' (int)
  'Lot' (int)
  'Community Board' (int)
  'Postcode' (int)
  'Bldg Type' (int)
  'Residential' (str)
  'Special District 1' (str)
  'Special District 2' (str)
  'Work Type' (str)
  'Permit Status' (str)
  'Filing Status' (str)
  'Permit Type' (str)
  'Permit Sequence #' (int)
  'Permit Subtype' (str)
  'Oil Gas' (str)
  'Site Fill' (str)
  'Filing Date' (date)
  'Issuance Date' (date)
  'Expiration Date' (date)
  'Job Start Date' (date)
  'Permittee's First Name' (str)
  'Permittee's Last Name' (str)
  'Permittee's Business Name' (str)
  'Permittee's Phone #' (int)
  'Permittee's License Type' (str)
  'Permittee's License #' (int)
  'Act as Superintendent' (str)
  'Permittee's Other Title' (str)
  'HIC License' (int)
  'Site Safety Mgr's First Name' (str)
  'Site Safety Mgr's Last Name' (str)
  'Site Safety Mgr Busin

We used openclean profiles to create a summary of the stats for each column.

In [4]:
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,2428526,0,5,2.058862e-06,2.045161
BIN,2428526,0,300024,0.1235416,16.764061
Number,2428526,4,28639,0.01179277,11.933581
Street,2428526,4,20201,0.008318228,11.223448
Job #,2428526,0,1110544,0.4572914,19.723021
Job doc. #,2428526,0,12,4.941269e-06,0.496506
Job Type,2428526,0,6,2.470634e-06,1.855338
Self_Cert,2428526,1527841,1,1.110266e-06,0.0
Block,2428526,498,13625,0.00561155,12.54555
Lot,2428526,507,1718,0.0007075727,6.659702


We made a method to generate a histogram of a column in order to identify outliers.

In [5]:
def get_histogram(data_set, column_name):
    print(f'\n{column_name}:')
    value = data_set.distinct(column_name)
    for rank, val in enumerate(value.most_common()):
        v, freq = val
        print(f'{rank+1:<3} {v} {freq:>10,}')
# Uncomment to see example of histogram function
#get_histogram(ds_full, 'Street')

# Cleaning the Data

### Cleaning Street Column

In [6]:
from openclean_geo.address.usstreet import StandardizeUSStreetName
from openclean.function.value.null import is_empty

def clean_street_name(name):
    # Replace empty data with 'N/A'
    if is_empty(name):
        return 'N/A'
    # Function to help standardize the street names
    street_func = StandardizeUSStreetName(characters='upper', alphanum=True, repeated=False)
    name = ''.join(street_func.apply([name], threads=None))
    # The conditional statements below are used to try and reduce the remaining
    # number of outlier data by fixing some common errors revealed in the histogram.
    if name == 'CLARKE PLACE EAST':
        name = 'EAST CLARKE PLACE'
    elif name == 'EAST BEDFORD PARK BLVD':
        name = 'BEDFORD PARK BLVD EAST'
    elif name == 'WTC':
        name = 'WORLD TRADE CTR'
    elif name == 'TIME SQ':
        name = 'TIMES SQ'
    elif name == 'PITT':
        name = 'PITT ST'
    elif name == 'BOGARDUS':
        name = 'BOGARDUS PLACE'
    elif name == 'NAGLE':
        name = 'NAGLE AVE'
    elif name == 'SHEPHERD':
        name = 'SHEPHERD AVE'

    split_name = name.split()
    if len(split_name) == 0:
        return 'N/A'
    
    if split_name[-1] in ['SSTREET', 'STRET', 'STREET', 'STREE']:
        split_name[-1] = 'ST'
    elif split_name[-1] == 'PL':
        split_name[-1] = 'PLACE'
    elif split_name[-1].isnumeric():
        split_name.append('ST')
    elif split_name[0] == 'ST':
        split_name[0] = 'SAINT'
    elif split_name[-1] == 'E':
        split_name[-1] = 'EAST'
    elif split_name[-1] == 'W':
        split_name[-1] = 'WEST'
    elif split_name[-1] == 'N':
        split_name[-1] = 'NORTH'
    elif split_name[-1] in ['S', 'SOUIH']:
        split_name[-1] = 'SOUTH'
    elif split_name[-1] in ['BLDV', 'BLV', 'BOULEVARD', 'BOOULEVARD']:
        split_name[-1] = 'BLVD'

    name = ' '.join(split_name)

    return name

def clean_street_data(ds):
    cleaned_data = ds.update('Street', lambda x: clean_street_name(x))
    return cleaned_data

# Street profile after cleaning
ds_full = clean_street_data(ds_full)
street_data = ds_full.select(columns=['Street'])
after_clean_street = street_data.profile(default_profiler=DefaultColumnProfiler)
# The number of unique values in Street is significantly lower after
# standardizing and cleaning the data
after_clean_street.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Street,2428526,0,8576,0.003531,10.576541


### Cleaning Number Column

In [7]:
def clean_number(num):
    if is_empty(num):
        return 'N/A'
    # remove any leading zero's
    num = num.lstrip('0')
    if len(num) == 0:
        return 'N/A'

    return num

def clean_number_data(ds):
    cleaned_data = ds.update('Number', lambda x: clean_number(x)) 
    return cleaned_data

ds_full = clean_number_data(ds_full)

### Clean Permit Type Column
Profiling revealed that there is only one row missing a value for Permit Type. The Work Type for that row is PL, which stands for Plumbing. Every other tuple in the dataset that had a Work Type of PL also had a Permit Type PL. So I think it's fair to assume that the missing Permit Type should be filled with PL since it has Work Type PL.

In [8]:
# Replace the only missing Permit Type with PL since its work type is PL and
# all other rows with work type PL have permit type PL also
def clean_permit_type(ds):
    cleaned_data = ds.update('Permit Type', lambda x: 'PL' if is_empty(x) else x) 
    return cleaned_data

ds_full = clean_permit_type(ds_full)

### Clean Block and Lot Column
Block and Lot are both values that are assigned by the Deparment of Finance and depend on the address of the building. This web application https://stevemorse.org/vital/nycblocklot.html takes in Borough, Number, and Street and outputs the Block and Lot values by scraping http://maps.nyc.gov/doitt/nycitymap/ . So I used this web app to try and fill in some of the missing Block and Lot data.

In [9]:
import requests
import urllib.parse

# Make a request the url to try and find the block and lot for an address.
# Replace missing data with N/A if it fails to find a value
# This requires you to first clean street and number data
def get_block_and_lot(borough, number, street, block, lot):
    # Return if block and lot are already filled in
    if not is_empty(block) and not is_empty(lot):
        return borough, number, street, block, lot
    if not is_empty(number) and not is_empty(street):
        req = f'https://stevemorse.org/vital/nycblocklot.php?borough={borough.title()}&number={number}&street={urllib.parse.quote(street.title())}'
        r = requests.get(req)
        if r.text == "Callback('?', '?');":
            # Try again without applying title() to street value
            req = f'https://stevemorse.org/vital/nycblocklot.php?borough={borough.title()}&number={number}&street={urllib.parse.quote(street)}'
            r = requests.get(req)
        if r.status_code == 200:
            r_str = r.text.removeprefix('Callback(').removesuffix(');').replace("'", '')
            block_lot = r_str.split(',')
            if is_empty(block):
                block = block_lot[0].strip()
            if is_empty(lot):
                lot = block_lot[1].strip()
    # Return 'N/A' if the web app was unable to find the block and lot data
    # for this input
    block = 'N/A' if block == '?' else block
    lot = 'N/A' if lot == '?' else lot
    return borough, number, street, block, lot

def clean_block_and_lot(ds):
    needed_columns = ['BOROUGH','Number','Street','Block','Lot']
    cleaned_data = ds.update(needed_columns, lambda bo, n, s, bl, l: get_block_and_lot(bo, n, s, bl, l))
    return cleaned_data

ds_full = clean_block_and_lot(ds_full)

### Clean Community Board Data
Community Board is a 3-digit identifier with the first digit being the Borough code and the last two digits representing the community board code for that building. This data contained a lot of missing and incorrect values for community board. We were unable to find an API to identify the correct community board code for an address, so we are just replacing all empty and invalid data with 'N/A'. 

In [10]:
# Replace missing or incorrect data with N/A
def fix_community_board_data(data):
    is_valid = True
    if not is_empty(data) and len(data) == 3:
        for i in range(3):
            if not data[i].isnumeric():
                is_valid = False
                break
    else:
        is_valid = False
    if is_valid and data[0].isnumeric():
        if int(data[0]) > 5:
            is_valid = False
    if is_valid:
        return data
    else:
        return 'N/A'

def clean_community_board_data(ds):
    cleaned_data = ds.update('Community Board', lambda x: fix_community_board_data(x)) 
    return cleaned_data

ds_full = clean_community_board_data(ds_full)

### Clean Remaining missing data
For columns that are marked as not required or don't have a means to identify the correct value for missing data, we simply replaced the empty value with the string 'N/A'. 

In [11]:
# Replace unrequired and missing data with N/A
def replace_empty_data(self_cert, postcode, bldg_type, residential, special1, special2, work_type, permit_status):
    if is_empty(self_cert):
        self_cert = 'N/A'
    if is_empty(postcode):
        postcode = 'N/A'
    if is_empty(bldg_type):
        bldg_type = 'N/A'
    if is_empty(residential):
        residential = 'N/A'
    if is_empty(special1):
        special1 = 'N/A'
    if is_empty(special2):
        special2 = 'N/A'
    if is_empty(work_type):
        work_type = 'N/A'
    if is_empty(permit_status):
        permit_status = 'N/A'
    return self_cert, postcode, bldg_type, residential, special1, special2, work_type, permit_status

def clean_missing_data(ds):
    COLUMNS= [
        'Self_Cert',
        'Postcode',
        'Bldg Type',
        'Residential',
        'Special District 1',
        'Special District 2',
        'Work Type',
        'Permit Status'
    ]
    cleaned_data = ds.update(COLUMNS, lambda s, po, b, r, s1, s2, w, pe: replace_empty_data(s, po, b, r, s1, s2, w, pe))
    return cleaned_data

ds_full = clean_missing_data(ds_full)

### Profiling after cleaning first 20 columns

In [12]:
# Example with first 20 columns
COLUMNS_SUBSET = ['BOROUGH','BIN','Number','Street','Job #',
                  'Job doc. #','Job Type','Self_Cert','Block',
                  'Lot','Community Board','Postcode','Bldg Type',
                  'Residential','Special District 1','Special District 2',
                  'Work Type','Permit Status','Filing Status','Permit Type'
]
ds = ds_full.select(columns=COLUMNS_SUBSET)

profiles = ds.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,2428526,0,5,2.058862e-06,2.045161
BIN,2428526,0,300024,0.1235416,16.764061
Number,2428526,0,28600,0.01177669,11.932774
Street,2428526,0,8576,0.00353136,10.576541
Job #,2428526,0,1110544,0.4572914,19.723021
Job doc. #,2428526,0,12,4.941269e-06,0.496506
Job Type,2428526,0,6,2.470634e-06,1.855338
Self_Cert,2428526,0,2,8.235448e-07,0.951343
Block,2428526,0,13626,0.005610811,12.545786
Lot,2428526,0,1719,0.0007078368,6.661165


### Clean Owner's  House State
We imported a dataset of state code data as a reference of valid state values. Then replaced any state value that didn't occurr in the reference data with 'N/A'

In [13]:
from openclean.data.refdata import RefStore

def clean_state(name, states_ref):
    # Return 'N/A' if the state value is invalid
    if name not in states_ref:
        return 'N/A'
    else:
        return name

def clean_owner_state_data(ds):
    # Download the license plate state codes dataset.
    refdata = RefStore()
    refdata\
        .load('nyc.gov:dof:state_codes', auto_download=True)\
        .df()\
        .head()

    # Get set of distinct state codes.
    states_ref = refdata.load('nyc.gov:dof:state_codes', auto_download=True).distinct('code')

    cleaned_data = ds.update('Owner’s House State', lambda x: clean_state(x, states_ref))
    return cleaned_data

ds_full = clean_owner_state_data(ds_full)

### Clean Owner's House Street

In [14]:
def clean_owner_street_data(ds):
    cleaned_data = ds.update("Owner's House Street Name", lambda x: clean_street_name(x))
    return cleaned_data

ds_full = clean_owner_street_data(ds_full)

### Clean Owner's House City
Used a reference of data of all the cities in the united states. Then I used fuzzy matching to match the input city to a valid city value in the reference data set.

In [15]:
from fuzzywuzzy import fuzz

def clean_city_name(name, valid_city_lookup):
    if is_empty(name):
        return 'N/A'
    return valid_city_lookup.get(name.upper(), 'N/A')


def clean_owner_city_data(ds):
    # Download the license plate state codes dataset.
    refdata = RefStore()
    refdata\
        .load('encyclopaedia_britannica:us_cities', auto_download=True)\
        .df()\

    # Get set of distinct state codes.
    city_ref = refdata.load('encyclopaedia_britannica:us_cities', auto_download=True).distinct('city')
    # Get list of distinct owner house city names
    city_names = ds.distinct('Owner’s House City')
    # Init lookup dictionary for fuzzy matching
    city_ref_lookup = {}
    # compare city name to each valid city and add to lookup table
    # if the similarity is high enough
    for city in city_names:
        if is_empty(city):
            continue
        name = city.upper()
        found_match = False
        for valid_city in city_ref:
            percent_match = fuzz.ratio(valid_city.upper(), name)
            if percent_match > 90:
                found_match = True
                city_ref_lookup[name] = valid_city.upper()
                break
        if not found_match:
            city_ref_lookup[name] = 'N/A'

    cleaned_data = ds.update('Owner’s House City', lambda x: clean_city_name(x, city_ref_lookup))
    return cleaned_data

ds_full = clean_owner_city_data(ds_full)

### Permit Subtype

In [None]:
def clean_permit_subtype_data(ds):
    cleaned_data = ds.update('Permit Subtype', lambda x: 'N/A' if is_empty(x) else x) 
    return cleaned_data

ds_full = clean_permit_subtype_data(ds_full)

### Oil Gas

In [None]:
def clean_oil_gas_data(ds):
    cleaned_data = ds.update('Oil Gas', lambda x: 'N/A' if is_empty(x) else x) 
    return cleaned_data

ds_full = clean_oil_gas_data(ds_full)

### Site Fill

In [None]:
def clean_site_fill(val):
    if is_empty(val):
        return 'N/A'
    if val in ['NONE', 'USE UNDER 300 CU.YD']
        return 'N/A'

    return val

def clean_site_fill_data(ds):
    cleaned_data = ds.update('Site Fill', lambda x: clean_site_fill(x)) 
    return cleaned_data

ds_full = clean_site_fill_data(ds_full)

### Filing Date

In [None]:
def clean_date(val):
    if is_empty(val):
        return 'N/A'
    # Remove Time after Date
    val = val.split("T")
    val = val[0]
    
    #filter out invalid date
    startDate = '1989-01-01'
    endDate = '2013-12-31'

    afterStartDate = val >= startDate
    beforeEndDate = val <= endDate
    daysBetween = afterStartDate & beforeEndDate

    if daysBetween:
        return val
    else:
        return 'N/A'

def clean_filing_date_data(ds):
    cleaned_data = ds.update('Filing Date', lambda x: clean_date(x)) 
    return cleaned_data

ds_full = clean_filing_date_data(ds_full)

### Issuance Date

In [None]:
def clean_issuance_date_data(ds):
    cleaned_data = ds.update('Issuance Date', lambda x: clean_date(x)) 
    return cleaned_data

ds_full = clean_issuance_date_data(ds_full)

### Expiration Date

In [None]:
def clean_expiration_date_data(ds):
    cleaned_data = ds.update('Expiration Date', lambda x: clean_date(x)) 
    return cleaned_data

ds_full = clean_expiration_date_data(ds_full)

### Job Start Date

In [None]:
def clean_job_start_date_data(ds):
    cleaned_data = ds.update('Job Start Date', lambda x: clean_date(x)) 
    return cleaned_data

ds_full = clean_job_start_date_data(ds_full)

### Permittee's First Name

In [None]:
def clean_first_name(name):
    if is_empty(name):
        return 'N/A'
    
    name = name.removeprefix('MR. ')
    name = name.removeprefix('MR ')
    name = name.removeprefix('\\')
    name = name.removeprefix(' ')
    name = name.strip('_')

    # Getting the first name only
    # some names contain '-', " ", "_", "/" between first and middle name
    name = name.split(" ")[0]
    name = name.split("-")[0]
    name = name.split("/")[0]
    name = name.split("\\")[0]
    name = name.split("_")[0]

    # further clean the first name
    name = name.strip("_")
    name = name.strip("`")
    name = name.strip("\\")
    name = name.strip("{")
    name = name.strip(".")                                                                      
    name = name.strip("-")                                                                    
    name = name.strip(",")

    # Not-a-name & missing value rows are given "NA"
    if is_empty(name) or name.isnumeric():
        return 'N/A'

    return name

def clean_permittee_first_name_data(ds):
    cleaned_data = ds.update("Permittee's First Name", lambda x: clean_first_name(x)) 
    return cleaned_data

ds_full = clean_permitee_first_name(ds_full)

### Permittee's Last Name

In [None]:
def clean_last_name(name):
    if is_empty(name):
        return 'N/A'
    name = name.removeprefix('\\\\')
    name = name.removeprefix('\\')
    name = name.removeprefix('\\ ')
    name = name.removeprefix('/')
    name = name.removeprefix('\\\'')
    name = name.removeprefix('0 ')
    name = name.removeprefix('11 ')
    name = name.strip('_')
    # Getting the last name only
    # some names contain '-', " ", "_", "/" between first and middle name
    name = name.split(" ")[0]
    name = name.split("-")[0]
    name = name.split("/")[0]
    name = name.split("\\")[0]
    name = name.split("_")[0]

    # further clean the last name
    name = name.strip("_")
    name = name.strip("`")
    name = name.strip("\\")
    name = name.strip("{")
    name = name.strip(".")                                                                      
    name = name.strip("-")                                                                    
    name = name.strip(",")

    # Not-a-name & missing value rows are given "NA"
    if is_empty(name) or name.isnumeric():
        return 'N/A'

    return name

def clean_permittee_last_name_data(ds):
    cleaned_data = ds.update("Permittee's Last Name", lambda x: clean_last_name(x)) 
    return cleaned_data

ds_full = clean_permitee_last_name(ds_full)

### Permittee's Business Name

In [None]:
def clean_permittee_business_name(name):
    if is_empty(name):
        return 'N/A'
    na_list = ['N.A', 'na', 'n.a', 'NA', 'OWNER']
    for na in na_list:
        if name == na:
            return 'N/A'

    street_values = ["st", "St", "ST", "street", "Street", "STREET"]
    stripped_name = name.strip()
    for part in stripped_name:
        if part in street_values:
            return 'N/A'

    return name

def clean_permittee_business_name_data(ds):
    cleaned_data = ds.update("Permittee's Business Name", lambda x: clean_permittee_business_name(x)) 
    return cleaned_data

ds_full = clean_permittee_business_name_data(ds_full)

### Permittee's Phone #

In [None]:
def clean_permittee_phone(num):
    if is_empty(num):
        return 'N/A'
    num = num.split('.')[0]
    return num

def clean_permittee_phone_data(ds):
    cleaned_data = ds.update("Permittee's Phone #", lambda x: clean_permittee_phone(x)) 
    return cleaned_data

ds_full = clean_permittee_phone_data(ds_full)

### Permittee's License Type

In [None]:
def clean_permittee_license_type_data(ds):
    cleaned_data = ds.update("Permittee's License Type", lambda x: 'N/A' if is_empty(x) else x) 
    return cleaned_data

ds_full = clean_permittee_license_type_data(ds_full)

### Permittee's License #

In [None]:
def clean_permittee_license_num(num):
    if is_empty(num):
        return 'N/A'
    
    num = num.strip(" ")
    num = num.strip("`")
    num = num.strip("\\")
    num = num.strip("*")                                                                    
    num = num.strip(",")
    
    num = num.split('.')[0]

    # Not-a-name & missing value rows are given "NA"
    if is_empty(num):
        return 'N/A'

    return num

def clean_permittee_license_num_data(ds):
    cleaned_data = ds.update("Permittee's License #", lambda x: clean_permittee_license_num(x)) 
    return cleaned_data

ds_full = clean_permittee_license_num_data(ds_full)

### Act as Superintendent

In [None]:
def clean_act_superintendent_data(ds):
    cleaned_data = ds.update('Act as Superintendent', lambda x: 'N/A' if is_empty(x) else x) 
    return cleaned_data

ds_full = clean_act_superintendent_data(ds_full)

### Permittee's Other Title

In [None]:
def clean_permittee_other_title(val):
    if is_empty(val):
        return 'N/A'

    GC = ["GC","GC .","G.C .","GCC.","G?C","G.,C.","G..C.","G,C,","G.C. (SIGNS)","DEMO CON","DEMO. CONTRACTR","GEN CONTACTOR","G .C.","G.CONTRACTOR","GEN  CONTRACTOR","G.C.","_GC","_G.C","G.C","G C","GC.","G.C..","G/C","_G.C.","___GC","G. C.","G.G..","G. C","G..C","G..","G/C/","G,C.","GEN. CONTRACTOR","GEN CONTRACTOR","GEN CONTR.","GEN. CONTR.","GENERAL CONTR.","GENERAL CONTRAC","GC/OWNER","GEN CON","GENERAL","GEN CONT"]
    CONTRACTOR = ["CONTR.","CONTR","CONT","CON.","CONT.","C"]
    DEMO_CONTRACTOR = ["DEMO CONTR","DEMO. CONTR.","DEMO.CON.","DEMO CONT","DEMO CONTR{","DEMO CONT.","DEMO.CON","DEMO. CONT.","DEMO. CONTRACTO","DEMO CONTR.","DEMO.","DEMO.CONTRACTOR","DEMOL CONTR"]
    OIL_BURN_INST = ["OIL BURNER INS.","OIL BURNER INSL","OIL BURNER INST","B INSTALLER","OILBURNER","O.B.INSTALLER","FUEL OIL INST","OIL BURNER INS","B. INSTALLER","OIL BURNER INST","OIL BURNER","FUEL OIL","OIL BUR. INSTR.","OBI","OIL INSTALLER","OIL INSTALLER","OIL BURN. INSTA","OIL BUR.","OIL BRN INSTL","TANK INSTALLER","FUEL OIL INST.","BOILER INSTALL.","BOILER INSTALL","BOILER INSTALLE","BURNER INSTALLE","O.B. INSTALLER","O.B.I.","OB INSTALLER","O.B.","BOIL INSTALLER",""]
    SIGN = ["SIGN MFG.","SIGN MFG","SIGN MGF","SIGN HANGER-164","SIGN HGR","SIGN MGR","SIGN HANG. #168","SIGN MANUFACT.","SIGN RIGGER","SIGN ERECTOR","SGN.HNGR.","SIGN HANGER-168","SIGN HANGERS","SIGN BROKER","SIGN HANGER-157","SG MFG","SGN HNGR","SIGN CONTRACTOR","SIGN MFGR.",]
    for g in GC:
        if val == g:
            return 'GENERAL CONTRACTOR'
    for c in CONTRACTOR:
        if val == c:
            return 'CONTRACTOR'
    for dc in DEMO_CONTRACTOR:
        if val == dc:
            return 'DEMOLITION CONTRACTOR'
    for o in OIL_BURN_INST:
        if val == o:
            return 'OIL BURNER INSTALLLER'
    for s in SIGN:
        if val == s:
            return 'SIGN HANGER'
    return val

def clean_permittee_other_title_data(ds):
    cleaned_data = ds.update("Permittee's Other Title", lambda x: clean_permittee_other_title(x)) 
    return cleaned_data

ds_full = clean_permittee_other_title_data(ds_full)

### HIC License

In [None]:
def clean_hic_license_num(num):
    if is_empty(num):
        return 'N/A'
    num = num.split('.')[0]
    single_number = ['0','1','2','3','4','5','6','7','8','9']

    for n in single_number:
        if num == n:
            return 'N/A'
        
    if is_empty(num):
        return 'N/A'

    return num

def clean_hic_license_num_data(ds):
    cleaned_data = ds.update("HIC License", lambda x: clean_hic_license_num(x)) 
    return cleaned_data

ds_full = clean_hic_license_num_data(ds_full)

### Site Safety Mgr's First Name

In [None]:
def clean_safety_mgr_first_name_data(ds):
    cleaned_data = ds.update("Site Safety Mgr's First Name", lambda x: clean_first_name(x)) 
    return cleaned_data

ds_full = clean_safety_mgr_first_name(ds_full)

### Site Safety Mgr's Last Name

In [None]:
def clean_safety_mgr_last_name_data(ds):
    cleaned_data = ds.update("Site Safety Mgr's Last Name", lambda x: clean_last_name(x)) 
    return cleaned_data

ds_full = clean_safety_mgr_last_name(ds_full)

### Site Safety Mgr Business Name

In [None]:
def clean_safety_mgr_business_name(name):
    if is_empty(name):
        return 'N/A'
    name = name.strip("_")
    name = name.strip("`")
    name = name.strip("\\")
    name = name.strip("{")
    name = name.strip(".")                                                                      
    name = name.strip("-")                                                                    
    name = name.strip(",")

    if is_empty(name):
        return 'N/A'
    
    return name

def clean_safety_mgr_business_name_data(ds):
    cleaned_data = ds.update("Site Safety Mgr Business Name", lambda x: clean_safety_mgr_business_name(x)) 
    return cleaned_data

ds_full = clean_safety_mgr_business_name_data(ds_full)

## Profile cleaned data

In [None]:
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

## Export Result Data Set

In [94]:
ds_full.write('./cleaned_data.csv')