This notebook contains code to profile and clean the Historical DOB Permit Issuance data found here https://data.cityofnewyork.us/Housing-Development/Historical-DOB-Permit-Issuance/bty7-2jhb

# Download Dataset

In [1]:
import gzip
import os
import humanfriendly
import numpy as np
import pandas as pd

from openclean.data.source.socrata import Socrata

import warnings
warnings.filterwarnings('ignore')

# Unique id for Historical DOB Permit Issuance data set
db_id = 'bty7-2jhb'
dataset = Socrata().dataset(db_id)

datafile = f'./{db_id}.tsv.gz'

# Download file if it doesn't exist
if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as  f:
        print('Downloading ...\n')
        dataset.write(f)

fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print(f'Using "{dataset.name}" in file {datafile} of size {fsize}')


Using "Historical DOB Permit Issuance" in file ./bty7-2jhb.tsv.gz of size 321.34 MB


# Profiling the Data

We load the data in using opencleans stream method.

In [2]:
from openclean.pipeline import stream

ds_full = stream(datafile)

Profiling the data to get the data type for each column

In [3]:
from openclean.profiling.column import DefaultColumnProfiler

# Print the most frequent data type for each column.

print('Schema\n------')
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)
for col in ds_full.columns:
    p = profiles.column(col)
    print("  '{}' ({})".format(col, p['datatypes']['distinct'].most_common(1)[0][0]))


Schema
------
  'BOROUGH' (str)
  'BIN' (int)
  'Number' (str)
  'Street' (str)
  'Job #' (int)
  'Job doc. #' (int)
  'Job Type' (str)
  'Self_Cert' (str)
  'Block' (int)
  'Lot' (int)
  'Community Board' (int)
  'Postcode' (int)
  'Bldg Type' (int)
  'Residential' (str)
  'Special District 1' (str)
  'Special District 2' (str)
  'Work Type' (str)
  'Permit Status' (str)
  'Filing Status' (str)
  'Permit Type' (str)
  'Permit Sequence #' (int)
  'Permit Subtype' (str)
  'Oil Gas' (str)
  'Site Fill' (str)
  'Filing Date' (date)
  'Issuance Date' (date)
  'Expiration Date' (date)
  'Job Start Date' (date)
  'Permittee's First Name' (str)
  'Permittee's Last Name' (str)
  'Permittee's Business Name' (str)
  'Permittee's Phone #' (int)
  'Permittee's License Type' (str)
  'Permittee's License #' (int)
  'Act as Superintendent' (str)
  'Permittee's Other Title' (str)
  'HIC License' (int)
  'Site Safety Mgr's First Name' (str)
  'Site Safety Mgr's Last Name' (str)
  'Site Safety Mgr Busin

We used openclean profiles to create a summary of the stats for each column.

In [4]:
profiles = ds_full.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,2428526,0,5,2.058862e-06,2.045161
BIN,2428526,0,300024,0.1235416,16.764061
Number,2428526,4,28639,0.01179277,11.933581
Street,2428526,4,20201,0.008318228,11.223448
Job #,2428526,0,1110544,0.4572914,19.723021
Job doc. #,2428526,0,12,4.941269e-06,0.496506
Job Type,2428526,0,6,2.470634e-06,1.855338
Self_Cert,2428526,1527841,1,1.110266e-06,0.0
Block,2428526,498,13625,0.00561155,12.54555
Lot,2428526,507,1718,0.0007075727,6.659702


We made a method to generate a histogram of a column in order to identify outliers.

In [5]:
def get_histogram(data_set, column_name):
    print(f'\n{column_name}:')
    value = data_set.distinct(column_name)
    for rank, val in enumerate(value.most_common()):
        v, freq = val
        print(f'{rank+1:<3} {v} {freq:>10,}')
# Uncomment to see example of histogram function
#get_histogram(ds_full, 'Street')

# Cleaning the Data

### Cleaning Street Column

In [6]:
from openclean_geo.address.usstreet import StandardizeUSStreetName
from openclean.function.value.null import is_empty

def clean_street_name(name):
    # Replace empty data with 'N/A'
    if is_empty(name):
        return 'N/A'
    # Function to help standardize the street names
    street_func = StandardizeUSStreetName(characters='upper', alphanum=True, repeated=False)
    name = ''.join(street_func.apply([name], threads=None))
    # The conditional statements below are used to try and reduce the remaining
    # number of outlier data by fixing some common errors revealed in the histogram.
    if name == 'CLARKE PLACE EAST':
        name = 'EAST CLARKE PLACE'
    elif name == 'EAST BEDFORD PARK BLVD':
        name = 'BEDFORD PARK BLVD EAST'
    elif name == 'WTC':
        name = 'WORLD TRADE CTR'
    elif name == 'TIME SQ':
        name = 'TIMES SQ'
    elif name == 'PITT':
        name = 'PITT ST'
    elif name == 'BOGARDUS':
        name = 'BOGARDUS PLACE'
    elif name == 'NAGLE':
        name = 'NAGLE AVE'
    elif name == 'SHEPHERD':
        name = 'SHEPHERD AVE'

    split_name = name.split()
    if len(split_name) == 0:
        return 'N/A'
    
    if split_name[-1] in ['SSTREET', 'STRET', 'STREET', 'STREE']:
        split_name[-1] = 'ST'
    elif split_name[-1] == 'PL':
        split_name[-1] = 'PLACE'
    elif split_name[-1].isnumeric():
        split_name.append('ST')
    elif split_name[0] == 'ST':
        split_name[0] = 'SAINT'
    elif split_name[-1] == 'E':
        split_name[-1] = 'EAST'
    elif split_name[-1] == 'W':
        split_name[-1] = 'WEST'
    elif split_name[-1] == 'N':
        split_name[-1] = 'NORTH'
    elif split_name[-1] in ['S', 'SOUIH']:
        split_name[-1] = 'SOUTH'
    elif split_name[-1] in ['BLDV', 'BLV', 'BOULEVARD', 'BOOULEVARD']:
        split_name[-1] = 'BLVD'

    name = ' '.join(split_name)

    return name

def clean_street_data(ds):
    cleaned_data = ds.update('Street', lambda x: clean_street_name(x))
    return cleaned_data

# Street profile after cleaning
ds_full = clean_street_data(ds_full)
street_data = ds_full.select(columns=['Street'])
after_clean_street = street_data.profile(default_profiler=DefaultColumnProfiler)
# The number of unique values in Street is significantly lower after
# standardizing and cleaning the data
after_clean_street.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Street,2428526,0,8576,0.003531,10.576541


### Cleaning Number Column

In [7]:
def clean_number(num):
    if is_empty(num):
        return 'N/A'
    # remove any leading zero's
    num = num.lstrip('0')
    if len(num) == 0:
        return 'N/A'

    return num

def clean_number_data(ds):
    cleaned_data = ds.update('Number', lambda x: clean_number(x)) 
    return cleaned_data

ds_full = clean_number_data(ds_full)

### Clean Permit Type Column
Profiling revealed that there is only one row missing a value for Permit Type. The Work Type for that row is PL, which stands for Plumbing. Every other tuple in the dataset that had a Work Type of PL also had a Permit Type PL. So I think it's fair to assume that the missing Permit Type should be filled with PL since it has Work Type PL.

In [8]:
# Replace the only missing Permit Type with PL since its work type is PL and
# all other rows with work type PL have permit type PL also
def clean_permit_type(ds):
    cleaned_data = ds.update('Permit Type', lambda x: 'PL' if is_empty(x) else x) 
    return cleaned_data

ds_full = clean_permit_type(ds_full)

### Clean Block and Lot Column
Block and Lot are both values that are assigned by the Deparment of Finance and depend on the address of the building. This web application https://stevemorse.org/vital/nycblocklot.html takes in Borough, Number, and Street and outputs the Block and Lot values by scraping http://maps.nyc.gov/doitt/nycitymap/ . So I used this web app to try and fill in some of the missing Block and Lot data.

In [9]:
import requests
import urllib.parse

# Make a request the url to try and find the block and lot for an address.
# Replace missing data with N/A if it fails to find a value
# This requires you to first clean street and number data
def get_block_and_lot(borough, number, street, block, lot):
    # Return if block and lot are already filled in
    if not is_empty(block) and not is_empty(lot):
        return borough, number, street, block, lot
    if not is_empty(number) and not is_empty(street):
        req = f'https://stevemorse.org/vital/nycblocklot.php?borough={borough.title()}&number={number}&street={urllib.parse.quote(street.title())}'
        r = requests.get(req)
        if r.text == "Callback('?', '?');":
            # Try again without applying title() to street value
            req = f'https://stevemorse.org/vital/nycblocklot.php?borough={borough.title()}&number={number}&street={urllib.parse.quote(street)}'
            r = requests.get(req)
        if r.status_code == 200:
            r_str = r.text.removeprefix('Callback(').removesuffix(');').replace("'", '')
            block_lot = r_str.split(',')
            if is_empty(block):
                block = block_lot[0].strip()
            if is_empty(lot):
                lot = block_lot[1].strip()
    # Return 'N/A' if the web app was unable to find the block and lot data
    # for this input
    block = 'N/A' if block == '?' else block
    lot = 'N/A' if lot == '?' else lot
    return borough, number, street, block, lot

def clean_block_and_lot(ds):
    needed_columns = ['BOROUGH','Number','Street','Block','Lot']
    cleaned_data = ds.update(needed_columns, lambda bo, n, s, bl, l: get_block_and_lot(bo, n, s, bl, l))
    return cleaned_data

ds_full = clean_block_and_lot(ds_full)

### Clean Community Board Data
Community Board is a 3-digit identifier with the first digit being the Borough code and the last two digits representing the community board code for that building. This data contained a lot of missing and incorrect values for community board. We were unable to find an API to identify the correct community board code for an address, so we are just replacing all empty and invalid data with 'N/A'. 

In [10]:
# Replace missing or incorrect data with N/A
def fix_community_board_data(data):
    is_valid = True
    if not is_empty(data) and len(data) == 3:
        for i in range(3):
            if not data[i].isnumeric():
                is_valid = False
                break
    else:
        is_valid = False
    if is_valid and data[0].isnumeric():
        if int(data[0]) > 5:
            is_valid = False
    if is_valid:
        return data
    else:
        return 'N/A'

def clean_community_board_data(ds):
    cleaned_data = ds.update('Community Board', lambda x: fix_community_board_data(x)) 
    return cleaned_data

ds_full = clean_community_board_data(ds_full)

### Clean Remaining missing data
For columns that are marked as not required or don't have a means to identify the correct value for missing data, we simply replaced the empty value with the string 'N/A'. 

In [11]:
# Replace unrequired and missing data with N/A
def replace_empty_data(self_cert, postcode, bldg_type, residential, special1, special2, work_type, permit_status):
    if is_empty(self_cert):
        self_cert = 'N/A'
    if is_empty(postcode):
        postcode = 'N/A'
    if is_empty(bldg_type):
        bldg_type = 'N/A'
    if is_empty(residential):
        residential = 'N/A'
    if is_empty(special1):
        special1 = 'N/A'
    if is_empty(special2):
        special2 = 'N/A'
    if is_empty(work_type):
        work_type = 'N/A'
    if is_empty(permit_status):
        permit_status = 'N/A'
    return self_cert, postcode, bldg_type, residential, special1, special2, work_type, permit_status

def clean_missing_data(ds):
    COLUMNS= [
        'Self_Cert',
        'Postcode',
        'Bldg Type',
        'Residential',
        'Special District 1',
        'Special District 2',
        'Work Type',
        'Permit Status'
    ]
    cleaned_data = ds.update(COLUMNS, lambda s, po, b, r, s1, s2, w, pe: replace_empty_data(s, po, b, r, s1, s2, w, pe))
    return cleaned_data

ds_full = clean_missing_data(ds_full)

### Profiling after cleaning first 20 columns

In [12]:
# Example with first 20 columns
COLUMNS_SUBSET = ['BOROUGH','BIN','Number','Street','Job #',
                  'Job doc. #','Job Type','Self_Cert','Block',
                  'Lot','Community Board','Postcode','Bldg Type',
                  'Residential','Special District 1','Special District 2',
                  'Work Type','Permit Status','Filing Status','Permit Type'
]
ds = ds_full.select(columns=COLUMNS_SUBSET)

profiles = ds.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,2428526,0,5,2.058862e-06,2.045161
BIN,2428526,0,300024,0.1235416,16.764061
Number,2428526,0,28600,0.01177669,11.932774
Street,2428526,0,8576,0.00353136,10.576541
Job #,2428526,0,1110544,0.4572914,19.723021
Job doc. #,2428526,0,12,4.941269e-06,0.496506
Job Type,2428526,0,6,2.470634e-06,1.855338
Self_Cert,2428526,0,2,8.235448e-07,0.951343
Block,2428526,0,13626,0.005610811,12.545786
Lot,2428526,0,1719,0.0007078368,6.661165


### Clean Owner's  House State
We imported a dataset of state code data as a reference of valid state values. Then replaced any state value that didn't occurr in the reference data with 'N/A'

In [13]:
from openclean.data.refdata import RefStore

def clean_state(name, states_ref):
    # Return 'N/A' if the state value is invalid
    if name not in states_ref:
        return 'N/A'
    else:
        return name

def clean_owner_state_data(ds):
    # Download the license plate state codes dataset.
    refdata = RefStore()
    refdata\
        .load('nyc.gov:dof:state_codes', auto_download=True)\
        .df()\
        .head()

    # Get set of distinct state codes.
    states_ref = refdata.load('nyc.gov:dof:state_codes', auto_download=True).distinct('code')

    cleaned_data = ds.update('Owner’s House State', lambda x: clean_state(x, states_ref))
    return cleaned_data

ds_full = clean_owner_state_data(ds_full)

### Clean Owner's House Street

In [14]:
def clean_owner_street_data(ds):
    cleaned_data = ds.update("Owner's House Street Name", lambda x: clean_street_name(x))
    return cleaned_data

ds_full = clean_owner_street_data(ds_full)

### Clean Owner's House City
Used a reference of data of all the cities in the united states. Then I used fuzzy matching to match the input city to a valid city value in the reference data set.

In [15]:
from fuzzywuzzy import fuzz

def clean_city_name(name, valid_city_lookup):
    if is_empty(name):
        return 'N/A'
    return valid_city_lookup.get(name.upper(), 'N/A')


def clean_owner_city_data(ds):
    # Download the license plate state codes dataset.
    refdata = RefStore()
    refdata\
        .load('encyclopaedia_britannica:us_cities', auto_download=True)\
        .df()\

    # Get set of distinct state codes.
    city_ref = refdata.load('encyclopaedia_britannica:us_cities', auto_download=True).distinct('city')
    # Get list of distinct owner house city names
    city_names = ds.distinct('Owner’s House City')
    # Init lookup dictionary for fuzzy matching
    city_ref_lookup = {}
    # compare city name to each valid city and add to lookup table
    # if the similarity is high enough
    for city in city_names:
        if is_empty(city):
            continue
        name = city.upper()
        found_match = False
        for valid_city in city_ref:
            percent_match = fuzz.ratio(valid_city.upper(), name)
            if percent_match > 90:
                found_match = True
                city_ref_lookup[name] = valid_city.upper()
                break
        if not found_match:
            city_ref_lookup[name] = 'N/A'

    cleaned_data = ds.update('Owner’s House City', lambda x: clean_city_name(x, city_ref_lookup))
    return cleaned_data

ds_full = clean_owner_city_data(ds_full)

## Profiling Cols 21 - 40

In [16]:
# load
#data = pd.read_csv('Historical_DOB_Permit_Issuance.csv')
data = ds_full.to_df()
cols = data[data.columns[20:40]]
cols.columns

Index(['Permit Sequence #', 'Permit Subtype', 'Oil Gas', 'Site Fill',
       'Filing Date', 'Issuance Date', 'Expiration Date', 'Job Start Date',
       'Permittee's First Name', 'Permittee's Last Name',
       'Permittee's Business Name', 'Permittee's Phone #',
       'Permittee's License Type', 'Permittee's License #',
       'Act as Superintendent', 'Permittee's Other Title', 'HIC License',
       'Site Safety Mgr's First Name', 'Site Safety Mgr's Last Name',
       'Site Safety Mgr Business Name'],
      dtype='object')

In [17]:
cols.head()

Unnamed: 0,Permit Sequence #,Permit Subtype,Oil Gas,Site Fill,Filing Date,Issuance Date,Expiration Date,Job Start Date,Permittee's First Name,Permittee's Last Name,Permittee's Business Name,Permittee's Phone #,Permittee's License Type,Permittee's License #,Act as Superintendent,Permittee's Other Title,HIC License,Site Safety Mgr's First Name,Site Safety Mgr's Last Name,Site Safety Mgr Business Name
0,1,,,,2010-11-05T00:00:00,2010-11-05T00:00:00,2011-11-05T00:00:00,2010-11-05T00:00:00,LAWRENCE,LEVINE,"PAR PLUMBING CO., INC",2129261088,MASTER PLUMBER,161,,,,,,
1,12,FN,,NONE,2012-01-30T00:00:00,2012-01-30T00:00:00,2013-01-29T00:00:00,2002-08-08T00:00:00,ANTHONY,RASULO,RIVERBAY CORP,7183203300,GENERAL CONTRACTOR,1962,,,,,,
2,3,,,NONE,2008-02-04T00:00:00,2008-02-04T00:00:00,2009-02-03T00:00:00,2005-08-29T00:00:00,OSCAR,JACKSON,PERFECT PLUMBING & HETING CORP,7185157055,MASTER PLUMBER,594,,,,,,
3,1,,,NONE,1998-08-31T00:00:00,1998-08-31T00:00:00,1999-08-31T00:00:00,1998-08-31T00:00:00,GERI,KAUUMBA,GOWIE PLUMBING,7188821281,MASTER PLUMBER,1137,Y,,,,,
4,1,MH,,NONE,2007-04-30T00:00:00,2007-04-30T00:00:00,2008-01-08T00:00:00,2007-04-30T00:00:00,GARY,ZYSMAN,THE DU-RITE INC,2013877000,GENERAL CONTRACTOR,9872,Y,,,,,


In [18]:
cols.tail()

Unnamed: 0,Permit Sequence #,Permit Subtype,Oil Gas,Site Fill,Filing Date,Issuance Date,Expiration Date,Job Start Date,Permittee's First Name,Permittee's Last Name,Permittee's Business Name,Permittee's Phone #,Permittee's License Type,Permittee's License #,Act as Superintendent,Permittee's Other Title,HIC License,Site Safety Mgr's First Name,Site Safety Mgr's Last Name,Site Safety Mgr Business Name
2428521,1,,,ON-SITE,2003-10-08T00:00:00,2003-10-08T00:00:00,2004-04-24T00:00:00,2003-10-08T00:00:00,SALVATORE,CALCAGNO,SALVATORE CALCAGNO CONSTRUCTION,7184426800,GENERAL CONTRACTOR,3107,Y,,,,,
2428522,1,,,NONE,1996-07-29T00:00:00,1997-07-28T00:00:00,1998-07-10T00:00:00,1996-07-29T00:00:00,VITO,MELELEO,LAVINIO CONSTR. INC.,7189487735,GENERAL CONTRACTOR,5209,Y,GC,,,,
2428523,1,,,NONE,1999-07-09T00:00:00,1999-07-09T00:00:00,2000-07-08T00:00:00,1999-07-09T00:00:00,CHARLES,BERWIND,A. BERWIND PLUMBING & HEATING,5165935333,MASTER PLUMBER,180,Y,,,,,
2428524,2,,,NONE,1996-06-25T00:00:00,1996-06-25T00:00:00,1997-06-07T00:00:00,1994-08-09T00:00:00,FRANK,CIOLLO,FRANK C. PL. & HEATING CORP.,7189812366,MASTER PLUMBER,1357,Y,,,,,
2428525,1,,,OFF-SITE,1999-09-20T00:00:00,1999-09-20T00:00:00,2000-01-01T00:00:00,1999-09-20T00:00:00,KENNETH,FROHLICK,GATEWAY DEMOLITION CORP.,7183591400,GENERAL CONTRACTOR,1023,Y,,,,,


In [19]:
# types of cols with missing values
cols.dtypes[cols.isnull().any()]

Series([], dtype: object)

In [20]:
# show the number (percentage) of missing values for each col
x = cols.isnull().sum()
y = cols.isnull().sum() / cols.shape[0] * 100
z = {'Number of missing values' : x, 'Percentage of missing values' : y}
df = pd.DataFrame(z, columns = ['Number of missing values', 'Percentage of missing values'])
df.sort_values(by = 'Percentage of missing values', ascending = False)

Unnamed: 0,Number of missing values,Percentage of missing values
Permit Sequence #,0,0.0
Permit Subtype,0,0.0
Site Safety Mgr's Last Name,0,0.0
Site Safety Mgr's First Name,0,0.0
HIC License,0,0.0
Permittee's Other Title,0,0.0
Act as Superintendent,0,0.0
Permittee's License #,0,0.0
Permittee's License Type,0,0.0
Permittee's Phone #,0,0.0


## Data Clean Cols 21 - 40

### Permit Subtype

In [21]:
cols['Permit Subtype'].value_counts()

      1010293
OT     585308
FN     215908
MH     214293
SH     125409
SP      80185
BL      51996
FP      50161
FB      31488
SF      31106
FS      17035
SD       9189
EA       5005
CH        909
FA        239
SC          2
Name: Permit Subtype, dtype: int64

In [22]:
cols['Permit Subtype'].isnull().sum()

0

In [23]:
# Fill missing value with NA
cols['Permit Subtype'].fillna("N/A", inplace=True)

In [24]:
cols['Permit Subtype'].isnull().sum()

0

### Oil Gas

In [25]:
cols['Oil Gas'].value_counts()

       2397492
OIL      29215
GAS       1819
Name: Oil Gas, dtype: int64

In [26]:
cols['Oil Gas'].isnull().sum()

0

In [27]:
# Fill missing value with NA
cols['Oil Gas'].fillna("N/A", inplace=True)

### Site Fill

In [28]:
cols["Site Fill"].value_counts()

NONE                   1553658
NOT APPLICABLE          371778
ON-SITE                 243255
                        167703
OFF-SITE                 83449
USE UNDER 300 CU.YD       8683
Name: Site Fill, dtype: int64

In [29]:
cols['Site Fill'].fillna("N/A", inplace=True)
cols['Site Fill'].replace("NONE", "N/A", inplace=True)
cols['Site Fill'].replace("USE UNDER 300 CU.YD", "N/A", inplace=True)

In [30]:
cols["Site Fill"].isnull().sum()

0

### Filling Data

In [31]:
cols['Filing Date'].value_counts()

2007-03-29T00:00:00    998
2007-03-30T00:00:00    981
2006-12-28T00:00:00    927
2008-01-07T00:00:00    920
2007-12-28T00:00:00    900
                      ... 
2011-11-08T00:00:00      1
2012-02-25T00:00:00      1
2011-12-24T00:00:00      1
2008-09-06T00:00:00      1
2003-10-18T00:00:00      1
Name: Filing Date, Length: 6415, dtype: int64

In [32]:
# Remove Time after Date
cols['Filing Date'] = cols['Filing Date'].str.split("T")
cols['Filing Date'] = cols['Filing Date'].apply(lambda x: x[0])

cols['Filing Date'].value_counts()

2007-03-29    998
2007-03-30    981
2006-12-28    927
2008-01-07    920
2007-12-28    900
             ... 
2011-11-08      1
2012-02-25      1
2011-12-24      1
2008-09-06      1
2003-10-18      1
Name: Filing Date, Length: 6415, dtype: int64

In [33]:
#filter out invalid date

startDate = '1989-01-01'
endDate = '2013-12-31'

afterStartDate = cols['Job Start Date'] >= startDate
beforeEndDate = cols['Job Start Date'] <= endDate
daysBetween = afterStartDate & beforeEndDate

cols['Filing Date'] = cols['Filing Date'].loc[daysBetween]

cols['Filing Date'].value_counts()

2007-03-29    998
2007-03-30    981
2006-12-28    927
2008-01-07    920
2007-12-28    900
             ... 
2011-11-08      1
2012-02-25      1
2011-12-24      1
2008-09-06      1
2003-10-18      1
Name: Filing Date, Length: 6415, dtype: int64

### Issuance Date

In [34]:
cols['Issuance Date'].value_counts()

2007-03-29T00:00:00    994
2007-03-30T00:00:00    959
2006-12-28T00:00:00    947
2007-12-28T00:00:00    918
2008-06-27T00:00:00    909
                      ... 
2006-06-18T00:00:00      1
2013-04-13T00:00:00      1
2012-11-04T00:00:00      1
2011-02-20T00:00:00      1
2003-10-18T00:00:00      1
Name: Issuance Date, Length: 6409, dtype: int64

In [35]:
cols['Issuance Date'].isnull().sum()

0

In [36]:
# Remove Time after Date
cols['Issuance Date'] = cols['Issuance Date'].str.split("T")
cols['Issuance Date'] = cols['Issuance Date'].apply(lambda x: x[0])

cols['Issuance Date'].value_counts()

2007-03-29    994
2007-03-30    959
2006-12-28    947
2007-12-28    918
2008-06-27    909
             ... 
2006-06-18      1
2013-04-13      1
2012-11-04      1
2011-02-20      1
2003-10-18      1
Name: Issuance Date, Length: 6409, dtype: int64

In [37]:
#filter out invalid date

startDate = '1989-01-01'
endDate = '2013-12-31'

afterStartDate = cols['Job Start Date'] >= startDate
beforeEndDate = cols['Job Start Date'] <= endDate
daysBetween = afterStartDate & beforeEndDate

cols['Issuance Date'] = cols['Issuance Date'].loc[daysBetween]

cols['Issuance Date'].value_counts()

2007-03-29    994
2007-03-30    959
2006-12-28    947
2007-12-28    918
2008-06-27    909
             ... 
2007-05-28      1
2013-04-07      1
2012-03-17      1
2011-01-01      1
2003-10-18      1
Name: Issuance Date, Length: 6409, dtype: int64

### Expiration Date

In [38]:
cols['Expiration Date'].value_counts()

2007-12-31T00:00:00    18638
2006-12-31T00:00:00    18065
2005-12-31T00:00:00    16359
2004-12-31T00:00:00    13974
2009-04-01T00:00:00    11426
                       ...  
2014-09-05T00:00:00        1
2015-03-05T00:00:00        1
2209-04-28T00:00:00        1
1989-09-12T00:00:00        1
2016-09-23T00:00:00        1
Name: Expiration Date, Length: 9025, dtype: int64

In [39]:
cols['Expiration Date'].isnull().sum()

0

In [40]:
# take care of missing value
cols['Expiration Date'].fillna("N/A", inplace=True)

In [41]:
# Remove Time after Date
cols['Expiration Date'] = cols['Expiration Date'].str.split("T")
cols['Expiration Date'] = cols['Expiration Date'].apply(lambda x: x[0])

cols['Expiration Date'].value_counts()

2007-12-31    18638
2006-12-31    18065
2005-12-31    16359
2004-12-31    13974
2009-04-01    11426
              ...  
2014-09-05        1
2015-03-05        1
2209-04-28        1
1989-09-12        1
2016-09-23        1
Name: Expiration Date, Length: 9025, dtype: int64

In [42]:
#filter out invalid date

startDate = '1989-01-01'
endDate = '2013-12-31'

afterStartDate = cols['Job Start Date'] >= startDate
beforeEndDate = cols['Job Start Date'] <= endDate
daysBetween = afterStartDate & beforeEndDate

cols['Expiration Date'] = cols['Expiration Date'].loc[daysBetween]

cols['Expiration Date'].value_counts()

2007-12-31    18634
2006-12-31    18063
2005-12-31    16357
2004-12-31    13972
2009-04-01    11426
              ...  
2014-05-27        1
2014-08-06        1
2014-12-01        1
2014-07-01        1
1990-01-06        1
Name: Expiration Date, Length: 9011, dtype: int64

### Job Start Date

In [43]:
cols['Job Start Date'].value_counts()

2008-06-27T00:00:00    1376
2008-06-25T00:00:00    1095
2007-07-17T00:00:00    1077
2004-06-09T00:00:00    1063
2008-07-29T00:00:00    1051
                       ... 
2077-10-24T00:00:00       1
2002-06-29T00:00:00       1
2201-06-15T00:00:00       1
2001-08-05T00:00:00       1
2015-01-25T00:00:00       1
Name: Job Start Date, Length: 7476, dtype: int64

In [44]:
cols['Job Start Date'].isnull().sum()

0

In [45]:
cols['Job Start Date'].fillna("N/A", inplace=True)

In [46]:
cols['Job Start Date'] = cols['Job Start Date'].str.split("T")
cols['Job Start Date'] = cols['Job Start Date'].apply(lambda x: x[0])

cols['Job Start Date'].value_counts()

2008-06-27    1376
2008-06-25    1095
2007-07-17    1077
2004-06-09    1063
2008-07-29    1051
              ... 
2077-10-24       1
2002-06-29       1
2201-06-15       1
2001-08-05       1
2015-01-25       1
Name: Job Start Date, Length: 7476, dtype: int64

In [47]:
#filter out invalid date

startDate = '1989-01-01'
endDate = '2013-12-31'

afterStartDate = cols['Job Start Date'] >= startDate
beforeEndDate = cols['Job Start Date'] <= endDate
daysBetween = afterStartDate & beforeEndDate

cols['Job Start Date'] = cols['Job Start Date'].loc[daysBetween]
cols['Job Start Date'].value_counts()

2008-06-27    1376
2008-06-25    1095
2007-07-17    1077
2004-06-09    1063
2008-07-29    1051
              ... 
2007-07-08       1
1990-07-08       1
2005-10-01       1
1994-04-03       1
2001-08-26       1
Name: Job Start Date, Length: 7261, dtype: int64

### Permittee's First Name

In [48]:
cols["Permittee's First Name"].isnull().sum()

0

In [49]:
# issues: having different symbols before or after
# ex: _ (__)` MR.  \{ . /ANN`   -   (, JR)   X.
# contains first name and middle name (or middle initial)
cols["Permittee's First Name"].value_counts()

JOHN           112451
MICHAEL         76081
ROBERT          69481
JOSEPH          57833
PETER           52864
                ...  
TONIE               1
LEONARD{            1
KLEANTHAIES         1
GEORGEQ             1
VILAJET             1
Name: Permittee's First Name, Length: 35868, dtype: int64

In [50]:
# Getting the first name only
# some names contain '-', " ", "_", "/" between first and middle name
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.split(" ").str[0]
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.split("-").str[0]
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.split("/").str[0]
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.split("\\").str[0]
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.split("_").str[0]

# further clean the first name
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.strip("_")
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.strip("`")
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.strip("MR.")
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.strip("\\")
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.strip("{")
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.strip(".")                                                                      
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.strip("-")                                                                    
cols["Permittee's First Name"] = cols["Permittee's First Name"].str.strip(",")

# Not-a-name & missing value rows are given "NA"
cols["Permittee's First Name"].str.isnumeric().replace("N/A",inplace=True)
cols["Permittee's First Name"].fillna("N/A", inplace=True)

In [51]:
cols["Permittee's First Name"].value_counts()

JOHN         112921
ICHAEL        76390
OBERT         69635
JOSEPH        58011
PETE          53722
              ...  
AIGOZATA          1
DINATALIE         1
APENCE            1
JANAN             1
VILAJET           1
Name: Permittee's First Name, Length: 28984, dtype: int64

In [52]:
cols["Permittee's First Name"].isnull().sum()

0

### Permittee's Last Name

In [53]:
# having similar issue with "Permittee's First Name"
cols["Permittee's Last Name"].value_counts()

SINGH        28500
WHITE        19733
LEE          16399
MARTINEZ     15557
BROWN        14565
             ...  
FERRAIR          1
'HKRELI          1
GLEXANDER        1
DENNIGER         1
DANSIGER         1
Name: Permittee's Last Name, Length: 85626, dtype: int64

In [54]:
cols["Permittee's Last Name"].isnull().sum()

0

In [55]:
# some names contain '-', " ", "_", "/" between last name & other coarse strings
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.split(" ").str[0]
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.split("-").str[0]
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.split("/").str[0]
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.split("\\").str[0]
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.split("_").str[0]

# further clean the Last name
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.strip("_")
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.strip("`")
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.strip("\\")
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.strip("{")
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.strip(".")                                                                      
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.strip("-")                                                                    
cols["Permittee's Last Name"] = cols["Permittee's Last Name"].str.strip(",")

# Not-a-name & missing value rows are given "NA"
cols["Permittee's Last Name"].str.isnumeric().replace("N/A",inplace=True)
cols["Permittee's Last Name"].fillna("N/A", inplace=True)

In [56]:
cols["Permittee's Last Name"].value_counts()

SINGH       28530
WHITE       19817
LEE         16425
MARTINEZ    15560
BROWN       14578
            ...  
PERCHUK         1
VIRANZA         1
RAQMOS          1
GURBQLL         1
DANSIGER        1
Name: Permittee's Last Name, Length: 80924, dtype: int64

In [57]:
cols["Permittee's Last Name"].isnull().sum()

0

### Permittee's Business Name

In [58]:
cols["Permittee's Business Name"].value_counts()

N/A                              27050
ROCKLEDGE SCAFFOLD                7732
STRUCTURE TONE INC                7461
EVEREST SCAFFOLDING INC           7368
OWNER                             5090
                                 ...  
HAMDAL CONSTRUCTION CO.              1
REDMOND WINDOWS AND DOORS            1
SVC. PLUS PLBG & HTG CO. INC         1
CORP. CONSTR. PROJ. MANA.            1
FINE LIVING CONSTRUCTION CORP        1
Name: Permittee's Business Name, Length: 351179, dtype: int64

In [59]:
cols["Permittee's Business Name"].isnull().sum()

0

In [60]:
# Missing value means the name is not available (replace with NA)
#cols["Permittee's Business Name"].fillna("NA")
cols["Permittee's Business Name"].replace(("N.A", "na","n.a"), ("N/A","N/A","N/A"),inplace=True)

# some contains business address instead of business name
streetValue = ["st", "St", "ST", "street", "Street", "STREET"]
cols["Permittee's Business Name"].str.contains('|'.join(streetValue)).replace("N/A", inplace=True)

In [61]:
cols["Permittee's Business Name"].value_counts()

N/A                              28787
ROCKLEDGE SCAFFOLD                7732
STRUCTURE TONE INC                7461
EVEREST SCAFFOLDING INC           7368
OWNER                             5090
                                 ...  
FUNCTIOAL CONSTR INC                 1
ALL CITY MECHANICAL, INC             1
PHOENX 132 SUTTON ST INC             1
J. HARONE , CO                       1
FINE LIVING CONSTRUCTION CORP        1
Name: Permittee's Business Name, Length: 351178, dtype: int64

### Permittee's Phone #

In [62]:
cols["Permittee's Phone #"].isnull().sum()

0

In [63]:
# some #s followed by a .0 (xxxxxxxxx.0)
# missing value means no # is available
cols["Permittee's Phone #"] = cols["Permittee's Phone #"].astype(str)
cols["Permittee's Phone #"] = cols["Permittee's Phone #"].str.split(".")
cols["Permittee's Phone #"] = cols["Permittee's Phone #"].apply(lambda x: x[0])
cols["Permittee's Phone #"].fillna("N/A",inplace=True)

In [64]:
cols["Permittee's Phone #"].isnull().sum()

0

In [65]:
cols["Permittee's Phone #"].value_counts()

2124816100    16556
7183924921    11749
9144230400    11396
2129261088     9812
7185894900     9110
              ...  
7182633277        1
7183942021        1
718828111P        1
7188516462        1
7189485837        1
Name: Permittee's Phone #, Length: 145122, dtype: int64

### Permittee's License Type

In [66]:
cols["Permittee's License Type"].value_counts()

GENERAL CONTRACTOR             1343651
MASTER PLUMBER                  528867
                                254952
FIRE SUPPRESSION CONTRACTOR     128567
OIL BURNER INSTALLER             58574
SIGN HANGER                      50004
OWNER                            36295
NO WORK                          15367
DEMOLITION CONTRACTOR             5188
PROFESSIONAL ENGINEER             3433
REGISTERED ARCHITECT              2944
HOME IMPROVEMENT CONTRACTOR        684
Name: Permittee's License Type, dtype: int64

In [67]:
cols["Permittee's License Type"].isnull().sum()

0

In [68]:
# Not sure what "NO WORK" means, but can't do anything to it
# missing value means no # is available
cols["Permittee's License Type"].fillna("N/A",inplace=True)

In [69]:
cols["Permittee's License Type"].isnull().sum()

0

### Permittee's License #

In [70]:
cols["Permittee's License #"].value_counts()

          221196
0          78260
2660       14500
1982       13282
2593       11656
           ...  
605173         1
31668          1
36559          1
3137           1
18330          1
Name: Permittee's License #, Length: 46747, dtype: int64

In [71]:
cols["Permittee's License #"].isnull().sum()

0

In [72]:
# a single digit from 0 to 9 means not available, or wrong input

singleNumber = ['0','1','2','3','4','5','6','7','8','9']
cols["Permittee's License #"].replace(singleNumber,'N/A',inplace=True)


In [73]:
# missing value means no # is available

cols["Permittee's License #"].str.strip(" ")
cols["Permittee's License #"].str.strip("\\")
cols["Permittee's License #"].str.strip(",")
cols["Permittee's License #"].str.strip("`")
cols["Permittee's License #"].str.strip("*")

cols["Permittee's License #"] = cols["Permittee's License #"].astype(str)
cols["Permittee's License #"] = cols["Permittee's License #"].str.split(".")
cols["Permittee's License #"] = cols["Permittee's License #"].apply(lambda x: x[0])
cols["Permittee's License #"].fillna("N/A",inplace=True)

In [74]:
cols["Permittee's License #"].isnull().sum()

0

### Act as Superintendent

In [75]:
cols['Act as Superintendent'].value_counts()

Y    1586974
      833300
N       8252
Name: Act as Superintendent, dtype: int64

In [76]:
cols['Act as Superintendent'].isnull().sum()

0

In [77]:
# Fill missing value with NA
cols['Act as Superintendent'].fillna("N/A", inplace=True)

In [78]:
cols['Act as Superintendent'].isnull().sum()

0

### Permittee's Other Title

In [79]:
cols["Permittee's Other Title"].value_counts()

                   2160267
GC                  105964
G.C.                 63698
G.C                  15668
CONTRACTOR           13935
                    ...   
SPRIN CONTRACTO          1
SUB CONT.                1
REFIR CONTRAC            1
IL BURNER INST           1
TANK #61130423           1
Name: Permittee's Other Title, Length: 2798, dtype: int64

In [80]:
cols["Permittee's Other Title"].isnull().sum()

0

In [81]:
GC = ["GC","GC .","G.C .","GCC.","G?C","G.,C.","G..C.","G,C,","G.C. (SIGNS)","DEMO CON","DEMO. CONTRACTR","GEN CONTACTOR","G .C.","G.CONTRACTOR","GEN  CONTRACTOR","G.C.","_GC","_G.C","G.C","G C","GC.","G.C..","G/C","_G.C.","___GC","G. C.","G.G..","G. C","G..C","G..","G/C/","G,C.","GEN. CONTRACTOR","GEN CONTRACTOR","GEN CONTR.","GEN. CONTR.","GENERAL CONTR.","GENERAL CONTRAC","GC/OWNER","GEN CON","GENERAL","GEN CONT"]
CONTRACTOR = ["CONTR.","CONTR","CONT","CON.","CONT.","C"]
DEMO_CONTRACTOR = ["DEMO CONTR","DEMO. CONTR.","DEMO.CON.","DEMO CONT","DEMO CONTR{","DEMO CONT.","DEMO.CON","DEMO. CONT.","DEMO. CONTRACTO","DEMO CONTR.","DEMO.","DEMO.CONTRACTOR","DEMOL CONTR"]
OIL_BURN_INST = ["OIL BURNER INS.","OIL BURNER INSL","OIL BURNER INST","B INSTALLER","OILBURNER","O.B.INSTALLER","FUEL OIL INST","OIL BURNER INS","B. INSTALLER","OIL BURNER INST","OIL BURNER","FUEL OIL","OIL BUR. INSTR.","OBI","OIL INSTALLER","OIL INSTALLER","OIL BURN. INSTA","OIL BUR.","OIL BRN INSTL","TANK INSTALLER","FUEL OIL INST.","BOILER INSTALL.","BOILER INSTALL","BOILER INSTALLE","BURNER INSTALLE","O.B. INSTALLER","O.B.I.","OB INSTALLER","O.B.","BOIL INSTALLER",""]
SIGN = ["SIGN MFG.","SIGN MFG","SIGN MGF","SIGN HANGER-164","SIGN HGR","SIGN MGR","SIGN HANG. #168","SIGN MANUFACT.","SIGN RIGGER","SIGN ERECTOR","SGN.HNGR.","SIGN HANGER-168","SIGN HANGERS","SIGN BROKER","SIGN HANGER-157","SG MFG","SGN HNGR","SIGN CONTRACTOR","SIGN MFGR.",]

cols["Permittee's Other Title"].replace(GC,"GENERAL CONTRACTOR",inplace=True)
cols["Permittee's Other Title"].replace(CONTRACTOR,"CONTRACTOR",inplace=True)
cols["Permittee's Other Title"].replace(DEMO_CONTRACTOR,"DEMOLITION CONTRACTOR",inplace=True)
cols["Permittee's Other Title"].replace(OIL_BURN_INST,"OIL BURNER INSTALLER",inplace=True)
cols["Permittee's Other Title"].replace(SIGN,"SIGN HANGER",inplace=True)

### HIC License

In [82]:
cols["HIC License"].value_counts()

           2401722
0              845
9999999        551
45141          405
1251318        332
            ...   
1011135          1
354933           1
880320           1
904922           1
1115514          1
Name: HIC License, Length: 5513, dtype: int64

In [83]:
cols["HIC License"].isnull().sum()

0

In [84]:
# remove ending .0s
cols["HIC License"] = cols["HIC License"].astype(str)
cols["HIC License"] = cols["HIC License"].str.split(".")
cols["HIC License"] = cols["HIC License"].apply(lambda x: x[0])
singleNumber = ['0','1','2','3','4','5','6','7','8','9']

# remove invalid inputs
cols["HIC License"].replace(singleNumber,'N/A',inplace=True)
cols["HIC License"].replace('nan','N/A',inplace=True)

# Fill missing value with NA
cols["HIC License"].fillna("N/A", inplace=True)

### Site Safety Mgr's First Name

In [85]:
cols["Site Safety Mgr's First Name"].value_counts()

                  2417665
N/A                   547
JOHN                  503
MICHAEL               476
ADMINISTRATIVE        376
                   ...   
JERZY                   1
RASHID                  1
ALAAN                   1
STERN                   1
SABAN                   1
Name: Site Safety Mgr's First Name, Length: 509, dtype: int64

In [86]:
cols["Site Safety Mgr's First Name"].isnull().sum()

0

In [87]:
# some names contain '-', " ", "_", "/" between name & other coarse strings
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.split(" ").str[0]
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.split("-").str[0]
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.split("/").str[0]
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.split("\\").str[0]
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.split("_").str[0]

# further clean the name
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.strip("_")
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.strip("`")
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.strip("\\")
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.strip("{")
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.strip(".")                                                                      
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.strip("-")                                                                    
cols["Site Safety Mgr's First Name"] = cols["Site Safety Mgr's First Name"].str.strip(",")

# Not-a-name & missing value rows are given "NA"
cols["Site Safety Mgr's First Name"].str.isnumeric().replace("N/A",inplace=True)
cols["Site Safety Mgr's First Name"].fillna("N/A", inplace=True)

### Site Safety Mgr's Last Name

In [88]:
cols["Site Safety Mgr's Last Name"].value_counts()

             2417665
N/A              522
WAIVER           376
ESPOSITO          96
SAME              94
              ...   
CLAIRA             1
SHUELEN            1
ROSENBERG          1
SARCONA            1
CRISPIANO          1
Name: Site Safety Mgr's Last Name, Length: 1238, dtype: int64

In [89]:
cols["Site Safety Mgr's Last Name"].isnull().sum()

0

In [90]:
# some names contain '-', " ", "_", "/" between name & other coarse strings
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.split(" ").str[0]
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.split("-").str[0]
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.split("/").str[0]
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.split("\\").str[0]
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.split("_").str[0]

# further clean the name
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.strip("_")
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.strip("`")
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.strip("\\")
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.strip("{")
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.strip(".")                                                                      
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.strip("-")                                                                    
cols["Site Safety Mgr's Last Name"] = cols["Site Safety Mgr's Last Name"].str.strip(",")

# Not-a-name & missing value rows are given "NA"
cols["Site Safety Mgr's Last Name"].str.isnumeric().replace("N/A",inplace=True)
cols["Site Safety Mgr's Last Name"].fillna("N/A", inplace=True)

### Site Safety Mgr Business Name

In [91]:
cols["Site Safety Mgr Business Name"].value_counts()

                                    2419949
N/A                                     621
TOTAL SAFETY CONSULTING                 507
SITE SAFETY LLC                         428
NYC DEPT. OF BUILDINGS                  378
                                     ...   
B.T.U. CONTRACTING                        1
RICHARD PLUMBING & HEASTING CO.I          1
SELECT SAFTY CONSULANTS                   1
TURNER CONSTRUCTION COMP                  1
ACE PIPING CONTRACTORS                    1
Name: Site Safety Mgr Business Name, Length: 1260, dtype: int64

In [92]:
cols["Site Safety Mgr Business Name"].isnull().sum()

0

In [93]:
# clean the name
cols["Site Safety Mgr Business Name"] = cols["Site Safety Mgr Business Name"].str.strip("_")
cols["Site Safety Mgr Business Name"] = cols["Site Safety Mgr Business Name"].str.strip("`")
cols["Site Safety Mgr Business Name"] = cols["Site Safety Mgr Business Name"].str.strip("\\")
cols["Site Safety Mgr Business Name"] = cols["Site Safety Mgr Business Name"].str.strip("{")
cols["Site Safety Mgr Business Name"] = cols["Site Safety Mgr Business Name"].str.strip(".")                                                                      
cols["Site Safety Mgr Business Name"] = cols["Site Safety Mgr Business Name"].str.strip("-")                                                                    
cols["Site Safety Mgr Business Name"] = cols["Site Safety Mgr Business Name"].str.strip(",")


# Not-a-name & missing value rows are given "NA"
cols["Site Safety Mgr Business Name"].str.isnumeric().replace("N/A",inplace=True)
cols["Site Safety Mgr Business Name"].fillna("N/A", inplace=True)

## Export Result Data Set Columns 21-40

In [94]:
cols.to_csv(r'./resultCols.csv', index = False)