In [24]:
from openclean.data.load import dataset
from openclean.pipeline import stream
import pandas as pd

data_dir = '../project_data/'

data_list = [
    'data-cityofnewyork-us.8eq5-dtjb.csv', 
    'data-cityofnewyork-us.emuv-tx7t.csv', 
    'data-cityofnewyork-us.gt6r-wh7c.csv', 
    'data-cityofnewyork-us.un8d-rbed.csv', 
    'data-cityofnewyork-us.m6ad-jy3s.csv', 
    'data-cityofnewyork-us.wye7-nyek.csv', 
    'data-cityofnewyork-us.bty7-2jhb.csv', 
    'data-cityofnewyork-us.xrwg-eczf.csv', 
    'data-cityofnewyork-us.3rfa-3xsf.csv', 
    'data-cityofnewyork-us.aiww-p3af.csv', 
    'data-cityofnewyork-us.cwy2-px8b.csv', 
    'data-cityofnewyork-us.hy4q-igkk.csv'
]


# Park Borough also included in igkk & p3af

data_column = [
    'Borough', 
    'Borough', 
    'Borough', 
    'Borough', 
    'Borough', 
    'Borough', 
    'BOROUGH', 
    'Borough', 
    'Borough', 
    'Borough', 
    'Borough', 
    'Borough'
]

In [145]:
# our original strategy:
# 1. padding missing value with value 'UNKNOWN'
# 2. using uppercase for columns which could be used as reference data, such as BOROUGH
# 3. using knn cluster to check the spelling error
from openclean.cluster.knn import knn_clusters, knn_collision_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

from openclean.function.value.null import is_empty
from openclean.operator.transform.update import update


def calc_effectiveness(problem_rows, cleaned_rows):
    precision = intersected_num / cleaned_rows * 1.0 if cleaned_rows != 0 else 0.0
    recall = intersected_num / problem_rows * 1.0 if problem_rows != 0 else 0.0
    print(f"Data cleaned with precision {precision} and recall {recall} in {intersected_num} cleaning rows")
    return precision, recall

def print_cluster(cnumber, cluster):
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster.items():
        print('{} ({})'.format(val, count))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))

def perform_knn_cluster(ds_full, column, using_collision=True, minsize=2, t=0.6):
    values = ds_full.select(column).distinct()
    clusters = knn_clusters(values=values, sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(t)), minsize=minsize) \
        if using_collision else knn_collision_clusters(values=values, sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(t)), minsize=minsize)
    print('{} clusters of size {} or greater'.format(len(clusters), minsize))
    clusters.sort(key=lambda c: len(c), reverse=True)
    for i, cluster in enumerate(clusters):
        print_cluster(i + 1, cluster)

def profiling_data(datafile, column):
    ds_full = stream(datafile, encoding='utf-8')
    df = ds_full.to_df()
    borough = df[column].value_counts()
    print(borough)
    print("Total locations: ", len(borough))
    perform_knn_cluster(ds_full, column, using_collision=True)
    return df
    
def cleaning_data_original(df, column):
    rows_affected_upper = df[df[column] != df[column].str.upper()].index
    df = update(df, columns=column, func=str.upper)
    rows_affected_empty = df.isnull().index
    df = update(df, columns=column, func=lambda x: 'OTHER' if is_empty(x) else x)
    cleaned_rows = rows_affected_upper.union(rows_affected_empty)
    return df, cleaned_rows
    
def save_cleaned_data(df, output='result.csv'):
    df.to_csv('output')

## dataset data-cityofnewyork-us.8eq5-dtjb.csv

In [108]:
# Profile dataset
datafile = data_dir + data_list[0]
column = data_column[0]
df = profiling_data(datafile, column)
df, cleaned_rows = cleaning_data_original(df, column)
df

MN    25
BK    23
QN    13
BX    11
SI     4
Name: Borough, dtype: int64
Total locations:  5
0 clusters of size 3 or greater


Unnamed: 0,BID Name:,Borough,Service Area (Linear Feet),Full-time staff,Sanitation staff employed,Public Safety staff employed,Part-time staff,"Individual businesses (retail, restaurant, office, etc.)",Occupied storefronts,Vacant storefronts,...,Streetscape & beautification expenses,Other program expenses,Capital improvement expenses,Salaries,Outside contractor expenses,Insurance costs,Rent and utilities,Supplies and equipment costs,Other G&A expenses,Total expenses
0,125th Street,MN,10270,3,6,4,8,154,180,10,...,,77534,,345388,24699,25267,118989,,25700,1263213
1,161st Street,BX,5220,1,2,0,3,145,150,2,...,10000,,,133000,18000,7500,8500,3200,,312200
2,165th Street Mall,QN,1740,1,3,0,0,168,86,6,...,,,,64910,11180,43336,13281,,6748,233579
3,180th Street,QN,8150,1,0,4,0,42,41,1,...,15750,,,16000,7380,2885,2400,,2700,63377
4,34th Street Partnership,MN,41110,54,68,30,10,I don't know,293,56,...,1930174,4642102,132237,918142,127785,80795,96207,9605,63462,14803896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,Washington Heights BID,MN,7030,2,5,0,0,248,210,38,...,,,,138100,9000,15500,93900,7690,11000,569890
72,West Shore,SI,26880,1,2,0,3,72,Industrial Space - do not track,Industrical Space - do not track,...,,3000,20930,35000,,,,,3173,94753
73,Westchester Square,BX,9100,2,2,1,0,190,163,10,...,,5620,,96664,,3895,33046,,16051,394569
74,White Plains Road,BX,3610,1,2,0,1,106,99,7,...,1000,,,39833,6465,2088,5980,250,,91516


In [110]:
# Get effectiveness
# Step 1: Get rows which should be cleaned
calc_effectiveness(0, len(cleaned_rows))

Data cleaned with precision 0.0 and recall 0


(0.0, 0)

## data-cityofnewyork-us.emuv-tx7t.csv

In [43]:
# Profile dataset
datafile = data_dir + data_list[1]
column = data_column[1]
profiling_data(datafile, column)

MN    25
BK    23
QN    12
BX     9
SI     3
Name: Borough, dtype: int64
Total locations:  5
0 clusters of size 2 or greater


## data-cityofnewyork-us.gt6r-wh7c.csv

In [44]:
# Profile dataset
datafile = data_dir + data_list[2]
column = data_column[2]
profiling_data(datafile, column)

MN    25
BK    23
QN    13
BX    10
SI     4
Name: Borough, dtype: int64
Total locations:  5
0 clusters of size 2 or greater


## data-cityofnewyork-us.un8d-rbed.csv

In [115]:
# Profile dataset
datafile = data_dir + data_list[3]
column = data_column[3]
profiling_data(datafile, column)

                    11989
Brooklyn             3014
Manhattan            2029
Queens               1986
Bronx                 759
LIC                   265
Staten Island         213
Long Island City      111
S.I.                   30
Jackson Heights        24
Flushing               20
Jamaica                 9
Bayside                 9
Brooklyhn               9
10014                   6
Broorlyn                2
Name: Borough, dtype: int64
Total locations:  16
1 clusters of size 2 or greater
Cluster 1 (of size 2)

Brooklyhn (9)
Brooklyn (3014)

Suggested value: Brooklyn




Unnamed: 0,Contracting Company,Company Representative,Representative Phone,Representative Email,Site Address,Block,Lot,Borough,Work Description,Keywords,...,CB,Upload Date,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,545 Broadway BK LLC,Seiden & Schein PC,212 857-0110,jayseiden@seidenschein.com,545 Broadway,3076,6,Brooklyn,"Commercial, retail, alteration, addition to ex...","demolition, SOE, excavation, foundation, exter...",...,34,06/20/2017 09:52:03 AM,40.704937,-73.949311,1,34,491,3071328,3030760006,East Williamsburg
1,"JP Morgan Chase Bank, N.A.",Marcus & Pollack LLP,212 490-2900,jmarcus@marcuspollack.com,270 Park Avenue,1283,21,Manhattan,"Commercial Alteration, retail, addition to exi...","Demolition, steel, concrete, hvac, electric, c...",...,Manhattan 5,05/24/2021 08:20:00 PM,40.755641,-73.975109,5,4,94,1035421,1012830021,Midtown-Midtown South
2,545 Broadway BK LLC,Seiden & Schein PC,212 857-0110,jayseiden@seidenschein.com,545 Broadway,3076,6,Brooklyn,"Commercial, retail, alteration, addition to ex...","demolition, SOE, excavation, foundation, exter...",...,34,06/02/2017 02:50:42 PM,40.704937,-73.949311,1,34,491,3071328,3030760006,East Williamsburg
3,Brooklyn Veterinary Emergency Service dba Vete...,Marcus & Pollack LLP,212 490-2900,jmarcus@marcuspollack.com,196 Fourth Avenue,427,42,Brooklyn,Commercial alteration,"concrete, electrical, carpentry, HVAC, plumbing",...,39,06/02/2017 02:50:42 PM,40.678290,-73.982464,6,33,119,3006997,3004270042,Park Slope-Gowanus
4,145-36 Realty LLC,Law Office of Thomas Berinato,718 575-3400,tberinato@verizon.net,145-38 34 Ave.,4995,46,Queens,"Commercial, residential, new bldg","plumbing, electrical",...,20,06/02/2017 02:50:42 PM,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20470,5 Bay St. LLC,Triangle Equities,718 463-5757,egoldman@triequities.com,5 Bay St.,1,"58, 60",Staten Island,"Commerical, retail, new building, hotel","demolition, site work, paving, fence, structur...",...,49,09/26/2017 09:18:46 AM,40.641476,-74.073976,1,49,3,5131766,5000010068,West New Brighton-New Brighton-St. George
20471,PR III/MD Astoria Storage LLC,Anthony Como Assoc.,718 273-3080,acomo@nyrets.com,31-07 20 Avenue,850,380,Queens,"Commercial, new building","fencing, landscaping, piling, site work, concr...",...,22,09/06/2017 03:33:30 PM,40.779545,-73.906030,1,22,10701,4601079,4008500380,Steinway
20472,Sunset Park Storage LLC,Marcus & Pollack LLP,212 490-2900,jmarcus@marcuspollack.com,5002-12 2nd Avenue,788,32,Brooklyn,"Commercial, new building","concrete, steel, electrical, carpentry, HVAC, ...",...,Brooklyn 7,05/26/2019 09:19:01 AM,40.649006,-74.017031,7,38,18,3426393,3007880032,Sunset Park West
20473,Bond St. Owner LLC,"Tuchman, Korngold, Weiss, Liebman",212 687-3747,bgottlieb@tkwlg.com,61 Bond St.,172,"5, 7, 10, 109, 13",Brooklyn,"Commercial, retail, new building","steel, framing, carpentry, hvac, electrical, p...",...,35,09/06/2017 03:33:30 PM,40.687864,-73.984181,2,33,41,3426306,3001720005,DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill


## dataset data-cityofnewyork-us.m6ad-jy3s.csv

In [11]:
# Profile dataset
datafile = data_dir + data_list[4]
column = data_column[4]
profiling_data(datafile, column)

MN    25
BK    23
QN    13
BX     9
SI     4
Name: Borough, dtype: int64
Total locations:  5


## dataset data-cityofnewyork-us.wye7-nyek.csv

In [116]:
# Profile dataset
datafile = data_dir + data_list[5]
column = data_column[5]
profiling_data(datafile, column)

Manhattan                                            252
Queens                                               216
Brooklyn                                             200
Bronx                                                128
Staten Island                                         75
Bronx;#Brooklyn;#Manhattan;#Queens;#Staten Island     14
Manhattan;#Queens                                      7
Staten Island;#Queens;#Manhattan;#Brooklyn;#Bronx      4
Bronx;#Manhattan                                       4
Brooklyn;#Manhattan                                    3
Brooklyn;#Staten Island                                3
Queens;#Bronx                                          3
Manhattan;#Bronx                                       3
Bronx;#Brooklyn;#Manhattan;#Queens                     3
Manhattan;#Brooklyn                                    3
Bronx;#Queens                                          3
Queens;#Brooklyn                                       3
Brooklyn;#Bronx;#Manhattan;#Que

Unnamed: 0,Project Name,Project Received Date,Project ID Number OP,Project Description,Impacted Park ID(s),Impacted Park Name(s),Address or Intersection,Borough,Multisite Project?,Sponsoring Entity,...,Substantial Completion Date,Final Inspection Date,Latest Inspection Site Visit,Punchlist Generated,Conditional Letter issued,Final Acceptance of Park by Borough,Final Acceptance Letter Date,Created,Modified,Coordinates
0,Filling of Five Bridges over Abandoned CSX Lin...,09/25/2018,HBX163,Project to fill-in bridges passing over an aba...,XZ79;X052;X053;,GREENSTREET;Triangle;O'Neill Triangle;,,Bronx,Single Site,CDOT,...,,,05/08/2020,05/04/2020,05/18/2020,,,12/07/2018,05/04/2021,"40.823690, -73.911201"
1,Red flag repairs at the Passerelle bridge,03/08/2021,LGA-124.431 WO14A,Red flag repairs at the Passerelle bridge,Q099;,Flushing Meadows Corona Park;,Passerelle Bridge,Queens,Single Site,PANYNJ,...,,,,,,,,05/03/2021,05/03/2021,"40.753287, -73.844672"
2,"Rehabilitation of East 169th Street, East 175t...",04/08/2016,HBX1670; HBX1215; HBX180,Rehabilitation of three bridges. No parkland.,,,,Bronx,Single Site,CDOT,...,,,,,,,,05/03/2021,05/03/2021,
3,Mobility Improvement on Northbound Bruckner Ex...,08/19/2015,X731.27,Reconfiguration of expressway and parkway inte...,X039;X101;,Pelham Bay Park;Hutchinson River Parkway;,Bruckner Expressway & Pelham Parkway,Bronx,Multi Site,SDOT,...,,,,,,,,11/29/2016,04/30/2021,"40.857479, -73.825591"
4,Groundwater Sustainability of the Long Island ...,02/01/2019,LK00GHP,Installation of five deep monitoring wells for...,Q220C;B247;B057;B058;B052;Q107;B372;Q084;Q131;,Dr. Charles R. Drew Park;Breukelen Ballfields;...,,Queens;#Brooklyn,Multi Borough,USGS,...,,,,,,,,02/01/2019,04/28/2021,"40.719018, -73.951823"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
947,"Cleaning and Relocation of Koenig Sphere, and ...",05/09/2017,WTC-244.080.07,See project name.,M005;,The Battery;,,Manhattan,Single Site,PANYNJ,...,,,,,,10/03/2017 12:00:00 AM,10/03/2017 12:00:00 AM,01/25/2018,11/05/2019,"40.703905, -74.017148"
948,Con Ed New Service Installation,09/29/2017,,Installation of sidewalk conduit to provide se...,B400;,Hattie Carthan Community Garden;,,Brooklyn,Single Site,ConEd,...,,,,,,,,02/02/2018,11/05/2019,"40.6898874,-73.94844549999999"
949,PS 354Q - North Rochdale Playground,11/21/2016,,PS 354Q - Roof Replacement and Masonry Work,,,"Baisley Blvd. 168 St. And Bedell St., Queens",Queens,Single Site,SCA,...,,,,,,,,03/14/2017,11/05/2019,
950,PS 272K - Bayview Playground,11/21/2016,,Installations and exterior work on school buil...,,,"Seaview Ave. Between E. 99 St. And E. 101 St.,...",Brooklyn,Single Site,SCA,...,,,,,,,,03/14/2017,11/05/2019,


## dataset data-cityofnewyork-us.bty7-2jhb.csv

In [51]:
# Profile dataset
datafile = data_dir + data_list[6]
column = data_column[6]
profiling_data(datafile, column)

MANHATTAN        1008004
BROOKLYN          532384
QUEENS            517986
BRONX             215035
STATEN ISLAND     155117
Name: BOROUGH, dtype: int64
Total locations:  5
0 clusters of size 3 or greater


## dataset data-cityofnewyork-us.xrwg-eczf.csv

In [52]:
# Profile dataset
datafile = data_dir + data_list[7]
column = data_column[7]
profiling_data(datafile, column)

MANHATTAN        928
BROOKLYN         493
QUEENS           395
BRONX            322
STATEN ISLAND    179
Bronx              2
Name: Borough, dtype: int64
Total locations:  6
0 clusters of size 3 or greater


## dataset data-cityofnewyork-us.3rfa-3xsf.csv

In [53]:
# Profile dataset
datafile = data_dir + data_list[8]
column = data_column[8]
profiling_data(datafile, column)

Unspecified      689461
QUEENS           328154
BROOKLYN         316593
MANHATTAN        229076
BRONX            142234
STATEN ISLAND     77615
Name: Borough, dtype: int64
Total locations:  6
0 clusters of size 3 or greater


## dataset data-cityofnewyork-us.aiww-p3af.csv

In [54]:
# Profile dataset
datafile = data_dir + data_list[9]
column = data_column[9]
profiling_data(datafile, column)

Unspecified      706747
QUEENS           347530
BROOKLYN         343072
MANHATTAN        258171
BRONX            143763
STATEN ISLAND     83124
Name: Borough, dtype: int64
Total locations:  6
0 clusters of size 3 or greater


## dataset data-cityofnewyork-us.cwy2-px8b.csv

In [55]:
# Profile dataset
datafile = data_dir + data_list[10]
column = data_column[10]
profiling_data(datafile, column)

QUEENS           4891
BROOKLYN         3858
MANHATTAN        2914
BRONX            2033
STATEN ISLAND     397
Name: Borough, dtype: int64
Total locations:  5
0 clusters of size 3 or greater


## dataset data-cityofnewyork-us.hy4q-igkk.csv

In [56]:
# Profile dataset
datafile = data_dir + data_list[11]
column = data_column[11]
profiling_data(datafile, column)

Unspecified      669438
QUEENS           322878
BROOKLYN         319698
MANHATTAN        245673
BRONX            149974
STATEN ISLAND     81698
Name: Borough, dtype: int64
Total locations:  6
0 clusters of size 3 or greater


In [131]:
# Since our aim was only to clean the columns which were overlap with the starting data set 
# hence we load all the dataset at once then performing data cleaning to create reference data

borough_df = pd.DataFrame(columns=['Borough'])

for i in range(len(data_list)):
    datafile = data_dir + data_list[i]
    print("Load data: ", datafile)
    ds = dataset(datafile, encoding='utf-8')
    ds.rename(columns={'BOROUGH': 'Borough'}, inplace=True)
    borough = ds[['Borough']]
    borough_df = borough_df.append(borough)

Load data:  ../project_data/data-cityofnewyork-us.8eq5-dtjb.csv
Load data:  ../project_data/data-cityofnewyork-us.emuv-tx7t.csv
Load data:  ../project_data/data-cityofnewyork-us.gt6r-wh7c.csv
Load data:  ../project_data/data-cityofnewyork-us.un8d-rbed.csv
Load data:  ../project_data/data-cityofnewyork-us.m6ad-jy3s.csv
Load data:  ../project_data/data-cityofnewyork-us.wye7-nyek.csv
Load data:  ../project_data/data-cityofnewyork-us.bty7-2jhb.csv
Load data:  ../project_data/data-cityofnewyork-us.xrwg-eczf.csv
Load data:  ../project_data/data-cityofnewyork-us.3rfa-3xsf.csv
Load data:  ../project_data/data-cityofnewyork-us.aiww-p3af.csv
Load data:  ../project_data/data-cityofnewyork-us.cwy2-px8b.csv
Load data:  ../project_data/data-cityofnewyork-us.hy4q-igkk.csv


In [133]:
pd.set_option('display.max_rows', None)
print(borough_df.value_counts())

Borough                                          
Unspecified                                          2065646
MANHATTAN                                            1744766
QUEENS                                               1521834
BROOKLYN                                             1516098
BRONX                                                 653361
STATEN ISLAND                                         398130
                                                       11989
Brooklyn                                                3214
Manhattan                                               2281
Queens                                                  2202
Bronx                                                    889
Staten Island                                            288
LIC                                                      265
Long Island City                                         111
MN                                                       100
BK                                 

In [138]:
group = borough_df.value_counts().reset_index()
group.columns = ['Borough', 'count']
intermediate_df = group[['Borough', 'count']]
intermediate_df.to_csv('../reference_data/borough_reference_data.csv', index=None)

In [141]:
# Next we cleaned the overall column 'Borough' and generate our reference data
# First we extracted only the column 'Borough'
borough_df.to_csv('../reference_data/borough_rows_all.csv', index=None)

In [146]:
# Load it as stream format
ds_full = stream('../reference_data/borough_rows_all.csv', encoding='utf-8')

In [147]:
# And perform knn cluster
perform_knn_cluster(ds_full, 'Borough')

34 clusters of size 2 or greater
Cluster 1 (of size 7)

Bronx;#Brooklyn;#Manhattan;#Queens;#Staten Island (14)
Brooklyn;#Staten Island (3)
Bronx;#Brooklyn;#Queens;#Staten Island;#Manhattan (1)
Manhattan;#Brooklyn;#Bronx;#Queens;#Staten Island (1)
Queens;#Brooklyn;#Staten Island (1)
Bronx;#Queens;#Staten Island (1)
Bronx;#Brooklyn;#Queens;#Staten Island (1)

Suggested value: Bronx;#Brooklyn;#Manhattan;#Queens;#Staten Island


Cluster 2 (of size 6)

Brooklyn;#Manhattan (3)
Brooklyn;#Manhattan;#Staten Island (1)
Queens;#Brooklyn;#Staten Island (1)
Bronx;#Brooklyn;#Queens;#Staten Island (1)
Bronx;#Queens;#Staten Island (1)
Brooklyn;#Staten Island (3)

Suggested value: Brooklyn;#Manhattan


Cluster 3 (of size 5)

Brooklyn;#Manhattan;#Staten Island (1)
Bronx;#Brooklyn;#Manhattan;#Queens (3)
Manhattan;#Brooklyn;#Bronx;#Queens;#Staten Island (1)
Bronx;#Brooklyn;#Queens;#Staten Island (1)
Bronx;#Brooklyn;#Manhattan;#Queens;#Staten Island (14)

Suggested value: Bronx;#Brooklyn;#Manhattan;#Queens

In [148]:
# From cluster 29 we noticed there was a typo for 'Brooklyhn' ('Brooklyn'), transform the overall stream to DataFrame

df_overall = ds_full.to_df()

In [152]:
df_clean = update(df_overall, columns='Borough', func=str.upper)
df_clean = update(df_clean, columns='Borough', func=lambda x: 'BROOKLYN' if str(x) == 'BROOKLYHN' else x)
df_clean = update(df_clean, columns='Borough', func=lambda x: 'UNSPECIFIED' if is_empty(x) else x)
borough = df_clean['Borough'].value_counts()
print(borough)

UNSPECIFIED                                          2077635
MANHATTAN                                            1747047
QUEENS                                               1524036
BROOKLYN                                             1519321
BRONX                                                 654250
STATEN ISLAND                                         398418
LIC                                                      265
LONG ISLAND CITY                                         111
MN                                                       100
BK                                                        92
QN                                                        51
BX                                                        39
S.I.                                                      30
JACKSON HEIGHTS                                           24
FLUSHING                                                  20
SI                                                        15
BRONX;#BROOKLYN;#MANHATT

In [139]:
# Measure effectiveness of the borough column
# In this step, we would calculate the overall effectiveness of our original data clean strategy regarding borough
# From the data above, we inspected the problem of data set manually

# 1. In our original strategy, we fill the borough which was unknown to 'OTHER', while in this case some were marked as 'Unspecified', hence we use UNSPECIFIED instead of UIKNOWN
# 2. We converted the all the values to uppercase
# 3. We use cluster to check the spelling error

# From the saved intermediate data set, we found total number of problem rows from the total 12 data set with column borough: 2087274
# Next we cleaned the overall data set to check the effectiveness of our method:

# The overall rows changed by our strategy was: 2086772
# For some columns, we found our strategy could not clean the data into correct format, the number of row was: 679
# Hence we get our precision : 
precision = 1.0 * (2086772 - 679) / 2086772
recall = 1.0 * (2086772 - 679) / 2087274
print(f'Precision: {precision}, Recall: {recall}')

Precision: 0.9996746170640588, Recall: 0.9994341902404763


In [154]:
# We refined our strategy
# Since we found for some rows such as "BRONX;#QUEENS;#STATEN ISLAND", we changed this by only using the first borough
df_clean = update(df_clean, columns='Borough', func=lambda x: str(x).split(';')[0])
print(df_clean.value_counts())

Borough         
UNSPECIFIED         2077635
MANHATTAN           1747062
QUEENS              1524049
BROOKLYN            1519333
BRONX                654282
STATEN ISLAND        398427
LIC                     265
LONG ISLAND CITY        111
MN                      100
BK                       92
QN                       51
BX                       39
S.I.                     30
JACKSON HEIGHTS          24
FLUSHING                 20
SI                       15
JAMAICA                   9
BAYSIDE                   9
10014                     6
BROORLYN                  2
dtype: int64


In [156]:
# We also noticed that the knn cluster could not specify a typo 'BROORLYN'
# There were a digital item '10014', after we referred from the internet, we assdf_clean = update(df_clean, columns='Borough', func=lambda x: 'MANHATTAN' if str(x) == '10014' else x)
df_clean = update(df_clean, columns='Borough', func=lambda x: 'BROOKLYN' if str(x) == 'BROORLYN' else x)ume it was a zipcode located in Manhattan, hence we replace it with 'MANHATTAN'

print(df_clean.value_counts())

Borough         
UNSPECIFIED         2077635
MANHATTAN           1747068
QUEENS              1524049
BROOKLYN            1519335
BRONX                654282
STATEN ISLAND        398427
LIC                     265
LONG ISLAND CITY        111
MN                      100
BK                       92
QN                       51
BX                       39
S.I.                     30
JACKSON HEIGHTS          24
FLUSHING                 20
SI                       15
JAMAICA                   9
BAYSIDE                   9
dtype: int64


In [157]:
# Now we noticed that there were also some abbreviation representation of the borough, for example, 'MN' for 'Manhattan', 'LIC' for 'Long ISLAND CITY'
# We fixed them using their full names
# We also generated a reference data from the abbreviation we encountered
# Our new strategy was mapping unformatted value from formatted reference data generated from starting dataset

abb_map = {'MN': 'MANHATTAN', 
           'BK': 'BROOKLYN', 
           'QN': 'QUEENS', 
           'BX': 'BRONX', 
           'S.I.': 'STATEN ISLAND', 
           'LIC': 'LONG ISLAND CITY', 
           'SI': 'STATEN ISLAND'}

df_clean = update(df_clean, columns='Borough', func=lambda x: abb_map[str(x)] if str(x) in abb_map else x)
print(df_clean.value_counts())

Borough         
UNSPECIFIED         2077635
MANHATTAN           1747168
QUEENS              1524100
BROOKLYN            1519427
BRONX                654321
STATEN ISLAND        398472
LONG ISLAND CITY        376
JACKSON HEIGHTS          24
FLUSHING                 20
BAYSIDE                   9
JAMAICA                   9
dtype: int64


In [189]:
# Now we implement our new data clean strategy
from openclean.function.eval.base import Col

def improved_data_clean_borough(ds_full, column):
    abb_map = {'MN': 'MANHATTAN', 'BK': 'BROOKLYN', 'QN': 'QUEENS', 'BX': 'BRONX', 'S.I.': 'STATEN ISLAND', 'LIC': 'LONG ISLAND CITY', 'SI': 'STATEN ISLAND'}
    ds_full = update(ds_full, columns=column, func=str.upper)
    ds_full = update(ds_full, columns=column, func=lambda x: 'BROOKLYN' if str(x) == 'BROOKLYHN' else x)
    ds_full = update(ds_full, columns=column, func=lambda x: 'UNSPECIFIED' if is_empty(x) else x)
    ds_full = update(ds_full, columns=column, func=lambda x: str(x).split(';')[0])
    ds_full = update(ds_full, columns=column, func=lambda x: 'MANHATTAN' if str(x) == '10014' else x)
    ds_full = update(ds_full, columns=column, func=lambda x: 'BROOKLYN' if str(x) == 'BROORLYN' else x)
    ds_full = update(ds_full, columns=column, func=lambda x: abb_map[str(x)] if str(x) in abb_map else x)
    return ds_full

In [190]:
# Next we perform our new cleaning strategy regarding rolumn 'Borough' to all the data sets

datafile = data_dir + data_list[5]
print(datafile)
ds_test = dataset(datafile, encoding='utf-8')
# column = data_column[5]
ds_test = improved_data_clean_borough(ds_test, column)
bor_test = ds_test['Borough']
print(bor_test.value_counts())

../project_data/data-cityofnewyork-us.wye7-nyek.csv
MANHATTAN        267
QUEENS           229
BROOKLYN         212
BRONX            160
STATEN ISLAND     84
Name: Borough, dtype: int64


In [None]:
# And then we visualisation our result with treemap and Histogram

In [None]:
# We generate a reference data regarding potential abbreviation and the full name