In [24]:
from openclean.data.load import dataset
from openclean.pipeline import stream
import pandas as pd

data_dir = '../project_data/'

data_list = [
    'data-cityofnewyork-us.8eq5-dtjb.csv', 
    'data-cityofnewyork-us.emuv-tx7t.csv', 
    'data-cityofnewyork-us.gt6r-wh7c.csv', 
    'data-cityofnewyork-us.un8d-rbed.csv', 
    'data-cityofnewyork-us.m6ad-jy3s.csv', 
    'data-cityofnewyork-us.wye7-nyek.csv', 
    'data-cityofnewyork-us.bty7-2jhb.csv', 
    'data-cityofnewyork-us.xrwg-eczf.csv', 
    'data-cityofnewyork-us.3rfa-3xsf.csv', 
    'data-cityofnewyork-us.aiww-p3af.csv', 
    'data-cityofnewyork-us.cwy2-px8b.csv', 
    'data-cityofnewyork-us.hy4q-igkk.csv'
]


# Park Borough also included in igkk & p3af

data_column = [
    'Borough', 
    'Borough', 
    'Borough', 
    'Borough', 
    'Borough', 
    'Borough', 
    'BOROUGH', 
    'Borough', 
    'Borough', 
    'Borough', 
    'Borough', 
    'Borough'
]

In [49]:
# our original strategy:
# 1. padding missing value with value 'UNKNOWN'
# 2. using uppercase for columns which could be used as reference data, such as BOROUGH
# 3. using knn cluster to check the spelling error
from openclean.cluster.knn import knn_clusters, knn_collision_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan


def calc_effectiveness(problem_rows, cleaned_rows):
    intersected_num = len(pd.merge(problem_rows, cleaned_rows, how='inner'));
    precision = intersected_num / len(cleaned_rows) * 1.0
    recall = intersected_num / len(problem_rows) * 1.0
    return precision, recall

def print_cluster(cnumber, cluster):
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster.items():
        print('{} ({})'.format(val, count))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))

def perform_knn_cluster(ds_full, column, using_collision=False, minsize=3, t=0.7):
    values = ds_full.select(column).distinct()
    clusters = knn_clusters(values=values, sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(t)), minsize=minsize) \
        if using_collision else knn_collision_clusters(values=values, sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(t)), minsize=minsize)
    print('{} clusters of size {} or greater'.format(len(clusters), minsize))
    clusters.sort(key=lambda c: len(c), reverse=True)
    for i, cluster in enumerate(clusters):
        print_cluster(i + 1, cluster)

def profiling_data(datafile, column):
    ds_full = stream(datafile, encoding='utf-8')
    df = ds_full.to_df()
    work_loc = df[column].value_counts()
    print(work_loc)
    print("Total locations: ", len(work_loc))
    perform_knn_cluster(ds_full, column)
    
def cleaning_data_original(datafile, column, cluster_results):
    ds_full = dataset(datafile)
    ds_full = update(ds_full, columns=column, func=str.upper)
    ds_full = update(ds_full, columns=column, func=lambda x: 'OTHER' if is_empty(x) else x)
    
def save_cleaned_data(ds_full, output='result.csv'):
    ds_full.to_csv('output')

## dataset data-cityofnewyork-us.8eq5-dtjb.csv

In [42]:
# Profile dataset
datafile = data_dir + data_list[0]
column = data_column[0]
profiling_data(datafile, column)

MN    25
BK    23
QN    13
BX    11
SI     4
Name: Borough, dtype: int64
Total locations:  5
0 clusters of size 2 or greater


## data-cityofnewyork-us.emuv-tx7t.csv

In [43]:
# Profile dataset
datafile = data_dir + data_list[1]
column = data_column[1]
profiling_data(datafile, column)

MN    25
BK    23
QN    12
BX     9
SI     3
Name: Borough, dtype: int64
Total locations:  5
0 clusters of size 2 or greater


## data-cityofnewyork-us.gt6r-wh7c.csv

In [44]:
# Profile dataset
datafile = data_dir + data_list[2]
column = data_column[2]
profiling_data(datafile, column)

MN    25
BK    23
QN    13
BX    10
SI     4
Name: Borough, dtype: int64
Total locations:  5
0 clusters of size 2 or greater


## data-cityofnewyork-us.un8d-rbed.csv

In [45]:
# Profile dataset
datafile = data_dir + data_list[3]
column = data_column[3]
profiling_data(datafile, column)

                    11989
Brooklyn             3014
Manhattan            2029
Queens               1986
Bronx                 759
LIC                   265
Staten Island         213
Long Island City      111
S.I.                   30
Jackson Heights        24
Flushing               20
Jamaica                 9
Bayside                 9
Brooklyhn               9
10014                   6
Broorlyn                2
Name: Borough, dtype: int64
Total locations:  16
0 clusters of size 2 or greater


## dataset data-cityofnewyork-us.m6ad-jy3s.csv

In [11]:
# Profile dataset
datafile = data_dir + data_list[4]
column = data_column[4]
profiling_data(datafile, column)

MN    25
BK    23
QN    13
BX     9
SI     4
Name: Borough, dtype: int64
Total locations:  5


## dataset data-cityofnewyork-us.wye7-nyek.csv

In [50]:
# Profile dataset
datafile = data_dir + data_list[5]
column = data_column[5]
profiling_data(datafile, column)

Manhattan                                            252
Queens                                               216
Brooklyn                                             200
Bronx                                                128
Staten Island                                         75
Bronx;#Brooklyn;#Manhattan;#Queens;#Staten Island     14
Manhattan;#Queens                                      7
Staten Island;#Queens;#Manhattan;#Brooklyn;#Bronx      4
Bronx;#Manhattan                                       4
Brooklyn;#Manhattan                                    3
Brooklyn;#Staten Island                                3
Queens;#Bronx                                          3
Manhattan;#Bronx                                       3
Bronx;#Brooklyn;#Manhattan;#Queens                     3
Manhattan;#Brooklyn                                    3
Bronx;#Queens                                          3
Queens;#Brooklyn                                       3
Brooklyn;#Bronx;#Manhattan;#Que

## dataset data-cityofnewyork-us.bty7-2jhb.csv

In [51]:
# Profile dataset
datafile = data_dir + data_list[6]
column = data_column[6]
profiling_data(datafile, column)

MANHATTAN        1008004
BROOKLYN          532384
QUEENS            517986
BRONX             215035
STATEN ISLAND     155117
Name: BOROUGH, dtype: int64
Total locations:  5
0 clusters of size 3 or greater


## dataset data-cityofnewyork-us.xrwg-eczf.csv

In [52]:
# Profile dataset
datafile = data_dir + data_list[7]
column = data_column[7]
profiling_data(datafile, column)

MANHATTAN        928
BROOKLYN         493
QUEENS           395
BRONX            322
STATEN ISLAND    179
Bronx              2
Name: Borough, dtype: int64
Total locations:  6
0 clusters of size 3 or greater


## dataset data-cityofnewyork-us.3rfa-3xsf.csv

In [53]:
# Profile dataset
datafile = data_dir + data_list[8]
column = data_column[8]
profiling_data(datafile, column)

Unspecified      689461
QUEENS           328154
BROOKLYN         316593
MANHATTAN        229076
BRONX            142234
STATEN ISLAND     77615
Name: Borough, dtype: int64
Total locations:  6
0 clusters of size 3 or greater


## dataset data-cityofnewyork-us.aiww-p3af.csv

In [54]:
# Profile dataset
datafile = data_dir + data_list[9]
column = data_column[9]
profiling_data(datafile, column)

Unspecified      706747
QUEENS           347530
BROOKLYN         343072
MANHATTAN        258171
BRONX            143763
STATEN ISLAND     83124
Name: Borough, dtype: int64
Total locations:  6
0 clusters of size 3 or greater


## dataset data-cityofnewyork-us.cwy2-px8b.csv

In [55]:
# Profile dataset
datafile = data_dir + data_list[10]
column = data_column[10]
profiling_data(datafile, column)

QUEENS           4891
BROOKLYN         3858
MANHATTAN        2914
BRONX            2033
STATEN ISLAND     397
Name: Borough, dtype: int64
Total locations:  5
0 clusters of size 3 or greater


## dataset data-cityofnewyork-us.hy4q-igkk.csv

In [56]:
# Profile dataset
datafile = data_dir + data_list[11]
column = data_column[11]
profiling_data(datafile, column)

Unspecified      669438
QUEENS           322878
BROOKLYN         319698
MANHATTAN        245673
BRONX            149974
STATEN ISLAND     81698
Name: Borough, dtype: int64
Total locations:  6
0 clusters of size 3 or greater
