## Since we may map values from previous cleaned data 

In [1]:
from openclean.data.load import dataset
from openclean.pipeline import stream
import pandas as pd

data_dir = '../project_data/'
file_name = 'data-cityofnewyork-us.k397-673e.csv'

COLUMNS = [
    "Fiscal Year",
    "Payroll Number",
    "Agency Name",
    "Last Name",
    "First Name",
    "Mid Init",
    "Agency Start Date",
    "Work Location Borough",
    "Title Description",
    "Leave Status as of June 30",
    "Base Salary",
    "Pay Basis",
    "Regular Hours",
    "Regular Gross Paid",
    "OT Hours",
    "Total OT Paid",
    "Total Other Pay"
]

In [3]:
datafile = data_dir + file_name
ds_full = stream(datafile, encoding='utf-8')
df = ds_full.to_df()

In [6]:
column = "Work Location Borough"
work_loc = df[column].value_counts()
print(work_loc)

MANHATTAN        2394979
                  506226
QUEENS            379695
BROOKLYN          323565
BRONX             177881
OTHER              83688
RICHMOND           46156
WESTCHESTER         3417
ULSTER              1953
Manhattan           1622
Bronx                935
SULLIVAN             822
Queens               660
DELAWARE             551
NASSAU               245
PUTNAM               243
SCHOHARIE            175
DUTCHESS             140
Richmond             112
ALBANY                95
GREENE                61
WASHINGTON DC         47
ORANGE                22
Name: Work Location Borough, dtype: int64


In [7]:
from openclean.function.value.null import is_empty
from openclean.operator.transform.update import update
df = update(df, columns=column, func=str.upper)
df = update(df, columns=column, func=lambda x: 'OTHER' if is_empty(x) else x)

In [9]:
print(df[column].value_counts())

MANHATTAN        2396601
OTHER             589914
QUEENS            380355
BROOKLYN          323565
BRONX             178816
RICHMOND           46268
WESTCHESTER         3417
ULSTER              1953
SULLIVAN             822
DELAWARE             551
NASSAU               245
PUTNAM               243
SCHOHARIE            175
DUTCHESS             140
ALBANY                95
GREENE                61
WASHINGTON DC         47
ORANGE                22
Name: Work Location Borough, dtype: int64


In [10]:
from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

minsize = 5
values = ds_full.select(column).distinct()
clusters = knn_clusters(
    values=values,
    sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.9)),
    minsize=minsize
)

print('{} clusters of size {} or greater'.format(len(clusters), minsize))

0 clusters of size 5 or greater
