Since we may map values to the dataset we cleaned from previous cleaned data, this jupyter notebook was used to generate the reference data we cleaned and as a part of contribution to the data reference repo

In [1]:
from openclean.data.load import dataset
from openclean.pipeline import stream
import pandas as pd

data_dir = '../project_data/'
file_name = 'data-cityofnewyork-us.k397-673e.csv'

COLUMNS = [
    "Fiscal Year",
    "Payroll Number",
    "Agency Name",
    "Last Name",
    "First Name",
    "Mid Init",
    "Agency Start Date",
    "Work Location Borough",
    "Title Description",
    "Leave Status as of June 30",
    "Base Salary",
    "Pay Basis",
    "Regular Hours",
    "Regular Gross Paid",
    "OT Hours",
    "Total OT Paid",
    "Total Other Pay"
]

In [3]:
datafile = data_dir + file_name
ds_full = stream(datafile, encoding='utf-8')
df = ds_full.to_df()

In [6]:
column = "Work Location Borough"
work_loc = df[column].value_counts()
print(work_loc)

MANHATTAN        2394979
                  506226
QUEENS            379695
BROOKLYN          323565
BRONX             177881
OTHER              83688
RICHMOND           46156
WESTCHESTER         3417
ULSTER              1953
Manhattan           1622
Bronx                935
SULLIVAN             822
Queens               660
DELAWARE             551
NASSAU               245
PUTNAM               243
SCHOHARIE            175
DUTCHESS             140
Richmond             112
ALBANY                95
GREENE                61
WASHINGTON DC         47
ORANGE                22
Name: Work Location Borough, dtype: int64


In [7]:
from openclean.function.value.null import is_empty
from openclean.operator.transform.update import update
df = update(df, columns=column, func=str.upper)
df = update(df, columns=column, func=lambda x: 'OTHER' if is_empty(x) else x)

In [9]:
print(df[column].value_counts())

MANHATTAN        2396601
OTHER             589914
QUEENS            380355
BROOKLYN          323565
BRONX             178816
RICHMOND           46268
WESTCHESTER         3417
ULSTER              1953
SULLIVAN             822
DELAWARE             551
NASSAU               245
PUTNAM               243
SCHOHARIE            175
DUTCHESS             140
ALBANY                95
GREENE                61
WASHINGTON DC         47
ORANGE                22
Name: Work Location Borough, dtype: int64


In [10]:
from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

minsize = 5
values = ds_full.select(column).distinct()
clusters = knn_clusters(
    values=values,
    sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.9)),
    minsize=minsize
)

print('{} clusters of size {} or greater'.format(len(clusters), minsize))

0 clusters of size 5 or greater


In [11]:
file_name = 'data-cityofnewyork-us.3rfa-3xsf.csv'
datafile = data_dir + file_name
ds_full = stream(datafile, encoding='utf-8')
df = ds_full.to_df()

In [20]:
pd.set_option('display.max_rows', None)

agency_name = df[['Agency', 'Agency Name']].value_counts()
print(agency_name)

Agency  Agency Name                                                                                
HPD     Department of Housing Preservation and Development                                             671543
DOT     Department of Transportation                                                                   333501
DEP     Department of Environmental Protection                                                         167866
NYPD    New York City Police Department                                                                158664
DOB     Department of Buildings                                                                        129288
DPR     Department of Parks and Recreation                                                              66862
DOHMH   Department of Health and Mental Hygiene                                                         46725
DCA     Department of Consumer Affairs                                                                  27299
TLC     Taxi and Lim

In [22]:
file_list = ['data-cityofnewyork-us.3rfa-3xsf.csv', 'data-cityofnewyork-us.aiww-p3af.csv', 'data-cityofnewyork-us.cwy2-px8b.csv', 'data-cityofnewyork-us.hy4q-igkk.csv', 'data-cityofnewyork-us.xrwg-eczf.csv']
column = ['Agency', 'Agency Name']
df_agency_map = pd.DataFrame(columns=column)

Unnamed: 0,Agency,Agency Name


In [29]:
for file_name in file_list:
    datafile = data_dir + file_name
    df = dataset(datafile, encoding='utf-8')
    agency_name = df[['Agency', 'Agency Name']]
    df_agency_map = df_agency_map.append(agency_name)
    
print(df_agency_map.count())

Agency         5471311
Agency Name    5471311
dtype: int64


In [30]:
print(df_agency_map.value_counts())

Agency      Agency Name                                                                                
HPD         Department of Housing Preservation and Development                                             2015506
DOT         Department of Transportation                                                                    965161
DEP         Department of Environmental Protection                                                          642485
NYPD        New York City Police Department                                                                 548551
DOB         Department of Buildings                                                                         381210
DPR         Department of Parks and Recreation                                                              165511
DOHMH       Department of Health and Mental Hygiene                                                         134952
DCA         Department of Consumer Affairs                                                 

In [40]:
group = df_agency_map.value_counts().reset_index()
group.columns = ['Agency', 'Agency Name', 'count']

In [41]:
print(group)

         Agency                                        Agency Name    count
0           HPD  Department of Housing Preservation and Develop...  2015506
1           DOT                       Department of Transportation   965161
2           DEP             Department of Environmental Protection   642485
3          NYPD                    New York City Police Department   548551
4           DOB                            Department of Buildings   381210
5           DPR                 Department of Parks and Recreation   165511
6         DOHMH            Department of Health and Mental Hygiene   134952
7           DCA                     Department of Consumer Affairs    85726
8           TLC                      Taxi and Limousine Commission    68364
9          DSNY                               BCC - Brooklyn South    50959
10         DSNY                               BCC - Brooklyn North    41187
11         DSNY                                  BCC - Queens East    32383
12         D

In [45]:
intermediate_df = group[['Agency', 'Agency Name']]
intermediate_df.to_csv('../reference_data/agency_reference_data.csv')

In [52]:
from openclean.pipeline import stream


agency_dataset = '../reference_data/agency_reference_data.csv'
ds_stream = stream('../reference_data/agency_reference_data.csv', encoding='utf-8')

agency_name_stream = ds_stream.select('Agency Name').distinct()

from openclean.cluster.knn import knn_collision_clusters

clusters = knn_collision_clusters(
    values=agency_name_stream,
    sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.9)),
    minsize=minsize
)

def print_cluster(cnumber, cluster):
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster.items():
        print('{} ({})'.format(val, count))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))

clusters.sort(key=lambda c: len(c), reverse=True)

for i, cluster in enumerate(clusters):
    print_cluster(i + 1, cluster)


Cluster 1 (of size 7)

School - PS 134 (1)
School - PS 214 (1)
School - PS 224 (1)
School - PS 231 (1)
School - PS 24 (1)
School - PS 254 (1)
School - PS 234 (1)

Suggested value: School - PS 134


Cluster 2 (of size 7)

School - PS 214 (1)
School - PS 22 (1)
School - PS 224 (1)
School - PS 234 (1)
School - PS 245 (1)
School - PS 254 (1)
School - PS 24 (1)

Suggested value: School - PS 214


Cluster 3 (of size 7)

School - PS 214 (1)
School - PS 224 (1)
School - PS 234 (1)
School - PS 24 (1)
School - PS 255 (1)
School - PS 256 (1)
School - PS 254 (1)

Suggested value: School - PS 214


Cluster 4 (of size 6)

School - PS 123 (1)
School - PS 130 (1)
School - PS 134 (1)
School - PS 138 (1)
School - PS 173 (1)
School - PS 133 (1)

Suggested value: School - PS 123


Cluster 5 (of size 6)

School - PS 130 (1)
School - PS 133 (1)
School - PS 134 (1)
School - PS 188 (1)
School - PS 198 (1)
School - PS 138 (1)

Suggested value: School - PS 130


Cluster 6 (of size 6)

School - PS 211 (1)
School

In [75]:
ds_stream = stream('../reference_data/agency_reference_data.csv', encoding='utf-8')
df_agency = ds_stream.to_df()[['Agency', 'Agency Name']]
df_sort = df_agency.sort_values('Agency')
df_sort.reset_index(drop=True, inplace=True)
df_dup = df_sort[df_sort['Agency'] == df_sort['Agency Name']]
df_sort.drop(df_dup.index, inplace=True)
df_sort['Agency Name'] = df_sort['Agency Name'].str.upper()
print(df_sort)
df_sort.to_csv('../reference_data/agency_reference_data.csv', index=None)

         Agency                                        Agency Name
0         3-1-1                              311 QUALITY ASSURANCE
1         3-1-1                                  3-1-1 CALL CENTER
2           BIC                      BUSINESS INTEGRITY COMMISSION
3          CCHR                         COMMISSION ON HUMAN RIGHTS
4          CCRB                    CIVILIAN COMPLAINT REVIEW BOARD
5           DCA                     DEPARTMENT OF CONSUMER AFFAIRS
6          DCAS     DEPARTMENT OF CITYWIDE ADMINISTRATIVE SERVICES
7           DCP                        DEPARTMENT OF CITY PLANNING
8           DEP             DEPARTMENT OF ENVIRONMENTAL PROTECTION
9          DFTA                           DEPARTMENT FOR THE AGING
10          DHS                    DEPARTMENT OF HOMELESS SERVICES
11          DOB                            DEPARTMENT OF BUILDINGS
12          DOE                    SCHOOL - PS 123 MAHALIA JACKSON
13          DOE                          SCHOOL - PS 5 ELLEN L