## Check Data availability


In [17]:
from openclean.data.source.socrata import Socrata

for dataset in Socrata().catalog(domain='data.cityofnewyork.us'):
    if 'OATH Hearings' in dataset.name.lower() and 'Division Case Status' in dataset.name.lower():
        print(f'{dataset.identifier}\t{dataset.domain}\t{dataset.name}')

## Download Data

In [18]:
import gzip
import humanfriendly
import os

dataset = Socrata().dataset('jz4z-kudi')
datafile = './jz4z-kudi.tsv.gz'

if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        dataset.write(f)
        
fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))

Using 'OATH Hearings Division Case Status' in file ./jz4z-kudi.tsv.gz of size 478.98 MB


## Read Data as Pandas DataFrame

In [20]:

from openclean.pipeline import stream
df = stream(os.path.join('data', 'jz4z-kudi.tsv.gz'))

In [None]:
vlb = df.select('violation_location_borough').distinct()

print('{} distinct boroughs (for {} total values)'.format(len(vlb), sum(vlb.values())))
from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan
minsize = 5

clusters = knn_clusters(
    values=vlb,
    sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.9)),
    minsize=minsize
)
print('{} clusters of size {} or greater'.format(len(clusters), minsize))
def print_cluster(cnumber, cluster):
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster.items():
        print('{} ({})'.format(val, count))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))


clusters.sort(key=lambda c: len(c), reverse=True)

for i, cluster in enumerate(clusters):
    print_cluster(i + 1, cluster)

