In [4]:
from openclean.data.source.socrata import Socrata

for dataset in Socrata().catalog(domain='data.cityofnewyork.us'):
    if 'citywide' in dataset.name.lower() and 'payroll' in dataset.name.lower():
        print(f'{dataset.identifier}\t{dataset.domain}\t{dataset.name}')

k397-673e	data.cityofnewyork.us	Citywide Payroll Data (Fiscal Year)


In [5]:
import gzip
import humanfriendly
import os

dataset = Socrata().dataset('k397-673e')
datafile = './k397-673e.tsv.gz'

if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        dataset.write(f)
        
fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))

Downloading ...

Using 'Citywide Payroll Data (Fiscal Year)' in file ./k397-673e.tsv.gz of size 89.62 MB


In [6]:
from openclean.pipeline import stream

ds_full = stream(datafile)

In [7]:
print(f'{ds_full.count():,} rows.')

3,923,290 rows.


In [8]:
ds_full.head()

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Mid Init,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay
0,2020,17,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,84698.21,0.0,0.0,0.0
1,2020,17,OFFICE OF EMERGENCY MANAGEMENT,GEAGER,VERONICA,M,09/12/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,84698.21,0.0,0.0,0.0
2,2020,17,OFFICE OF EMERGENCY MANAGEMENT,RAMANI,SHRADDHA,,02/22/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,84698.21,0.0,0.0,0.0
3,2020,17,OFFICE OF EMERGENCY MANAGEMENT,ROTTA,JONATHAN,D,09/16/2013,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,84698.21,0.0,0.0,0.0
4,2020,17,OFFICE OF EMERGENCY MANAGEMENT,WILSON II,ROBERT,P,04/30/2018,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,84698.21,0.0,0.0,0.0
5,2020,17,OFFICE OF EMERGENCY MANAGEMENT,WASHINGTON,MORIAH,A,03/18/2019,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,87900.95,0.0,0.0,-3202.74
6,2020,17,OFFICE OF EMERGENCY MANAGEMENT,VAZQUEZ,MARGARET,,09/29/2008,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,94415.0,per Annum,1820,84312.72,0.0,0.0,0.0
7,2020,17,OFFICE OF EMERGENCY MANAGEMENT,KRAWCZYK,AMANDA,N,05/15/2017,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,83976.54,0.0,0.0,0.0
8,2020,17,OFFICE OF EMERGENCY MANAGEMENT,MURRELL,JALEESA,S,12/01/2014,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,83877.36,0.0,0.0,0.0
9,2020,17,OFFICE OF EMERGENCY MANAGEMENT,DE LOS SANTOS,JANIRA,,06/05/2017,BROOKLYN,EMERGENCY PREPAREDNESS SPECIALIST,ACTIVE,67676.0,per Annum,1820,66647.77,348.5,16572.64,144.15


In [9]:
COLUMNS = [
    "Fiscal Year",
    "Payroll Number",
    "Agency Name",
    "Last Name",
    "First Name",
    "Mid Init",
    "Agency Start Date",
    "Work Location Borough",
    "Title Description",
    "Leave Status as of June 30",
    "Base Salary",
    "Pay Basis",
    "Regular Hours",
    "Regular Gross Paid",
    "OT Hours",
    "Total OT Paid",
    "Total Other Pay"
]

ds = ds_full.select(columns=COLUMNS)

In [10]:
from openclean.profiling.column import DefaultColumnProfiler

profiles = ds.profile(default_profiler=DefaultColumnProfiler)

In [12]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Fiscal Year,3923290,0,7,2e-06,2.805614
Payroll Number,3923290,1745440,157,7.2e-05,4.286506
Agency Name,3923290,0,165,4.2e-05,4.365925
Last Name,3923290,2031,157080,0.040059,14.264455
First Name,3923290,2033,88232,0.022501,11.611521
Mid Init,3923290,1596166,43,1.8e-05,4.073274
Agency Start Date,3923290,63,14933,0.003806,11.097847
Work Location Borough,3923290,506226,22,6e-06,1.507244
Title Description,3923290,84,1802,0.000459,6.207524
Leave Status as of June 30,3923290,0,5,1e-06,0.710495


In [13]:
print('Schema\n------')
for col in ds.columns:
    p = profiles.column(col)
    print("  '{}' ({})".format(col, p['datatypes']['distinct'].most_common(1)[0][0]))

Schema
------
  'Fiscal Year' (int)
  'Payroll Number' (int)
  'Agency Name' (str)
  'Last Name' (str)
  'First Name' (str)
  'Mid Init' (str)
  'Agency Start Date' (date)
  'Work Location Borough' (str)
  'Title Description' (str)
  'Leave Status as of June 30' (str)
  'Base Salary' (float)
  'Pay Basis' (str)
  'Regular Hours' (float)
  'Regular Gross Paid' (float)
  'OT Hours' (float)
  'Total OT Paid' (float)
  'Total Other Pay' (float)


In [14]:
profiles.minmax('Agency Start Date')

Unnamed: 0,min,max
date,1901-01-01,9999-12-31 00:00:00
