# Data Science with Python and Dask
## Chapter 5: Cleaning and Transforming DataFrames

In [1]:
# Before beginning, set your working directory to where the data resides
import os
os.chdir('/Users/Abba/Documents/data-science-python-dask')

### Intro Section

In [3]:
# Listing 5.1
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import numpy as np

dtypes = {
 'Date First Observed': 'str',
 'Days Parking In Effect    ': 'str',
 'Double Parking Violation': 'str',
 'Feet From Curb': np.float32,
 'From Hours In Effect': 'str',
 'House Number': 'str',
 'Hydrant Violation': 'str',
 'Intersecting Street': 'str',
 'Issue Date': 'str',
 'Issuer Code': np.float32,
 'Issuer Command': 'str',
 'Issuer Precinct': np.float32,
 'Issuer Squad': 'str',
 'Issuing Agency': 'str',
 'Law Section': np.float32,
 'Meter Number': 'str',
 'No Standing or Stopping Violation': 'str',
 'Plate ID': 'str',
 'Plate Type': 'str',
 'Registration State': 'str',
 'Street Code1': np.uint32,
 'Street Code2': np.uint32,
 'Street Code3': np.uint32,
 'Street Name': 'str',
 'Sub Division': 'str',
 'Summons Number': np.uint32,
 'Time First Observed': 'str',
 'To Hours In Effect': 'str',
 'Unregistered Vehicle?': 'str',
 'Vehicle Body Type': 'str',
 'Vehicle Color': 'str',
 'Vehicle Expiration Date': 'str',
 'Vehicle Make': 'str',
 'Vehicle Year': np.float32,
 'Violation Code': np.uint16,
 'Violation County': 'str',
 'Violation Description': 'str',
 'Violation In Front Of Or Opposite': 'str',
 'Violation Legal Code': 'str',
 'Violation Location': 'str',
 'Violation Post Code': 'str',
 'Violation Precinct': np.float32,
 'Violation Time': 'str'
}

nyc_data_raw = dd.read_csv('nyc-parking-tickets/*.csv', dtype=dtypes, usecols=dtypes.keys())

### Section 5.1.1

In [4]:
# Listing 5.2
with ProgressBar():
    display(nyc_data_raw['Plate ID'].head())

[########################################] | 100% Completed | 1.45 ss


0    GBB9093
1    62416MB
2    78755JZ
3    63009MA
4    91648MC
Name: Plate ID, dtype: string

In [5]:
# Listing 5.3
with ProgressBar():
    display(nyc_data_raw[['Plate ID', 'Registration State']].head())

[########################################] | 100% Completed | 1.61 ss


Unnamed: 0,Plate ID,Registration State
0,GBB9093,NY
1,62416MB,NY
2,78755JZ,NY
3,63009MA,NY
4,91648MC,NY


In [6]:
# Listing 5.4
columns_to_select = ['Plate ID', 'Registration State']

with ProgressBar():
    display(nyc_data_raw[columns_to_select].head())

[########################################] | 100% Completed | 1.68 ss


Unnamed: 0,Plate ID,Registration State
0,GBB9093,NY
1,62416MB,NY
2,78755JZ,NY
3,63009MA,NY
4,91648MC,NY


### Section 5.1.2

In [7]:
# Listing 5.5
with ProgressBar():
    display(nyc_data_raw.drop('Violation Code', axis=1).head())

[########################################] | 100% Completed | 1.63 ss


Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,...,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
0,1283294138,GBB9093,NY,PAS,08/04/2013,SUBN,AUDI,P,37250,13610,...,GY,0,2013.0,-,0.0,,,,,
1,1283294151,62416MB,NY,COM,08/04/2013,VAN,FORD,P,37290,40404,...,WH,0,2012.0,-,0.0,,,,,
2,1283294163,78755JZ,NY,COM,08/05/2013,P-U,CHEVR,P,37030,31190,...,,0,0.0,-,0.0,,,,,
3,1283294175,63009MA,NY,COM,08/05/2013,VAN,FORD,P,37270,11710,...,WH,0,2010.0,-,0.0,,,,,
4,1283294187,91648MC,NY,COM,08/08/2013,TRLR,GMC,P,37240,12010,...,BR,0,2012.0,-,0.0,,,,,


In [8]:
# Listing 5.6
violationColumnNames = list(filter(lambda columnName: 'Violation' in columnName, nyc_data_raw.columns))

with ProgressBar():
    display(nyc_data_raw.drop(violationColumnNames, axis=1).head())

[########################################] | 100% Completed | 1.58 ss


Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,...,Law Section,Sub Division,Days Parking In Effect,From Hours In Effect,To Hours In Effect,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb
0,1283294138,GBB9093,NY,PAS,08/04/2013,SUBN,AUDI,P,37250,13610,...,408.0,F1,BBBBBBB,ALL,ALL,GY,0,2013.0,-,0.0
1,1283294151,62416MB,NY,COM,08/04/2013,VAN,FORD,P,37290,40404,...,408.0,C,BBBBBBB,ALL,ALL,WH,0,2012.0,-,0.0
2,1283294163,78755JZ,NY,COM,08/05/2013,P-U,CHEVR,P,37030,31190,...,408.0,F7,BBBBBBB,ALL,ALL,,0,0.0,-,0.0
3,1283294175,63009MA,NY,COM,08/05/2013,VAN,FORD,P,37270,11710,...,408.0,F1,BBBBBBB,ALL,ALL,WH,0,2010.0,-,0.0
4,1283294187,91648MC,NY,COM,08/08/2013,TRLR,GMC,P,37240,12010,...,408.0,E1,BBBBBBB,ALL,ALL,BR,0,2012.0,-,0.0


### Section 5.1.3

In [9]:
# Listing 5.7
nyc_data_renamed = nyc_data_raw.rename(columns={'Plate ID':'License Plate'})
nyc_data_renamed

Unnamed: 0_level_0,Summons Number,License Plate,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,Street Code3,Vehicle Expiration Date,Violation Location,Violation Precinct,Issuer Precinct,Issuer Code,Issuer Command,Issuer Squad,Violation Time,Time First Observed,Violation County,Violation In Front Of Or Opposite,House Number,Street Name,Intersecting Street,Date First Observed,Law Section,Sub Division,Violation Legal Code,Days Parking In Effect,From Hours In Effect,To Hours In Effect,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
npartitions=138,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
,uint32,string,string,string,string,uint16,string,string,string,uint32,uint32,uint32,string,string,float32,float32,float32,string,string,string,string,string,string,string,string,string,string,float32,string,string,string,string,string,string,string,float32,string,float32,string,string,string,string,string
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


### Section 5.1.4

In [10]:
# Listing 5.8
with ProgressBar():
    display(nyc_data_raw.loc[56].head(1))

[########################################] | 100% Completed | 2.36 ss


Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
56,1293090530,GES3519,NY,PAS,07/07/2013,40,SDN,HONDA,F,70630,...,BLACK,0,1997.0,-,0.0,,,,,


In [11]:
# Listing 5.9
with ProgressBar():
    display(nyc_data_raw.loc[100:200].head(100))

[########################################] | 100% Completed | 2.37 ss


Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
100,1294727205,XBD7628,VA,PAS,08/04/2013,17,SUBN,JEEP,P,14510,...,GRY,0,0.0,-,0.0,,,,,
101,1294727461,R613159,IL,PAS,07/17/2013,17,SDN,VOLKS,P,14510,...,BLUE,0,0.0,-,0.0,,,,,
102,1294727473,6TCX735,CA,PAS,07/18/2013,17,SDN,MAZDA,P,14510,...,BLACK,0,0.0,-,0.0,,,,,
103,1294727497,ZWZ43K,NJ,PAS,08/10/2013,17,SUBN,LINCO,P,14510,...,,0,0.0,-,0.0,,,,,
104,1295357240,T624858C,NY,PAS,07/22/2013,21,SUBN,TOYOT,X,28790,...,SILVE,0,2012.0,-,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1307953700,481CKW,NJ,PAS,07/27/2013,24,SUBN,NISSA,K,0,...,PURPL,0,0.0,-,0.0,,,,,
196,1307953724,W122KP,NY,PAS,07/28/2013,20,SDN,TOYOT,K,0,...,MAROO,0,1998.0,-,0.0,,,,,
197,1307953761,GAW6458,NY,PAS,08/17/2013,20,SDN,TOYOT,K,48502,...,BLACK,0,2001.0,-,0.0,,,,,
198,1307953852,GFP6251,NY,PAS,08/24/2013,27,SDN,NISSA,K,0,...,G/Y,0,2006.0,-,0.0,,,,,


In [12]:
# Listing 5.10
with ProgressBar():
    some_rows = nyc_data_raw.loc[100:200].head(100)
some_rows.drop(range(100, 200, 2))

[########################################] | 100% Completed | 2.51 ss


Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
101,1294727461,R613159,IL,PAS,07/17/2013,17,SDN,VOLKS,P,14510,...,BLUE,0,0.0,-,0.0,,,,,
103,1294727497,ZWZ43K,NJ,PAS,08/10/2013,17,SUBN,LINCO,P,14510,...,,0,0.0,-,0.0,,,,,
105,1295546486,42909JM,NY,COM,07/17/2013,19,DELV,MERCU,P,58590,...,BROWN,0,1996.0,-,0.0,,,,,
107,1302446484,452WW4,MA,PAS,06/24/2013,46,SDN,CHEVR,C,75530,...,YELLO,0,2013.0,-,0.0,,,,,
109,1302453774,GEX5296,NY,PAS,07/26/2013,14,SDN,ME/BE,C,36420,...,BLACK,0,0.0,-,0.0,,,,,
111,1302453798,867ZFX,CT,PAS,07/15/2013,17,SDN,ACURA,C,0,...,WHITE,0,0.0,-,0.0,,,,,
113,1302456593,GEZ1408,NY,PAS,07/15/2013,17,SUBN,HYUND,X,77730,...,BLK,0,2013.0,-,0.0,,,,,
115,1302456738,UCSLIM,NY,PAS,08/07/2013,17,SUBN,FORD,X,77730,...,BLK,0,2003.0,-,0.0,,,,,
117,1302466203,HPM2636,PA,PAS,08/08/2013,40,SUBN,CADIL,X,24245,...,SILVR,0,0.0,-,0.0,,,,,
119,1302466227,TWODAMAX,NY,SRF,08/05/2013,17,SDN,HYUND,X,53630,...,GRAY,0,2012.0,-,0.0,,,,,


### Section 5.2.1

In [13]:
# Listing 5.11
missing_values = nyc_data_raw.isnull().sum()
with ProgressBar():
    percent_missing = ((missing_values / nyc_data_raw.index.size) * 100).compute()
percent_missing

[########################################] | 100% Completed | 385.05 s


Summons Number                        0.000000
Plate ID                              0.020867
Registration State                    0.000000
Plate Type                            0.000000
Issue Date                            0.000000
Violation Code                        0.000000
Vehicle Body Type                     0.564922
Vehicle Make                          0.650526
Issuing Agency                        0.000000
Street Code1                          0.000000
Street Code2                          0.000000
Street Code3                          0.000000
Vehicle Expiration Date               0.000002
Violation Location                   15.142846
Violation Precinct                    0.000002
Issuer Precinct                       0.000002
Issuer Code                           0.000002
Issuer Command                       15.018851
Issuer Squad                         15.022566
Violation Time                        0.019207
Time First Observed                  90.040886
Violation Cou

### Section 5.2.2

In [14]:
# Listing 5.12
columns_to_drop = list(percent_missing[percent_missing >= 50].index)
nyc_data_clean_stage1 = nyc_data_raw.drop(columns_to_drop, axis=1)

In [15]:
print(columns_to_drop)

['Time First Observed', 'Intersecting Street', 'Violation Legal Code', 'Unregistered Vehicle?', 'Meter Number', 'No Standing or Stopping Violation', 'Hydrant Violation', 'Double Parking Violation']


### Section 5.2.3

In [16]:
# Listing 5.13
with ProgressBar():
    count_of_vehicle_colors = nyc_data_clean_stage1['Vehicle Color'].value_counts().compute()
most_common_color = count_of_vehicle_colors.sort_values(ascending=False).index[0]

# Fill missing vehicle color with the most common color
nyc_data_clean_stage2 = nyc_data_clean_stage1.fillna({'Vehicle Color': most_common_color})

[########################################] | 100% Completed | 286.30 s


### Section 5.2.4

In [17]:
# Listing 5.14

# Updated to compensate for bug identified in https://github.com/dask/dask/issues/5854

# Old code:
rows_to_drop = list(percent_missing[(percent_missing > 0) & (percent_missing < 5)].index)
nyc_data_clean_stage3 = nyc_data_clean_stage2.dropna(subset=rows_to_drop)

# New code splits the rows to drop into two separate lists and chains the dropna methods to drop all the columns we want
# from the bug report, less than 9 rows can work; issue maybe was resolved by 2022.
#rows_to_drop1 =['Plate ID', 'Vehicle Body Type', 'Vehicle Make', 'Vehicle Expiration Date', 'Violation Precinct', 'Issuer Precinct', 'Issuer Code', 'Violation Time', 'Street Name']
#rows_to_drop2 =['Date First Observed', 'Law Section', 'Sub Division', 'Vehicle Color', 'Vehicle Year', 'Feet From Curb']
#nyc_data_clean_stage3 = nyc_data_clean_stage2.dropna(subset=rows_to_drop1).dropna(subset=rows_to_drop2)

### Section 5.2.5

In [18]:
# Listing 5.15
remaining_columns_to_clean = list(percent_missing[(percent_missing >= 5) & (percent_missing < 50)].index)
nyc_data_raw.dtypes[remaining_columns_to_clean]

Violation Location                   string[pyarrow]
Issuer Command                       string[pyarrow]
Issuer Squad                         string[pyarrow]
Violation County                     string[pyarrow]
Violation In Front Of Or Opposite    string[pyarrow]
House Number                         string[pyarrow]
Days Parking In Effect               string[pyarrow]
From Hours In Effect                 string[pyarrow]
To Hours In Effect                   string[pyarrow]
Violation Post Code                  string[pyarrow]
Violation Description                string[pyarrow]
dtype: object

In [19]:
# Listing 5.16
unknown_default_dict = dict(map(lambda columnName: (columnName, 'Unknown'), remaining_columns_to_clean))

In [20]:
# Listing 5.17
nyc_data_clean_stage4 = nyc_data_clean_stage3.fillna(unknown_default_dict)

In [21]:
# Listing 5.18
with ProgressBar():
    print(nyc_data_clean_stage4.isnull().sum().compute())
    nyc_data_clean_stage4.persist()

[########################################] | 100% Completed | 378.84 s
Summons Number                       0
Plate ID                             0
Registration State                   0
Plate Type                           0
Issue Date                           0
Violation Code                       0
Vehicle Body Type                    0
Vehicle Make                         0
Issuing Agency                       0
Street Code1                         0
Street Code2                         0
Street Code3                         0
Vehicle Expiration Date              0
Violation Location                   0
Violation Precinct                   0
Issuer Precinct                      0
Issuer Code                          0
Issuer Command                       0
Issuer Squad                         0
Violation Time                       0
Violation County                     0
Violation In Front Of Or Opposite    0
House Number                         0
Street Name                     

### Section 5.3

In [22]:
# Listing 5.19
with ProgressBar():
    license_plate_types = nyc_data_clean_stage4['Plate Type'].value_counts().compute()
license_plate_types

[########################################] | 100% Completed | 336.01 s


Plate Type
RGC       860
THC       641
OMS    368952
SEM      1764
OMV       416
        ...  
VPL       129
ITP     12111
HAM       648
MCL      5105
STG       286
Name: count, Length: 90, dtype: int64[pyarrow]

In [23]:
# Listing 5.20
condition = nyc_data_clean_stage4['Plate Type'].isin(['PAS', 'COM'])
plate_type_masked = nyc_data_clean_stage4['Plate Type'].where(condition, 'Other')
nyc_data_recode_stage1 = nyc_data_clean_stage4.drop('Plate Type', axis=1)
nyc_data_recode_stage2 = nyc_data_recode_stage1.assign(PlateType=plate_type_masked)
nyc_data_recode_stage3 = nyc_data_recode_stage2.rename(columns={'PlateType':'Plate Type'})

In [24]:
# Listing 5.21
with ProgressBar():
    display(nyc_data_recode_stage3['Plate Type'].value_counts().compute())

[########################################] | 100% Completed | 321.00 s


Plate Type
PAS      30452502
COM       7966914
Other     3418586
Name: count, dtype: int64[pyarrow]

In [25]:
# Listing 5.22
single_color = list(count_of_vehicle_colors[count_of_vehicle_colors == 1].index)
condition = nyc_data_clean_stage4['Vehicle Color'].isin(single_color)
vehicle_color_masked = nyc_data_clean_stage4['Vehicle Color'].mask(condition, 'Other')
nyc_data_recode_stage4 = nyc_data_recode_stage3.drop('Vehicle Color', axis=1)
nyc_data_recode_stage5 = nyc_data_recode_stage4.assign(VehicleColor=vehicle_color_masked)
nyc_data_recode_stage6 = nyc_data_recode_stage5.rename(columns={'VehicleColor':'Vehicle Color'})

### Section 5.4

In [26]:
# Listing 5.23
from datetime import datetime
issue_date_parsed = nyc_data_recode_stage6['Issue Date'].apply(lambda x: datetime.strptime(x, "%m/%d/%Y"), meta=datetime)
nyc_data_derived_stage1 = nyc_data_recode_stage6.drop('Issue Date', axis=1)
nyc_data_derived_stage2 = nyc_data_derived_stage1.assign(IssueDate=issue_date_parsed)
nyc_data_derived_stage3 = nyc_data_derived_stage2.rename(columns={'IssueDate':'Issue Date'})

In [27]:
# Listing 5.24
with ProgressBar():
    display(nyc_data_derived_stage3['Issue Date'].head())

[########################################] | 100% Completed | 2.48 ss


0   2013-08-04
1   2013-08-04
2   2013-08-05
3   2013-08-05
4   2013-08-08
Name: Issue Date, dtype: datetime64[ns]

In [28]:
# Listing 5.25
issue_date_month_year = nyc_data_derived_stage3['Issue Date'].apply(lambda dt: dt.strftime("%Y%m"), meta=str)
nyc_data_derived_stage4 = nyc_data_derived_stage3.assign(IssueMonthYear=issue_date_month_year)
nyc_data_derived_stage5 = nyc_data_derived_stage4.rename(columns={'IssueMonthYear':'Citation Issued Month Year'})

In [29]:
# Listing 5.26
with ProgressBar():
    display(nyc_data_derived_stage5['Citation Issued Month Year'].head())

[########################################] | 100% Completed | 2.37 ss


0    201308
1    201308
2    201308
3    201308
4    201308
Name: Citation Issued Month Year, dtype: object

### Section 5.5.1

In [30]:
# Listing 5.27
months = ['201310','201410','201510','201610','201710']
condition = nyc_data_derived_stage5['Citation Issued Month Year'].isin(months)
october_citations = nyc_data_derived_stage5[condition]

with ProgressBar():
    display(october_citations.head())

[########################################] | 100% Completed | 6.58 ss


Unnamed: 0,Summons Number,Plate ID,Registration State,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,Street Code3,...,From Hours In Effect,To Hours In Effect,Vehicle Year,Feet From Curb,Violation Post Code,Violation Description,Plate Type,Vehicle Color,Issue Date,Citation Issued Month Year
1609,1340313923,GEK8055,NY,40,SUBN,HONDA,P,79630,40404,40404,...,ALL,ALL,2013.0,5.0,Unknown,Unknown,PAS,BROWN,2013-10-23,201310
23367,1351679867,XE726658,DE,20,P-U,DODGE,P,90980,0,0,...,ALL,ALL,2002.0,0.0,Unknown,Unknown,PAS,RED,2013-10-21,201310
24172,1351805253,42067JM,NY,14,DELV,FRUEH,P,25630,13610,24985,...,ALL,ALL,1999.0,0.0,Unknown,Unknown,COM,WHITE,2013-10-17,201310
32902,1355051060,76254JY,NY,46,DELV,FRUEH,P,68020,26760,66120,...,ALL,ALL,2007.0,0.0,Unknown,Unknown,COM,WHITE,2013-10-10,201310
32903,1355051071,44125MC,NY,46,VAN,FORD,P,68020,26490,26520,...,ALL,ALL,2011.0,0.0,Unknown,Unknown,COM,WHITE,2013-10-10,201310


In [31]:
# Listing 5.28
bound_date = '2016-4-25'
condition = nyc_data_derived_stage5['Issue Date'] > bound_date
citations_after_bound = nyc_data_derived_stage5[condition]

with ProgressBar():
    display(citations_after_bound.head())

[########################################] | 100% Completed | 6.71 ss


Unnamed: 0,Summons Number,Plate ID,Registration State,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,Street Code3,...,From Hours In Effect,To Hours In Effect,Vehicle Year,Feet From Curb,Violation Post Code,Violation Description,Plate Type,Vehicle Color,Issue Date,Citation Issued Month Year
3741,1346495701,GCJ8613,NY,20,SDN,DODGE,X,10210,19210,19250,...,ALL,ALL,2010.0,0.0,Unknown,Unknown,PAS,BLK,2017-08-04,201708
3748,1346551819,GHJ2373,NY,20,SUBN,FORD,X,0,0,0,...,ALL,ALL,2010.0,0.0,Unknown,Unknown,PAS,BLK,2019-07-22,201907
6905,1348077426,66811MB,NY,78,IR,FRUEH,P,25680,46320,8120,...,0900P,0500A,2007.0,0.0,Unknown,Unknown,COM,WHT,2053-08-02,205308
12967,1353305650,91609MC,NY,46,VAN,FORD,T,24890,18670,18690,...,ALL,ALL,2012.0,0.0,Unknown,Unknown,COM,BR,2016-08-07,201608
17144,1354617988,54015JV,NY,78,VAN,FORD,P,38590,50150,52290,...,0900P,0500A,2007.0,0.0,Unknown,Unknown,COM,WHITE,2032-07-28,203207


### Section 5.5.1

In [32]:
# Listing 5.29
with ProgressBar():
    condition = (nyc_data_derived_stage5['Issue Date'] > '2014-01-01') & (nyc_data_derived_stage5['Issue Date'] <= '2017-12-31')
    nyc_data_filtered = nyc_data_derived_stage5[condition]
    nyc_data_new_index = nyc_data_filtered.set_index('Citation Issued Month Year')

In [33]:
# Listing 5.30
years = ['2014', '2015', '2016', '2017']
months = ['01','02','03','04','05','06','07','08','09','10','11','12']
divisions = [year + month for year in years for month in months]

with ProgressBar():
    nyc_data_new_index.repartition(divisions=divisions).to_parquet('nyc_data_date_index', compression='snappy')
    
nyc_data_new_index = dd.read_parquet('nyc_data_date_index')

[########################################] | 100% Completed | 15m 39s
[#####################################   ] | 92% Completed | 17m 42ss


ValueError: Failed to convert partition to expected pyarrow schema:
    `ArrowTypeError('Expected a string or bytes dtype, got datetime64[ns]', 'Conversion failed for column Issue Date with type datetime64[ns]')`

Expected partition schema:
    Summons Number: uint32
    Plate ID: large_string
    Registration State: large_string
    Violation Code: uint16
    Vehicle Body Type: large_string
    Vehicle Make: large_string
    Issuing Agency: large_string
    Street Code1: uint32
    Street Code2: uint32
    Street Code3: uint32
    Vehicle Expiration Date: large_string
    Violation Location: large_string
    Violation Precinct: float
    Issuer Precinct: float
    Issuer Code: float
    Issuer Command: large_string
    Issuer Squad: large_string
    Violation Time: large_string
    Violation County: large_string
    Violation In Front Of Or Opposite: large_string
    House Number: large_string
    Street Name: large_string
    Date First Observed: large_string
    Law Section: float
    Sub Division: large_string
    Days Parking In Effect    : large_string
    From Hours In Effect: large_string
    To Hours In Effect: large_string
    Vehicle Year: float
    Feet From Curb: float
    Violation Post Code: large_string
    Violation Description: large_string
    Plate Type: large_string
    Vehicle Color: large_string
    Issue Date: string
    Citation Issued Month Year: string

Received partition schema:
    Summons Number: uint32
    Plate ID: large_string
    Registration State: large_string
    Violation Code: uint16
    Vehicle Body Type: large_string
    Vehicle Make: large_string
    Issuing Agency: large_string
    Street Code1: uint32
    Street Code2: uint32
    Street Code3: uint32
    Vehicle Expiration Date: large_string
    Violation Location: large_string
    Violation Precinct: float
    Issuer Precinct: float
    Issuer Code: float
    Issuer Command: large_string
    Issuer Squad: large_string
    Violation Time: large_string
    Violation County: large_string
    Violation In Front Of Or Opposite: large_string
    House Number: large_string
    Street Name: large_string
    Date First Observed: large_string
    Law Section: float
    Sub Division: large_string
    Days Parking In Effect    : large_string
    From Hours In Effect: large_string
    To Hours In Effect: large_string
    Vehicle Year: float
    Feet From Curb: float
    Violation Post Code: large_string
    Violation Description: large_string
    Plate Type: large_string
    Vehicle Color: large_string
    Issue Date: timestamp[ns]
    Citation Issued Month Year: string

This error *may* be resolved by passing in schema information for
the mismatched column(s) using the `schema` keyword in `to_parquet`.

In [None]:
with ProgressBar():
    display(nyc_data_new_index['Issue Date'].head())

[#######                                 ] | 19% Completed | 195.52 s

### Section 5.6.1

In [None]:
# Listing 5.31
import pandas as pd
nyc_temps = pd.read_csv('nyc-temp-data.csv')

# Filtered out only the relevant months from the temperature data to accelerate the join
nyc_temps_filtered = nyc_temps[nyc_temps.monthYear.isin(divisions)]

nyc_temps_indexed = nyc_temps_filtered.set_index(nyc_temps_filtered.monthYear.astype(str))
nyc_data_with_temps = nyc_data_new_index.join(nyc_temps_indexed, how='inner')

with ProgressBar():
    display(nyc_data_with_temps.head(15))

### Section 5.6.2

In [None]:
# Listing 5.32
fy16 = dd.read_csv('nyc-parking-tickets/Parking_Violations_Issued_-_Fiscal_Year_2016.csv', dtype=dtypes, usecols=dtypes.keys())
fy17 = dd.read_csv('nyc-parking-tickets/Parking_Violations_Issued_-_Fiscal_Year_2017.csv', dtype=dtypes, usecols=dtypes.keys())

fy1617 = fy16.append(fy17)

with ProgressBar():
    print(fy16['Summons Number'].count().compute())

with ProgressBar():
    print(fy17['Summons Number'].count().compute())

with ProgressBar():
    print(fy1617['Summons Number'].count().compute())


### Section 5.7.1

In [None]:
# Listing 5.33
with ProgressBar():
    if not os.path.exists('nyc-final-csv'):
        os.makedirs('nyc-final-csv') 
    nyc_data_with_temps.repartition(npartitions=1).to_csv('nyc-final-csv/part*.csv')

In [None]:
# Listing 5.33
with ProgressBar():
    if not os.path.exists('nyc-final-csv-compressed'):
        os.makedirs('nyc-final-csv-compressed')
    nyc_data_with_temps.to_csv(
        filename='nyc-final-csv-compressed/*', 
        compression='gzip', 
        sep='|', 
        na_rep='NULL', 
        header=False, 
        index=False)

### Listing 5.7.2

In [None]:
# Listing 5.35
# Added reset_index as later versions of Dask raise an error stating the index column can't be found
with ProgressBar():
    nyc_data_with_temps.reset_index(drop=True).to_parquet('nyc_final', compression='snappy')