## Load Libraries and Data

In [1]:
import numpy as np
import pandas as pd
import modules.cleaning_utils as cut

In [2]:
arrest_data_csv = '../data/raw/NYPD_Arrests_Data__Historic__20241014.csv'

In [3]:
df = pd.read_csv(arrest_data_csv)
df

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,220756993,11/19/2020,155.0,RAPE 2,104.0,RAPE,PL 1303001,F,B,41,0.0,18-24,M,BLACK,1013232.0,236725.0,40.816392,-73.895296,POINT (-73.89529641399997 40.816391847000034)
1,221995093,12/18/2020,177.0,SEXUAL ABUSE,116.0,SEX CRIMES,PL 1306503,F,K,84,0.0,45-64,M,WHITE,989013.0,192652.0,40.695469,-73.982825,POINT (-73.98282507899995 40.69546894100006)
2,221558560,12/09/2020,157.0,RAPE 1,104.0,RAPE,PL 1303501,F,S,120,0.0,<18,M,BLACK,962748.0,174174.0,40.644726,-74.077483,POINT (-74.07748315899995 40.64472613100002)
3,217890363,09/15/2020,,,,,PL 2650022,M,B,44,0.0,25-44,M,BLACK,1009412.0,245306.0,40.839956,-73.909065,POINT (-73.90906496999997 40.83995593800007)
4,219517698,10/21/2020,153.0,RAPE 3,104.0,RAPE,PL 1302502,F,Q,112,0.0,25-44,M,WHITE HISPANIC,1025420.0,202485.0,40.722364,-73.851474,POINT (-73.85147389399998 40.72236368700004)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5725517,59094011,02/25/2009,490.0,"STOLEN PROPERTY 3,POSSESSION",232.0,POSSESSION OF STOLEN PROPERTY 5,PL 1654000,M,Q,110,0.0,18-24,M,WHITE HISPANIC,1019751.0,206764.0,40.734133,-73.871903,POINT (-73.87190313999997 40.73413305400004)
5725518,57260078,02/13/2009,511.0,"CONTROLLED SUBSTANCE, POSSESSION 7",235.0,DANGEROUS DRUGS,PL 2200300,M,K,79,0.0,45-64,M,BLACK,998273.0,193029.0,40.696494,-73.949431,POINT (-73.94943081499997 40.69649389200004)
5725519,58924345,02/19/2009,397.0,"ROBBERY,UNCLASSIFIED,OPEN AREAS",105.0,ROBBERY,PL 1601504,F,B,44,0.0,<18,M,BLACK,1006032.0,243764.0,40.835733,-73.921285,POINT (-73.92128542599994 40.83573258000007)
5725520,59093573,02/25/2009,779.0,"PUBLIC ADMINISTRATION,UNCLASSIFIED FELONY",126.0,MISCELLANEOUS PENAL LAW,PL 215510B,F,M,30,0.0,25-44,F,WHITE,999110.0,239415.0,40.823810,-73.946310,POINT (-73.946309849 40.82381017900008)


In [4]:
# Convert string ('null') values into nans
df = df.replace('(null)', np.nan)

# Capture original length and memory usage for later
original_length = len(df.index)
original_memory_usage = df.memory_usage().sum()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5725522 entries, 0 to 5725521
Data columns (total 19 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ARREST_KEY         int64  
 1   ARREST_DATE        object 
 2   PD_CD              float64
 3   PD_DESC            object 
 4   KY_CD              float64
 5   OFNS_DESC          object 
 6   LAW_CODE           object 
 7   LAW_CAT_CD         object 
 8   ARREST_BORO        object 
 9   ARREST_PRECINCT    int64  
 10  JURISDICTION_CODE  float64
 11  AGE_GROUP          object 
 12  PERP_SEX           object 
 13  PERP_RACE          object 
 14  X_COORD_CD         float64
 15  Y_COORD_CD         float64
 16  Latitude           float64
 17  Longitude          float64
 18  Lon_Lat            object 
dtypes: float64(7), int64(2), object(10)
memory usage: 830.0+ MB


In [6]:
# X/Y Coord and Lon_Lat are not needed with Latitude and Longitude columns already present
df.drop(['Y_COORD_CD', 'X_COORD_CD', 'Lon_Lat'], axis = 1, inplace = True)

## Missing Values

In [7]:
df.isnull().sum()

ARREST_KEY               0
ARREST_DATE              0
PD_CD                  876
PD_DESC               9756
KY_CD                 9756
OFNS_DESC             9756
LAW_CODE               198
LAW_CAT_CD           23602
ARREST_BORO              8
ARREST_PRECINCT          0
JURISDICTION_CODE       10
AGE_GROUP               17
PERP_SEX                 0
PERP_RACE                0
Latitude                 1
Longitude                1
dtype: int64

In [8]:
# Check for missing columns
# Highly missing: > 90% missingness
# Moderately missing: 1-90% missingness
# Low missing: < 1% missingness
highly_missing, moderately_missing, low_missing = cut.calculate_missing_ratios(df)

# No columns with high missingness
highly_missing

Series([], dtype: float64)

In [9]:
# No columns with moderate missingness
moderately_missing

Series([], dtype: float64)

In [10]:
# Rather than introduce biases via imputation, we will drop these records
low_missing

PD_CD                1.529992e-04
PD_DESC              1.703949e-03
KY_CD                1.703949e-03
OFNS_DESC            1.703949e-03
LAW_CODE             3.458200e-05
LAW_CAT_CD           4.122244e-03
ARREST_BORO          1.397253e-06
JURISDICTION_CODE    1.746566e-06
AGE_GROUP            2.969162e-06
Latitude             1.746566e-07
Longitude            1.746566e-07
dtype: float64

In [11]:
# The rest of the missing values represent tiny amounts of the overall data (< 1% missingness)
# Drop missing values
low_missing_index = low_missing.index.tolist()

for c in low_missing_index:
    df = df[~df[c].isnull()]

In [12]:
df.isnull().sum()

ARREST_KEY           0
ARREST_DATE          0
PD_CD                0
PD_DESC              0
KY_CD                0
OFNS_DESC            0
LAW_CODE             0
LAW_CAT_CD           0
ARREST_BORO          0
ARREST_PRECINCT      0
JURISDICTION_CODE    0
AGE_GROUP            0
PERP_SEX             0
PERP_RACE            0
Latitude             0
Longitude            0
dtype: int64

In [13]:
# Data loss after all above cleaning
print(f'Data Loss: {(100 * (original_length - len(df.index)) / original_length):.3f} %')

Data Loss: 0.580 %


## Data Validation and Type Conversion

In [14]:
# Data runs from 2006 to 2023 according to the official documentation
# Validate dates and convert to datetime
df = cut.validate_dates_and_times(df, ['ARREST_DATE'])

# According to maps, NY longitudes should be between -74.27 and -73.68, and latitudes between 40.49 and 40.92
df = cut.validate_coordinates(df)

In [15]:
# A number of offense codes have multiple descriptions
multi_ky_ofns = cut.multiple_descriptions(df, 'KY_CD', 'OFNS_DESC')
multi_ky_ofns

{101.0: ['MURDER & NON-NEGL. MANSLAUGHTE', 'MURDER & NON-NEGL. MANSLAUGHTER'],
 103.0: ['HOMICIDE-NEGLIGENT,UNCLASSIFIE', 'HOMICIDE-NEGLIGENT,UNCLASSIFIED'],
 116.0: ['FELONY SEX CRIMES', 'SEX CRIMES'],
 120.0: ['CHILD ABANDONMENT/NON SUPPORT',
  'CHILD ABANDONMENT/NON SUPPORT 1',
  'ENDAN WELFARE INCOMP'],
 121.0: ['CRIMINAL MISCHIEF & RELATED OF',
  'CRIMINAL MISCHIEF & RELATED OFFENSES'],
 124.0: ['KIDNAPPING',
  'KIDNAPPING & RELATED OFFENSES',
  'KIDNAPPING AND RELATED OFFENSES'],
 125.0: ['NYS LAWS-UNCLASSIFIED FELONY',
  'OTHER STATE LAWS',
  'OTHER STATE LAWS (NON PENAL LA'],
 232.0: ['POSSESSION OF STOLEN PROPERTY', 'POSSESSION OF STOLEN PROPERTY 5'],
 233.0: ['FORCIBLE TOUCHING', 'SEX CRIMES'],
 343.0: ['OTHER OFFENSES RELATED TO THEF',
  'OTHER OFFENSES RELATED TO THEFT',
  'THEFT OF SERVICES'],
 345.0: ['ENDAN WELFARE INCOMP', 'OFFENSES RELATED TO CHILDREN'],
 347.0: ['INTOXICATED & IMPAIRED DRIVING', 'INTOXICATED/IMPAIRED DRIVING'],
 349.0: ['DISRUPTION OF A RELIGIOUS SERV

In [16]:
# Replace the descriptions using self-chosen mapping
index_map = [
    1, 1, 0, 0, 1,
    2, 0, 0, 0, 1,
    1, 1, 1, 1, 0,
    1, 0, 0, 1, 1,
    1, 1
]
cut.replace_description(df, multi_ky_ofns, index_map, 'KY_CD', 'OFNS_DESC')

In [17]:
# Checking the same for PD_CD and PD_DESC
multi_pd_desc = cut.multiple_descriptions(df, 'PD_CD', 'PD_DESC')
multi_pd_desc

{104.0: ['VEHICULAR ASSAULT (INTOX DRIVE', 'VEHICULAR ASSAULT (INTOX DRIVER)'],
 106.0: ['ASSAULT 2,1,PEACE OFFICER', 'ASSAULT POLICE/PEACE OFFICER'],
 107.0: ['END WELFARE VULNERABLE ELDERLY PERSON',
  'ENDANGERING VULNERABLE ELDERLY'],
 112.0: ['MENACING 1ST DEGREE (VICT NOT',
  'MENACING 1ST DEGREE (VICT NOT PEACE OFFICER)'],
 122.0: ['HOMICIDE, NEGLIGENT, VEHICLE,',
  'HOMICIDE, NEGLIGENT, VEHICLE, INTOX DRIVER'],
 125.0: ['HOMICIDE,NEGLIGENT,UNCLASSIFIE', 'HOMICIDE,NEGLIGENT,UNCLASSIFIED'],
 129.0: ['MANSLAUGHTER,UNCLASSIFIED - NO',
  'MANSLAUGHTER,UNCLASSIFIED - NON NEGLIGENT'],
 178.0: ['FAC. SEXUAL OFFENSE W/CONTROLL',
  'FAC. SEXUAL OFFENSE W/CONTROLLED SUBSTANCE'],
 179.0: ['AGGRAVATED SEXUAL ASBUSE', 'SEXUAL ABUSE 1'],
 180.0: ['COURSE OF SEXUAL CONDUCT AGAIN',
  'COURSE OF SEXUAL CONDUCT AGAINST A CHILD'],
 201.0: ['TRESPASS 4,CRIMINAL', 'TRESPASS 4,CRIMINAL SUB 2'],
 244.0: ['BURGLARY,UNCLASSIFIED,UNKNOWN',
  'BURGLARY,UNCLASSIFIED,UNKNOWN TIME'],
 248.0: ['RADIO DEVICES,U

In [18]:
index_map = [
    1, 1, 1, 1, 1,
    1, 1, 1, 0, 1,
    0, 1, 1, 1, 1,
    0, 1, 1, 1, 1,
    1, 0, 1, 1, 1,
    2, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    0, 1, 1, 0, 0,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    0, 1, 1, 1, 0,
    0, 0, 0, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 0, 1, 0, 1,
    1, 1, 1, 0, 0,
    0, 1, 1, 1, 0
]
cut.replace_description(df, multi_pd_desc, index_map, 'PD_CD', 'PD_DESC')

In [19]:
df.LAW_CODE.unique()

array(['PL 1303001', 'PL 1306503', 'PL 1303501', ..., 'ECLH080700',
       'ABC0079000', 'AC 2652100'], dtype=object)

In [20]:
# F = Felony, M = Misdemeanor, V = Violation, I = Traffic Infraction, 9 is unclear
df.LAW_CAT_CD.unique()

array(['F', 'M', 'V', 'I', '9'], dtype=object)

In [21]:
df[df.LAW_CAT_CD.isin(['9'])]

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude
1149345,261415958,2023-01-05,849.0,"NY STATE LAWS,UNCLASSIFIED VIOLATION",677.0,OTHER STATE LAWS,CPL5700600,9,Q,102,0.0,25-44,M,WHITE,40.712206,-73.825952
1150243,261786919,2023-01-12,849.0,"NY STATE LAWS,UNCLASSIFIED VIOLATION",677.0,OTHER STATE LAWS,CPL5700600,9,Q,113,3.0,25-44,M,BLACK,40.679981,-73.776234
1151925,262820315,2023-02-01,849.0,"NY STATE LAWS,UNCLASSIFIED VIOLATION",677.0,OTHER STATE LAWS,CPL5700600,9,Q,102,0.0,25-44,F,WHITE,40.712206,-73.825952
1155242,264051045,2023-02-22,849.0,"NY STATE LAWS,UNCLASSIFIED VIOLATION",677.0,OTHER STATE LAWS,CPL5700600,9,Q,102,0.0,25-44,M,BLACK,40.712206,-73.825952
1155487,264430516,2023-03-02,849.0,"NY STATE LAWS,UNCLASSIFIED VIOLATION",677.0,OTHER STATE LAWS,CPL5700600,9,Q,102,0.0,25-44,F,BLACK,40.713859,-73.830768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1391245,279504937,2023-12-26,849.0,"NY STATE LAWS,UNCLASSIFIED VIOLATION",677.0,OTHER STATE LAWS,CPL5700600,9,Q,113,3.0,25-44,F,ASIAN / PACIFIC ISLANDER,40.679981,-73.776234
1391590,278897966,2023-12-12,849.0,"NY STATE LAWS,UNCLASSIFIED VIOLATION",677.0,OTHER STATE LAWS,CPL5700600,9,Q,113,3.0,25-44,M,WHITE HISPANIC,40.679981,-73.776234
1391655,277114485,2023-11-07,849.0,"NY STATE LAWS,UNCLASSIFIED VIOLATION",677.0,OTHER STATE LAWS,CPL5700600,9,M,5,0.0,25-44,F,BLACK,40.714957,-74.005654
1391824,278821522,2023-12-11,849.0,"NY STATE LAWS,UNCLASSIFIED VIOLATION",677.0,OTHER STATE LAWS,CPL5700600,9,Q,113,3.0,25-44,M,WHITE HISPANIC,40.679981,-73.776234


In [22]:
# Further investigation of 9 shows unclear results of crime
# For example, cannabis possession is now legal in NYC and not defined as a crime
df[df.LAW_CAT_CD.isin(['9'])].PD_DESC.unique()

array(['NY STATE LAWS,UNCLASSIFIED VIOLATION', 'WEAPONS, POSSESSION, ETC',
       'CANNABIS POSSESSION'], dtype=object)

In [23]:
# There are only 611 records, so we will drop this data
df = df[~df.LAW_CAT_CD.isin(['9'])]

In [25]:
# Validate age, sex, and race columns, converting invalid values to UNKNOWN or U
df = cut.validate_age(df, ['AGE_GROUP'])
df = cut.validate_sex(df, ['PERP_SEX'])
df = cut.validate_race(df, ['PERP_RACE'])

In [26]:
# Define columns for type conversions
numeric_col = ['ARREST_KEY', 'PD_CD', 'KY_CD', 'ARREST_PRECINCT', 'JURISDICTION_CODE']
object_col = ['PD_DESC', 'OFNS_DESC', 'LAW_CODE', 'LAW_CAT_CD', 'ARREST_BORO',
              'AGE_GROUP', 'PERP_RACE', 'PERP_SEX']

# Perform type conversions
df = cut.convert_to_categorical(df, numeric_col, object_col)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5684630 entries, 0 to 5725521
Data columns (total 16 columns):
 #   Column             Dtype         
---  ------             -----         
 0   ARREST_KEY         category      
 1   ARREST_DATE        datetime64[ns]
 2   PD_CD              category      
 3   PD_DESC            category      
 4   KY_CD              category      
 5   OFNS_DESC          category      
 6   LAW_CODE           category      
 7   LAW_CAT_CD         category      
 8   ARREST_BORO        category      
 9   ARREST_PRECINCT    category      
 10  JURISDICTION_CODE  category      
 11  AGE_GROUP          category      
 12  PERP_SEX           category      
 13  PERP_RACE          category      
 14  Latitude           float64       
 15  Longitude          float64       
dtypes: category(13), datetime64[ns](1), float64(2)
memory usage: 449.0 MB


In [28]:
# Compare loss of data and memory saved
print(f'Total Data Loss: {(100 * (original_length - len(df.index)) / original_length):.3f} %')
print(f'Total Memory Saved: {(original_memory_usage - df.memory_usage().sum()) / (1024 ** 2):.2f} MB')

Total Data Loss: 0.714 %
Total Memory Saved: 380.99 MB


In [29]:
# Write cleaned data to parquet file, split into two parts to abide by file limits for GitHub
df.iloc[:len(df.index) // 2].to_parquet('../data/cleaned/arrest_data_historic_cleaned_1.parquet.gz',
                                        index = False, compression = 'gzip')
# Write cleaned data to parquet file
df.iloc[len(df.index) // 2:].to_parquet('../data/cleaned/arrest_data_historic_cleaned_2.parquet.gz',
                                     index = False, compression = 'gzip')