## Load Libraries and Data

In [1]:
import numpy as np
import pandas as pd
import modules.cleaning_utils as cut

In [2]:
shooting_data_csv = '../data/raw/NYPD_Shooting_Incident_Data__Historic__20241014.csv'

In [3]:
df = pd.read_csv(shooting_data_csv)
df

Unnamed: 0,INCIDENT_KEY,OCCUR_DATE,OCCUR_TIME,BORO,LOC_OF_OCCUR_DESC,PRECINCT,JURISDICTION_CODE,LOC_CLASSFCTN_DESC,LOCATION_DESC,STATISTICAL_MURDER_FLAG,...,PERP_SEX,PERP_RACE,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,244608249,05/05/2022,00:10:00,MANHATTAN,INSIDE,14,0.0,COMMERCIAL,VIDEO STORE,True,...,M,BLACK,25-44,M,BLACK,986050.000,214231.000000,40.754692,-73.993500,POINT (-73.9935 40.754692)
1,247542571,07/04/2022,22:20:00,BRONX,OUTSIDE,48,0.0,STREET,(null),True,...,(null),(null),18-24,M,BLACK,1016802.000,250581.000000,40.854402,-73.882330,POINT (-73.88233 40.854402)
2,84967535,05/27/2012,19:35:00,QUEENS,,103,0.0,,,False,...,,,18-24,M,BLACK,1048632.000,198262.000000,40.710634,-73.767773,POINT (-73.76777349199995 40.71063412500007)
3,202853370,09/24/2019,21:00:00,BRONX,,42,0.0,,,False,...,M,UNKNOWN,25-44,M,BLACK,1014493.000,242565.000000,40.832417,-73.890714,POINT (-73.89071440599997 40.832416753000075)
4,27078636,02/25/2007,21:00:00,BROOKLYN,,83,0.0,,,False,...,M,BLACK,25-44,M,BLACK,1009149.375,190104.703125,40.688443,-73.910219,POINT (-73.91021857399994 40.68844345900004)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28557,265354835,03/19/2023,23:48:00,BRONX,INSIDE,47,0.0,COMMERCIAL,GROCERY/BODEGA,True,...,M,BLACK,18-24,M,BLACK,1025687.000,268586.000000,40.903785,-73.850098,POINT (-73.850098 40.903785)
28558,272968931,08/16/2023,02:46:00,BRONX,OUTSIDE,41,0.0,STREET,(null),False,...,F,BLACK,45-64,M,BLACK,1014639.000,240066.000000,40.825549,-73.890195,POINT (-73.890195 40.825549)
28559,270489846,06/27/2023,12:27:00,BRONX,INSIDE,41,0.0,DWELLING,MULTI DWELL - APT BUILD,True,...,M,BLACK,25-44,M,BLACK,1012221.000,238552.000000,40.821404,-73.898938,POINT (-73.898938 40.821404)
28560,271021661,07/08/2023,11:27:00,QUEENS,OUTSIDE,102,0.0,STREET,BEAUTY/NAIL SALON,False,...,M,WHITE HISPANIC,65+,M,ASIAN / PACIFIC ISLANDER,1028856.000,192785.000000,40.695717,-73.839138,POINT (-73.839138 40.695717)


In [4]:
# Convert string ('null') values into nans
df = df.replace('(null)', np.nan)

# Capture original length and memory usage for later
original_length = len(df.index)
original_memory_usage = df.memory_usage().sum()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28562 entries, 0 to 28561
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   INCIDENT_KEY             28562 non-null  int64  
 1   OCCUR_DATE               28562 non-null  object 
 2   OCCUR_TIME               28562 non-null  object 
 3   BORO                     28562 non-null  object 
 4   LOC_OF_OCCUR_DESC        2966 non-null   object 
 5   PRECINCT                 28562 non-null  int64  
 6   JURISDICTION_CODE        28560 non-null  float64
 7   LOC_CLASSFCTN_DESC       2964 non-null   object 
 8   LOCATION_DESC            11874 non-null  object 
 9   STATISTICAL_MURDER_FLAG  28562 non-null  bool   
 10  PERP_AGE_GROUP           18077 non-null  object 
 11  PERP_SEX                 18111 non-null  object 
 12  PERP_RACE                18111 non-null  object 
 13  VIC_AGE_GROUP            28562 non-null  object 
 14  VIC_SEX               

In [6]:
# Dropping columns of little interest
# Location descriptions are incomplete and not very useful
# X/Y Coord and Lon_Lat are not needed with Latitude and Longitude columns already present
df.drop(['LOC_OF_OCCUR_DESC', 'LOC_CLASSFCTN_DESC', 'LOCATION_DESC',
         'Y_COORD_CD', 'X_COORD_CD', 'Lon_Lat'], axis = 1, inplace = True)

## Missing Values

In [7]:
df.isnull().sum()

INCIDENT_KEY                   0
OCCUR_DATE                     0
OCCUR_TIME                     0
BORO                           0
PRECINCT                       0
JURISDICTION_CODE              2
STATISTICAL_MURDER_FLAG        0
PERP_AGE_GROUP             10485
PERP_SEX                   10451
PERP_RACE                  10451
VIC_AGE_GROUP                  0
VIC_SEX                        0
VIC_RACE                       0
Latitude                      59
Longitude                     59
dtype: int64

In [8]:
# Check for missing columns
# Highly missing: > 90% missingness
# Moderately missing: 1-90% missingness
# Low missing: < 1% missingness
highly_missing, moderately_missing, low_missing = cut.calculate_missing_ratios(df)

# No columns with high missingness
highly_missing

Series([], dtype: float64)

In [9]:
# Some columns with moderate missingness
# All of these columns are age, sex and race columns
moderately_missing

PERP_AGE_GROUP    0.367096
PERP_SEX          0.365906
PERP_RACE         0.365906
dtype: float64

In [10]:
# Validate age, sex, and race columns, converting invalid values to UNKNOWN or U
# Validate all columns while we are at it
df = cut.validate_age(df, ['PERP_AGE_GROUP', 'VIC_AGE_GROUP'])
df = cut.validate_sex(df, ['PERP_SEX', 'VIC_SEX'])
df = cut.validate_race(df, ['PERP_RACE', 'VIC_RACE'])

In [11]:
# Rather than introduce biases via imputation, we will drop these records
low_missing

JURISDICTION_CODE    0.000070
Latitude             0.002066
Longitude            0.002066
dtype: float64

In [12]:
# The rest of the missing values represent tiny amounts of the overall data (< 1% missingness)
# Drop missing values
low_missing_index = low_missing.index.tolist()

for c in low_missing_index:
    df = df[~df[c].isnull()]

In [13]:
df.isnull().sum()

INCIDENT_KEY               0
OCCUR_DATE                 0
OCCUR_TIME                 0
BORO                       0
PRECINCT                   0
JURISDICTION_CODE          0
STATISTICAL_MURDER_FLAG    0
PERP_AGE_GROUP             0
PERP_SEX                   0
PERP_RACE                  0
VIC_AGE_GROUP              0
VIC_SEX                    0
VIC_RACE                   0
Latitude                   0
Longitude                  0
dtype: int64

In [14]:
# Data loss after all above cleaning
print(f'Data Loss: {(100 * (original_length - len(df.index)) / original_length):.3f} %')

Data Loss: 0.214 %


## Data Validation and Type Conversion

In [15]:
# Data runs from 2006 to 2023 according to the official documentation
# Validate dates and convert to datetime
df = cut.validate_dates_and_times(df, ['OCCUR_DATE'])

# According to maps, NY longitudes should be between -74.27 and -73.68, and latitudes between 40.49 and 40.92
df = cut.validate_coordinates(df)

In [16]:
# Define columns for type conversions
numeric_col = ['INCIDENT_KEY', 'PRECINCT']
object_col = ['BORO', 'PRECINCT', 'PERP_AGE_GROUP', 'PERP_RACE', 'PERP_SEX',
              'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX']

# Perform type conversions
df = cut.convert_to_categorical(df, numeric_col, object_col)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28501 entries, 0 to 28561
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   INCIDENT_KEY             28501 non-null  category      
 1   OCCUR_DATE               28501 non-null  datetime64[ns]
 2   OCCUR_TIME               28501 non-null  object        
 3   BORO                     28501 non-null  category      
 4   PRECINCT                 28501 non-null  category      
 5   JURISDICTION_CODE        28501 non-null  float64       
 6   STATISTICAL_MURDER_FLAG  28501 non-null  bool          
 7   PERP_AGE_GROUP           28501 non-null  category      
 8   PERP_SEX                 28501 non-null  category      
 9   PERP_RACE                28501 non-null  category      
 10  VIC_AGE_GROUP            28501 non-null  category      
 11  VIC_SEX                  28501 non-null  category      
 12  VIC_RACE                 28501 non-nu

In [18]:
# Compare loss of data and memory saved
print(f'Total Data Loss: {(100 * (original_length - len(df.index)) / original_length):.3f} %')
print(f'Total Memory Saved: {(original_memory_usage - df.memory_usage().sum()) / (1024 ** 2):.2f} MB')

Total Data Loss: 0.214 %
Total Memory Saved: 2.10 MB


In [19]:
# Write cleaned data to parquet file
df.to_parquet('../data/cleaned/shooting_data_historic_cleaned.parquet.gz', index = False, compression = 'gzip')