# Preprocessing

This notebook is used to preprocess the data as chunk because the original dataset is too large and can't be loaded and processed in one-go in Pandas. The output is then pickled and stored next to the dataset.

In [3]:
import pandas as pd
import geopandas as gpd

%matplotlib inline

Column Names:  
1. g
1. ID
1. Case Number
1. Date
1. Block
1. IUCR
1. Primary Type
1. Description
1. Location Description
1. Arrest
1. Domestic
1. Beat
1. District
1. Ward
1. Community Area
1. FBI Code
1. X Coordinate
1. Y Coordinate
1. Year
1. Updated On

In [2]:
# RUNS IN 20 MINS
dfChunks_list = []
chunkReader = pd.read_json("../data/chicagoCrimes/Chicago_Crimes.json", lines=True, chunksize=50_000)
for i, dfChunk in enumerate(chunkReader):
    # preprocessing each chunk
    print(f'processing chunk #{(i+1):03d}..')
    
    # remove entries without coordinates to avoid plotting problems
    dfChunk.dropna(subset=['X Coordinate', 'Y Coordinate'], inplace=True)
    
    # remove unused columns to save space
    dfChunk.drop(
        columns=['Beat', 'Ward', 'Updated On', 'IUCR', 'Case Number', 
        'X Coordinate', 'Y Coordinate', 'Community Area', 'Block', 'Year'], 
        inplace=True)

    # convert binary columns from string to int8 (maybe switch to bool later?)
    dfChunk['Arrest'] = dfChunk['Arrest'].map({'true': 1, 'false':0}).astype("Int8")
    dfChunk['Domestic'] = dfChunk['Domestic'].map({'true': 1, 'false':0}).astype("Int8")

    # assign a score to each crime
    dfChunk['Score Crime'] = 27 - pd.to_numeric(dfChunk['FBI Code'].str[:2]).astype("Int8")
    dfChunk['Score Arrest'] = (dfChunk['Arrest'] * 2).astype("Int8")

    # convert numerical datatypes to fitting containers to save space
    dfChunk[['District']] = dfChunk[['District']].astype('Int8')
    dfChunk[['ID']] = dfChunk[['ID']].astype('Int32')

    # convert string types to categorical to save space
    dfChunk[['Primary Type', 'Description', 'Location Description', 'District', 'FBI Code']] = dfChunk[['Primary Type', 'Description', 'Location Description', 'District', 'FBI Code']].astype("category")

    # convert numerical datatypes to fitting containers to save space
    dfChunk['g'] = gpd.GeoSeries.from_wkt(dfChunk['g'])

    # convert dates to datetime objects
    dfChunk['Date'] = pd.to_datetime(dfChunk['Date'])

    dfChunks_list.append(dfChunk)

del dfChunk


processing chunk #001..
processing chunk #002..
processing chunk #003..
processing chunk #004..
processing chunk #005..
processing chunk #006..
processing chunk #007..
processing chunk #008..
processing chunk #009..
processing chunk #010..
processing chunk #011..
processing chunk #012..
processing chunk #013..
processing chunk #014..
processing chunk #015..
processing chunk #016..
processing chunk #017..
processing chunk #018..
processing chunk #019..
processing chunk #020..
processing chunk #021..
processing chunk #022..
processing chunk #023..
processing chunk #024..
processing chunk #025..
processing chunk #026..
processing chunk #027..
processing chunk #028..
processing chunk #029..
processing chunk #030..
processing chunk #031..
processing chunk #032..
processing chunk #033..
processing chunk #034..
processing chunk #035..
processing chunk #036..
processing chunk #037..
processing chunk #038..
processing chunk #039..
processing chunk #040..
processing chunk #041..
processing chunk

Combine the processed chunks into a single dataframe

In [37]:
df = pd.concat(dfChunks_list)

Re-apply category type to specific columns

In [38]:
df[['Primary Type', 'Description', 'Location Description', 'District', 'FBI Code']] = df[['Primary Type', 'Description', 'Location Description', 'District', 'FBI Code']].astype("category")

Get information about the data

In [45]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7078918 entries, 0 to 7147876
Data columns (total 12 columns):
 #   Column                Dtype         
---  ------                -----         
 0   g                     geometry      
 1   ID                    Int32         
 2   Date                  datetime64[ns]
 3   Primary Type          category      
 4   Description           category      
 5   Location Description  category      
 6   Arrest                Int8          
 7   Domestic              Int8          
 8   District              category      
 9   FBI Code              category      
 10  Score Crime           Int8          
 11  Score Arrest          Int8          
dtypes: Int32(1), Int8(4), category(5), datetime64[ns](1), geometry(1)
memory usage: 297.1 MB


Pickle the data

In [39]:
file_name = "../data/chicagoCrimes/Chicago_Crimes_cleaned.pkl"
df.to_pickle(file_name)

The Code needed to load the dataframe in any notebook

In [46]:
# dff = pd.read_pickle(file_name)
# dff = gpd.GeoDataFrame(dff, geometry='g')
# dff.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 7078918 entries, 0 to 7147876
Data columns (total 12 columns):
 #   Column                Dtype         
---  ------                -----         
 0   g                     geometry      
 1   ID                    Int32         
 2   Date                  datetime64[ns]
 3   Primary Type          category      
 4   Description           category      
 5   Location Description  category      
 6   Arrest                Int8          
 7   Domestic              Int8          
 8   District              category      
 9   FBI Code              category      
 10  Score Crime           Int8          
 11  Score Arrest          Int8          
dtypes: Int32(1), Int8(4), category(5), datetime64[ns](1), geometry(1)
memory usage: 297.0 MB
