# PA individual Chicago Crime Prediction Project - Data Acquisition 

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point
import os
from datetime import datetime, timedelta
import warnings
import pickle

# Set Some Parameters

In [19]:
# Create Directories
DATA_DIR = "data/"
PROCESSED_DIR = "data/processed/"
MODELS_DIR = "models/"

for directory in [DATA_DIR, PROCESSED_DIR, MODELS_DIR]:
    os.makedirs(directory, exist_ok=True)

# Some Useful Functions

In [21]:
## Check Data Quality
def assess_data_quality(df):

    quality_report = {}
    
    # Null Check
    quality_report['shape'] = df.shape
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    quality_report['missing_values'] = pd.DataFrame({
        'Count': missing_values,
        'Percentage': missing_percentage
    }).sort_values('Percentage', ascending=False)
    
    
    quality_report['dtypes'] = df.dtypes
    
    # Time Check
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p')
        quality_report['date_range'] = {
            'min_date': df['Date'].min(),
            'max_date': df['Date'].max(),
            'time_span_days': (df['Date'].max() - df['Date'].min()).days
        }
    
    # Geo Check
    if 'Latitude' in df.columns and 'Longitude' in df.columns:
        quality_report['geo_bounds'] = {
            'lat_min': df['Latitude'].min(),
            'lat_max': df['Latitude'].max(),
            'lon_min': df['Longitude'].min(),
            'lon_max': df['Longitude'].max()
        }
    
    # Distribution Check
    if 'Primary Type' in df.columns:
        quality_report['crime_type_distribution'] = df['Primary Type'].value_counts().head(10)
    
    if 'Arrest' in df.columns and 'Domestic' in df.columns:
        quality_report['arrests_percentage'] = df['Arrest'].mean() * 100
        quality_report['domestic_percentage'] = df['Domestic'].mean() * 100
    
    return quality_report

def memory_optimization(df):
    
    result = df.copy()
    
    for col in result.select_dtypes(include=['int']).columns:
        # Convert to the smallest int type
        c_min = result[col].min()
        c_max = result[col].max()
        
        if c_min >= 0:
            if c_max < np.iinfo(np.uint8).max:
                result[col] = result[col].astype(np.uint8)
            elif c_max < np.iinfo(np.uint16).max:
                result[col] = result[col].astype(np.uint16)
            elif c_max < np.iinfo(np.uint32).max:
                result[col] = result[col].astype(np.uint32)
        else:
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                result[col] = result[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                result[col] = result[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                result[col] = result[col].astype(np.int32)

    for col in result.select_dtypes(include=['float']).columns:
        result[col] = result[col].astype(np.float32)
    
    for col in result.select_dtypes(include=['object']).columns:
        if result[col].nunique() < 0.5 * len(result): 
            result[col] = result[col].astype('category')
    
    return result

def save_processed_data(df, filename, info=None):
 
    filepath = os.path.join(PROCESSED_DIR, filename)
    
    # Save Data
    if filename.endswith('.csv'):
        df.to_csv(filepath, index=False)
    elif filename.endswith('.parquet'):
        df.to_parquet(filepath, index=False)
    elif filename.endswith('.pkl'):
        with open(filepath, 'wb') as f:
            pickle.dump(df, f)
    else:
        with open(filepath + '.pkl', 'wb') as f:
            pickle.dump(df, f)

    if info:
        info_filepath = filepath.rsplit('.', 1)[0] + '_info.json'
        pd.Series(info).to_json(info_filepath)

# Check Data

In [35]:
df = pd.read_csv("data/Chicago_Crimes_2012_to_2017.csv")

In [36]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,3,10508693,HZ250496,05/03/2016 11:40:00 PM,013XX S SAWYER AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,...,24.0,29.0,08B,1154907.0,1893681.0,2016,05/10/2016 03:56:50 PM,41.864073,-87.706819,"(41.864073157, -87.706818608)"
1,89,10508695,HZ250409,05/03/2016 09:40:00 PM,061XX S DREXEL AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,...,20.0,42.0,08B,1183066.0,1864330.0,2016,05/10/2016 03:56:50 PM,41.782922,-87.604363,"(41.782921527, -87.60436317)"
2,197,10508697,HZ250503,05/03/2016 11:31:00 PM,053XX W CHICAGO AVE,470,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,STREET,False,...,37.0,25.0,24,1140789.0,1904819.0,2016,05/10/2016 03:56:50 PM,41.894908,-87.758372,"(41.894908283, -87.758371958)"
3,673,10508698,HZ250424,05/03/2016 10:10:00 PM,049XX W FULTON ST,460,BATTERY,SIMPLE,SIDEWALK,False,...,28.0,25.0,08B,1143223.0,1901475.0,2016,05/10/2016 03:56:50 PM,41.885687,-87.749516,"(41.885686845, -87.749515983)"
4,911,10508699,HZ250455,05/03/2016 10:00:00 PM,003XX N LOTUS AVE,820,THEFT,$500 AND UNDER,RESIDENCE,False,...,28.0,25.0,06,1139890.0,1901675.0,2016,05/10/2016 03:56:50 PM,41.886297,-87.761751,"(41.886297242, -87.761750709)"


In [37]:
# Assess Data Quality
quality_report = assess_data_quality(df)

print(f"Data shape: {quality_report['shape']}")
print(f"Date range: {quality_report['date_range']['min_date']} to {quality_report['date_range']['max_date']} ({quality_report['date_range']['time_span_days']} days)")
print(f"Most common crime types:\n{quality_report['crime_type_distribution']}")
print(f"Arrest rate: {quality_report['arrests_percentage']:.2f}%")
print(f"Domestic incidents: {quality_report['domestic_percentage']:.2f}%")

# Optimize Memory
memory_before = df.memory_usage(deep=True).sum() / (1024 * 1024)
print(f"Memory usage before optimization: {memory_before:.2f} MB")

df_optimized = memory_optimization(df)

memory_after = df_optimized.memory_usage(deep=True).sum() / (1024 * 1024)
print(f"Memory usage after optimization: {memory_after:.2f} MB")
print(f"Memory reduction: {(1 - memory_after/memory_before) * 100:.2f}%")

# Save Data
save_processed_data(df_optimized, 'chicago_crimes_raw.parquet', {
    'date_processed': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'data_shape': df_optimized.shape,
    'date_range': str(quality_report['date_range']),
    'memory_usage_mb': memory_after
})


Data shape: (1456714, 23)
Date range: 2012-01-01 00:00:00 to 2017-01-18 23:49:00 (1844 days)
Most common crime types:
Primary Type
THEFT                  329460
BATTERY                263700
CRIMINAL DAMAGE        155455
NARCOTICS              135240
ASSAULT                 91289
OTHER OFFENSE           87874
BURGLARY                83397
DECEPTIVE PRACTICE      75495
MOTOR VEHICLE THEFT     61138
ROBBERY                 57313
Name: count, dtype: int64
Arrest rate: 25.91%
Domestic incidents: 15.11%
Memory usage before optimization: 915.62 MB
Memory usage after optimization: 212.32 MB
Memory reduction: 76.81%
