# Analyzing Chicago Crime

In [3]:
# import gdown

In [4]:
url = "https://drive.google.com/file/d/1daS25XdX4s7fn7b2GEEme7iDBeN4c9E_/view?usp=drive_link"
id = "1daS25XdX4s7fn7b2GEEme7iDBeN4c9E_"

## Dataset Setup and Loading

In [5]:
#Importing the necessary libraries

#libraries for data wrangling
import pandas as pd
import numpy as np

#libraries for visualization
import matplotlib.pyplot as plt
# import seaborn as sns

#libraries for time
from datetime import datetime, timedelta

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#set up views
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [6]:
#Loading the data

#Couldn't load due to low memory
data_path = r"archive.zip"
df = pd.read_csv(data_path, low_memory=True, compression='zip') #how to load zipped csv files
df.tail()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
7784659,12847575,JF420478,09/01/2022 05:00:00 AM,005XX W SURF ST,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,1934,19.0,44.0,6.0,26,1172497.0,1919410.0,2022,01/03/2023 03:46:28 PM,41.934305,-87.641485,"(41.934304581, -87.641484982)"
7784660,12847801,JF420319,07/08/2022 12:00:00 AM,114XX S PRAIRIE AVE,1130,DECEPTIVE PRACTICE,FRAUD OR CONFIDENCE GAME,STREET,False,False,531,5.0,9.0,49.0,11,1179966.0,1828818.0,2022,01/03/2023 03:46:28 PM,41.685544,-87.616813,"(41.685543881, -87.616812541)"
7784661,12847324,JF420102,09/27/2022 11:00:00 AM,023XX E 70TH ST,0810,THEFT,OVER $500,RESIDENCE,False,False,331,3.0,5.0,43.0,6,1193181.0,1859005.0,2022,01/03/2023 03:46:28 PM,41.768068,-87.567453,"(41.768068052, -87.567452932)"
7784662,12847570,JF420427,09/03/2022 10:25:00 AM,052XX W CARMEN AVE,2021,NARCOTICS,POSSESS - BARBITURATES,RESIDENCE - YARD (FRONT / BACK),True,False,1623,16.0,45.0,11.0,18,1140553.0,1933418.0,2022,01/03/2023 03:46:28 PM,41.973391,-87.758535,"(41.973391184, -87.758534512)"
7784663,12840464,JF411839,09/26/2022 07:20:00 PM,0000X N MASON AVE,143A,WEAPONS VIOLATION,UNLAWFUL POSSESSION - HANDGUN,SIDEWALK,True,False,1513,15.0,29.0,25.0,15,1136773.0,1899652.0,2022,01/03/2023 03:46:28 PM,41.880802,-87.773246,"(41.880802263, -87.773245737)"


In [9]:
# Writing a function to load the full dataset or just specified samples

def load_chicago_crime_data(data_path, sample_size=None):
    """
    Optimized loading function for Chicago crime data
    """
    #Defining data types to reduce memory usage
    dtype_dict = {
        "ID": "Int32",
        "Case Number": "string",
        "IUCR": "category",
        'Description': 'category',
        'Location Description': 'category',
        'Arrest': 'bool',
        'Domestic': 'bool',
        'Beat': 'Int16',
        'District': 'Int8',
        'Ward': 'Int8',
        'Community Area': 'Int8',
        'FBI Code': 'category',
        'X Coordinate': 'float32',
        'Y Coordinate': 'float32',
        'Latitude': 'float32',
        'Longitude': 'float32'
    }

    #parse dates during loading
    date_cols = ['Date', 'Updated On']

    if sample_size:
        #Load random sample for faster analysis
        df = pd.read_csv(data_path,
                          dtype=dtype_dict,
                          parse_dates=date_cols,
                          low_memory=False, keep_default_na=True,
                          nrows = sample_size)
    else:
        df = pd.read_csv(data_path, dtype=dtype_dict,
                         parse_dates=date_cols,
                         low_memory=False, keep_default_na=True)
    
    return df

In [None]:
# df = load_chicago_crime_data('archive.zip', sample_size=100000)

In [12]:
df.tail()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
99995,10215495,HY401397,2015-08-28 22:10:00,031XX W LAWRENCE AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,PARKING LOT/GARAGE(NON.RESID.),False,True,1713,17,33,14,08B,1154394.0,1931719.0,2015,2018-02-10 15:50:01,41.968464,-87.70768,"(41.968462913, -87.707683157)"
99996,10215496,HY401545,2015-08-29 01:36:00,044XX N SAWYER AVE,460,BATTERY,SIMPLE,STREET,False,False,1724,17,33,14,08B,1153919.0,1929328.0,2015,2018-02-10 15:50:01,41.96191,-87.709496,"(41.961911365, -87.709493728)"
99997,10215497,HY401614,2015-08-29 03:52:00,037XX W LELAND AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,False,1723,17,39,14,14,1150607.0,1930973.0,2015,2018-02-10 15:50:01,41.966492,-87.721626,"(41.96649088, -87.721627447)"
99998,10215498,HY401663,2015-08-29 05:10:00,032XX N KIMBALL AVE,880,THEFT,PURSE-SNATCHING,RESTAURANT,False,False,1732,17,35,21,06,1153154.0,1921288.0,2015,2018-02-10 15:50:01,41.939865,-87.712517,"(41.939864288, -87.712520442)"
99999,10215499,HY401688,2015-08-29 01:00:00,011XX W GARFIELD BLVD,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,712,7,16,68,07,1169805.0,1868195.0,2015,2018-02-10 15:50:01,41.793827,-87.65287,"(41.793825753, -87.652869806)"


In [13]:
#Simulating the dataset
import numpy as np

#Setting seed for reproducibility
np.random.seed(234) #this initializes a pseudorandom number generator


In [20]:
#Writing a function to create a simulated dataset
def simulated_crime_data(n_records = 10000):

    #Crime types based on actual Chicago data
    crime_types = ['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 
                   'ASSAULT', 'BURGLARY', 'MOTOR VEHICLE THEFT', 'ROBBERY',
                   'DECEPTIVE PRACTICE', 'CRIMINAL TRESPASS']
    
    locations = ['STREET', 'RESIDENCE', 'APARTMENT', 'SIDEWALK', 'PARKING LOT',
                'RETAIL STORE', 'SCHOOL', 'RESTAURANT', 'VEHICLE', 'OFFICE']
    
    # Generate sample data
    data = {
        'ID': range(1, n_records + 1),
        'Date': pd.date_range('2020-01-01', '2024-12-31', periods=n_records),
        'Primary Type': np.random.choice(crime_types, n_records, p=[0.25, 0.15, 0.12, 0.10, 0.08, 0.08, 0.07, 0.05, 0.05, 0.05]),
        'Location Description': np.random.choice(locations, n_records),
        'Arrest': np.random.choice([True, False], n_records, p =[0.2, 0.8]),
        'Domestic': np.random.choice([True, False], n_records, p=[0.1, 0.9]),
        'District': np.random.randint(1,26, n_records),
        'Ward': np.random.randint(1,51, n_records),
        'Community Area': np.random.randint(1,78, n_records),
        'Latitude': np.random.uniform(41.6, 42.1, n_records), #numbers are uniformly generated from the interval provided
        'Longitude': np.random.uniform(-87.9, -87.5, n_records)
        }
    return pd.DataFrame(data)

In [22]:
simulated = simulated_crime_data(50000)
print(f"This dataset has {len(simulated)} records")
print(f"This dataset has {len(df)} records")
simulated.head()

This dataset has 50000 records
This dataset has 100000 records


Unnamed: 0,ID,Date,Primary Type,Location Description,Arrest,Domestic,District,Ward,Community Area,Latitude,Longitude
0,1,2020-01-01 00:00:00.000000000,MOTOR VEHICLE THEFT,RESTAURANT,False,False,19,26,30,41.8217,-87.709252
1,2,2020-01-01 00:52:35.391107822,THEFT,RETAIL STORE,False,False,2,21,36,41.791732,-87.883408
2,3,2020-01-01 01:45:10.782215644,BATTERY,SIDEWALK,True,False,20,17,74,42.076569,-87.865984
3,4,2020-01-01 02:37:46.173323466,NARCOTICS,SCHOOL,False,False,17,31,31,41.736011,-87.855017
4,5,2020-01-01 03:30:21.564431288,BATTERY,RESIDENCE,False,False,21,28,20,41.895187,-87.824708


## Preliminary Data Analysis (PDA)

In [26]:
#Using the origin dataset df

print("=== DATASET OVERVIEW")
print(f"Dataset Shape: {df.shape}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum()/1024**2:2f} MB")

print(f"\n=== COLUMN INFORMATION ===")
print(f"List of columns: {df.columns}\n")




=== DATASET OVERVIEW
Dataset Shape: (100000, 22)
Memory Usage: 30.820737 MB

=== COLUMN INFORMATION ===
List of columns: Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location'], dtype='object')



In [27]:
print(f"Column details:{df.info()}")
print("\n=== FIRST AND LAST 5 RECORDS ===")
print(df.head())
print("")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   ID                    100000 non-null  Int32         
 1   Case Number           100000 non-null  string        
 2   Date                  100000 non-null  datetime64[ns]
 3   Block                 100000 non-null  object        
 4   IUCR                  100000 non-null  category      
 5   Primary Type          100000 non-null  object        
 6   Description           100000 non-null  category      
 7   Location Description  99729 non-null   category      
 8   Arrest                100000 non-null  bool          
 9   Domestic              100000 non-null  bool          
 10  Beat                  100000 non-null  Int16         
 11  District              100000 non-null  Int8          
 12  Ward                  99996 non-null   Int8          
 13  

In [28]:
print(df.tail())

print("\n=== BASIC STATISTICS ===")
print(df.describe(include='all'))

             ID Case Number                Date                  Block  IUCR         Primary Type              Description            Location Description  Arrest  Domestic  Beat  District  Ward  Community Area FBI Code  X Coordinate  Y Coordinate  Year          Updated On   Latitude  Longitude                       Location
99995  10215495    HY401397 2015-08-28 22:10:00   031XX W LAWRENCE AVE  0486              BATTERY  DOMESTIC BATTERY SIMPLE  PARKING LOT/GARAGE(NON.RESID.)   False      True  1713        17    33              14      08B     1154394.0     1931719.0  2015 2018-02-10 15:50:01  41.968464 -87.707680  (41.968462913, -87.707683157)
99996  10215496    HY401545 2015-08-29 01:36:00     044XX N SAWYER AVE  0460              BATTERY                   SIMPLE                          STREET   False     False  1724        17    33              14      08B     1153919.0     1929328.0  2015 2018-02-10 15:50:01  41.961910 -87.709496  (41.961911365, -87.709493728)
99997  10215497    

### **Missing Data Analysis**

In [30]:
def analyze_missing_data(df):
    """Analyzed missing data patterns"""

    missing_data = pd.DataFrame({
        'Column': df.columns,
        'Missing_Count': df.isnull().sum(),
        'Missing_Percentage': (df.isnull().sum()/len(df))*100,
        'Data_Type': df.types
    })