In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import calendar

# Load data

In [None]:
!pip install gdown

import gdown

# 1. Specify the file ID and the file name to be saved in the Google Drive share link
file_id = '1UiEn8ssAfzhZCrqYAVA_cmW63nv9RrtY' # The ID extracted from the link
output_filename = 'Crimes_data.csv' # The file name saved locally after downloading

# 2. Build the download link
download_url = f'https://drive.google.com/uc?id={file_id}'

# 3. Download the file using gdown
try:
    gdown.download(download_url, output_filename, quiet=False)
    print(f"The file has been successfully downloaded and saved as {output_filename}")

    # 4. Use pandas to read the downloaded CSV file
    data = pd.read_csv(output_filename)

    # Print the first few lines of the data to confirm the successful reading
    print("\nData preview：")
    print(data.head())

except Exception as e:
    print(f"An error occurred when downloading or reading files：{e}")
    print("Please check whether the file ID is correct and whether the sharing permission of the file is set to 'Anyone with the link'。")

In [None]:
# View basic data information
data.info()
data.head()

### Summary of the Chicago Crime Dataset

The Chicago Crime Dataset contains **1,179,152** records with **22** features, documenting crime incidents in Chicago from **2020-03-20 to 2025-03-20**. This dataset, sourced from the Chicago Police Department's CLEAR system, includes key details about crime types, locations, and timestamps.

**Key Features Overview**

**1.Incident Information**

- **ID:** Unique identifier for each crime record

- **Case Number:** Case reference number

- **Date:** Date and time of the crime (currently in string format, requiring conversion to datetime)

- **Updated On:** Last update timestamp

**2.Crime Classification**

- **IUCR:** Crime classification code

- **Primary Type:** Major crime category (e.g., theft, narcotics, assault)

- **Description:** Specific crime details

- **FBI Code:** Federal classification of the crime

**3.Location Details**

- **Block:** Approximate address where the crime occurred

- **Location Description:** Specific place (e.g., residence, sidewalk, parking lot)

- **Beat / District / Ward / Community Area:** Administrative region identifiers

- **Latitude / Longitude:** Geographical coordinates (some missing values)

**4.Case Attributes**

- **Arrest:** Whether an arrest was made (Boolean)

- **Domestic:** Whether the crime was classified as domestic violence (Boolean)

**5.Geospatial Information**

- **X Coordinate / Y Coordinate:** Projected spatial coordinates

- **ocation:** Combined latitude and longitude in tuple format

**6.Temporal Attributes**

- **Year:** Year in which the crime occurred

- **Date:** Needs to be converted to datetime format for extracting hour, day of the week, and month

**Data Quality Issues**

**1.Missing Values:**

Location Description, Ward, Community Area, X Coordinate, Y Coordinate, Latitude, and Longitude contain missing values.

**2.Data Format Issues:**

Date is in string format and must be converted to datetime for temporal analysis.

**3.otential Data Cleaning:**

Case Number may not be necessary for analysis and could be removed.

# Preprocessing

## 1. Data cleaning and normalization

1. Duplicate Handling

In [None]:
# Remove exact duplicates
data = data.drop_duplicates()

2. Invalid Coordinate Removal (from Spatial Distribution Study)

In [None]:
# Remove records with invalid coordinates (where x=0 or y=0)
data = data[(data['X Coordinate'] != 0) & (data['Y Coordinate'] != 0)]

3. Outlier Handling (from Spatial Analysis)

In [None]:
# Remove coordinate outliers (bottom and top 1%)
x_low, x_high = data['X Coordinate'].quantile([0.01, 0.99])
y_low, y_high = data['Y Coordinate'].quantile([0.01, 0.99])

data = data[(data['X Coordinate'].between(x_low, x_high)) &
        (data['Y Coordinate'].between(y_low, y_high))]

4. Coordinate Normalization (from Spatial Distribution)

In [None]:
# Normalize coordinates to 0-1 range for modeling
data['x_norm'] = (data['X Coordinate'] - data['X Coordinate'].min()) / (data['X Coordinate'].max() - data['X Coordinate'].min())
data['y_norm'] = (data['Y Coordinate'] - data['Y Coordinate'].min()) / (data['Y Coordinate'].max() - data['Y Coordinate'].min())

5. Boolean Field Standardization

In [None]:
# Convert checkbox fields to proper booleans
data['Arrest'] = data['Arrest'].astype(bool)
data['Domestic'] = data['Domestic'].astype(bool)

6. Handle high cardinality class variables --> gorup the minority Values into 'Others'

In [None]:
# Select all possible category columns (text or low base numeric)
cat_cols = ['Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'District', 'Ward', 'Community Area', 'FBI Code']

high_card_cols = [col for col in cat_cols if data[col].nunique() > 50]
high_card_cols


In [None]:
def consolidate_rare_categories_fast(column, threshold=0.01, new_label="Other"):
    counts = column.value_counts()
    freq = counts / counts.sum()
    rare_mask = column.isin(freq[freq < threshold].index)
    return column.mask(rare_mask, new_label)

In [None]:
for col in high_card_cols:
    data[col] = consolidate_rare_categories_fast(data[col])

In [None]:
for col in cat_cols:
    unique_count = data[col].nunique()
    print(f"{col}: {unique_count}'s unique values")

7. Text Field Normalization

In [None]:
# One step to complete column name formatting: lowercase + space to underscore
data.columns = [col.lower().replace(' ', '_') for col in data.columns]

## 2. Missing value handling

In [None]:
missing_ratio = data.isnull().mean().sort_values(ascending=False) * 100
print("Proportion of missing values in each column：\n", missing_ratio[missing_ratio > 0])

In [None]:
# location_description (0.41% missing): Marked as unknown category
data['location_description'] = data['location_description'].fillna('Unknown')

# ward (0.000358% missing, only 4 items): Fill with the mode of the same community
data['ward'] = data.groupby('community_area')['ward'].transform(
    lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x.fillna(0)  # 0 means unknown
)

# community_area (0.000089% missing, only 1 item): Fill with the mode of the same block
data['community_area'] = data.groupby('block')['community_area'].transform(
    lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else 'Unkown'
)

In [None]:
# verify
data.isnull().sum()

In [None]:
# Delete the null value of "ward" again
data['ward'] = data.groupby('community_area')['ward'].transform(
    lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x.fillna(0)  # "0" indicates unknown.
)

In [None]:
# Verify again
data.isnull().sum()

## 3. Temporal/Spatial Aggregation

### 3.1 Temporal Aggregation

(1) 按年-月统计犯罪趋势

In [None]:
# Convert to datetime (handles multiple possible formats)
data['date'] = pd.to_datetime(data['date'], errors='coerce')  # errors='coerce' Convert an invalid value to NaT

# Extract the year field
data['year_month'] = data['date'].dt.to_period('M')

# Aggregated by year + crime type
monthly_crimes = data.groupby(['year_month', 'primary_type']).size().unstack(fill_value=0)

# Visualize popular crime types
monthly_crimes[['THEFT', 'BATTERY', 'ASSAULT']].plot(figsize=(12, 6))

The line chart shows the trend of different crime types over time.

(2) Analyze patterns by week and hour

In [None]:
# Extraction time feature
data['day_of_week'] = data['date'].dt.day_name()
data['hour'] = data['date'].dt.hour

# Heat map by week and hour
hourly_pattern = data.pivot_table(
    index='day_of_week',
    columns='hour',
    values='id',
    aggfunc='count'
)
sns.heatmap(hourly_pattern, cmap='YlOrRd')

Business Value: Identifying peak crime times (e.g., Friday nights)

### 3.2 Temporal Aggregation

(1) Statistics by administrative district

In [None]:
# Aggregated by police district
district_crimes = data.groupby(['district', 'primary_type']).size().unstack(fill_value=0)

# Find out what types of crimes are most common in each police district
district_crimes['top_crime'] = district_crimes.idxmax(axis=1)

(2) Geographic mesh aggregation (500m x 500m)

In [None]:
# Converts coordinates to grid numbers
data['grid_x'] = ((data['x_coordinate'] - data['x_coordinate'].min()) // 500).astype(int)
data['grid_y'] = ((data['y_coordinate'] - data['y_coordinate'].min()) // 500).astype(int)

# Count the number of crimes in the grid
grid_counts = data.groupby(['grid_x', 'grid_y']).size().reset_index(name='crime_count')

(3) Community level hot spot analysis

In [None]:
print(data.columns)

In [None]:
# Aggregated by community + crime type
community_crimes = data.groupby([ 'community_area', 'primary_type']).agg({
    'id': 'count',
    'arrest': 'mean'  # Arrest rate
}).unstack()

In [None]:
print(data.head())

In [None]:
data.info()

## Save the preprocessed data to the local machine.

In [None]:
data.to_csv('preprocessed_data.csv', index=False)
print("The data has been successfully saved locally：preprocessed_data.csv")
