In [2]:
from dotenv import load_dotenv
import os, pandas as pd, sqlalchemy as sa 

load_dotenv()

engine = sa.create_engine(
    f"postgresql+psycopg://{os.getenv('PGUSER')}:{os.getenv('PGPASSWORD')}@"
    f"{os.getenv('PGHOST')}:{os.getenv('PGPORT')}/{os.getenv('PGDATABASE')}"
)

def q(sql):
    return pd.read_sql(sql, engine)

In [3]:
# Pull the base dataset 
df = q("""
SELECT
    id,
    date,
    primary_type,
    location_description,
    arrest
FROM
       public_stg.stg_chicago_crimes
WHERE date >= '2025-01-01'
""")
df.head()

Unnamed: 0,id,date,primary_type,location_description,arrest
0,13948166,2025-08-27,CRIMINAL SEXUAL ASSAULT,HOSPITAL BUILDING / GROUNDS,False
1,13950808,2025-08-27,BATTERY,SCHOOL - PRIVATE GROUNDS,False
2,13948325,2025-08-27,BATTERY,GROCERY FOOD STORE,True
3,13947866,2025-08-27,THEFT,PARK PROPERTY,False
4,13950885,2025-08-27,SEX OFFENSE,APARTMENT,False


In [4]:
# Clean categorical featues 
df['location_description'] = df['location_description'].fillna('UNKNOWN')
top_locs = df['location_description'].value_counts().index[:15]
df['location_grouped'] = df['location_description'].where(df['location_description'].isin(top_locs), 'OTHER')

In [5]:
# Extract temporal features 
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['dow'] = df['date'].dt.dayofweek # 0=Mon, 6=Sun
df['hour'] = df['date'].dt.hour

In [6]:
# Encode target variable 
df['arrest'] = df['arrest'].astype(int) # True/False -> 1/0

In [10]:
# Save processed dataset 
from pathlib import Path 

# Create data/processed relative to working directory 
DATA_DIR = Path.cwd() / "Data" / "processed"
DATA_DIR.mkdir(parents=True, exist_ok=True)

out_path = DATA_DIR / "arrest_features.csv"
df.to_csv(out_path, index=False)
print(f"Saved to {out_path}, shape={df.shape}")

Saved to /Volumes/easystore/Projects/chicago-crime-pipeline/notebooks/eda/Data/processed/arrest_features.csv, shape=(5694, 10)
