# Notebook for Data Setup for D3 Chicago Crime Evolution Visualization
Simply run all cells in order. This notebook prepares the data needed for the D3 visualization of Chicago crime evolution over time.

In [None]:
# run if not yet installed in venv
# !pip install pandas gdown

In [3]:
ROOT = '' # 'src/

In [None]:
import gdown, os

file_id = '1Nl7eEWJFA8709eAy9EbdAXi7ZORRul0n'
output_file = 'crime.csv'
if not os.path.exists(output_file):
    gdown.download(f'https://drive.google.com/uc?id={file_id}', output_file, quiet=False)

In [None]:
import pandas as pd
import json
import os

VERBOSE = True
CRIME_PATH = output_file
LISTINGS_PATH = 'data/airbnb_data/listings.csv.gz'
OUTPUT_PATH = f'{ROOT}data/chicago_timeseries.json'
    
def vprint(*args, **kwargs):
    if VERBOSE:
        print(*args, **kwargs)

def generate_crime_data():

    vprint("1 Loading data")
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

    vprint("\t1.1 Loading AirBnB data")
    try:
        df_listings = pd.read_csv(LISTINGS_PATH, compression='gzip')
    except Exception as _:
        df_listings = pd.read_csv('data/airbnb_data/listings.csv')

    vprint("\t1.2 Loading crime data")
    crime_cols = ['Date', 'Primary Type', 'Latitude', 'Longitude']
    df_crime = pd.read_csv(CRIME_PATH, usecols=crime_cols) # subset

    df_crime = df_crime.rename(columns={
        'Latitude': 'latitude', 
        'Longitude': 'longitude',
        'Primary Type': 'Primary Type'
    })
    df_crime = df_crime.dropna(subset=[
        'latitude', 'longitude'
    ])

    vprint("3 Filtering data")
    SELECTED_CRIMES = ['HOMICIDE', 'BATTERY', 'ASSAULT', 'ROBBERY', 'BURGLARY']
    df_crime_filtered = df_crime[df_crime['Primary Type'].isin(SELECTED_CRIMES)].copy()
    df_crime_filtered['Date'] = pd.to_datetime(df_crime_filtered['Date'],format='%m/%d/%Y %I:%M:%S %p')   

    # quarterly periods (aggregated)
    df_crime_filtered['period'] = df_crime_filtered['Date'].dt.to_period('Q').astype(str)
    periods = sorted(df_crime_filtered['period'].unique())

    vprint("4 Structuring data")
    
    airbnb_data = {
        "count": len(df_listings),
        "locations": df_listings[['latitude', 'longitude', 'price', 'room_type', 'neighbourhood']].rename(
            columns={'latitude': 'lat', 'longitude': 'lon', 'neighbourhood': 'neighborhood'}
        ).fillna(0).to_dict('records')
    }

    output_data = {
        "periods": periods,
        "crime_types": SELECTED_CRIMES,
        "airbnb": airbnb_data,
        "crimes": []
    }

    MAX_CRIMES_PER_PERIOD = 3_000 # limit points for browser performance
    for period in periods:
        period_data = df_crime_filtered[df_crime_filtered['period'] == period]
        # sample if too many
        if len(period_data) > MAX_CRIMES_PER_PERIOD:
            period_data = period_data.sample(n=MAX_CRIMES_PER_PERIOD, random_state=999)
        
        output_data["crimes"].append({
            "period": period,
            "count": int(len(period_data)),
            "locations": period_data[['latitude', 'longitude', 'Primary Type']].rename(
                columns={'latitude': 'lat', 'longitude': 'lon', 'Primary Type': 'type'}
            ).to_dict('records')
        })

    vprint(f"5 Exporting {len(periods)} periods to {OUTPUT_PATH}")
    with open(OUTPUT_PATH, 'w') as f:
        json.dump(output_data, f)

    vprint("6 Done - LOL")


generate_crime_data()

1 Loading data
	1.1 Loading AirBnB data
	1.2 Loading crime data
3 Filtering data
4 Structuring data
5 Exporting 93 periods to data/chicago_timeseries.json
6 Done - LOL


In [5]:
import requests
import json

URL = "https://raw.githubusercontent.com/blackmad/neighborhoods/master/chicago.geojson"
OUTPUT_PATH = f'{ROOT}data/chicago_neighborhoods.geojson'

def generate_chicago_map():
    try:
        response = requests.get(URL)
        response.raise_for_status()
        geojson_data = response.json()      
        with open(OUTPUT_PATH, 'w') as f:
            json.dump(geojson_data, f)
        vprint(f"features: {len(geojson_data['features'])} neighborhoods")

    except Exception as e:
        vprint(f"Could not download geojson: {e}")

generate_chicago_map()

features: 98 neighborhoods
