# Dependencies

In [None]:
import pkg_resources
import sys
import subprocess

# List of required packages (use package names as recognized by pip)
required = {
    'geopandas',
    'osmnx',
    'contextily',
    'libpysal',
    'esda',
    'pointpats',
    'matplotlib',
    'seaborn',
    'scikit-learn'
}

# Get the set of installed packages
installed = {pkg.key for pkg in pkg_resources.working_set}
# Determine which packages are missing
missing = required - installed

if missing:
    print(f"Installing missing packages: {missing}")
    subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])
else:
    print("All required packages are already installed.")


2.0.2
Installing missing packages: {'pointpats', 'libpysal', 'osmnx', 'contextily', 'esda', 'geopandas'}
2.0.2


# Python imports

In [19]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import osmnx as ox
from shapely.geometry import Point
from libpysal.weights import Queen
from esda import Moran, Moran_Local
from sklearn.cluster import DBSCAN, KMeans
import os

# Crime data collection

In [11]:
data_path = "./data/NYPD_Complaint_Data_Historic_20250403.csv"
url = "https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD"

# columns to load
cols = ["CMPLNT_FR_DT", "LAW_CAT_CD", "BORO_NM", "ADDR_PCT_CD", "Latitude", "Longitude"]

# Check if the file exists locally
if os.path.exists(data_path):
    print("Loading data from local file...")
    crime_df = pd.read_csv(data_path, usecols=cols)
else:
    print("Downloading data from API...")
    crime_df = pd.read_csv(url, usecols=cols)
    # Save the fetched data to a local CSV file for future use
    crime_df.to_csv(data_path, index=False)

# Convert dates to datetime. Parse errors will set value to NaT
crime_df["CMPLNT_FR_DT"] = pd.to_datetime(crime_df["CMPLNT_FR_DT"], format="%m/%d/%Y", errors='coerce')

# Filter for year 2019
crime_df = crime_df[crime_df["CMPLNT_FR_DT"].dt.year == 2019]

# Drop records with missing or invalid coordinates
crime_df = crime_df.dropna(subset=["Latitude", "Longitude"])
crime_df = crime_df[crime_df["Latitude"] != 0]

print(f"Total records in 2019: {len(crime_df)}")
crime_df.head(3)


Loading data from local file...


KeyboardInterrupt: 

# Amenities data collection

In [30]:
data_path_amenities = "./data/NYC_Amenities.csv"

if os.path.exists(data_path_amenities):
    print("Loading amenities data from local file...")
    amenities_df = pd.read_csv(data_path_amenities)
else:
    print("Querying OSM for amenities data...")
    # Define a dictionary of tags for the amenities you're interested in:
    # - "amenity": bars and restaurants
    # - "leisure": parks
    # - "railway": stations
    tags = {
        "amenity": ["bar", "restaurant"],
        "leisure": "park",
        "railway": "station"
    }
    
    # Use OSMnx to query OSM for these features in New York City.
    # You can adjust the place name to suit your area of interest.
    amenities_gdf = ox.geometries_from_place("New York City, USA", tags)
    
    # Optional: Convert the geometry column to WKT (text) format so it can be saved to CSV
    amenities_gdf['geometry'] = amenities_gdf['geometry'].apply(lambda geom: geom.wkt if geom is not None else None)
    
    # Save the queried data to a CSV file for future use
    amenities_gdf.to_csv(data_path_amenities, index=False)
    amenities_df = amenities_gdf
    amenities_df.head(3)

Querying OSM for amenities data...


AttributeError: module 'osmnx' has no attribute 'geometries_from_place'