# Dependencies

In [None]:
import pkg_resources
import sys
import subprocess

# List of required packages (use package names as recognized by pip)
required = {
    'geopandas',
    'osmnx',
    'contextily',
    'libpysal',
    'esda',
    'pointpats',
    'matplotlib',
    'seaborn',
    'scikit-learn',
    'geodatasets',
    'folium'
}

# Get the set of installed packages
installed = {pkg.key for pkg in pkg_resources.working_set}
# Determine which packages are missing
missing = required - installed

if missing:
    print(f"Installing missing packages: {missing}")
    subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])
else:
    print("All required packages are already installed.")

# Python imports

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import osmnx as ox
from shapely.geometry import Point, Polygon, shape
from shapely.wkt import loads
from libpysal.weights import Queen
from esda import Moran, Moran_Local
from sklearn.cluster import DBSCAN, KMeans
import os
import warnings
from scipy.stats import gaussian_kde
from sklearn.preprocessing import StandardScaler
import contextily as ctx
import libpysal
import esda

# Data collection

In [None]:
# use this to get data from cityofnewyork.us
def load_data(url, filename, usecols=None):
    """
    Downloads a CSV file from a given URL and loads it into a DataFrame.
    """
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        df = pd.read_csv(url, usecols=usecols)
        df.to_csv(filename, index=False)
    else:
        print(f"Loading {filename} from local file...")
        df = pd.read_csv(filename, usecols=usecols)
    return df

## Crime data

In [None]:
crime_data_path = "./data/NYPD_Complaint_Data_Historic.csv"
crime_data_url = "https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD"
crime_cols = ["CMPLNT_FR_DT", "LAW_CAT_CD", "BORO_NM", "ADDR_PCT_CD", "Latitude", "Longitude"]

crime_df = load_data(crime_data_url, crime_data_path, usecols=crime_cols)

# Convert dates to datetime. Parse errors will set value to NaT
crime_df["CMPLNT_FR_DT"] = pd.to_datetime(crime_df["CMPLNT_FR_DT"], format="%m/%d/%Y", errors='coerce')

# Filter for year 2019
crime_df = crime_df[crime_df["CMPLNT_FR_DT"].dt.year == 2019]

# Drop records with missing or invalid coordinates
crime_df = crime_df.dropna(subset=["Latitude", "Longitude"])
crime_df = crime_df[crime_df["Latitude"] != 0]

# convert to geodataframe
crime_gdf = gpd.GeoDataFrame(
    crime_df,
    geometry=gpd.points_from_xy(crime_df["Longitude"], crime_df["Latitude"]),
    crs="EPSG:4326"
).to_crs(epsg=3857)

print(f"Total records in 2019: {len(crime_df)}")
crime_gdf.head(3)

## Population by Neighbourhood tabulation areas (NTA) data

In [65]:
import geopandas as gpd
import pandas as pd
import re
from shapely.wkt import loads
from difflib import SequenceMatcher
from IPython.display import Markdown, display

# ——————————————————————————————————————————————————————————————
# 0) Assumes `load_data(url, path)` is defined elsewhere
# ——————————————————————————————————————————————————————————————

# 1) Load NTA polygons
nta_polys_data_path = "./data/NYC_NTA_Polygons.csv"
nta_polys_data_url  = "https://data.cityofnewyork.us/api/views/9nt8-h7nd/rows.csv?accessType=DOWNLOAD"
nta_polys_df = load_data(nta_polys_data_url, nta_polys_data_path)

# 2) Convert to GeoDataFrame (EPSG:4326 → EPSG:3857)
nta_polys_gdf = gpd.GeoDataFrame(
    nta_polys_df,
    geometry=nta_polys_df["the_geom"].apply(loads),
    crs="EPSG:4326"
).to_crs(epsg=3857)

# 3) Load NTA population table
nta_pop_data_path = "./data/NYC_NTA.csv"
nta_pop_data_url  = "https://data.cityofnewyork.us/api/views/swpk-hqdp/rows.csv?accessType=DOWNLOAD"
nta_pop_df = load_data(nta_pop_data_url, nta_pop_data_path)

# 4) Normalize population DataFrame columns, keep its Borough
nta_pop_df = nta_pop_df.rename(columns={
    "NTA Code":   "NTA2020",
    "Population": "population",
    "NTA Name":   "NTAName_pop",
    "Borough":    "Borough_pop",
})

# 5) Merge population *and* Borough_pop onto the polys GDF
nta_polys_gdf = nta_polys_gdf.merge(
    nta_pop_df[["NTA2020", "population", "Borough_pop"]],
    on="NTA2020",
    how="left"
)

# 6) Rename the polygons’ own borough column for clarity
nta_polys_gdf = nta_polys_gdf.rename(columns={
    "BoroName": "BoroName_poly"
})

# 7) Remove any NTA whose name ends with "Park" or "park"
nta_polys_gdf = nta_polys_gdf[
    ~nta_polys_gdf["NTAName"].str.lower().str.endswith("park")
].copy()

# 8) Pre‐compute lookup maps
poly_boro_map = nta_polys_gdf.set_index("NTAName")["BoroName_poly"].to_dict()
pop_lookup    = nta_pop_df.set_index("NTAName_pop")[["population","Borough_pop"]].to_dict()

# 9) Exact‐name fallback for population *only if boroughs match*
nta_pop_df["NTAName_lc"]    = nta_pop_df["NTAName_pop"].str.lower()
nta_polys_gdf["NTAName_lc"] = nta_polys_gdf["NTAName"].str.lower()
mask_missing = nta_polys_gdf["population"].isna()

for idx in nta_polys_gdf[mask_missing].index:
    raw      = nta_polys_gdf.at[idx, "NTAName"]
    key      = nta_polys_gdf.at[idx, "NTAName_lc"]
    raw_boro = poly_boro_map.get(raw)
    if key in pop_lookup and pop_lookup[key]["Borough_pop"] == raw_boro:
        nta_polys_gdf.at[idx, "population"]  = pop_lookup[key]["population"]
        nta_polys_gdf.at[idx, "Borough_pop"] = pop_lookup[key]["Borough_pop"]

nta_polys_gdf = nta_polys_gdf.drop(columns="NTAName_lc")

# 10) Record initial unmatched set
initial_unmatched = nta_polys_gdf[nta_polys_gdf["population"].isna()].copy()
initial_count     = len(initial_unmatched)

# 11) Helpers for fuzzy matching
def normalize_string(s: str) -> str:
    cleaned   = re.sub(r'[^A-Za-z0-9 ]+', ' ', s)
    collapsed = re.sub(r'\s+', ' ', cleaned)
    return collapsed.strip().lower()

def common_prefix_len(a: str, b: str) -> int:
    for i in range(min(len(a), len(b))):
        if a[i] != b[i]:
            return i
    return min(len(a), len(b))

def substring_match_len(a: str, b: str) -> int:
    matcher = SequenceMatcher(None, a, b)
    match   = matcher.find_longest_match(0, len(a), 0, len(b))
    return match.size

pop_names    = nta_pop_df["NTAName_pop"].tolist()
name_to_pop  = nta_pop_df.set_index("NTAName_pop")["population"].to_dict()
name_to_boro = nta_pop_df.set_index("NTAName_pop")["Borough_pop"].to_dict()

# 12) Compute best matches & confidences *within same borough only*
records = []
for raw in initial_unmatched["NTAName"]:
    raw_boro = poly_boro_map.get(raw)
    eligible = [cand for cand in pop_names if name_to_boro.get(cand) == raw_boro]

    best_pref, best_sub, best_glob = None, None, None
    best_rev_pref, best_rev_sub       = None, None
    score_pref, score_sub, score_glob = -1.0, -1.0, -1.0
    score_rev_pref, score_rev_sub     = -1.0, -1.0

    norm_raw = normalize_string(raw)

    if eligible:
        for cand in eligible:
            norm_cand = normalize_string(cand)

            # prefix (A prefix of B)
            p = common_prefix_len(norm_raw, norm_cand) / (len(norm_cand) or 1)
            if p > score_pref:
                score_pref, best_pref = p, cand

            # reverse prefix (B prefix of A)
            rev_p = common_prefix_len(norm_raw, norm_cand) / (len(norm_raw) or 1)
            if rev_p > score_rev_pref:
                score_rev_pref, best_rev_pref = rev_p, cand

            # substring (A in B)
            match_len = substring_match_len(norm_raw, norm_cand)
            s = match_len / (len(norm_cand) or 1)
            if s > score_sub:
                score_sub, best_sub = s, cand

            # reverse substring (B contains A)
            rev_s = match_len / (len(norm_raw) or 1)
            if rev_s > score_rev_sub:
                score_rev_sub, best_rev_sub = rev_s, cand

            # overall fuzzy ratio
            g = SequenceMatcher(None, norm_raw, norm_cand).ratio()
            if g > score_glob:
                score_glob, best_glob = g, cand

    records.append({
        "Unmatched NTA Name":             raw,
        "Poly Borough":                   raw_boro,

        "Prefix Match":                   best_pref,
        "Prefix Confidence":              score_pref,
        "Prefix Population":              name_to_pop.get(best_pref),
        "Prefix Borough":                 name_to_boro.get(best_pref),

        "Reverse Prefix Match":           best_rev_pref,
        "Reverse Prefix Confidence":      score_rev_pref,
        "Reverse Prefix Population":      name_to_pop.get(best_rev_pref),
        "Reverse Prefix Borough":         name_to_boro.get(best_rev_pref),

        "Substring Match":                best_sub,
        "Substring Confidence":           score_sub,
        "Substring Population":           name_to_pop.get(best_sub),
        "Substring Borough":              name_to_boro.get(best_sub),

        "Reverse Substring Match":        best_rev_sub,
        "Reverse Substring Confidence":   score_rev_sub,
        "Reverse Substring Population":   name_to_pop.get(best_rev_sub),
        "Reverse Substring Borough":      name_to_boro.get(best_rev_sub),

        "Overall Match":                  best_glob,
        "Overall Confidence":             score_glob,
        "Overall Population":             name_to_pop.get(best_glob),
        "Overall Borough":                name_to_boro.get(best_glob),
    })

matches_df = pd.DataFrame(records)

# 13) Prefill any where all five metrics agree
filled_by_all_metrics = set()
for _, r in matches_df.iterrows():
    nm         = r["Unmatched NTA Name"]
    idxs       = nta_polys_gdf[nta_polys_gdf["NTAName"] == nm].index
    candidates = [
        r["Prefix Match"],
        r["Reverse Prefix Match"],
        r["Substring Match"],
        r["Reverse Substring Match"],
        r["Overall Match"]
    ]
    if candidates[0] and all(c == candidates[0] for c in candidates[1:]):
        nta_polys_gdf.loc[idxs, ["population","Borough_pop"]] = [
            r["Overall Population"], r["Overall Borough"]
        ]
        filled_by_all_metrics.add(nm)

# 14) Prefill any 100% matches (make substring independent)
filled_by_prefix         = set()
filled_by_reverse_prefix = set()
filled_by_substring      = set()
filled_by_overall        = set()

for _, r in matches_df.iterrows():
    nm   = r["Unmatched NTA Name"]
    idxs = nta_polys_gdf[nta_polys_gdf["NTAName"] == nm].index

    # prefix
    if r["Prefix Confidence"] == 1.0:
        nta_polys_gdf.loc[idxs, ["population","Borough_pop"]] = [
            r["Prefix Population"], r["Prefix Borough"]
        ]
        filled_by_prefix.add(nm)

    # reverse prefix
    if r["Reverse Prefix Confidence"] == 1.0:
        nta_polys_gdf.loc[idxs, ["population","Borough_pop"]] = [
            r["Reverse Prefix Population"], r["Reverse Prefix Borough"]
        ]
        filled_by_reverse_prefix.add(nm)

    # substring (now independent!)
    if r["Substring Confidence"] == 1.0:
        nta_polys_gdf.loc[idxs, ["population","Borough_pop"]] = [
            r["Substring Population"], r["Substring Borough"]
        ]
        filled_by_substring.add(nm)

    # overall
    if r["Overall Confidence"] > 0.9:
        nta_polys_gdf.loc[idxs, ["population","Borough_pop"]] = [
            r["Overall Population"], r["Overall Borough"]
        ]
        filled_by_overall.add(nm)

# 15) Prefill by global≥0.80 & (prefix or substring)≥0.45 same candidate
filled_by_fallback = set()
for _, r in matches_df.iterrows():
    nm   = r["Unmatched NTA Name"]
    idxs = nta_polys_gdf[nta_polys_gdf["NTAName"] == nm].index
    if nta_polys_gdf.loc[idxs, "population"].isna().all():
        cond = (
            r["Overall Confidence"] >= 0.80
            and (
                (r["Overall Match"] == r["Prefix Match"]    and r["Prefix Confidence"]    >= 0.45)
             or (r["Overall Match"] == r["Substring Match"] and r["Substring Confidence"] >= 0.45)
            )
        )
        if cond:
            nta_polys_gdf.loc[idxs, ["population","Borough_pop"]] = [
                r["Overall Population"], r["Overall Borough"]
            ]
            filled_by_fallback.add(nm)

# 16) Final still‐missing & display
still_unmatched = nta_polys_gdf[nta_polys_gdf["population"].isna()]
final_count     = len(still_unmatched)

if final_count:
    disp = matches_df.loc[
        matches_df["Unmatched NTA Name"].isin(still_unmatched["NTAName"])
    ].copy()
    disp = disp.sort_values(by="Substring Confidence", ascending=False)
    for c in ["Unmatched NTA Name","Prefix Match","Substring Match","Overall Match"]:
        disp[c] = disp[c].str.replace(r'[^A-Za-z0-9 ]+', ' ', regex=True).str.strip()
    display(Markdown(
        "## NTAs Still Missing Population (no same‐borough candidate)\n\n"
        + disp[[
            "Unmatched NTA Name","Poly Borough",
            "Prefix Match","Prefix Confidence",
            "Substring Match","Substring Confidence",
            "Overall Match","Overall Confidence"
        ]].to_markdown(index=False)
    ))

# 17) Print summary
print(f"Total NTAs processed:               {len(nta_polys_gdf)}")
print(f"NTAs initially missing population:   {initial_count}")
print(f"  • Filled by all-metrics agree:     {len(filled_by_all_metrics)}")
print(f"  • Filled by 100% prefix match:     {len(filled_by_prefix)}")
print(f"  • Filled by 100% reverse prefix:   {len(filled_by_reverse_prefix)}")
print(f"  • Filled by 100% substring match:  {len(filled_by_substring)}")
print(f"  • Filled by 90% overall match:     {len(filled_by_overall)}")
print(f"  • Filled by fallback (80%+ & 45%+): {len(filled_by_fallback)}")
print(f"NTAs still missing population:       {final_count}")
if final_count == 0:
    print("✅ All NTAs now have a population value.")


Loading ./data/NYC_NTA_Polygons.csv from local file...


Loading ./data/NYC_NTA.csv from local file...


## NTAs Still Missing Population (no same‐borough candidate)

| Unmatched NTA Name                                       | Poly Borough   | Prefix Match                                        |   Prefix Confidence | Substring Match                          |   Substring Confidence | Overall Match                                       |   Overall Confidence |
|:---------------------------------------------------------|:---------------|:----------------------------------------------------|--------------------:|:-----------------------------------------|-----------------------:|:----------------------------------------------------|---------------------:|
| Rockaway Beach Arverne Edgemere                          | Queens         | Rosedale                                            |           0.25      | Hammels Arverne Edgemere                 |               0.708333 | Hammels Arverne Edgemere                            |             0.690909 |
| Oakwood Richmondtown                                     | Staten Island  | Oakwood Oakwood Beach                               |           0.380952  | Port Richmond                            |               0.692308 | Port Richmond                                       |             0.606061 |
| Greenwich Village                                        | Manhattan      | Gramercy                                            |           0.25      | East Village                             |               0.666667 | East Village                                        |             0.62069  |
| Highland Park Cypress Hills Cemeteries  South            | Brooklyn       | Homecrest                                           |           0.111111  | Cypress Hills City Line                  |               0.652174 | Cypress Hills City Line                             |             0.477612 |
| Bushwick  West                                           | Brooklyn       | Bushwick North                                      |           0.642857  | Bushwick North                           |               0.642857 | Bushwick South                                      |             0.814815 |
| Bushwick  East                                           | Brooklyn       | Bushwick North                                      |           0.642857  | Bushwick North                           |               0.642857 | Bushwick South                                      |             0.814815 |
| Westchester Square                                       | Bronx          | Westchester Unionport                               |           0.571429  | Parkchester                              |               0.636364 | Westchester Unionport                               |             0.717949 |
| Tompkinsville Stapleton Clifton Fox Hills                | Staten Island  | Todt Hill Emersn Hill Heartland Villg Lighthse Hill |           0.0392157 | Grymes Hill Clifton Fox Hills            |               0.62069  | Grymes Hill Clifton Fox Hills                       |             0.657143 |
| Midtown South Flatiron Union Square                      | Manhattan      | Midtown Midtown South                               |           0.380952  | Midtown Midtown South                    |               0.619048 | Hudson Yards Chelsea Flat Iron Union Square         |             0.717949 |
| Fordham Heights                                          | Bronx          | Fordham South                                       |           0.615385  | Fordham South                            |               0.615385 | Fordham South                                       |             0.642857 |
| Todt Hill Emerson Hill Lighthouse Hill Manor Heights     | Staten Island  | Todt Hill Emersn Hill Heartland Villg Lighthse Hill |           0.294118  | Arden Heights                            |               0.615385 | Todt Hill Emersn Hill Heartland Villg Lighthse Hill |             0.679612 |
| Pelham Gardens                                           | Bronx          | Pelham Parkway                                      |           0.5       | Allerton Pelham Gardens                  |               0.608696 | Allerton Pelham Gardens                             |             0.756757 |
| Mount Eden Claremont  West                               | Bronx          | Mount Hope                                          |           0.6       | Mount Hope                               |               0.6      | Claremont Bathgate                                  |             0.511628 |
| Marine Park Mill Basin Bergen Beach                      | Brooklyn       | Madison                                             |           0.285714  | Bath Beach                               |               0.6      | Georgetown Marine Park Bergen Beach Mill Basin      |             0.592593 |
| Sheepshead Bay Manhattan Beach Gerritsen Beach           | Brooklyn       | Sheepshead Bay Gerritsen Beach Manhattn Bch         |           0.348837  | Bath Beach                               |               0.6      | Sheepshead Bay Gerritsen Beach Manhattn Bch         |             0.674157 |
| Coney Island Sea Gate                                    | Brooklyn       | Canarsie                                            |           0.125     | Seagate Coney Island                     |               0.6      | Seagate Coney Island                                |             0.585366 |
| Tremont                                                  | Bronx          | Claremont Bathgate                                  |           0         | East Tremont                             |               0.583333 | East Tremont                                        |             0.736842 |
| Woodlawn Cemetery                                        | Bronx          | Woodlawn Wakefield                                  |           0.5       | Norwood                                  |               0.571429 | Woodlawn Wakefield                                  |             0.628571 |
| Claremont Village Claremont  East                        | Bronx          | Claremont Bathgate                                  |           0.555556  | Belmont                                  |               0.571429 | Claremont Bathgate                                  |             0.52     |
| Green Wood Cemetery                                      | Brooklyn       | Greenpoint                                          |           0.5       | Midwood                                  |               0.571429 | Greenpoint                                          |             0.482759 |
| SoHo Little Italy Hudson Square                          | Manhattan      | SoHo TriBeCa Civic Center Little Italy              |           0.131579  | Lincoln Square                           |               0.571429 | SoHo TriBeCa Civic Center Little Italy              |             0.492754 |
| Wakefield Woodlawn                                       | Bronx          | West Concourse                                      |           0.0714286 | Norwood                                  |               0.571429 | Woodlawn Wakefield                                  |             0.5      |
| Soundview Clason Point                                   | Bronx          | Soundview Bruckner                                  |           0.555556  | Soundview Bruckner                       |               0.555556 | Soundview Castle Hill Clason Point Harding Park     |             0.637681 |
| Jacob Riis Park Fort Tilden Breezy Point Tip             | Queens         | Jamaica                                             |           0.285714  | Rego Park                                |               0.555556 | College Point                                       |             0.385965 |
| Highland Park Cypress Hills Cemeteries  North            | Queens         | Hollis                                              |           0.166667  | Rego Park                                |               0.555556 | Springfield Gardens North                           |             0.434783 |
| Ferry Point Park St  Raymond Cemetery                    | Bronx          | Fordham South                                       |           0.0769231 | Hunts Point                              |               0.545455 | Van Nest Morris Park Westchester Square             |             0.453333 |
| Riverdale Spuyten Duyvil                                 | Bronx          | Rikers Island                                       |           0.153846  | Spuyten Duyvil Kingsbridge               |               0.538462 | Spuyten Duyvil Kingsbridge                          |             0.56     |
| Hart Island                                              | Bronx          | Highbridge                                          |           0.1       | Rikers Island                            |               0.538462 | Rikers Island                                       |             0.666667 |
| North   South Brother Islands                            | Bronx          | Norwood                                             |           0.428571  | Rikers Island                            |               0.538462 | Rikers Island                                       |             0.5      |
| Tribeca Civic Center                                     | Manhattan      | Turtle Bay East Midtown                             |           0.0434783 | SoHo TriBeCa Civic Center Little Italy   |               0.526316 | SoHo TriBeCa Civic Center Little Italy              |             0.689655 |
| East Midtown Turtle Bay                                  | Manhattan      | East Village                                        |           0.416667  | Turtle Bay East Midtown                  |               0.521739 | Turtle Bay East Midtown                             |             0.521739 |
| Howard Beach Lindenwood                                  | Queens         | Hollis                                              |           0.333333  | Lindenwood Howard Beach                  |               0.521739 | Lindenwood Howard Beach                             |             0.521739 |
| Financial District Battery Park City                     | Manhattan      | Marble Hill Inwood                                  |           0         | Battery Park City Lower Manhattan        |               0.515152 | Battery Park City Lower Manhattan                   |             0.492754 |
| Sunnyside Yards  South                                   | Queens         | Steinway                                            |           0.125     | Woodside                                 |               0.5      | Springfield Gardens North                           |             0.565217 |
| Sunnyside                                                | Queens         | Steinway                                            |           0.125     | Woodside                                 |               0.5      | Woodside                                            |             0.470588 |
| St  George New Brighton                                  | Staten Island  | Stapleton Rosebank                                  |           0.111111  | New Brighton Silver Lake                 |               0.5      | New Brighton Silver Lake                            |             0.521739 |
| Sunnyside Yards  North                                   | Queens         | Steinway                                            |           0.125     | Woodside                                 |               0.5      | Springfield Gardens North                           |             0.652174 |
| Mount Hebron   Cedar Grove Cemeteries                    | Queens         | Maspeth                                             |           0.142857  | Corona                                   |               0.5      | park cemetery etc Queens                            |             0.40678  |
| Midtown Times Square                                     | Manhattan      | Midtown Midtown South                               |           0.380952  | Lincoln Square                           |               0.5      | Lincoln Square                                      |             0.588235 |
| Downtown Brooklyn DUMBO Boerum Hill                      | Brooklyn       | Dyker Heights                                       |           0.0769231 | Ocean Hill                               |               0.5      | DUMBO Vinegar Hill Downtown Brklyn Boerum Hill      |             0.666667 |
| The Evergreens Cemetery                                  | Brooklyn       | Brooklyn Heights Cobble Hill                        |           0         | Greenpoint                               |               0.5      | Fort Greene                                         |             0.470588 |
| Carroll Gardens Cobble Hill Gowanus Red Hook             | Brooklyn       | Carroll Gardens Columbia Street Red Hook            |           0.45      | Ocean Hill                               |               0.5      | Carroll Gardens Columbia Street Red Hook            |             0.690476 |
| Castle Hill Unionport                                    | Bronx          | Co Op City                                          |           0.1       | Westchester Unionport                    |               0.47619  | Westchester Unionport                               |             0.619048 |
| Long Island City Hunters Point                           | Queens         | Laurelton                                           |           0.111111  | College Point                            |               0.461538 | Queensbridge Ravenswood Long Island City            |             0.457143 |
| Freshkills Park  North                                   | Staten Island  | Annadale Huguenot Prince s Bay Eltingville          |           0         | Great Kills                              |               0.454545 | Great Kills                                         |             0.4375   |
| Freshkills Park  South                                   | Staten Island  | Annadale Huguenot Prince s Bay Eltingville          |           0         | Great Kills                              |               0.454545 | Great Kills                                         |             0.4375   |
| Pomonok Electchester Hillcrest                           | Queens         | Pomonok Flushing Heights Hillcrest                  |           0.235294  | Murray Hill                              |               0.454545 | Pomonok Flushing Heights Hillcrest                  |             0.6875   |
| Rosebank Shore Acres Park Hill                           | Staten Island  | Rossville Woodrow                                   |           0.176471  | Stapleton Rosebank                       |               0.444444 | Grasmere Arrochar Ft  Wadsworth                     |             0.4      |
| Marine Park Plumb Island                                 | Brooklyn       | Madison                                             |           0.285714  | Flatlands                                |               0.444444 | Georgetown Marine Park Bergen Beach Mill Basin      |             0.457143 |
| Barren Island Floyd Bennett Field                        | Brooklyn       | Bay Ridge                                           |           0.222222  | Flatlands                                |               0.444444 | Seagate Coney Island                                |             0.377358 |
| Jamaica Bay  West                                        | Brooklyn       | Brooklyn Heights Cobble Hill                        |           0         | Bay Ridge                                |               0.444444 | Homecrest                                           |             0.4      |
| Mount Olivet   All Faiths Cemeteries                     | Queens         | Maspeth                                             |           0.142857  | St  Albans                               |               0.444444 | Pomonok Flushing Heights Hillcrest                  |             0.382353 |
| Fort Totten                                              | Queens         | Forest Hills                                        |           0.25      | Airport                                  |               0.428571 | North Corona                                        |             0.521739 |
| St  John Cemetery                                        | Queens         | St  Albans                                          |           0.333333  | park cemetery etc Queens                 |               0.375    | park cemetery etc Queens                            |             0.45     |
| Montefiore Cemetery                                      | Queens         | Maspeth                                             |           0.142857  | park cemetery etc Queens                 |               0.375    | park cemetery etc Queens                            |             0.465116 |
| St  Michael s Cemetery                                   | Queens         | St  Albans                                          |           0.333333  | park cemetery etc Queens                 |               0.375    | Hammels Arverne Edgemere                            |             0.444444 |
| Fort Wadsworth                                           | Staten Island  | Annadale Huguenot Prince s Bay Eltingville          |           0         | Grasmere Arrochar Ft  Wadsworth          |               0.366667 | Grasmere Arrochar Ft  Wadsworth                     |             0.545455 |
| Tottenville Charleston                                   | Staten Island  | Todt Hill Emersn Hill Heartland Villg Lighthse Hill |           0.0392157 | Rossville Woodrow                        |               0.352941 | Rossville Woodrow                                   |             0.461538 |
| Holy Cross Cemetery                                      | Brooklyn       | Homecrest                                           |           0.222222  | park cemetery etc Brooklyn               |               0.346154 | park cemetery etc Brooklyn                          |             0.444444 |
| Calvary   Mount Zion Cemeteries                          | Queens         | Corona                                              |           0.166667  | Corona                                   |               0.333333 | park cemetery etc Queens                            |             0.45283  |
| United Nations                                           | Manhattan      | Upper West Side                                     |           0.0666667 | Chinatown                                |               0.333333 | Chinatown                                           |             0.521739 |
| Throgs Neck Schuylerville                                | Bronx          | Claremont Bathgate                                  |           0         | Schuylerville Throgs Neck Edgewater Park |               0.325    | Schuylerville Throgs Neck Edgewater Park            |             0.4      |
| Brooklyn Navy Yard                                       | Brooklyn       | Brooklyn Heights Cobble Hill                        |           0.321429  | Brooklyn Heights Cobble Hill             |               0.321429 | Bay Ridge                                           |             0.444444 |
| Hutchinson Metro Center                                  | Bronx          | Hunts Point                                         |           0.181818  | Belmont                                  |               0.285714 | Soundview Bruckner                                  |             0.439024 |
| Chelsea Hudson Yards                                     | Manhattan      | Chinatown                                           |           0.222222  | Clinton                                  |               0.285714 | Hudson Yards Chelsea Flat Iron Union Square         |             0.380952 |
| The Battery Governors Island Ellis Island Liberty Island | Manhattan      | Turtle Bay East Midtown                             |           0.0434783 | Clinton                                  |               0.285714 | Battery Park City Lower Manhattan                   |             0.382022 |
| Miller Field                                             | Staten Island  | Mariner s Harbor Arlington Port Ivory Graniteville  |           0.02      | Great Kills                              |               0.272727 | New Springville Bloomfield Travis                   |             0.444444 |
| Randall s Island                                         | Manhattan      | Marble Hill Inwood                                  |           0         | Lenox Hill Roosevelt Island              |               0.259259 | Lenox Hill Roosevelt Island                         |             0.55814  |
| McGuire Fields                                           | Brooklyn       | Midwood                                             |           0.142857  | Canarsie                                 |               0.25     | Homecrest                                           |             0.434783 |
| Hoffman   Swinburne Islands                              | Staten Island  | Annadale Huguenot Prince s Bay Eltingville          |           0         | park cemetery etc Staten Island          |               0.225806 | New Brighton Silver Lake                            |             0.408163 |
| Hell s Kitchen                                           | Manhattan      | Hamilton Heights                                    |           0.0625    | Chinatown                                |               0.222222 | Lenox Hill Roosevelt Island                         |             0.390244 |
| Snug Harbor                                              | Staten Island  | Stapleton Rosebank                                  |           0.0555556 | Port Richmond                            |               0.153846 | Grasmere Arrochar Ft  Wadsworth                     |             0.341463 |

Total NTAs processed:               230
NTAs initially missing population:   230
  • Filled by all-metrics agree:     108
  • Filled by 100% prefix match:     118
  • Filled by 100% reverse prefix:   91
  • Filled by 100% substring match:  130
  • Filled by 90% overall match:     85
  • Filled by fallback (80%+ & 45%+): 3
NTAs still missing population:       72


## PROBLEM MED OVENSTÅENDE STRING MATCH

![image-2.png](attachment:image-2.png)

- Nogle har ikke et godt match.

- Nogle får også et match grundet "BlaBlaBlaBla-South" får en høj prefix confidence på "BlaBlaBlaBla-North"


![image-3.png](attachment:image-3.png)

![image.png](attachment:image.png)

polygon data har parker. Man kan ikke bo i en park, bortset fra dem man godt kan bo i, så nogle mangler fra pop data. Så vi har fjernet dem som slutter på ordet "park" såsom Central park. Dog fjerner det også legit steder

## Amenities data

In [None]:
amenities_data_path = "./data/NYC_Amenities.csv"

# Function to extract coordinates from Point objects or calculate centroid for Polygons
def extract_coordinates(geometry):
    if isinstance(geometry, Point):
        return geometry.x, geometry.y
    elif isinstance(geometry, Polygon):
        centroid = geometry.centroid
        return centroid.x, centroid.y
    return None, None

amenities_df = None
amenities_gdf = None

# Apply the function to the dataset
if os.path.exists(amenities_data_path):
    print("Loading amenities data from local file...")
    amenities_df = pd.read_csv(amenities_data_path, low_memory=False)
    # convert to geodataframe
    amenities_gdf = gpd.GeoDataFrame(
        amenities_df,
        geometry=amenities_df["geometry"].apply(loads),
        crs="EPSG:4326"
    ).to_crs(epsg=3857)
else:
    print("Querying OSM for amenities data...")
    # Define a dictionary of tags for the amenities you're interested in
    tags = {
        "amenity": ["bar", "restaurant"],
        "leisure": "park",
        "railway": "station"
    }

    # Use OSMnx to query OSM for these features in New York City
    amenities_gdf = ox.features.features_from_place("New York City, USA", tags)

    # Extract coordinates or calculate centroid
    amenities_gdf[['Longitude', 'Latitude']] = amenities_gdf['geometry'].apply(lambda x: pd.Series(extract_coordinates(x)))

    # Save the queried data to a CSV file for future use
    amenities_gdf.to_csv(amenities_data_path, index=False)
    amenities_df = amenities_gdf

# Create a new column 'category' to combine 'leisure' and 'amenity'
amenities_gdf['category'] = amenities_gdf['leisure'].combine_first(amenities_gdf['amenity']).combine_first(amenities_gdf['railway'])

# 5) Filter to exactly the four types you want
keep = ["bar", "restaurant", "park", "station"]
amenities_gdf = amenities_gdf[amenities_gdf["category"].isin(keep)]

for amenity in amenities_gdf['category'].unique():
    print(f"Number of {amenity}: {len(amenities_gdf[amenities_gdf['category'] == amenity])}")
    


# VISUALIZATION

## TABLES

In [None]:
from IPython.display import display, Markdown
import numpy as np

def preview_gdf(
    gdf, 
    name, 
    groupby=None, 
    max_groups=None, 
    cols=None, 
    n=5, 
    random_state=42
):
    """
    Preview a GeoDataFrame in Markdown:
    - Reprojects to WGS84 (lat/lon).
    - Computes 'Latitude'/'Longitude' from centroids.
    - Always shows the 'geometry' column.
    - If `groupby` is provided, samples 1 row per group (up to max_groups).
      Otherwise, shows the first n rows.
    - You can pass `cols` to include extra columns before geometry/lat/lon.
    """
    # 1) Copy and reproject
    df = gdf.copy().to_crs(epsg=4326)
    # 2) Compute lat/lon
    centroids = df.geometry.centroid
    df['Latitude']  = centroids.y
    df['Longitude'] = centroids.x

    # 3) Decide rows to show
    if groupby:
        groups = df[groupby].dropna().unique()
        if max_groups and len(groups) > max_groups:
            rng = np.random.default_rng(random_state)
            groups = rng.choice(groups, size=max_groups, replace=False)
        df = (
            df[df[groupby].isin(groups)]
            .groupby(groupby, group_keys=False)
            .apply(lambda sub: sub.sample(n=1, random_state=random_state))
            .reset_index(drop=True)
        )
        title = f"{name} — Representative (1 per '{groupby}')"
    else:
        df = df.head(n)
        title = f"{name} — Sample ({n} rows)"

    # 4) Build column list: user cols + geometry + lat/lon
    display_cols = []
    if cols:
        display_cols += cols
    # always include geometry
    display_cols.append('geometry')
    # then lat/lon
    display_cols += ['Latitude', 'Longitude']

    # 5) Render Markdown
    md = df[display_cols].to_markdown(index=False)
    display(Markdown(f"## {title}\n\n{md}"))

def preview_df(
    df, 
    name, 
    n=5
):
    """
    Preview a regular DataFrame in Markdown:
    - Schema (column names & dtypes)
    - First n rows
    """
    schema = df.dtypes.reset_index()
    schema.columns = ['column', 'dtype']
    md_schema = schema.to_markdown(index=False)
    md_sample = df.head(n).to_markdown(index=False)
    display(Markdown(f"## {name} — Schema\n\n{md_schema}"))
    display(Markdown(f"## {name} — Sample ({n} rows)\n\n{md_sample}"))

# === Usage ===

# 1) Crime: one per category
crime_wgs = crime_gdf.to_crs(epsg=4326)
preview_gdf(
    crime_wgs,
    "Crime Data (2019)",
    groupby='LAW_CAT_CD',
    max_groups=3,
    cols=['CMPLNT_FR_DT', 'LAW_CAT_CD', 'BORO_NM']
)

# 2) Amenities: one per type
amen_wgs = amenities_gdf.to_crs(epsg=4326)
preview_gdf(
    amen_wgs,
    "Amenities Data",
    groupby='category',
    max_groups=4,
    cols=['category', 'name']
)

# 3) NTA Polygons (with Population) — only sample from those with a real population
nta_wgs = nta_polys_gdf.to_crs(epsg=4326)

# filter out any NTAs still missing population
nta_with_pop = nta_wgs[nta_wgs['population'].notna()]

# pick exactly one representative row (with a real population)
preview_gdf(
    nta_with_pop,
    "NTA Polygons (with Population)",
    groupby='NTA2020',
    max_groups=1,
    cols=['NTA2020', 'NTAName', 'BoroName', 'population']
)

# 4) NTA population table (regular DataFrame)
preview_df(
    nta_pop_df,
    "NTA Population Data",
    n=5
)


## Interactive map

In [None]:
import folium
import geopandas as gpd
import numpy as np
from shapely.ops import unary_union
from folium.plugins import HeatMap, MarkerCluster, GroupedLayerControl

# === CONFIGURATION ===
USE_SAMPLE      = True      # If True, sample a fraction of points for speed
SAMPLE_FRACTION = 0.03      # Fraction to sample when USE_SAMPLE=True
RANDOM_STATE    = 42

# === 1) Prepare GeoDataFrames ===
# Assumes crime_gdf, amenities_gdf, nta_polys_gdf are pre-loaded

# Reproject all to WGS84 (latitude/longitude) for mapping
crime     = crime_gdf.to_crs(epsg=4326).copy()
amenities = amenities_gdf.to_crs(epsg=4326).copy()
ntas      = nta_polys_gdf.to_crs(epsg=4326).copy()

# Also prepare Web Mercator for distance-in-meters computations
crime_m     = crime.to_crs(epsg=3857).copy()
amenities_m = amenities.to_crs(epsg=3857).copy()
ntas_m      = ntas.to_crs(epsg=3857).copy()

# === 2) (Optional) Sample for performance ===
if USE_SAMPLE:
    crime     = crime.sample(frac=SAMPLE_FRACTION, random_state=RANDOM_STATE)
    crime_m   = crime_m.loc[crime.index]
    amenities = amenities.sample(frac=SAMPLE_FRACTION, random_state=RANDOM_STATE)
    amenities_m = amenities_m.loc[amenities.index]

# === 3) Extract latitude and longitude for Folium ===
crime['latitude']   = crime.geometry.y
crime['longitude']  = crime.geometry.x
amen_centroids      = amenities.geometry.centroid
amenities['latitude']  = amen_centroids.y
amenities['longitude'] = amen_centroids.x

# === 4) Count crimes and amenities per NTA ===
crime_ntas = gpd.sjoin(
    crime[['geometry']],
    ntas[['NTA2020','population','geometry']],
    how='inner', predicate='within'
)
crime_counts = crime_ntas.groupby('NTA2020').size().rename('crime_count')

amen_ntas = gpd.sjoin(
    amenities[['geometry']],
    ntas[['NTA2020','population','geometry']],
    how='inner', predicate='within'
)
amenity_counts = amen_ntas.groupby('NTA2020').size().rename('amenity_count')

ntas = (
    ntas.set_index('NTA2020')
        .join(crime_counts, how='left')
        .join(amenity_counts, how='left')
        .fillna({'crime_count': 0, 'amenity_count': 0})
        .reset_index()
)
ntas['crime_rate']             = ntas['crime_count']            / ntas['population']
ntas['amenity_per_capita']     = ntas['amenity_count']          / ntas['population']
ntas['crime_to_amenity_ratio'] = ntas['crime_count']            / ntas['amenity_count'].replace({0: np.nan})

# === 5) Aggregate metrics per borough ===
boroughs = (
    ntas[['BoroName','population','crime_count','amenity_count','geometry']]
    .dissolve(by='BoroName', aggfunc='sum')
    .reset_index()
)
boroughs['density']           = boroughs['population'] / boroughs.geometry.to_crs(epsg=3857).area
boroughs['crime_rate_boro']   = boroughs['crime_count']   / boroughs['population']
boroughs['amenity_rate_boro'] = boroughs['amenity_count'] / boroughs['population']

# === 6) Pre‐compute MultiPoint unions for each amenity type ===
unions = {}
for amen_type in ['bar','restaurant','park','station']:
    pts = amenities_m[amenities_m['category'] == amen_type].geometry
    unions[amen_type] = unary_union(pts)

# === 7) Compute nearest‐amenity distance for each crime point ===
for amen_type, union_geom in unions.items():
    # distance in meters to the single closest amenity of that type
    crime_m[f'dist_to_{amen_type}'] = crime_m.geometry.distance(union_geom)
    crime[f'dist_to_{amen_type}']   = crime_m.loc[crime.index, f'dist_to_{amen_type}']

# === 8) Compute average distances per NTA and per borough ===
distance_fields = ['dist_to_bar','dist_to_restaurant','dist_to_park','dist_to_station']

# Average per NTA
c2n = gpd.sjoin(
    crime[['geometry'] + distance_fields],
    ntas[['NTA2020','geometry']],
    how='inner', predicate='within'
)
dist_ntas = (
    c2n.groupby('NTA2020')[distance_fields]
       .mean()
       .rename(columns={f: f'avg_{f}' for f in distance_fields})
)
ntas = ntas.set_index('NTA2020').join(dist_ntas).reset_index()

# Average per borough
c2b = gpd.sjoin(
    crime[['geometry'] + distance_fields],
    boroughs[['BoroName','geometry']],
    how='inner', predicate='within'
)
dist_boros = (
    c2b.groupby('BoroName')[distance_fields]
       .mean()
       .rename(columns={f: f'avg_{f}' for f in distance_fields})
)
boroughs = boroughs.set_index('BoroName').join(dist_boros).reset_index()

# === 9) Initialize Folium map ===
center = [crime['latitude'].mean(), crime['longitude'].mean()]
m = folium.Map(location=center, zoom_start=11, tiles=None)
folium.TileLayer('CartoDB Positron', name='Basemap', control=True).add_to(m)

# === 10) Crime point and cluster layers ===
crime_point_layers = []
crime_cluster_layers = []
for category, color in [('Felony','crimson'),('Misdemeanor','orange'),('Violation','blue')]:
    # Individual points
    fg_pts = folium.FeatureGroup(name=f"Crime: {category}", show=False)
    subset = crime[crime['LAW_CAT_CD'].str.title() == category]
    for _, r in subset.iterrows():
        folium.CircleMarker(
            [r['latitude'], r['longitude']],
            radius=3, color=color, fill=True, fill_opacity=0.6
        ).add_to(fg_pts)
    fg_pts.add_to(m)
    crime_point_layers.append(fg_pts)

    # Cluster markers
    fg_cl = MarkerCluster(name=f"Crime Clusters: {category}", show=False)
    for _, r in subset.iterrows():
        folium.Marker([r['latitude'], r['longitude']]).add_to(fg_cl)
    fg_cl.add_to(m)
    crime_cluster_layers.append(fg_cl)

# === 11) Crime heatmap ===
crime_heat = folium.FeatureGroup(name="Crime Heatmap", show=False)
HeatMap(
    list(zip(crime['latitude'], crime['longitude'])),
    radius=15, blur=10, min_opacity=0.3
).add_to(crime_heat)
crime_heat.add_to(m)

# === 12) Amenity point and cluster layers ===
amenity_point_layers = []
amenity_cluster_layers = []
for amen, color in [('Bar','purple'),('Restaurant','darkgreen'),('Park','green'),('Station','cadetblue')]:
    # Small circle markers
    fg_pts = folium.FeatureGroup(name=f"Amenity: {amen}", show=False)
    mc = MarkerCluster().add_to(fg_pts)
    subset = amenities[amenities['category'].str.title() == amen]
    for _, r in subset.iterrows():
        folium.CircleMarker(
            [r['latitude'], r['longitude']],
            radius=3, color=color, fill=True, fill_opacity=0.6
        ).add_to(mc)
    fg_pts.add_to(m)
    amenity_point_layers.append(fg_pts)

    # Clustered raw markers
    fg_cl = MarkerCluster(name=f"Amenity Clusters: {amen}", show=False)
    for _, r in subset.iterrows():
        folium.Marker([r['latitude'], r['longitude']]).add_to(fg_cl)
    fg_cl.add_to(m)
    amenity_cluster_layers.append(fg_cl)

# === 13) Amenity heatmap ===
amen_heat = folium.FeatureGroup(name="Amenity Heatmap (All)", show=False)
HeatMap(
    list(zip(amenities['latitude'], amenities['longitude'])),
    radius=15, blur=10, min_opacity=0.3
).add_to(amen_heat)
amen_heat.add_to(m)

# === 14) Choropleth style helper ===
def make_choro_style(series):
    cuts = series.quantile([0.2,0.4,0.6,0.8]).values
    def style_fn(feature):
        v = feature['properties'].get(series.name)
        if v is None or np.isnan(v):
            c = 'grey'
        elif v > cuts[3]:
            c = 'red'
        elif v > cuts[2]:
            c = 'orange'
        elif v > cuts[1]:
            c = 'yellow'
        elif v > cuts[0]:
            c = 'lightgreen'
        else:
            c = 'green'
        return {'fillColor': c, 'color': 'black', 'weight': 1, 'fillOpacity': 0.6}
    return style_fn

# === 15) NTA metric choropleths ===
nta_layers = []
for title, field, aliases in [
    ("Crime Rate",         'crime_rate',             ['Neighborhood','Crime Rate']),
    ("Amenities/Capita",   'amenity_per_capita',     ['Neighborhood','Amenities/Capita']),
    ("Crime/Amenity Ratio",'crime_to_amenity_ratio',['Neighborhood','Crime/Amenity'])
]:
    fg = folium.FeatureGroup(name=f"NTA: {title}", show=False)
    folium.GeoJson(
        ntas,
        style_function=make_choro_style(ntas[field]),
        tooltip=folium.GeoJsonTooltip(fields=['NTAName', field], aliases=aliases)
    ).add_to(fg)
    fg.add_to(m)
    nta_layers.append(fg)

# === 16) Borough metric choropleths ===
boro_layers = []
for title, field, aliases in [
    ("Population Density",'density',           ['Borough','Density']),
    ("Crime Rate",       'crime_rate_boro',   ['Borough','Crime Rate']),
    ("Amenities Rate",   'amenity_rate_boro', ['Borough','Amenities Rate'])
]:
    fg = folium.FeatureGroup(name=f"Borough: {title}", show=False)
    folium.GeoJson(
        boroughs,
        style_function=make_choro_style(boroughs[field]),
        tooltip=folium.GeoJsonTooltip(fields=['BoroName', field], aliases=aliases)
    ).add_to(fg)
    # Label boroughs
    for _, r in boroughs.iterrows():
        folium.map.Marker(
            [r.geometry.centroid.y, r.geometry.centroid.x],
            icon=folium.DivIcon(html=f"<div style='font-size:12px'><b>{r['BoroName']}</b></div>")
        ).add_to(fg)
    fg.add_to(m)
    boro_layers.append(fg)

# === 17) NTA average distance to each amenity type ===
distance_layers = []
for amen_type in ['bar','restaurant','park','station']:
    field = f'avg_dist_to_{amen_type}'
    fg = folium.FeatureGroup(name=f"NTA Avg Distance to {amen_type.title()} (m)", show=False)
    folium.GeoJson(
        ntas,
        style_function=make_choro_style(ntas[field]),
        tooltip=folium.GeoJsonTooltip(
            fields=['NTAName', field],
            aliases=['Neighborhood', f'Avg m to {amen_type.title()}']
        )
    ).add_to(fg)
    # Numeric labels
    for _, r in ntas.iterrows():
        val = r[field]
        if not np.isnan(val):
            folium.map.Marker(
                [r.geometry.centroid.y, r.geometry.centroid.x],
                icon=folium.DivIcon(html=f"<div style='font-size:8px;color:black'>{int(val)}</div>")
            ).add_to(fg)
    fg.add_to(m)
    distance_layers.append(fg)

# === 18) Crime→Amenity Distances by Type ===
combo_layers = []
for crime_cat in ['Felony','Misdemeanor','Violation']:
    for amen_type in ['bar','restaurant','park','station']:
        field = f'avg_dist_{crime_cat.lower()}_to_{amen_type}'
        # Compute mean nearest distance per NTA
        subset = crime[crime['LAW_CAT_CD'].str.title() == crime_cat]
        c2n_cat = gpd.sjoin(
            subset[['geometry', f'dist_to_{amen_type}']],
            ntas[['NTA2020','geometry']],
            how='inner', predicate='within'
        )
        avg_dist = c2n_cat.groupby('NTA2020')[f'dist_to_{amen_type}'].mean()
        ntas[field] = ntas['NTA2020'].map(avg_dist)

        explanation = (
            f"Average over all {crime_cat.lower()} incidents of the "
            f"distance (in meters) to the single closest {amen_type}"
        )
        layer_name = (
            f"NTA {crime_cat}→{amen_type.title()} Distance "
            f"(*{explanation}*)"
        )
        fg = folium.FeatureGroup(name=layer_name, show=False)
        folium.GeoJson(
            ntas,
            style_function=make_choro_style(ntas[field]),
            tooltip=folium.GeoJsonTooltip(
                fields=['NTAName', field],
                aliases=['Neighborhood', f'{crime_cat}→{amen_type.title()} (m)']
            )
        ).add_to(fg)
        # Value labels
        for _, r in ntas.iterrows():
            val = r[field]
            if not np.isnan(val):
                folium.map.Marker(
                    [r.geometry.centroid.y, r.geometry.centroid.x],
                    icon=folium.DivIcon(html=f"<div style='font-size:7px'>{int(val)}</div>")
                ).add_to(fg)
        fg.add_to(m)
        combo_layers.append(fg)

# === 19) Grouped Layer Control ===
GroupedLayerControl(
    groups={
        'Crime Points':                    crime_point_layers,
        'Crime Clusters':                  crime_cluster_layers,
        'Crime Heatmap':                   [crime_heat],
        'Amenities Points':                amenity_point_layers,
        'Amenity Clusters':                amenity_cluster_layers,
        'Amenity Heatmap':                 [amen_heat],
        'NTA Metrics':                     nta_layers,
        'Borough Metrics':                 boro_layers,
        'NTA Distances to Amenities':      distance_layers,
        'Crime→Amenity Distances by Type': combo_layers
    },
    exclusive_groups=[],    # allow multiple on/off
    collapse=False,
    position='topright'
).add_to(m)

# === 20) Display the map ===
m


# Brugbar tekst

## god kilde

Urban crime patterns have been studied from both social and spatial perspectives. Criminology theories such as Broken Windows (Wilson & Kelling, 1982) suggest that the environment plays a role in crime prevalence – for example, disorder in the physical environment might encourage criminal behavior. On the other hand, urbanist Jane Jacobs (1961) argued that active streetscapes with plenty of “eyes on the street” can deter crime, implying that amenities attracting people (cafés, bars, etc.) might enhance safety through informal surveillance.

# Crime Hotspots (Clustering Analysis) - DBSCAN


In [None]:
# --- DBSCAN clustering to identify hotspots ---
coords = np.column_stack([crime_gdf.geometry.x, crime_gdf.geometry.y])
db = DBSCAN(eps=500, min_samples=30).fit(coords)
crime_gdf["cluster"] = db.labels_

# Identify top 5 largest clusters (exclude noise)
labels = crime_gdf["cluster"].values
clusters = [lab for lab in set(labels) if lab != -1]
clusters.sort(key=lambda c: (labels == c).sum(), reverse=True)
top5 = clusters[:5]
print("Top 5 clusters and sizes:", {c: int((labels == c).sum()) for c in top5})

# Build convex hulls for these clusters
hotspot_hulls = []
for c in top5:
    pts = crime_gdf[crime_gdf["cluster"] == c]
    hull = pts.unary_union.convex_hull
    hotspot_hulls.append({
        "cluster": c,
        "count": int((labels == c).sum()),
        "geometry": hull
    })
hotspots_gdf = gpd.GeoDataFrame(hotspot_hulls, crs=crime_gdf.crs)

# Ensure Web Mercator for contextily
hotspots_gdf = hotspots_gdf.to_crs(epsg=3857)

# --- Plotting ---
fig, ax = plt.subplots(figsize=(10, 10))
# sample crime points for context
crime_gdf.sample(frac=0.1).plot(ax=ax, color="grey", alpha=0.1, markersize=1)

# plot hull boundaries and fills
hotspots_gdf.boundary.plot(
    ax=ax, color='red', linewidth=2, linestyle='--'
)
hotspots_gdf.plot(
    ax=ax, column='cluster', alpha=0.3, cmap='Set1'
)

# add basemap with fallback
try:
    ctx.add_basemap(
        ax,
        source=ctx.providers.OpenStreetMap.Mapnik,
        crs=crime_gdf.crs
    )
except Exception as e:
    print("⚠️ Basemap failed:", e)

ax.set_title(
    "Figure 2. Top 5 Crime Hotspots in NYC (2019)", fontsize=15
)
ax.axis("off")
plt.tight_layout()
plt.show()


In [None]:
# AGAIN USE NTA's instead of boroughs

# 1) Ensure boroughs has the necessary columns
boroughs['boro_name'] = boroughs.get('boro_name', boroughs['BoroName'])
boroughs['crime_count'] = boroughs['crime_count'].fillna(0).astype(int)

# 2) Build Queen contiguity weights and row-standardize
w = libpysal.weights.Queen.from_dataframe(boroughs)
w.transform = 'R'

# 3) Extract crime counts and compute spatial lag (neighbor average)
counts = boroughs['crime_count'].values
lag = libpysal.weights.lag_spatial(w, counts)

# 4) Print a summary table
print(f"{'Borough':<20} {'Crime':>6} {'Neighbor Avg':>14}")
print("-" * 42)
for name, cnt, nbr_avg in zip(boroughs['boro_name'], counts, lag):
    print(f"{name:<20} {cnt:6d} {nbr_avg:14.1f}")

# 5) Compute Global Moran's I
mi = esda.Moran(counts, w)
print(f"\nGlobal Moran's I: {mi.I:.3f}")
print(f"p-value:           {mi.p_sim:.3f}")

# 6) Add the spatial lag back to the GeoDataFrame
boroughs['spatial_lag'] = lag

# 7) Choropleth maps side by side
fig, axes = plt.subplots(1, 2, figsize=(14, 7))

boroughs.plot(
    column='crime_count',
    cmap='Reds',
    legend=True,
    ax=axes[0],
    edgecolor='black'
)
axes[0].set_title('Crime Count by Borough (2019)')
axes[0].axis('off')

boroughs.plot(
    column='spatial_lag',
    cmap='Blues',
    legend=True,
    ax=axes[1],
    edgecolor='black'
)
axes[1].set_title('Spatial Lag of Crime Count')
axes[1].axis('off')

plt.tight_layout()
plt.show()

# 8) Moran’s I scatterplot
y     = (counts      - counts.mean())      / counts.std()
y_lag = (lag - lag.mean()) / lag.std()

fig, ax = plt.subplots(figsize=(6, 6))
ax.axhline(0, color='gray', linewidth=1)
ax.axvline(0, color='gray', linewidth=1)
ax.scatter(y, y_lag, s=100, color='steelblue')

for i, name in enumerate(boroughs['boro_name']):
    ax.text(y[i] + 0.02, y_lag[i] + 0.02, name, fontsize=9)

ax.set_xlabel('Standardized Crime Count')
ax.set_ylabel('Standardized Spatial Lag')
ax.set_title("Moran's I Scatterplot")
plt.tight_layout()
plt.show()


# Spatial Clustering

In [None]:
# 1) Count restaurants per borough via spatial join
rests = amenities_gdf[amenities_gdf["category"] == "restaurant"]
rests_in_boro = gpd.sjoin(
    rests,
    boroughs[["boro_name", "geometry"]],
    how="inner",
    predicate="within"
)
rest_counts = rests_in_boro.groupby("boro_name").size().to_dict()

# 2) Add restaurant counts to boroughs GeoDataFrame
boroughs["rest_count"] = boroughs["boro_name"].map(rest_counts).fillna(0).astype(int)

# 3) Build feature matrix [crime_count, rest_count] and normalize
features = boroughs[["crime_count", "rest_count"]].values
scaler = StandardScaler()
X = scaler.fit_transform(features)

# 4) Fit KMeans (2 clusters) and attach labels
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
boroughs["cluster"] = kmeans.labels_

# 5) Print results
for _, row in boroughs.iterrows():
    print(
        f"{row.boro_name}: Crime={row.crime_count}, "
        f"Restaurants={row.rest_count}, Cluster={row.cluster}"
    )


In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

# 1. Prepare coordinates of all crime points (in Web Mercator meters)
coords = np.column_stack([crime_gdf.geometry.x, crime_gdf.geometry.y])

# 2. Compute nearest‐neighbor distances (skip self–distance)
nn = NearestNeighbors(n_neighbors=2, metric='euclidean').fit(coords)
distances, _ = nn.kneighbors(coords)
nn_distances = distances[:, 1]  # true nearest neighbor

# 3. Calculate observed Average Nearest Neighbor (ANN)
ann = nn_distances.mean()

# 4. Calculate expected ANN under Complete Spatial Randomness (CSR)
area = boroughs.geometry.unary_union.area     # total study area in m²
n_points = len(crime_gdf)                     # number of crime incidents
expected_ann = 0.5 * np.sqrt(area / n_points)

# 5. Calculate the R ratio
R = ann / expected_ann

# 6. Beginner‐friendly printout
print("=== Average Nearest Neighbor (ANN) Analysis ===")
print(f"Observed ANN: {ann:.2f} m")
print(f"Expected ANN (CSR): {expected_ann:.2f} m")
print(f"R ratio: {R:.3f}\n")
print("Interpretation:")
if R < 1:
    print(f"  • R = {R:.3f} < 1 → points are clustered (closer than random).")
elif R > 1:
    print(f"  • R = {R:.3f} > 1 → points are dispersed (farther than random).")
else:
    print(f"  • R = {R:.3f} ≈ 1 → random spatial pattern.")
print(f"On average, each crime is {ann:.2f} m from its nearest neighbor,")
print(f"much smaller than the {expected_ann:.2f} m we'd expect if crimes were random.\n")

# 7. Visualization: histogram of nearest‐neighbor distances
plt.figure(figsize=(8, 6))
plt.hist(nn_distances, bins=50, edgecolor='black')
plt.axvline(ann, color='red', linestyle='--', linewidth=2, label=f'Observed ANN ({ann:.2f} m)')
plt.axvline(expected_ann, color='blue', linestyle=':', linewidth=2, label=f'Expected ANN ({expected_ann:.2f} m)')
plt.title("Histogram of Nearest‐Neighbor Distances (Crime Incidents)")
plt.xlabel("Distance to Nearest Crime (meters)")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()
