In [None]:
# import and build geolocation from tracts for Travis
import pandas as pd
import geopandas as gpd
import numpy as np
import itertools
import matplotlib.pyplot as plt

crime_df = pd.read_csv(
    r"C:\Users\nicol\OneDrive\DAT490\RQ2_Crime_Final_Pct.csv"
)

tracts = gpd.read_file(
    r"C:\Users\nicol\OneDrive\DAT490\cb_2023_us_tract_500k\cb_2023_us_tract_500k.shp",
    ignore_geometry=True
)

tracts_tx_travis = tracts[
    (tracts["STATEFP"] == "48") &
    (tracts["COUNTYFP"] == "453")
]

crime_df["tract_geoid_str"] = (
    crime_df["tract_geoid"]
    .astype(str)
    .str.zfill(11)
)

# 5. Quick sanity checks
print("Crime data â€“ tract IDs (first 5 rows):")
print(crime_df[["tract_geoid", "tract_geoid_str"]].head(), "\n")

print("Travis County tract subset shape (rows, cols):")
print(tracts_tx_travis.shape, "\n")

print("Travis County tract IDs sample:")
print(tracts_tx_travis[["GEOID", "NAME", "NAMELSAD"]].head())

In [None]:
crime_counts = (
    crime_df
      .groupby(["tract_geoid_str", "Crime_Category"])
      .size()
      .unstack(fill_value=0)
      .reset_index()
)

crime_counts["total_crime"] = crime_counts.drop(columns=["tract_geoid_str"]).sum(axis=1)

tracts_travis_simple = tracts_tx_travis[["GEOID"]].copy()
tracts_travis_simple = tracts_travis_simple.rename(columns={"GEOID": "tract_geoid_str"})

tracts_crime = tracts_travis_simple.merge(
    crime_counts,
    on="tract_geoid_str",
    how="left"
).fillna(0)

print("Crime counts by tract (first 10 rows):")
print(tracts_crime.head(10))

print("\nColumns in tract-level crime summary:")
print(tracts_crime.columns.tolist())

In [None]:
hotspots = tracts_crime.copy()

crime_cols = [c for c in hotspots.columns 
              if c not in ["tract_geoid_str", "total_crime"]]

hotspot_thresholds = {
    col: hotspots[col].quantile(0.90)
    for col in crime_cols
}

for col in crime_cols:
    cutoff = hotspot_thresholds[col]
    hotspots[f"{col}_hotspot"] = hotspots[col] >= cutoff

hotspot_flag_cols = [f"{col}_hotspot" for col in crime_cols]
hotspots["multi_hotspot_count"] = hotspots[hotspot_flag_cols].sum(axis=1)

print("Hotspot thresholds (90th percentile) per crime category:")
for col in crime_cols:
    print(f"  {col}: {hotspot_thresholds[col]}")

print("\nSample of hotspot-labeled tracts:")
print(hotspots.head(12))

In [None]:
hot = hotspots.copy()

crime_cols = ["Administrative", "Drug", "Other", "Property", "Public Order", "Violent"]
hotspot_flag_cols = [f"{col}_hotspot" for col in crime_cols]

category_hotspot_counts = {
    col: hot[f"{col}_hotspot"].sum()
    for col in crime_cols
}

print("=== Hotspot count per crime category ===")
for col, cnt in category_hotspot_counts.items():
    print(f"{col:15s}: {cnt} tracts")
print("\n")

multi_dist = hot["multi_hotspot_count"].value_counts().sort_index()

print("=== Number of crime types per hotspot tract (distribution) ===")
print(multi_dist)
print("\n")

hot["hotspot_combo"] = hot.apply(
    lambda row: tuple(
        col for col in crime_cols if row[f"{col}_hotspot"]
    ),
    axis=1
)

combo_counts = hot["hotspot_combo"].value_counts()

print("=== Most common hotspot combinations (top 10) ===")
print(combo_counts.head(10))
print("\n")

print("Sample tracts with hotspot combinations:")
print(hot[["tract_geoid_str", "hotspot_combo", "multi_hotspot_count"]].head(12))

In [None]:
demo_df = crime_df.copy()

exclude_cols = [
    "year_occurred", "Time", "Highest Offense Description",
    "Crime_Category", "Census Block Group",
    "tract_geoid", "tract_geoid_str"
]

demo_df = demo_df.drop(columns=exclude_cols, errors='ignore')

demo_num = demo_df.select_dtypes(include="number")

missing_pct = demo_num.isna().mean()
keep_cols = missing_pct[missing_pct < 0.40].index.tolist()

demo_num = demo_num[keep_cols]

demo_num = demo_num.fillna(demo_num.median())

print("Demographic feature set shape:")
print(demo_num.shape)

print("\nFirst 5 rows of cleaned demographic features:")
print(demo_num.head())

In [None]:
#Standardize and run K-Means clustering on demographics

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

scaler = StandardScaler()
demo_scaled = scaler.fit_transform(demo_num)

print("Scaled demographic matrix shape:", demo_scaled.shape)

k = 5 
kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
cluster_labels = kmeans.fit_predict(demo_scaled)

demo_clusters = pd.DataFrame({
    "tract_geoid_str": crime_df["tract_geoid_str"].values,
    "demo_cluster": cluster_labels
})

print("\nCluster label counts:")
print(demo_clusters["demo_cluster"].value_counts().sort_index())

print("\nFirst 10 cluster assignments:")
print(demo_clusters.head(10))

In [None]:
# Collapse demographic clusters to one row per tract
demo_clusters_unique = (
    demo_clusters
    .groupby("tract_geoid_str", as_index=False)
    .agg({"demo_cluster": "first"})
)

print("Unique demographic cluster rows:", demo_clusters_unique.shape)
print(demo_clusters_unique.head())

In [None]:
demo_clusters_travis = demo_clusters_unique[
    demo_clusters_unique["tract_geoid_str"].isin(travis_geoids)
].copy()

print("Filtered to Travis County:", demo_clusters_travis.shape)

In [None]:
crime_demo = crime_hotspots.merge(
    demo_clusters_travis,
    on="tract_geoid_str",
    how="left"
)

print("Merged shape:", crime_demo.shape)
print("Missing clusters:", crime_demo["demo_cluster"].isna().sum())
print(crime_demo.head())

In [None]:
hotspot_cols = [c for c in crime_demo.columns if c.endswith("_hotspot")]

print("Hotspot columns:", hotspot_cols)


hotspot_rates = (
    crime_demo.groupby("demo_cluster")[hotspot_cols]
    .mean()
    .sort_index()
)

print("\n=== Hotspot Rate by Cluster (Table) ===")
print(hotspot_rates)


multi_counts = (
    crime_demo.groupby("demo_cluster")["multi_hotspot_count"]
    .value_counts()
    .unstack(fill_value=0)
    .sort_index()
)

for i in range(7):
    if i not in multi_counts.columns:
        multi_counts[i] = 0

multi_counts = multi_counts[sorted(multi_counts.columns)]

print("\n=== Multi-Hotspot Count Frequencies by Cluster (Table) ===")
print(multi_counts)


def plot_heatmap(data, title, cmap="Blues"):
    fig, ax = plt.subplots(figsize=(10, 6))

    im = ax.imshow(data, cmap=cmap)

    ax.set_xticks(np.arange(data.shape[1]))
    ax.set_yticks(np.arange(data.shape[0]))
    ax.set_xticklabels(data.columns)
    ax.set_yticklabels(data.index)

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")

    cbar = plt.colorbar(im, ax=ax)
    cbar.ax.set_ylabel("Rate" if data.max().max() <= 1 else "Count", rotation=-90, va="bottom")

    ax.set_title(title, fontsize=14)
    plt.tight_layout()
    plt.show()

plot_heatmap(
    hotspot_rates,
    "Hotspot Rate by Cluster (Crime Types)",
    cmap="Blues"
)

plot_heatmap(
    multi_counts,
    "Multi-Hotspot Severity by Cluster (0=none, 6=super-hotspot)",
    cmap="Oranges"
)

In [None]:
#Co-occurrance cell

hotspot_cols = [
    "Administrative_hotspot",
    "Drug_hotspot",
    "Other_hotspot",
    "Property_hotspot",
    "Public Order_hotspot",
    "Violent_hotspot"
]

# Co-occurrence matrix
co_matrix = pd.DataFrame(
    0, index=hotspot_cols, columns=hotspot_cols, dtype=int
)

for A in hotspot_cols:
    for B in hotspot_cols:
        co_occ = ((crime_demo[A]) & (crime_demo[B])).sum()
        co_matrix.loc[A, B] = co_occ

print(co_matrix)

# Top actual co-occurrences
pairs = []
for A in hotspot_cols:
    for B in hotspot_cols:
        if A != B:
            pairs.append((A, B, co_matrix.loc[A, B]))

df_pairs = (
    pd.DataFrame(pairs, columns=["A", "B", "co_occurrence"])
    .sort_values("co_occurrence", ascending=False)
)

print("\n=== Top Crime-Type Co-Occurrences ===")
print(df_pairs.head(10))

In [None]:
tracts_tx_travis["GEOID"] = tracts_tx_travis["GEOID"].astype(str)
hotspots["tract_geoid_str"] = hotspots["tract_geoid_str"].astype(str)


gdf_map = tracts_tx_travis.merge(
    hotspots[["tract_geoid_str", "multi_hotspot_count"]],
    left_on="GEOID",
    right_on="tract_geoid_str",
    how="left"
)

print("Merged map shape:", gdf_map.shape)
gdf_map.head()

In [None]:
tracts_geo = gpd.read_file(
    r"C:\Users\nicol\OneDrive\DAT490\cb_2023_us_tract_500k\cb_2023_us_tract_500k.shp"
)

tracts_tx_geo = tracts_geo[
    (tracts_geo["STATEFP"] == "48") &
    (tracts_geo["COUNTYFP"] == "453")
].copy()

print("Travis tract geodata shape:", tracts_tx_geo.shape)

tracts_tx_geo["GEOID"] = tracts_tx_geo["GEOID"].astype(str)
hotspots["tract_geoid_str"] = hotspots["tract_geoid_str"].astype(str)
crime_df["tract_geoid_str"] = crime_df["tract_geoid"].astype(str).str.zfill(11)

tract_demo = (
    crime_df.groupby("tract_geoid_str")[["poverty_all_people_pct", "renter_occupied_pct"]]
    .first()
    .reset_index()
)

gdf_map = tracts_tx_geo.merge(
    hotspots[["tract_geoid_str", "multi_hotspot_count"]],
    left_on="GEOID",
    right_on="tract_geoid_str",
    how="left"
)

gdf_map["multi_hotspot_count"] = gdf_map["multi_hotspot_count"].fillna(0)

gdf_map = gdf_map.merge(
    tract_demo,
    on="tract_geoid_str",
    how="left"
)

print("Merged map shape:", gdf_map.shape)

hot = gdf_map[gdf_map["multi_hotspot_count"] > 0].copy()

max_pov = hot.loc[hot["poverty_all_people_pct"].idxmax()]

max_rent = hot.loc[hot["renter_occupied_pct"].idxmax()]

print("\n=== Highest Poverty Among Hotspot Tracts ===")
print(f"GEOID: {max_pov['GEOID']}")
print(f"multi_hotspot_count: {max_pov['multi_hotspot_count']}")
print(f"poverty_all_people_pct: {max_pov['poverty_all_people_pct']}")
print(f"renter_occupied_pct: {max_pov['renter_occupied_pct']}")

print("\n=== Highest Renter % Among Hotspot Tracts ===")
print(f"GEOID: {max_rent['GEOID']}")
print(f"multi_hotspot_count: {max_rent['multi_hotspot_count']}")
print(f"poverty_all_people_pct: {max_rent['poverty_all_people_pct']}")
print(f"renter_occupied_pct: {max_rent['renter_occupied_pct']}")

fig, ax = plt.subplots(figsize=(10, 10))

gdf_map.plot(
    ax=ax,
    color="#f0f0f0",
    edgecolor="lightgrey",
    linewidth=0.2
)

hot.plot(
    column="multi_hotspot_count",
    cmap="inferno",
    linewidth=0.5,
    edgecolor="black",
    legend=True,
    legend_kwds={"label": "# of crime categories that are hotspots"},
    ax=ax
)

gdf_map[gdf_map["GEOID"] == max_pov["GEOID"]].boundary.plot(
    ax=ax,
    color="cyan",
    linewidth=2,
    label="Max poverty hotspot"
)

gdf_map[gdf_map["GEOID"] == max_rent["GEOID"]].boundary.plot(
    ax=ax,
    color="lime",
    linewidth=2,
    label="Max renter hotspot"
)

ax.set_title(
    "Crime-Type Co-Occurrence Hotspot Intensity\n"
    "(Travis County Census Tracts)",
    fontsize=16
)

ax.set_axis_off()
ax.legend(loc="lower left")

plt.tight_layout()
plt.show()