In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CrimePrediction_Week3") \
    .config("spark.driver.memory", "6g") \
    .config("spark.executor.memory", "6g") \
    .config("spark.driver.maxResultSize", "3g") \
    .config("spark.ui.port", "4050") \
    .getOrCreate()

print("✅ Spark session initialized for Week 3!")

✅ Spark session initialized for Week 3!


In [24]:
print("="*70)
print("WEEK 3: GEOSPATIAL ANALYSIS & HOTSPOT DETECTION")
print("Team: Kiran Ghumare, Neethu Satravada, Sajitha Mathi")
print("="*70)

# Loading the saved data
df_geo = spark.read.parquet("../data/processed/integrated_crime_data.parquet")

print(f"✅ Loaded {df_geo.count():,} records")

# Checking column names to confirm
print("\nAvailable columns:")
print(df_geo.columns)

# Converting to Pandas
print("\nConverting to Pandas for geospatial analysis...")
df_pandas = df_geo.select(
    "date_ts",
    "crime_type",
    "latitude",
    "longitude",
    "community_area",
    "temp_mean",
    "year",
    "month",
    "hour",
    "season"
).toPandas()

print(f"✅ Converted {len(df_pandas):,} records to Pandas")
print("\n✅ Data ready for geospatial analysis!")

# sample
print("\nSample data:")
print(df_pandas.head())

WEEK 3: GEOSPATIAL ANALYSIS & HOTSPOT DETECTION
Team: Kiran Ghumare, Neethu Satravada, Sajitha Mathi
✅ Loaded 1,970,206 records

Available columns:
['date_ts', 'crime_type', 'latitude', 'longitude', 'community_area', 'temp_mean', 'precipitation', 'wind_speed', 'year', 'month', 'hour', 'dayofweek', 'season', 'per_capita_income']

Converting to Pandas for geospatial analysis...
✅ Converted 1,970,206 records to Pandas

✅ Data ready for geospatial analysis!

Sample data:
     date_ts               crime_type   latitude  longitude  community_area  \
0 2025-11-09            OTHER OFFENSE  41.765286 -87.577086            43.0   
1 2025-11-09  CRIMINAL SEXUAL ASSAULT  41.936336 -87.650710             6.0   
2 2025-11-09       DECEPTIVE PRACTICE  41.904817 -87.689930            24.0   
3 2025-11-09  CRIMINAL SEXUAL ASSAULT  41.877609 -87.667595            28.0   
4 2025-11-09              SEX OFFENSE  41.860902 -87.707037            29.0   

   temp_mean  year  month  hour season  
0        NaN

In [None]:
import h3
import numpy as np

print("="*70)
print("H3 HEXAGONAL SPATIAL INDEXING")
print("="*70)

# Checking h3 version
print(f"h3 version: {h3.__version__}")

# Adding H3 index to each crime (resolution 8 = ~0.46 km² hexagons)
print("\nGenerating H3 hexagonal grid indices...")

# Using the correct function based on h3 version
def get_h3_index(lat, lon, resolution=8):
    try:
        # Trying new API (h3 v4+)
        return h3.latlng_to_cell(lat, lon, resolution)
    except AttributeError:
        # Falling back to old API (h3 v3)
        return h3.geo_to_h3(lat, lon, resolution)

df_pandas['h3_index'] = df_pandas.apply(
    lambda row: get_h3_index(row['latitude'], row['longitude'], 8),
    axis=1
)

print(f"✅ Added H3 indices to {len(df_pandas):,} records")

# Aggregating crimes by hexagon
hex_crimes = df_pandas.groupby('h3_index').agg({
    'crime_type': 'count',
    'latitude': 'mean',
    'longitude': 'mean'
}).reset_index()

hex_crimes.columns = ['h3_index', 'crime_count', 'lat_center', 'lon_center']

print(f"✓ Created {len(hex_crimes):,} hexagonal cells")
print(f"✓ Average crimes per hex: {hex_crimes['crime_count'].mean():.1f}")

# Showing top crime hotspot hexagons
print("\nTop 10 Crime Hotspot Hexagons:")
print(hex_crimes.nlargest(10, 'crime_count')[['h3_index', 'crime_count', 'lat_center', 'lon_center']])

print("\n✅ H3 spatial indexing complete!")

H3 HEXAGONAL SPATIAL INDEXING
h3 version: 4.3.1

Generating H3 hexagonal grid indices...
✅ Added H3 indices to 1,970,206 records
✓ Created 893 hexagonal cells
✓ Average crimes per hex: 2206.3

Top 10 Crime Hotspot Hexagons:
            h3_index  crime_count  lat_center  lon_center
224  882664c1a9fffff        30767   41.881964  -87.628032
234  882664c1e1fffff        26457   41.894715  -87.625989
237  882664c1e7fffff        16714   41.889090  -87.631855
235  882664c1e3fffff        14558   41.887668  -87.622796
240  882664c1edfffff        10835   41.903049  -87.630709
673  882664ceb5fffff        10554   41.755311  -87.560330
303  882664c8cbfffff        10138   41.861043  -87.712294
383  882664caa7fffff         9543   41.875766  -87.723370
551  882664cce1fffff         9426   41.745956  -87.605168
248  882664c811fffff         9403   41.877779  -87.745298

✅ H3 spatial indexing complete!


In [26]:
%pip install shapely geopandas

from shapely.geometry import Polygon
import geopandas as gpd

print("\nConverting H3 indices to polygon boundaries...")

def h3_to_poly(h3_index):
    boundary = h3.cell_to_boundary(h3_index)
    return Polygon([(p[1], p[0]) for p in boundary])  # (lon, lat)

hex_crimes["geometry"] = hex_crimes["h3_index"].apply(h3_to_poly)

gdf_hex = gpd.GeoDataFrame(hex_crimes, geometry="geometry", crs="EPSG:4326")

print("✅ Created GeoDataFrame with hex polygons")


Note: you may need to restart the kernel to use updated packages.

Converting H3 indices to polygon boundaries...
✅ Created GeoDataFrame with hex polygons


In [None]:
import folium
import branca.colormap as cm

print("\nCreating hexagonal hotspot map...")

# Creating a Folium map
m = folium.Map(location=[41.88, -87.63], zoom_start=10)

# Creating color scale based on crime counts
max_count = gdf_hex["crime_count"].max()
colormap = cm.linear.RdBu_09.scale(0, max_count)

# Adding each hexagon as a GeoJson overlay
for _, row in gdf_hex.iterrows():
    geo_json = folium.GeoJson(
        row["geometry"].__geo_interface__,
        style_function=lambda feature, count=row["crime_count"]: {
            "fillColor": colormap(count),
            "color": "black",
            "weight": 0.5,
            "fillOpacity": 0.6 
        },
        tooltip=folium.Tooltip(f"""
            Crimes in Hex: {row['crime_count']}<br>
            Lat: {row['lat_center']:.4f}<br>
            Lon: {row['lon_center']:.4f}
        """)
    )
    geo_json.add_to(m)

colormap.caption = "Crime Density per H3 Hexagon"
colormap.add_to(m)

# Savig the map
m.save("../outputs/h3_hex_hotspots.html")

print("✅ Hexagon hotspot map saved!")



Creating hexagonal hotspot map...
✅ Hexagon hotspot map saved!


In [28]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

print("\n" + "="*70)
print("DBSCAN CLUSTERING FOR HOTSPOT DETECTION")
print("="*70)

# Preparing data for clustering (using hexagon centers)
X = hex_crimes[['lat_center', 'lon_center', 'crime_count']].values

# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Applying DBSCAN
# eps=0.5 (distance threshold), min_samples=5 (minimum cluster size)
print("\nApplying DBSCAN clustering...")
dbscan = DBSCAN(eps=0.5, min_samples=5)
hex_crimes['cluster'] = dbscan.fit_predict(X_scaled)

# Analyzing clusters
n_clusters = len(set(hex_crimes['cluster'])) - (1 if -1 in hex_crimes['cluster'] else 0)
n_noise = list(hex_crimes['cluster']).count(-1)

print(f"✅ Found {n_clusters} crime hotspot clusters")
print(f"✅ Noise points (not in clusters): {n_noise}")

# Showing cluster statistics
cluster_stats = hex_crimes[hex_crimes['cluster'] != -1].groupby('cluster').agg({
    'crime_count': ['sum', 'mean', 'count'],
    'lat_center': 'mean',
    'lon_center': 'mean'
}).round(2)

print("\nCluster Statistics:")
print(cluster_stats.head(10))

print("\n✅ DBSCAN clustering complete!")


DBSCAN CLUSTERING FOR HOTSPOT DETECTION

Applying DBSCAN clustering...
✅ Found 2 crime hotspot clusters
✅ Noise points (not in clusters): 8

Cluster Statistics:
        crime_count                lat_center lon_center
                sum     mean count       mean       mean
cluster                                                 
0           1854912  2095.95   885      41.84     -87.69

✅ DBSCAN clustering complete!


In [None]:
from folium.plugins import HeatMap

print("\n" + "="*70)
print("CREATING INTERACTIVE CRIME MAP")
print("="*70)

# Sampling data for visualization (using 50k points for performance)
df_sample = df_pandas.sample(n=min(10000, len(df_pandas)), random_state=42)
print(f"Using {len(df_sample):,} crime points for visualization...")

# Creating base map centered on Chicago
chicago_center = [41.8781, -87.6298]
crime_map = folium.Map(
    location=chicago_center,
    zoom_start=11,
    tiles='OpenStreetMap'
)

# Adding heatmap layer
heat_data = [[row['latitude'], row['longitude']] for idx, row in df_sample.iterrows()]

HeatMap(
    heat_data,
    radius=7,
    blur=10,
    min_opacity=0.3,
    max_zoom=12,
    name='Crime Heatmap'
).add_to(crime_map)

# Adding cluster markers for top hotspots
top_clusters = hex_crimes[hex_crimes['cluster'] != -1].nlargest(20, 'crime_count')

for idx, row in top_clusters.iterrows():
    folium.CircleMarker(
        location=[row['lat_center'], row['lon_center']],
        radius=8,
        popup=f"Cluster {row['cluster']}<br>Crimes: {row['crime_count']}",
        color='red',
        fill=True,
        fillColor='red',
        fillOpacity=0.6
    ).add_to(crime_map)

# Saving map
map_path = '../outputs/chicago_crime_heatmap.html'
crime_map.save(map_path)

print(f"✅ Interactive map saved to: {map_path}")
print("✅ Open this file in a browser to view the interactive map!")

print("\n✅ Interactive crime map created!")


CREATING INTERACTIVE CRIME MAP
Using 10,000 crime points for visualization...
✅ Interactive map saved to: ../outputs/chicago_crime_heatmap.html
✅ Open this file in a browser to view the interactive map!

✅ Interactive crime map created!


In [None]:
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt

print("\n" + "="*70)
print("KERNEL DENSITY ESTIMATION")
print("="*70)

# Sampling for KDE (10k points for performance)
kde_sample = df_pandas.sample(n=min(10000, len(df_pandas)), random_state=42)

print(f"Computing KDE with {len(kde_sample):,} points...")

# Preparing data
x = kde_sample['longitude'].values
y = kde_sample['latitude'].values

# Creating KDE
xy = np.vstack([x, y])
kde = gaussian_kde(xy)

# Creating grid for density plot
lon_min, lon_max = x.min(), x.max()
lat_min, lat_max = y.min(), y.max()

xx, yy = np.mgrid[lon_min:lon_max:100j, lat_min:lat_max:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])
density = np.reshape(kde(positions).T, xx.shape)

# Plot
fig, ax = plt.subplots(figsize=(14, 10))
ax.imshow(
    np.rot90(density),
    cmap='hot',
    extent=[lon_min, lon_max, lat_min, lat_max],
    aspect='auto'
)
ax.scatter(x, y, c='blue', s=1, alpha=0.1)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Chicago Crime Density Map (Kernel Density Estimation)', fontsize=16)

plt.tight_layout()
plt.savefig('../outputs/crime_density_kde.png', dpi=150, bbox_inches='tight')
print("✅ KDE plot saved to: ../outputs/crime_density_kde.png")

plt.show()

print("\n✅ Kernel Density Estimation complete!")


KERNEL DENSITY ESTIMATION
Computing KDE with 10,000 points...
✅ KDE plot saved to: ../outputs/crime_density_kde.png

✅ Kernel Density Estimation complete!


  plt.show()


In [None]:
# Installing using pip
import subprocess
import sys

# Installing scikit-learn
subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn", "xgboost"])

print("✅ Installation complete!")

✅ Installation complete!


In [32]:
# !brew install libomp

In [33]:
# Saving the data
df_ml.to_parquet("../data/processed/integrated_crime_data.parquet", index=False)

IsADirectoryError: [Errno 21] Failed to open local file '../data/processed/integrated_crime_data.parquet'. Detail: [errno 21] Is a directory