In [1]:
# Install libraries needed for Week 3
!pip install h3 folium scikit-learn matplotlib seaborn geopandas
!pip install pysal esda libpysal

print("✓ All Week 3 libraries installed!")

✓ All Week 3 libraries installed!


In [3]:
# Reinitialize Spark and imports
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder \
    .appName("CrimePrediction_Week3") \
    .config("spark.driver.memory", "6g") \
    .config("spark.executor.memory", "6g") \
    .config("spark.driver.maxResultSize", "3g") \
    .getOrCreate()

print("✓ Spark session initialized for Week 3!")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/10 16:15:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


✓ Spark session initialized for Week 3!


In [4]:
print("="*70)
print("WEEK 3: GEOSPATIAL ANALYSIS & HOTSPOT DETECTION")
print("Team: Kiran Ghumare, Neethu Satravada, Sajitha Mathi")
print("="*70)

# Load the saved data
df_geo = spark.read.parquet("../data/processed/integrated_crime_data.parquet")

print(f"✓ Loaded {df_geo.count():,} records")

# Show column names to confirm
print("\nAvailable columns:")
print(df_geo.columns)

# Convert to Pandas - use correct column names (already lowercase)
print("\nConverting to Pandas for geospatial analysis...")
df_pandas = df_geo.select(
    "date_ts",
    "crime_type",      # Already renamed
    "latitude",        # Already renamed
    "longitude",       # Already renamed
    "community_area",  # Already renamed
    "temp_mean",
    "year",
    "month",
    "hour",
    "season"
).toPandas()

print(f"✓ Converted {len(df_pandas):,} records to Pandas")
print("\n✅ Data ready for geospatial analysis!")

# Show sample
print("\nSample data:")
print(df_pandas.head())

WEEK 3: GEOSPATIAL ANALYSIS & HOTSPOT DETECTION
Team: Kiran Ghumare, Neethu Satravada, Sajitha Mathi


                                                                                

✓ Loaded 1,971,196 records

Available columns:
['date_ts', 'crime_type', 'latitude', 'longitude', 'community_area', 'temp_mean', 'precipitation', 'wind_speed', 'year', 'month', 'hour', 'dayofweek', 'season', 'per_capita_income']

Converting to Pandas for geospatial analysis...


                                                                                

✓ Converted 1,971,196 records to Pandas

✅ Data ready for geospatial analysis!

Sample data:
              date_ts       crime_type   latitude  longitude  community_area  \
0 2020-08-10 09:45:00          ROBBERY  41.908418 -87.677407            24.0   
1 2023-09-06 17:00:00  CRIMINAL DAMAGE  41.886018 -87.633938            32.0   
2 2023-09-06 11:00:00            THEFT  41.871835 -87.626151            32.0   
3 2019-05-21 08:20:00         BURGLARY  41.856547 -87.695605            29.0   
4 2021-07-07 10:30:00      SEX OFFENSE  41.655116 -87.594883            54.0   

   temp_mean  year  month  hour  season  
0       75.2  2020      8     9  summer  
1       75.7  2023      9    17    fall  
2       75.7  2023      9    11    fall  
3       49.1  2019      5     8  spring  
4       75.2  2021      7    10  summer  


In [5]:
import h3
import pandas as pd
import numpy as np

print("="*70)
print("H3 HEXAGONAL SPATIAL INDEXING")
print("="*70)

# Check h3 version
print(f"h3 version: {h3.__version__}")

# Add H3 index to each crime (resolution 8 = ~0.46 km² hexagons)
print("\nGenerating H3 hexagonal grid indices...")

# Use the correct function based on h3 version
def get_h3_index(lat, lon, resolution=8):
    try:
        # Try new API (h3 v4+)
        return h3.latlng_to_cell(lat, lon, resolution)
    except AttributeError:
        # Fall back to old API (h3 v3)
        return h3.geo_to_h3(lat, lon, resolution)

df_pandas['h3_index'] = df_pandas.apply(
    lambda row: get_h3_index(row['latitude'], row['longitude'], 8),
    axis=1
)

print(f"✓ Added H3 indices to {len(df_pandas):,} records")

# Aggregate crimes by hexagon
hex_crimes = df_pandas.groupby('h3_index').agg({
    'crime_type': 'count',
    'latitude': 'mean',
    'longitude': 'mean'
}).reset_index()

hex_crimes.columns = ['h3_index', 'crime_count', 'lat_center', 'lon_center']

print(f"✓ Created {len(hex_crimes):,} hexagonal cells")
print(f"✓ Average crimes per hex: {hex_crimes['crime_count'].mean():.1f}")

# Show top crime hotspot hexagons
print("\nTop 10 Crime Hotspot Hexagons:")
print(hex_crimes.nlargest(10, 'crime_count')[['h3_index', 'crime_count', 'lat_center', 'lon_center']])

print("\n✅ H3 spatial indexing complete!")

H3 HEXAGONAL SPATIAL INDEXING
h3 version: 4.3.1

Generating H3 hexagonal grid indices...
✓ Added H3 indices to 1,971,196 records
✓ Created 875 hexagonal cells
✓ Average crimes per hex: 2252.8

Top 10 Crime Hotspot Hexagons:
            h3_index  crime_count  lat_center  lon_center
227  882664c1a9fffff        20797   41.881854  -87.628327
237  882664c1e1fffff        16086   41.895184  -87.626084
204  882664c133fffff        11777   41.904132  -87.643033
245  882664c803fffff        10321   41.885757  -87.761423
239  882664c1e5fffff        10063   41.897175  -87.637900
676  882664ceb5fffff        10062   41.755089  -87.560269
468  882664cc0bfffff         9822   41.781201  -87.607870
243  882664c1edfffff         9775   41.902827  -87.630532
240  882664c1e7fffff         9387   41.888957  -87.632076
372  882664ca87fffff         9243   41.879556  -87.691480

✅ H3 spatial indexing complete!


In [6]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

print("\n" + "="*70)
print("DBSCAN CLUSTERING FOR HOTSPOT DETECTION")
print("="*70)

# Prepare data for clustering (use hexagon centers)
X = hex_crimes[['lat_center', 'lon_center', 'crime_count']].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply DBSCAN
# eps=0.5 (distance threshold), min_samples=5 (minimum cluster size)
print("\nApplying DBSCAN clustering...")
dbscan = DBSCAN(eps=0.5, min_samples=5)
hex_crimes['cluster'] = dbscan.fit_predict(X_scaled)

# Analyze clusters
n_clusters = len(set(hex_crimes['cluster'])) - (1 if -1 in hex_crimes['cluster'] else 0)
n_noise = list(hex_crimes['cluster']).count(-1)

print(f"✓ Found {n_clusters} crime hotspot clusters")
print(f"✓ Noise points (not in clusters): {n_noise}")

# Show cluster statistics
cluster_stats = hex_crimes[hex_crimes['cluster'] != -1].groupby('cluster').agg({
    'crime_count': ['sum', 'mean', 'count'],
    'lat_center': 'mean',
    'lon_center': 'mean'
}).round(2)

print("\nCluster Statistics:")
print(cluster_stats.head(10))

print("\n✅ DBSCAN clustering complete!")


DBSCAN CLUSTERING FOR HOTSPOT DETECTION

Applying DBSCAN clustering...
✓ Found 2 crime hotspot clusters
✓ Noise points (not in clusters): 8

Cluster Statistics:
        crime_count                lat_center lon_center
                sum     mean count       mean       mean
cluster                                                 
0           1888188  2177.84   867      41.84     -87.69

✅ DBSCAN clustering complete!


25/11/10 20:28:03 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 822577 ms exceeds timeout 120000 ms
25/11/10 20:28:03 WARN SparkContext: Killing executors is not supported by current scheduler.
25/11/10 20:28:04 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

In [7]:
import folium
from folium.plugins import HeatMap

print("\n" + "="*70)
print("CREATING INTERACTIVE CRIME MAP")
print("="*70)

# Sample data for visualization (use 50k points for performance)
df_sample = df_pandas.sample(n=min(50000, len(df_pandas)), random_state=42)

print(f"Using {len(df_sample):,} crime points for visualization...")

# Create base map centered on Chicago
chicago_center = [41.8781, -87.6298]
crime_map = folium.Map(
    location=chicago_center,
    zoom_start=11,
    tiles='OpenStreetMap'
)

# Add heatmap layer
heat_data = [[row['latitude'], row['longitude']] for idx, row in df_sample.iterrows()]

HeatMap(
    heat_data,
    radius=10,
    blur=15,
    max_zoom=13,
    name='Crime Heatmap'
).add_to(crime_map)

# Add cluster markers for top hotspots
top_clusters = hex_crimes[hex_crimes['cluster'] != -1].nlargest(20, 'crime_count')

for idx, row in top_clusters.iterrows():
    folium.CircleMarker(
        location=[row['lat_center'], row['lon_center']],
        radius=8,
        popup=f"Cluster {row['cluster']}<br>Crimes: {row['crime_count']}",
        color='red',
        fill=True,
        fillColor='red',
        fillOpacity=0.6
    ).add_to(crime_map)

# Save map
map_path = '../outputs/chicago_crime_heatmap.html'
crime_map.save(map_path)

print(f"✓ Interactive map saved to: {map_path}")
print("✓ Open this file in a browser to view the interactive map!")

print("\n✅ Interactive crime map created!")


CREATING INTERACTIVE CRIME MAP
Using 50,000 crime points for visualization...
✓ Interactive map saved to: ../outputs/chicago_crime_heatmap.html
✓ Open this file in a browser to view the interactive map!

✅ Interactive crime map created!


In [10]:
# Set matplotlib backend
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend
import matplotlib.pyplot as plt

print("✓ Matplotlib backend configured")

✓ Matplotlib backend configured


In [11]:
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
import seaborn as sns

print("\n" + "="*70)
print("KERNEL DENSITY ESTIMATION")
print("="*70)

# Sample for KDE (10k points for performance)
kde_sample = df_pandas.sample(n=min(10000, len(df_pandas)), random_state=42)

print(f"Computing KDE with {len(kde_sample):,} points...")

# Prepare data
x = kde_sample['longitude'].values
y = kde_sample['latitude'].values

# Create KDE
xy = np.vstack([x, y])
kde = gaussian_kde(xy)

# Create grid for density plot
lon_min, lon_max = x.min(), x.max()
lat_min, lat_max = y.min(), y.max()

xx, yy = np.mgrid[lon_min:lon_max:100j, lat_min:lat_max:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])
density = np.reshape(kde(positions).T, xx.shape)

# Plot
fig, ax = plt.subplots(figsize=(14, 10))
ax.imshow(
    np.rot90(density),
    cmap='hot',
    extent=[lon_min, lon_max, lat_min, lat_max],
    aspect='auto'
)
ax.scatter(x, y, c='blue', s=1, alpha=0.1)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Chicago Crime Density Map (Kernel Density Estimation)', fontsize=16)

plt.tight_layout()
plt.savefig('../outputs/crime_density_kde.png', dpi=150, bbox_inches='tight')
print("✓ KDE plot saved to: ../outputs/crime_density_kde.png")

plt.show()

print("\n✅ Kernel Density Estimation complete!")


KERNEL DENSITY ESTIMATION
Computing KDE with 10,000 points...
✓ KDE plot saved to: ../outputs/crime_density_kde.png

✅ Kernel Density Estimation complete!


  plt.show()


In [19]:
# Install using pip (not the broken venv python)
import subprocess
import sys

# Install scikit-learn
subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn", "xgboost"])

print("✓ Installation complete!")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/neethusatravada/Documents/DATABASE/crime_prediction_project/.venv/bin/python'

In [16]:
print("="*70)
print("WEEK 4: MACHINE LEARNING MODEL DEVELOPMENT")
print("Team: Kiran Ghumare, Neethu Satravada, Sajitha Mathi")
print("="*70)

# Import ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print("✓ ML libraries imported")

# Load data from Week 3
print("\nLoading integrated crime data...")
df_ml = df_pandas.copy()

print(f"✓ Loaded {len(df_ml):,} records")
print(f"✓ Features available: {df_ml.columns.tolist()}")

# Check for missing values
print(f"\nMissing values:")
print(df_ml.isnull().sum())

# Drop rows with missing values
df_ml = df_ml.dropna(subset=['latitude', 'longitude', 'crime_type', 'community_area'])
print(f"\n✓ Clean dataset: {len(df_ml):,} records")

WEEK 4: MACHINE LEARNING MODEL DEVELOPMENT
Team: Kiran Ghumare, Neethu Satravada, Sajitha Mathi


ModuleNotFoundError: No module named 'sklearn.ensemble'