In [None]:
import numpy as np
import pandas as pd
import bokeh
from bokeh.io import output_file, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Select, CustomJS, Slider
from bokeh.layouts import column, row
from bokeh.models.tiles import WMTSTileSource
from bokeh.models import HoverTool
# Will output the plot into an html file in the web browser firefox in our case
output_file("crime_viz.html")

# Loads a basemap from OpenStreetMap into the bokeh plot.
# WMTSTileSource Web Map Tile Source lets us display map tiles (small image squres making up the full map)
# Z X and Y are the zoom level and tile coordinates, which are automatically adjusted in the plot
tile_source = WMTSTileSource(url="https://c.tile.openstreetmap.org/{Z}/{X}/{Y}.png")

In [1]:
crime_data = pd.read_csv('./crime_data_final.csv')

crime_data['Date'] = pd.to_datetime(crime_data['Date'])
crime_data['Year'] = crime_data['Date'].dt.year
crime_data['Month'] = crime_data['Date'].dt.month

# Filter to only vehicle thefts
df = crime_data[crime_data['Category'] == 'VEHICLE THEFT']

# ✅ Filter years from 2003 to 2024 leaving out 2025 because it is not complete yet
df = df[(df['Year'] >= 2003) & (df['Year'] <= 2024)]

# Remove outlier coordinates
df = df[(df['X'] < -121) & (df['Y'] < 38.5)]

# Check if points for 2006+ are being filtered out
# print(df[df['Year'] >= 2006][['X', 'Y']].describe())

df.head()

NameError: name 'pd' is not defined

In [None]:
# In the dataset, we have latitude and longitude coordinates, but they won't work with the OpenStreetMap tiles.
# We need to convert them to web mercator coordinates, which is a projection used by most web mapping applications.
def latlon_to_mercator(lon, lat):
    r_major = 6378137.0  # Earth's radius in meters
    x = r_major * np.radians(lon)
    y = r_major * np.log(np.tan(np.pi / 4 + np.radians(lat) / 2))
    return x, y

# Approx SF center
sf_x, sf_y = latlon_to_mercator(-122.4, 37.75)

# Converts latitude and longitude into web mercator coordinates
df['x_merc'], df['y_merc'] = latlon_to_mercator(df['X'], df['Y'])

In [None]:
# First, initialize your source with ONLY the filtered data for the starting year
years = sorted(df['Year'].unique())
initial_year = min(years)
filtered_df = df[df['Year'] == initial_year]

print("The length of the dataset is ", len(df))

# Creates a mask, which is a boolean list where the element is true if the row is from 2003 and false otherwise
# So if the df['Year'] looks like [2003, 2003, 2004, 2006, 2008] the mask will be [True, True, False, False, False]
# This is used to filter the data for the initial year (2003) to be displayed on the plot initially
mask = (df['Year'] == initial_year)

# ColumnDataSource is a data structure that Bokeh uses to store data. It's essentiall a dataframe that's compatible with Bokeh.
source = ColumnDataSource(data=dict(
    # These four lines underneath represent all the data points that will be plotted INITIALLY, so 2003
    # Essentially, we are saying let's start off by showing vehicle thefts only from the first year 2003
    # "For every x and y coordinate, if that mask is true, then use the x coordinate, otherwise use None"
    x_merc=[x if m else None for x, m in zip(df['x_merc'], mask)],
    y_merc=[y if m else None for y, m in zip(df['y_merc'], mask)],
    Year=[y if m else None for y, m in zip(df['Year'], mask)],
    # This is the full data, which we just store, so that we can filter with javascript later when we use the slider
    x_merc_all=df['x_merc'],
    y_merc_all=df['y_merc'],
    Year_all=df['Year'],
))

for key, value in source.data.items():
    print(f"{key}: {len(value)}")

print(source.data.keys())
# --- Slider Widget ---
year_slider = Slider(start=min(years), end=max(years), value=min(years), step=1, title="Year")

The length of the dataset is  174840
x_merc: 174840
y_merc: 174840
Year: 174840
x_merc_all: 174840
y_merc_all: 174840
Year_all: 174840
dict_keys(['x_merc', 'y_merc', 'Year', 'x_merc_all', 'y_merc_all', 'Year_all'])


In [None]:
# Below creates the plot area of our map 
p = figure(
    # sf_x and sf_y are the coordinates of the "approximate center" of San Francisco
    # Below is essentially defining how big the area will be around the center
    # So we are saying "Start zooming in to an area 30km wide and 30km tall around the center of SF"
    x_range=(sf_x - 10000, sf_x + 10000),
    y_range=(sf_y - 10000, sf_y + 10000),
    # Below tells Bokeh that this map is using the Web Mercator projection
    x_axis_type='mercator',
    y_axis_type='mercator',
    title='Vehicle Theft Density in San Francisco',
)

# This essentially adds the OpenStreetMap map to the plot
p.add_tile(tile_source)

# Below we plot each red dots on the map to indicate the location of vehicle thefts
# Notice we are using x_merc and y_merc, which are THE INITIAL COORDINATES FROM THE FIRST YEAR (2003)
# This is the data that will be displayed when the plot is first opened
# When we use the slider, the data will change via the JavaScript callback function
p.scatter(x='x_merc', y='y_merc', source=source, size=5, alpha=0.2, color="red", marker="circle")

In [None]:
print("Data lengths:", {k: len(v) for k, v in source.data.items()})
# Should show ALL columns with matching lengths

print("X range:", df['X'].min(), df['X'].max())
print("Y range:", df['Y'].min(), df['Y'].max())
print("Web Mercator x range:", df['x_merc'].min(), df['x_merc'].max())
print("Web Mercator y range:", df['y_merc'].min(), df['y_merc'].max())

Data lengths: {'x_merc': 174840, 'y_merc': 174840, 'Year': 174840, 'x_merc_all': 174840, 'y_merc_all': 174840, 'Year_all': 174840}
X range: -122.51364206426544 -122.36374276695295
Y range: 37.7079199575616 37.82999075468863
Web Mercator x range: -13638156.24982341 -13621469.536376316
Web Mercator y range: 4538246.470944002 4555436.968484607


In [None]:
# Hover Tool for debugging
hover = HoverTool(tooltips=[
    ("Year", "@Year"),
])
p.add_tools(hover)

In [None]:
# --- CustomJS Callback (Filter by year only) ---
callback = CustomJS(args=dict(source=source, year_slider=year_slider), code="""
    // Retrieve the selected year from the slider
    const year = parseInt(year_slider.value);
    
    // Below data from source.data grabs the entire dataset from ColumnDataSource 
    // That means x_merc, y_merc, year all the way down to year_all
    const data = source.data;
    const x_all = data['x_merc_all'];
    const y_all = data['y_merc_all'];
    const year_all = data['Year_all'];

    const x = [], y = [], yr = [];

    // For loop to go through all the data points in the dataset
    // If the year of that data point matches the selected year, then add it to the x and y arrays
    // In order to keep the length of the ColumnDataSource the same, we add null values for the other years
    for (let i = 0; i < x_all.length; i++) {
    if (Number(year_all[i]) === year) {
        x.push(x_all[i]);
        y.push(y_all[i]);
        yr.push(year_all[i]);
    } else {
        x.push(null);  // Keep the same length, but make invisible
        y.push(null);
        yr.push(null);
    }
}

    console.log("Lengths:", x.length, y.length, yr.length);

    // This is where the filtered data is pushed to the plot
    data['x_merc'] = x;
    data['y_merc'] = y;
    data['Year'] = yr;

    source.change.emit();
""")

In [None]:
year_slider.js_on_change('value', callback)

# --- Show layout ---
layout = column(year_slider, p)
show(layout)