# Crashspot — Week 1 Starter Notebook


1. Verifying Python environment.
2. Installing/confirming the required libraries.
3. Creating a clean project folder structure.
4. Loading accident and road datasets.
5. Checking and aligning CRS (coordinate reference systems).
6. Making **first quick plots** with GeoPandas/Matplotlib.
7. Building a **simple interactive web map** with Folium.
8. Exporting GeoJSON for the web, and saving outputs.




## 1) Environment Check
- Importing required libraries and prints helpful info.

In [1]:
import sys

print("Python version:", sys.version)
print("Environment OK — now checking imports...")

missing = []
def try_import(name, import_as=None):
    try:
        mod = __import__(name) if import_as is None else __import__(import_as)
        print(f"✔ {name} imported")
    except Exception as e:
        print(f"✖ Could not import {name}: {e}")
        missing.append(name)

try_import("pandas")
try_import("numpy")
try_import("matplotlib")
try:
    import matplotlib.pyplot as plt
    print("✔ matplotlib.pyplot imported")
except Exception as e:
    print("✖ Could not import matplotlib.pyplot:", e); missing.append("matplotlib")

try_import("geopandas")
try_import("shapely")
try_import("rasterio")
try_import("folium")
try_import("sklearn", import_as="sklearn")

if missing:
    print("\nSome packages are missing. Inside your activated environment, run:")
    print("  pip install " + " ".join(missing))
else:
    print("\nAll required packages imported successfully!")


Python version: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
Environment OK — now checking imports...
✔ pandas imported
✔ numpy imported
✔ matplotlib imported
✔ matplotlib.pyplot imported
✔ geopandas imported
✔ shapely imported
✔ rasterio imported
✔ folium imported
✔ sklearn imported

All required packages imported successfully!


## 2) Create Project Folders
- Creating a recommended structure in the **current working directory**.


In [8]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
for p in [
    PROJECT_ROOT,
    PROJECT_ROOT / "data_raw",
    PROJECT_ROOT / "data_clean",
    PROJECT_ROOT / "outputs" / "maps",
    PROJECT_ROOT / "outputs" / "figures",
    PROJECT_ROOT / "scripts",
    PROJECT_ROOT / "docs",
]:
    p.mkdir(parents=True, exist_ok=True)
    print("Created/exists:", p)

print("\nProject root is:", PROJECT_ROOT.resolve())


Created/exists: /Users/himalranabhat/Desktop/Crashspot
Created/exists: /Users/himalranabhat/Desktop/Crashspot/data_raw
Created/exists: /Users/himalranabhat/Desktop/Crashspot/data_clean
Created/exists: /Users/himalranabhat/Desktop/Crashspot/outputs/maps
Created/exists: /Users/himalranabhat/Desktop/Crashspot/outputs/figures
Created/exists: /Users/himalranabhat/Desktop/Crashspot/scripts
Created/exists: /Users/himalranabhat/Desktop/Crashspot/docs

Project root is: /Users/himalranabhat/Desktop/Crashspot


## 3) Putting Data in `data_raw/`

- `data_raw/accidents.csv` (or `.shp`, `.gpkg`)
- `data_raw/roads_osm.gpkg` (or `.shp`)


## 4) Load Data (Accidents & Roads)
- Loading common formats.


In [9]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Paths to your downloaded FARS files
acc_2022_path = PROJECT_ROOT / "data_raw" / "accident_2022.csv"
acc_2023_path = PROJECT_ROOT / "data_raw" / "accident_2023.csv"

# Load both years
acc_2022 = pd.read_csv(acc_2022_path)
acc_2023 = pd.read_csv(acc_2023_path)

# Add year column
acc_2022["YEAR"] = 2022
acc_2023["YEAR"] = 2023

# Combine
acc_all = pd.concat([acc_2022, acc_2023])
print("Total crashes (2022+2023):", len(acc_all))

# Filter to Louisiana (STATE == 22)
la_df = acc_all[acc_all["STATE"] == 22]
print("Louisiana crashes:", len(la_df))

# Filter to Ouachita Parish (Monroe) (COUNTY == 73)
monroe_df = la_df[la_df["COUNTY"] == 73]
print("Monroe crashes:", len(monroe_df))

# Convert to GeoDataFrame
geometry = [Point(xy) for xy in zip(monroe_df["LONGITUD"], monroe_df["LATITUDE"])]
monroe_gdf = gpd.GeoDataFrame(monroe_df, geometry=geometry, crs="EPSG:4326")

# Save outputs
out_la = PROJECT_ROOT / "data_clean" / "fars_la_2022_2023.geojson"
out_monroe = PROJECT_ROOT / "data_clean" / "fars_monroe_2022_2023.geojson"

# Louisiana-wide GeoDataFrame
la_gdf = gpd.GeoDataFrame(
    la_df,
    geometry=[Point(xy) for xy in zip(la_df["LONGITUD"], la_df["LATITUDE"])],
    crs="EPSG:4326"
)

la_gdf.to_file(out_la, driver="GeoJSON")
monroe_gdf.to_file(out_monroe, driver="GeoJSON")

print("Saved statewide file:", out_la)
print("Saved Monroe file:", out_monroe)


Total crashes (2022+2023): 77076
Louisiana crashes: 1607
Monroe crashes: 60
Saved statewide file: /Users/himalranabhat/Desktop/Crashspot/data_clean/fars_la_2022_2023.geojson
Saved Monroe file: /Users/himalranabhat/Desktop/Crashspot/data_clean/fars_monroe_2022_2023.geojson


## Rebuild Monroe file with only valid point

In [11]:
import pandas as pd, geopandas as gpd
from shapely.geometry import Point
from pathlib import Path

# make sure PROJECT_ROOT points to your main Crashspot folder
PROJECT_ROOT = Path.cwd().parent

# reload raw CSVs (2022 & 2023)
acc_2022 = pd.read_csv(PROJECT_ROOT / "data_raw" / "ACCIDENT_2022.csv")
acc_2023 = pd.read_csv(PROJECT_ROOT / "data_raw" / "ACCIDENT_2023.csv")

# add YEAR and combine
acc_2022["YEAR"] = 2022
acc_2023["YEAR"] = 2023
acc_all = pd.concat([acc_2022, acc_2023], ignore_index=True)

# coerce coords to numeric and drop invalid (NaN or 0) coords
for c in ["LATITUDE", "LONGITUD"]:
    acc_all[c] = pd.to_numeric(acc_all[c], errors="coerce")

la_df = acc_all[acc_all["STATE"] == 22].copy()
la_df = la_df[
    la_df["LATITUDE"].notna() & la_df["LONGITUD"].notna() &
    (la_df["LATITUDE"] != 0) & (la_df["LONGITUD"] != 0)
]

monroe_df = la_df[la_df["COUNTY"] == 73].copy()

# build GeoDataFrame in WGS84
monroe_gdf = gpd.GeoDataFrame(
    monroe_df,
    geometry=gpd.points_from_xy(monroe_df["LONGITUD"], monroe_df["LATITUDE"]),
    crs="EPSG:4326"
)

# double-check geometry validity
monroe_gdf = monroe_gdf[monroe_gdf.geometry.notna() & ~monroe_gdf.geometry.is_empty]

# save a CLEAN file
out_monroe_clean = PROJECT_ROOT / "data_clean" / "fars_monroe_2022_2023_clean.geojson"
monroe_gdf.to_file(out_monroe_clean, driver="GeoJSON")
print(len(monroe_gdf), "valid Monroe crashes saved to:", out_monroe_clean)


60 valid Monroe crashes saved to: /Users/himalranabhat/Desktop/Crashspot/data_clean/fars_monroe_2022_2023_clean.geojson


## 5) Check & Align CRS
- Aligning both layers to **WGS84 (EPSG:4326)**


In [None]:
def ensure_epsg4326(gdf):
    if gdf is None:
        return None
    if gdf.crs is None:
        print("Warning: CRS missing; assuming EPSG:4326. Adjust if incorrect.")
        gdf = gdf.set_crs("EPSG:4326")
    elif gdf.crs.to_string() != "EPSG:4326":
        gdf = gdf.to_crs("EPSG:4326")
    return gdf

accidents_gdf = ensure_epsg4326(accidents_gdf)
roads_gdf = ensure_epsg4326(roads_gdf)

if accidents_gdf is not None:
    print("Accidents CRS:", accidents_gdf.crs)
if roads_gdf is not None:
    print("Roads CRS:", roads_gdf.crs)


## 6) Quick Static Plots (Matplotlib)
- Checking if data lines up.


In [None]:
import matplotlib.pyplot as plt

if roads_gdf is not None or accidents_gdf is not None:
    ax = None
    if roads_gdf is not None:
        ax = roads_gdf.plot(figsize=(8,8))
    if accidents_gdf is not None:
        ax = accidents_gdf.plot(ax=ax, markersize=3)
    plt.title("Roads + Accidents (quick look)")
    plt.show()
else:
    print("Load data first (Section 4).")


## 7) Simple Interactive Map (Folium)
- Centers on the average accident location (if available), else a default location.
- Adding roads (as GeoJSON) and accident points.



In [12]:
import folium
from folium.plugins import MarkerCluster
import geopandas as gpd
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
monroe_path = PROJECT_ROOT / "data_clean" / "fars_monroe_2022_2023_clean.geojson"

# load to ensure no empty geometries remain
monroe_g = gpd.read_file(monroe_path)
monroe_g = monroe_g[monroe_g.geometry.notna() & ~monroe_g.geometry.is_empty]

m = folium.Map(location=[32.5093, -92.1193], zoom_start=11)
cluster = MarkerCluster().add_to(m)

folium.GeoJson(
    data=monroe_g,   # pass the GeoDataFrame (not just the path)
    name="Monroe FARS (2022–2023)",
    popup=folium.GeoJsonPopup(
        fields=["YEAR","MONTH","DAY","HOUR","HARM_EV","MAN_COLL"],
        aliases=["Year","Month","Day","Hour","Harmful Event","Manner of Collision"]
    )
).add_to(cluster)

folium.LayerControl().add_to(m)
out_html = PROJECT_ROOT / "outputs" / "maps" / "monroe_fars_2022_2023.html"
m.save(str(out_html))
out_html


PosixPath('/Users/himalranabhat/Desktop/Crashspot/outputs/maps/monroe_fars_2022_2023.html')

## 8) Save Cleaned Copies
- Save **GeoJSON** or **GeoPackage**.


In [15]:
from pathlib import Path
import geopandas as gpd

# files you expect to exist
expected_files = {
    "Louisiana (2022–2023)": PROJECT_ROOT / "data_clean" / "fars_la_2022_2023.geojson",
    "Monroe (2022–2023)": PROJECT_ROOT / "data_clean" / "fars_monroe_2022_2023_clean.geojson"
}

for name, f in expected_files.items():
    if f.exists():
        try:
            gdf = gpd.read_file(f)
            print(f"✅ {name} file found at {f} with {len(gdf)} records")
        except Exception as e:
            print(f"⚠️ {name} file exists but could not be read: {e}")
    else:
        print(f"❌ {name} file not found at {f}")


✅ Louisiana (2022–2023) file found at /Users/himalranabhat/Desktop/Crashspot/data_clean/fars_la_2022_2023.geojson with 1607 records
✅ Monroe (2022–2023) file found at /Users/himalranabhat/Desktop/Crashspot/data_clean/fars_monroe_2022_2023_clean.geojson with 60 records
