In [None]:
# Install required libraries (run once)
%pip install geopandas pandas

# Import libraries
import geopandas as gpd
import pandas as pd
import os

In [None]:
# Set the shapefile path (adjust if needed)
shapefile_path = "./NFDB_point_20250519.shp"

if not os.path.exists(shapefile_path):
    raise FileNotFoundError(f"Shapefile not found at {shapefile_path}")

print("Loading NFDB shapefile...")
gdf = gpd.read_file(shapefile_path)
print(f"Raw data loaded. Shape: {gdf.shape}")

# Preview columns and first rows
print("Columns:", gdf.columns)
gdf.head()

In [None]:
# Convert to snake_case
gdf.columns = gdf.columns.str.lower()
gdf.columns = gdf.columns.str.replace(" ", "_")

print("Standardized columns:", gdf.columns)

In [None]:
# Select only the necessary columns
keep_cols = [
    "nfdbfireid",
    "src_agency",
    "fire_id",
    "latitude",
    "longitude",
    "year",
    "month",
    "day",
    "rep_date",
    "out_date",
    "size_ha",
    "cause",
    "fire_type",
    "prescribed",
]

# Check for missing expected columns
missing_cols = [col for col in keep_cols if col not in gdf.columns]
if missing_cols:
    print(f"Warning: Missing columns -> {missing_cols}")

gdf = gdf[[col for col in keep_cols if col in gdf.columns]]
gdf.head()

In [None]:
# Convert date columns to ISO format
date_cols = ["rep_date", "out_date"]
for col in date_cols:
    if col in gdf.columns:
        gdf[col] = pd.to_datetime(gdf[col], errors="coerce").dt.date

# Convert numeric columns
if "size_ha" in gdf.columns:
    gdf["size_ha"] = pd.to_numeric(gdf["size_ha"], errors="coerce")
if "year" in gdf.columns:
    gdf["year"] = pd.to_numeric(gdf["year"], errors="coerce")

gdf.head()

In [None]:
# Drop rows missing essential fields
gdf = gdf.dropna(subset=["latitude", "longitude", "year"])
print(f"Data after dropping missing critical values. Shape: {gdf.shape}")

In [None]:
output_path = "../1_datasets/all_fires.csv"
gdf.to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")