In [1]:
# Install required libraries (run once)
%pip install geopandas pandas

# Import libraries
import geopandas as gpd
import pandas as pd
import os


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Set the shapefile path (adjust if needed)
shapefile_path = "./NFDB_point_20250519.shp"

if not os.path.exists(shapefile_path):
    raise FileNotFoundError(f"Shapefile not found at {shapefile_path}")

print("Loading NFDB shapefile...")
gdf = gpd.read_file(shapefile_path)
print(f"Raw data loaded. Shape: {gdf.shape}")

# Preview columns and first rows
print("Columns:", gdf.columns)
gdf.head()


Loading NFDB shapefile...
Raw data loaded. Shape: (442403, 27)
Columns: Index(['NFDBFIREID', 'SRC_AGENCY', 'NAT_PARK', 'FIRE_ID', 'FIRENAME',
       'LATITUDE', 'LONGITUDE', 'YEAR', 'MONTH', 'DAY', 'REP_DATE',
       'ATTK_DATE', 'OUT_DATE', 'SIZE_HA', 'CAUSE', 'CAUSE2', 'FIRE_TYPE',
       'RESPONSE', 'PROTZONE', 'PRESCRIBED', 'MORE_INFO', 'CFS_NOTE1',
       'CFS_NOTE2', 'ACQ_DATE', 'layer', 'omit', 'geometry'],
      dtype='object')


Unnamed: 0,NFDBFIREID,SRC_AGENCY,NAT_PARK,FIRE_ID,FIRENAME,LATITUDE,LONGITUDE,YEAR,MONTH,DAY,...,RESPONSE,PROTZONE,PRESCRIBED,MORE_INFO,CFS_NOTE1,CFS_NOTE2,ACQ_DATE,layer,omit,geometry
0,AB-2024-CWF-001-2024,AB,,CWF-001-2024,,50.066333,-114.154883,2024,1,2,...,,,,Calgary Forest Area,,,2025-04-07,AB_NFDB_2024,,POINT Z (-1345400.786 322485.621 0)
1,AB-2024-HWF-001-2024,AB,,HWF-001-2024,,57.912833,-116.33405,2024,1,5,...,,,,High Level Forest Area,,,2025-04-07,AB_NFDB_2024,,POINT Z (-1210041.546 1182823.657 0)
2,AB-2024-SWF-001-2024,AB,,SWF-001-2024,,56.57555,-115.216533,2024,1,17,...,,,,Slave Lake Forest Area,,,2025-04-07,AB_NFDB_2024,,POINT Z (-1194304.684 1023994.495 0)
3,AB-2024-LWF-001-2024,AB,,LWF-001-2024,,55.957361,-110.709667,2024,1,9,...,,,,Lac La Biche Forest Area,,,2025-04-07,AB_NFDB_2024,,POINT Z (-950764.968 883261.867 0)
4,AB-2024-LWF-002-2024,AB,,LWF-002-2024,,55.957361,-110.709667,2024,1,9,...,,,,Lac La Biche Forest Area,,,2025-04-07,AB_NFDB_2024,,POINT Z (-950764.968 883261.867 0)


In [3]:
# Convert to snake_case
gdf.columns = gdf.columns.str.lower()
gdf.columns = gdf.columns.str.replace(' ', '_')

print("Standardized columns:", gdf.columns)


Standardized columns: Index(['nfdbfireid', 'src_agency', 'nat_park', 'fire_id', 'firename',
       'latitude', 'longitude', 'year', 'month', 'day', 'rep_date',
       'attk_date', 'out_date', 'size_ha', 'cause', 'cause2', 'fire_type',
       'response', 'protzone', 'prescribed', 'more_info', 'cfs_note1',
       'cfs_note2', 'acq_date', 'layer', 'omit', 'geometry'],
      dtype='object')


In [4]:
# Select only the necessary columns
keep_cols = [
    'nfdbfireid', 'src_agency', 'fire_id', 'latitude', 'longitude',
    'year', 'month', 'day', 'rep_date', 'out_date',
    'size_ha', 'cause', 'fire_type', 'prescribed'
]

# Check for missing expected columns
missing_cols = [col for col in keep_cols if col not in gdf.columns]
if missing_cols:
    print(f"Warning: Missing columns -> {missing_cols}")

gdf = gdf[[col for col in keep_cols if col in gdf.columns]]
gdf.head()


Unnamed: 0,nfdbfireid,src_agency,fire_id,latitude,longitude,year,month,day,rep_date,out_date,size_ha,cause,fire_type,prescribed
0,AB-2024-CWF-001-2024,AB,CWF-001-2024,50.066333,-114.154883,2024,1,2,2024-01-02,NaT,0.05,H,Wildfire,
1,AB-2024-HWF-001-2024,AB,HWF-001-2024,57.912833,-116.33405,2024,1,5,2024-01-05,NaT,0.4,H,Wildfire,
2,AB-2024-SWF-001-2024,AB,SWF-001-2024,56.57555,-115.216533,2024,1,17,2024-01-17,NaT,0.01,H,Wildfire,
3,AB-2024-LWF-001-2024,AB,LWF-001-2024,55.957361,-110.709667,2024,1,9,2024-01-09,NaT,0.01,H,Wildfire,
4,AB-2024-LWF-002-2024,AB,LWF-002-2024,55.957361,-110.709667,2024,1,9,2024-01-09,NaT,0.01,H,Wildfire,


In [5]:
# Convert date columns to ISO format
date_cols = ['rep_date', 'out_date']
for col in date_cols:
    if col in gdf.columns:
        gdf[col] = pd.to_datetime(gdf[col], errors='coerce').dt.date

# Convert numeric columns
if 'size_ha' in gdf.columns:
    gdf['size_ha'] = pd.to_numeric(gdf['size_ha'], errors='coerce')
if 'year' in gdf.columns:
    gdf['year'] = pd.to_numeric(gdf['year'], errors='coerce')

gdf.head()


Unnamed: 0,nfdbfireid,src_agency,fire_id,latitude,longitude,year,month,day,rep_date,out_date,size_ha,cause,fire_type,prescribed
0,AB-2024-CWF-001-2024,AB,CWF-001-2024,50.066333,-114.154883,2024,1,2,2024-01-02,NaT,0.05,H,Wildfire,
1,AB-2024-HWF-001-2024,AB,HWF-001-2024,57.912833,-116.33405,2024,1,5,2024-01-05,NaT,0.4,H,Wildfire,
2,AB-2024-SWF-001-2024,AB,SWF-001-2024,56.57555,-115.216533,2024,1,17,2024-01-17,NaT,0.01,H,Wildfire,
3,AB-2024-LWF-001-2024,AB,LWF-001-2024,55.957361,-110.709667,2024,1,9,2024-01-09,NaT,0.01,H,Wildfire,
4,AB-2024-LWF-002-2024,AB,LWF-002-2024,55.957361,-110.709667,2024,1,9,2024-01-09,NaT,0.01,H,Wildfire,


In [6]:
# Drop rows missing essential fields
gdf = gdf.dropna(subset=['latitude', 'longitude', 'year'])
print(f"Data after dropping missing critical values. Shape: {gdf.shape}")


Data after dropping missing critical values. Shape: (442403, 14)


In [7]:
output_path = "../1_datasets/all_fires.csv"
gdf.to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")

Cleaned data saved to ../1_datasets/all_fires.csv
