## Installs

In [8]:
# Operational Packages
import numpy as np
import pandas as pd
from pathlib import Path
import sys
import os
import zipfile
import shutil
import geopandas as gpd
from shapely.geometry import Polygon

# Directories
nb_dir = Path.cwd()
REPO_ROOT = nb_dir.parent
data_dir = REPO_ROOT / 'data/'
processed_dir = data_dir / 'processed/'
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))



In [9]:
# Unzip all Shapefiles
def unzip_to_folder(zip_path, extract_to):
    """
    Unzips a ZIP archive into a specified directory.
    """
    extract_to = Path(extract_to)
    extract_to.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)                  # Read SHP to destination folder

    macosx_folder = extract_to / '__MACOSX'
    if macosx_folder.exists():
        shutil.rmtree(macosx_folder)

In [10]:
zippaths = Path(data_dir/'raw/zips')                    # ZIPs folders
zipfolders = list(zippaths.glob('*.zip'))           # Select all .ZIP

for folder in zipfolders:
    unzip_to_folder(folder,processed_dir/'shapefiles'/str(folder.name)[:-4])    # Retain name indentity
    print(f'NBAC Wildfires Year: {(str(folder.name)[5:9])} Shapefiles opened')

print(f'Target folder destination: {processed_dir/'shapefiles'}')


NBAC Wildfires Year: 2021 Shapefiles opened
NBAC Wildfires Year: 2020 Shapefiles opened
NBAC Wildfires Year: 2018 Shapefiles opened
NBAC Wildfires Year: 2019 Shapefiles opened
NBAC Wildfires Year: 2022 Shapefiles opened
NBAC Wildfires Year: 2023 Shapefiles opened
NBAC Wildfires Year: 2024 Shapefiles opened
Target folder destination: /Users/mitchellpalmer/Projects/wildfire-risk-analysis/data/processed/shapefiles


In [None]:
print('Open all shapefiles')
# Open all Shapefiles
all_gdfs_dct = {}   # Store in dictionary

for folder in (processed_dir / "shapefiles").iterdir():
    if folder.is_dir():
        shp = next(folder.glob("*.shp"), None)
        if shp:
            name = folder.name
            all_gdfs_dct[name] = gpd.read_file(shp)

# All dictionary keys
print(f'Appended into singular dictionary.')


Open all shapefiles
Appended into singular dictionary.


In [None]:
# Assess each shapefile's (key) column structure

# 1. Get union of all column names
all_cols = sorted(
    {col for gdf in all_gdfs_dct.values() for col in gdf.columns}
)

# 2. Create a DataFrame of False
presence = pd.DataFrame(
    False,
    index=all_gdfs_dct.keys(),
    columns=all_cols,
)

# 3. Mark columns that exist in each GeoDataFrame
for name, gdf in all_gdfs_dct.items():
    presence.loc[name, gdf.columns] = True

presence


Unnamed: 0,ADJ_FLAG,ADJ_HA,ADMIN_AREA,AG_EDATE,AG_SDATE,BASRC,CAPDATE,FIRECAUS,FIREMAPM,FIREMAPS,...,HS_SDATE,NATPARK,NFIREID,POLY_HA,PRESCRIBED,Shape_Area,Shape_Leng,VERSION,YEAR,geometry
NBAC_2024_20250506,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
NBAC_2023_20250506,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
NBAC_2022_20250506,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
NBAC_2019_20250506,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
NBAC_2018_20250506,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
NBAC_2020_20250506,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
NBAC_2021_20250506,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [13]:
# Assess column structure
missing_per_col = (presence == False).sum(axis=0)      # or (~presence).sum(axis=0)
missing_per_col # Sum False values i.e different structured columns

ADJ_FLAG      0
ADJ_HA        0
ADMIN_AREA    0
AG_EDATE      0
AG_SDATE      0
BASRC         0
CAPDATE       0
FIRECAUS      0
FIREMAPM      0
FIREMAPS      0
GID           0
HS_EDATE      0
HS_SDATE      0
NATPARK       0
NFIREID       0
POLY_HA       0
PRESCRIBED    0
Shape_Area    0
Shape_Leng    0
VERSION       0
YEAR          0
geometry      0
dtype: int64

# Singular GDF Location

In [14]:
# columns that exist in *all* years
common_cols = presence.columns[presence.all(axis=0)]
common_cols


Index(['ADJ_FLAG', 'ADJ_HA', 'ADMIN_AREA', 'AG_EDATE', 'AG_SDATE', 'BASRC',
       'CAPDATE', 'FIRECAUS', 'FIREMAPM', 'FIREMAPS', 'GID', 'HS_EDATE',
       'HS_SDATE', 'NATPARK', 'NFIREID', 'POLY_HA', 'PRESCRIBED', 'Shape_Area',
       'Shape_Leng', 'VERSION', 'YEAR', 'geometry'],
      dtype='object')

In [None]:

# Pick a reference CRS from the first GeoDataFrame
first_gdf = next(iter(all_gdfs_dct.values()))
target_crs = first_gdf.crs

gdfs_to_concat = []

for name, gdf in all_gdfs_dct.items():
    # reproject if needed
    if gdf.crs != target_crs:
        gdf = gdf.to_crs(target_crs)
    
    # either keep only columns shared by all:
    # gdf = gdf[common_cols]

    # or, if you’re okay with missing columns being NaN, skip that line
    gdfs_to_concat.append(gdf)

# Stack them vertically
fires_all_years = gpd.GeoDataFrame(
    pd.concat(gdfs_to_concat, ignore_index=True),
    crs=target_crs
)


In [34]:
combined = {"NBAC_all_years": fires_all_years}
all_gdf_df = combined["NBAC_all_years"]

In [35]:
all_gdf_df

Unnamed: 0,YEAR,NFIREID,BASRC,FIREMAPS,FIREMAPM,FIRECAUS,HS_SDATE,HS_EDATE,AG_SDATE,AG_EDATE,...,ADJ_HA,ADJ_FLAG,ADMIN_AREA,NATPARK,PRESCRIBED,VERSION,GID,Shape_Leng,Shape_Area,geometry
0,2024.0,1.0,MAFiMS,Landsat,Processed imagery,Natural,2024-08-12,2024-09-02,2024-07-29,2024-10-04,...,625.258308,,NT,,,20250506,2024_1,38513.122912,5.906397e+06,"POLYGON ((-1120513.931 1401415.528, -1120640.6..."
1,2024.0,2.0,MAFiMS,Sentinel-2,Processed imagery,Natural,2024-06-29,2024-08-12,2024-06-28,2024-10-04,...,185.715925,,NT,,,20250506,2024_2,16857.668554,1.754145e+06,"MULTIPOLYGON (((-1245606.340 1450579.000, -124..."
2,2024.0,3.0,MAFiMS,Sentinel-2,Processed imagery,Natural,2024-07-17,2024-07-24,2024-07-17,2024-10-04,...,1188.878170,,NT,,,20250506,2024_3,51547.334192,1.120790e+07,"MULTIPOLYGON (((-846230.854 1422552.091, -8462..."
3,2024.0,4.0,MAFiMS,Landsat,Processed imagery,Natural,2024-07-17,2024-08-13,2024-07-17,2024-10-04,...,3862.626905,,NT,,,20250506,2024_4,132691.511657,3.640874e+07,"MULTIPOLYGON (((-869281.585 1439913.862, -8692..."
4,2024.0,5.0,MAFiMS,Sentinel-2,Processed imagery,Natural,2024-07-28,2024-08-12,2024-07-31,2024-10-04,...,26.552612,,NT,,,20250506,2024_5,2833.473229,2.507250e+05,"POLYGON ((-1229068.442 1459363.850, -1229071.1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10673,2021.0,3012.0,Agency,Undefined,Buffered points,Human,,,2021-07-14,,...,1.370586,true,MB,,,20250506,2021_3012,370.034560,1.086572e+04,"POLYGON ((-25063.257 92639.363, -25073.265 926..."
10674,2021.0,3013.0,Agency,Undefined,Undefined,Human,2021-10-03,2021-10-03,2021-05-11,,...,775.198749,true,MB,,,20250506,2021_3013,13953.627354,7.771992e+06,"MULTIPOLYGON (((-379481.043 431792.146, -37948..."
10675,2021.0,3014.0,Agency,Undefined,Undefined,Human,,,2021-07-11,,...,12.005618,true,MB,,,20250506,2021_3014,1550.406556,1.002230e+05,"MULTIPOLYGON (((-388861.854 760502.553, -38887..."
10676,2021.0,3015.0,Agency,Undefined,Buffered points,Human,,,2021-06-16,,...,0.001635,true,PC,JA,,20250506,2021_3015,11.049791,9.691564e+00,"POLYGON ((-1498027.987 706035.643, -1498028.29..."


In [36]:
print(f'All fire geometries dataframe shape: {all_gdf_df.shape} \n')

print(f'NBAC Wildfire value count from years: {all_gdf_df['YEAR'].value_counts().sort_index()}\n')
      


All fire geometries dataframe shape: (10678, 22) 

NBAC Wildfire value count from years: YEAR
2018.0    1698
2019.0     900
2020.0     704
2021.0    1721
2022.0    1479
2023.0    2216
2024.0    1960
Name: count, dtype: int64



In [37]:
print(f'Null Values: \n{all_gdf_df.isnull().sum()}\n')

Null Values: 
YEAR              0
NFIREID           0
BASRC             0
FIREMAPS          0
FIREMAPM          0
FIRECAUS          0
HS_SDATE       6236
HS_EDATE       6236
AG_SDATE        495
AG_EDATE       5195
CAPDATE        3393
POLY_HA           0
ADJ_HA            0
ADJ_FLAG       4559
ADMIN_AREA        0
NATPARK       10228
PRESCRIBED    10618
VERSION           0
GID               0
Shape_Leng        0
Shape_Area        0
geometry          0
dtype: int64



## Geodataframe Cleaning

In [38]:
all_gdf_df.dtypes

YEAR           float64
NFIREID        float64
BASRC           object
FIREMAPS        object
FIREMAPM        object
FIRECAUS        object
HS_SDATE        object
HS_EDATE        object
AG_SDATE        object
AG_EDATE        object
CAPDATE         object
POLY_HA        float64
ADJ_HA         float64
ADJ_FLAG        object
ADMIN_AREA      object
NATPARK         object
PRESCRIBED      object
VERSION         object
GID             object
Shape_Leng     float64
Shape_Area     float64
geometry      geometry
dtype: object

In [49]:
all_gdf_df['capdate']

0        2024-09-15
1        2024-08-22
2        2024-08-08
3        2024-08-30
4        2024-08-17
            ...    
10673          None
10674          None
10675          None
10676          None
10677          None
Name: capdate, Length: 10678, dtype: object

In [48]:
all_gdf_df.dtypes

year                     int64
fireid                   int64
basrc                   object
firemaps                object
firemapm                object
cause                   object
hs_sdate        datetime64[ns]
hs_edate                object
ag_sdate                object
ag_edate                object
capdate                 object
poly_ha                float64
adj_ha                 float64
adj_flag                object
admin_area              object
natpark                 object
prescribed              object
version                 object
gid                     object
shape_length           float64
shape_area             float64
geometry              geometry
dtype: object

In [50]:
all_gdf_df = all_gdf_df.copy()

all_gdf_df['YEAR'] = all_gdf_df['YEAR'].astype(int)
all_gdf_df['NFIREID'] = all_gdf_df['NFIREID'].astype(int)
all_gdf_df['hs_sdate'] = all_gdf_df['hs_sdate'].astype('datetime64[ns]')
all_gdf_df['hs_edate'] = all_gdf_df['hs_edate'].astype('datetime64[ns]')
all_gdf_df['ag_sdate'] = all_gdf_df['ag_sdate'].astype('datetime64[ns]')
all_gdf_df['ag_edate'] = all_gdf_df['ag_edate'].astype('datetime64[ns]')
all_gdf_df['capdate'] = all_gdf_df['capdate'].astype('datetime64[ns]')

col_names = {
    'Shape_Leng' : "shape_length",
    "FIRECAUS": "CAUSE",
    "NFIREID": "FIREID",

}
all_gdf_df = all_gdf_df.rename(columns=col_names)

# lower-case all column names in one line
all_gdf_df.columns = all_gdf_df.columns.str.lower()

KeyError: 'YEAR'

## All Fire analysis

In [51]:
all_gdf_df['cause'].value_counts()

cause
Natural         6807
Human           3224
Undetermined     647
Name: count, dtype: int64

In [57]:
all_gdf_df['cause'].value_counts(normalize=True).round(4)*100

cause
Natural         63.75
Human           30.19
Undetermined     6.06
Name: proportion, dtype: float64

### BC Fires

In [59]:
BC_gdfs = all_gdf_df[all_gdf_df['admin_area']=='BC']
print(f'BC Wildfires Shape: {BC_gdfs.shape}\n')
BC_gdfs.head(2)

BC Wildfires Shape: (2753, 22)



Unnamed: 0,year,fireid,basrc,firemaps,firemapm,cause,hs_sdate,hs_edate,ag_sdate,ag_edate,...,adj_ha,adj_flag,admin_area,natpark,prescribed,version,gid,shape_length,shape_area,geometry
162,2024,159,MAFiMS,Landsat,Processed imagery,Natural,2024-05-09,2024-07-19,2024-05-10,,...,12260.052718,,BC,,,20250506,2024_159,800166.4,116433800.0,"MULTIPOLYGON (((-1488758.227 1319811.062, -148..."
163,2024,160,MAFiMS,Sentinel-2,Processed imagery,Natural,2024-04-11,2024-10-16,2023-06-28,,...,450940.307996,,BC,,,20250506,2024_160,15994710.0,4271401000.0,"MULTIPOLYGON (((-1421109.650 1317743.467, -142..."


#### BC Fires Analysis

In [24]:
BC_gdfs['cause'].value_counts()

cause
Natural         1916
Human            795
Undetermined      42
Name: count, dtype: int64

In [25]:
BC_gdfs['cause'].value_counts(normalize=True)*100

cause
Natural         69.596803
Human           28.877588
Undetermined     1.525608
Name: proportion, dtype: float64

#### BC Fires Export

In [None]:
# Keep only columns you actually need to keep the file small
cols = ['gid', 'fireid', 'year', 'admin_area', 'adj_ha', 'geometry']
BC_simple = BC_gdfs[cols].copy()

# 
BC_year_min = str(BC_simple['year'].min())
BC_year_max = str(BC_simple['year'].max())

# Reproject to WGS84 for GeoJSON
BC_4326 = BC_simple.to_crs(epsg=4326)


# Make sure output folder exists
out_dir = processed_dir / "BC_fires"
out_dir.mkdir(parents=True, exist_ok=True)

# GeoJSON output 
bc_path_geojson = processed_dir/ 'BC_fires'/ f"BC_fires_{BC_year_min}_{BC_year_max}.geojson"
BC_4326.to_file(bc_path_geojson, driver="GeoJSON")

print(f'BC Fires extracted between years {BC_year_min} - {BC_year_max}')
print("Saved BC_Fires GeoJson File", bc_path_geojson)

# Shapefile output
bc_path_shp = processed_dir/ 'BC_fires'/f"BC_fires_{BC_year_min}_{BC_year_max}_shp"
BC_4326.to_file(bc_path_shp) 

print("Saved BC_Fires Shapefile", bc_path_geojson)

BC Fires extracted between years 2018 - 2024
Saved BC_Fires GeoJson File /Users/mitchellpalmer/Projects/wildfire-risk-analysis/data/processed/BC_fires/BC_fires_2018_2024.geojson
Saved BC_Fires Shapefile /Users/mitchellpalmer/Projects/wildfire-risk-analysis/data/processed/BC_fires/BC_fires_2018_2024.geojson


### Avalanche Canada Region Fires (BC + AB)

In [67]:
AvCan_gdfs = all_gdf_df[all_gdf_df['admin_area'].isin(['BC','AB'])]
print(f'Avalanche Canada Wildfires dataframe shape: {AvCan_gdfs.shape}\n')
AvCan_gdfs.head(2)

Avalanche Canada Wildfires dataframe shape: (3967, 22)



Unnamed: 0,year,fireid,basrc,firemaps,firemapm,cause,hs_sdate,hs_edate,ag_sdate,ag_edate,...,adj_ha,adj_flag,admin_area,natpark,prescribed,version,gid,shape_length,shape_area,geometry
23,2024,24,MAFiMS,Landsat,Processed imagery,Natural,2024-05-06,2024-08-13,2024-06-01,2024-10-04,...,81133.988043,,AB,,,20250506,2024_24,3721022.0,766917100.0,"MULTIPOLYGON (((-1265565.543 1444031.010, -126..."
57,2024,56,MAFiMS,Sentinel-2,Processed imagery,Natural,2024-04-20,2024-09-17,2024-05-24,2024-11-14,...,24362.005793,,AB,,,20250506,2024_56,1558421.0,230208200.0,"MULTIPOLYGON (((-1184092.940 1413865.989, -118..."


#### Avalanche Canada 