## Imports

In [1]:
# Operational Packages
import numpy as np
import pandas as pd
from pathlib import Path
import sys
import os
import textwrap

# Directories
nb_dir = Path.cwd()
REPO_ROOT = nb_dir.parent
data_dir = REPO_ROOT / 'data/'
processed_dir = data_dir / 'processed/'
raw_dir = data_dir / "raw"

if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

## Load Data

In [2]:
# Find all matching NBAC formatted summarystats_*.xlsx files
summary_files = sorted(raw_dir.glob("NBAC_summarystats_*.xlsx"))

if not summary_files:
    raise FileNotFoundError("No NBAC_summarystats_*.xlsx found in data/raw")

# If there's more than one, take the last (alphabetically = usually latest)
data_path = summary_files[-1]

print(f"Using summary stats file: {data_path.name}")

NBAC = pd.read_excel(data_path, sheet_name=None)
print(f'NBCA Summary Statistics read. \n Latest statistics update: {data_path.name[-13:-5]}')


Using summary stats file: NBAC_summarystats_1972to2024_20250506.xlsx
NBCA Summary Statistics read. 
 Latest statistics update: 20250506


In [3]:
print(NBAC.keys())

NBAC_admin = NBAC['sumstats_admin']
NBAC_meta = NBAC['metadata']
NBAC_admin2 = NBAC['sumstats_admin2']
NBAC_parks = NBAC['sumstats_natpark']
NBAC_years = NBAC['NBAC_1972_2024_20250506']


dict_keys(['sumstats_admin', 'sumstats_natpark', 'sumstats_admin2', 'NBAC_1972_2024_20250506', 'metadata'])


## Clean Data

### Metadata

In [4]:
NBAC_meta = NBAC_meta['National Burned Area Composite - Metadata']

NBAC_meta

0      Source: https://cwfis.cfs.nrcan.gc.ca/datamart...
1      View complete metadata - https://cwfis.cfs.nrc...
2                                                    NaN
3                                                Summary
4      The NBAC is a national, geospatial burned area...
                             ...                        
155    GH-Gwaii Haanas National Park Reserve and Haid...
156    PRESCRIBED identifies a prescribed fire as rep...
157       VERSION identifies the annual dataset version.
158    GID is a Global Identifier that concatenates t...
159    useful for selecting unique fire records merge...
Name: National Burned Area Composite - Metadata, Length: 160, dtype: object

#### Meta Summary

In [5]:
meta_summary_list = []
for row in NBAC_meta[4:11]:
    meta_summary_list.append(row)
meta_summary = "".join(meta_summary_list)

meta_summary

"The NBAC is a national, geospatial burned area product compiled annually since 1972 forannual estimates of carbon emissions. A rule-based decision process is used to select the bestburned area perimeters from a number of available data providers. // La CNSB est un outil decartographie nationale et géospatiale des superficies brûlées compilé chaque année depuis1972 afin d'en calculer annuellement les émissions de carbone. Un processus décisionnel axésur les règles permet de sélectionner les meilleures fourniseurs de données disponibles pourreprésenter une situation de feu donnée."

#### Meta Description

In [6]:
meta_description = " ".join(NBAC_meta.iloc[np.r_[13:67]].dropna().astype(str))
meta_description

"The National Burned Area Composite (NBAC) is a product created as a component of the Fire Monitoring, Accounting and Reporting System (FireMARS), jointly developed by the Canada Centre for Mapping and Earth Observation (formerly the Canada Centre for Remote Sensing) of Natural Resources Canadaand the Canadian Forest Service. FireMARS was initially developed with funding support from the Canadian Space Agency Government Related Initiatives Program (http://www4.asc-csa.gc.ca/auot-eoau/eng/grip/about.aspx) through a collaboration of those in fire research (http://www.nrcan.gc.ca/forests/fire/13143), forest carbon accounting (http://www.nrcan.gc.ca/forests/climate-change/13087) and remote sensing. NBAC is a national product compiled annually since 1972 by the FireMARS system which tracks forest fires for annual estimates of carbon emissions and to help identify National Forest Inventory plots that may have been disturbed by fire.See the FireMARS website at http://www.nrcan.gc.ca/forests/f

#### Meta Fields & Attributes

In [7]:
# 1. Take just the “fields” section of the metadata
meta_lines = (
    NBAC_meta.iloc[67:]      # start at row 67
    .dropna()                # drop blank lines
    .astype(str).str.strip() # make sure they're strings, strip spaces
)

# (optional) drop the header line if it exists
meta_lines = meta_lines[meta_lines != "Fields / Attributes"]

# 2. Split each line into first word + rest
split = meta_lines.str.split(n=1, expand=True)

# 3. Name the two resulting columns
split.columns = ["Field", "Attribute"]

meta_fields = split.reset_index(drop=True)
meta_fields


Unnamed: 0,Field,Attribute
0,YEAR,is the fire year
1,NFIREID,is a uniquely assigned ID to each fire event o...
2,specific,year. It is the common ID used to link a fire ...
3,"territorial,",or national park boundary for a specific year.
4,BASRC,identifies the burned area product from a data...
...,...,...
87,GH-Gwaii,Haanas National Park Reserve and Haida Heritag...
88,PRESCRIBED,identifies a prescribed fire as reported by th...
89,VERSION,identifies the annual dataset version.
90,GID,is a Global Identifier that concatenates the f...


In [8]:
# Function to merge identifed rows split across description
def merge_meta_rows(df, main_idx, extra_idxs):
    parts = [str(df.loc[main_idx, "Attribute"]).strip()]
    for i in extra_idxs:
        field_i = str(df.loc[i, "Field"]).strip()
        attr_i  = str(df.loc[i, "Attribute"]).strip()
        parts.append(f"{field_i} {attr_i}")
    df.loc[main_idx, "Attribute"] = " ".join(parts)
    return df.drop(extra_idxs).reset_index(drop=True)

# Example: NFIREID + its two continuation lines
meta_fields = merge_meta_rows(meta_fields, main_idx=1, extra_idxs=[2, 3])
meta_fields = merge_meta_rows(meta_fields, main_idx=6, extra_idxs=[7])
meta_fields = merge_meta_rows(meta_fields, main_idx=8, extra_idxs=[9])
meta_fields = merge_meta_rows(meta_fields, main_idx=10, extra_idxs=[11,12,13])
meta_fields = merge_meta_rows(meta_fields, main_idx=11, extra_idxs=[12,13])
meta_fields = merge_meta_rows(meta_fields, main_idx=12, extra_idxs=[13])
meta_fields = merge_meta_rows(meta_fields, main_idx=15, extra_idxs=[16])
meta_fields = merge_meta_rows(meta_fields, main_idx=79, extra_idxs=[80])
meta_fields = merge_meta_rows(meta_fields, main_idx=13, extra_idxs=[14])


In [9]:
# Function to split Provonical/Territory abbreviations from descriptions
def split_field_dash(df, field_col="Field", attr_col="Attribute"):
    # 1. rows where the Field contains a dash
    mask = df[field_col].astype(str).str.contains("-", na=False)

    # 2. split on the first '-' into two parts
    split = df.loc[mask, field_col].astype(str).str.split("-", n=1, expand=True)
    split.columns = ["abbr", "name_part"]

    # 3. clean up pieces
    abbr = split["abbr"].str.strip()
    name_part = split["name_part"].fillna("").str.strip()   # text after '-'
    existing_attr = df.loc[mask, attr_col].fillna("").str.strip()

    # 4. combine name_part + existing_attr (add a space only if both non-empty)
    glue = np.where((name_part != "") & (existing_attr != ""), " ", "")
    combined_attr = (name_part + glue + existing_attr).str.strip()

    # 5. write back into the original dataframe
    df.loc[mask, field_col] = abbr
    df.loc[mask, attr_col] = combined_attr

    return df

NBAC_meta_fields_clean = split_field_dash(meta_fields)


#### Provincial/Territory Summary Statistics

In [10]:
# 1. Save the description from the first column name
admin_description = NBAC_admin.columns[0]

# 2. Drop the first row (all NaN / banner)
NBAC_summary_stats = NBAC_admin.iloc[1:].copy()

# 3. Use the next row as header
NBAC_summary_stats.columns = NBAC_summary_stats.iloc[0]

# 4. Drop that header row from the data and reset index
NBAC_summary_stats_clean = NBAC_summary_stats.iloc[1:].reset_index(drop=True)

NBAC_summary_stats_clean.attrs['description'] = admin_description

NBAC_summary_stats_clean.head(5)

1,YEAR,AB,BC,MB,NB,NL,NS,NT,NU,ON,PC,PE,QC,SK,YT,CANADA
0,2024,719604.355673,911524.79932,233958.983969,7938.424448,60385.50932,46.177906,1647294.00149,42.072809,79874.038884,92589.440537,21.051528,208466.217566,760954.259178,181700.081212,4904399.41384
1,2023,1950086.59448,2209978.837971,142526.442937,2277.911368,18934.132477,21809.410837,3487336.802179,3784.6894,338450.972125,774388.558646,,4266657.231382,1086507.599438,333095.330548,14635834.513788
2,2022,119898.638877,114453.97964,135861.741668,,20082.396889,2845.138552,579870.789403,987.072408,3312.651189,34323.808935,,28199.857015,224706.437495,194283.416626,1458825.928697
3,2021,56143.805131,785897.164377,1139281.668036,,,184.381283,154455.192913,247.075691,687831.085672,43060.129092,,49363.933784,867993.573556,133945.834348,3918403.843883
4,2020,2830.611904,13761.809879,43146.398874,,2720.240278,672.3788,18156.350975,49.16431,13185.115149,2280.475027,,51483.340908,42739.985624,15939.710809,206965.582537


In [11]:
dataset_meta = {
    "name": "NBAC burned area by administrative area",
    "description": admin_description,
    "source": "https://cwfis.cfs.nrcan.gc.ca",  # example
    "units": "adjusted hectares",
}


#### National Parks

In [12]:

# 1. Save the description (first non-NaN banner row)
parks_description = NBAC_parks.iloc[1, 0]   # "Sum of SUM_ADJ_HA"

# 2. Drop top two rows; row 2 becomes header
NBAC_parks_clean = NBAC_parks.iloc[2:].copy()

# set header from the first remaining row
NBAC_parks_clean.columns = NBAC_parks_clean.iloc[0]

# 3. Drop that header row from the data and reset index
NBAC_parks_clean = NBAC_parks_clean.iloc[1:-1].reset_index(drop=True)

# 4. Rename the first column from "Row Labels" to YEAR (they’re years)
NBAC_parks_clean = NBAC_parks_clean.rename(columns={"Row Labels": "YEAR"})

# (optional) convert YEAR to integer
NBAC_parks_clean["YEAR"] = NBAC_parks_clean["YEAR"].astype(int)

# 5. Attach the description as metadata (like alt text)
NBAC_parks_clean.attrs["description"] = parks_description

#### Individual Fires

In [13]:

# 1. Capture the description from the first few rows of the first column
NBAC_years_description = (
    NBAC_years.iloc[0:3, 0]      # rows 0–2, first column
    .dropna()
    .astype(str)
    .str.strip()
    .str.join(" ")               # join into one sentence; use "\n".join(...) for line breaks
)

# 2. Drop the first 3 rows; row index 3 becomes the header row
NBAC_years_clean = NBAC_years.iloc[3:].copy()

# set header from the first remaining row
NBAC_years_clean.columns = NBAC_years_clean.iloc[0]

# 3. Drop that header row from the data and reset index
NBAC_years_clean = NBAC_years_clean.iloc[1:].reset_index(drop=True)

# optional: attach description as metadata (like "alt text")
NBAC_years_clean.attrs["description"] = NBAC_years_description

# optional: make YEAR numeric
NBAC_years_clean["YEAR"] = NBAC_years_clean["YEAR"].astype(int)


#### Summary Statistics Two

In [14]:
print(NBAC_admin2.isnull().sum())

print(NBAC_admin2.dtypes)

YEAR          0
ADMIN_AREA    0
SUM_ADJ_HA    0
dtype: int64
YEAR            int64
ADMIN_AREA     object
SUM_ADJ_HA    float64
dtype: object


In [15]:
NBAC_summary_stats_two_clean = NBAC_admin2.copy()

### Export Files

In [16]:
output_path = processed_dir / 'NBAC_Summary_Stats_Cleaned.xlsx'

with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
    NBAC_summary_stats_clean.to_excel(writer, sheet_name='Admin_ProvTerr',index=False)
    NBAC_summary_stats_two_clean.to_excel(writer, sheet_name='Admin_ProvTerr_Summary',index=False)
    NBAC_parks_clean.to_excel(writer,sheet_name='Parks',index=False)
    NBAC_years_clean.to_excel(writer,sheet_name='Fires_Yearly',index=False)
    NBAC_meta_fields_clean.to_excel(writer,sheet_name='Metafields',index=False)

In [17]:

output_path_txt = processed_dir / 'NBAC_README.txt'
width = 80  # characters per line

wrapped_summary = textwrap.fill(meta_summary, width=width,initial_indent="  ",subsequent_indent="  ")
wrapped_description = textwrap.fill(meta_description, width=width,initial_indent="  ",subsequent_indent="  ")

with open(output_path_txt, 'w') as file:
    file.write("README\n\n")
    file.write("Meta Summary\n")
    file.write(wrapped_summary + "\n\n")
    file.write("Meta Description\n")
    file.write(wrapped_description + "\n")