## BLOCK 0: Imports & Settings

In [30]:
import re
import numpy as np
import pandas as pd
from pathlib import Path


In [31]:
from pathlib import Path
print("CWD:", Path.cwd())
print("Files in CWD (first 30):")
for p in sorted(Path.cwd().glob("*"))[:30]:
    print(" -", p.name)


CWD: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\02_Data_Preprocessing
Files in CWD (first 30):
 - 01_data_preprocessing.ipynb


## BLOCK 1 — Robust project root & folders

In [32]:
# ==========================================================
# BLOCK 1 — Robust Project Root + Paths
# Finds the folder that contains "01_Raw_Data"
# ==========================================================
PROJECT_ROOT = Path.cwd()

while not (PROJECT_ROOT / "01_Raw_Data").exists():
    if PROJECT_ROOT.parent == PROJECT_ROOT:
        raise RuntimeError("Could not find PROJECT_ROOT (folder containing 01_Raw_Data).")
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_ROOT = PROJECT_ROOT / "01_Raw_Data" / "Gross Regional Domestic Product"
WITH_DIR = RAW_ROOT / "By Industry (with NIR, 2025)"
WITHOUT_DIR = RAW_ROOT / "By Industry (without NIR, 2021)"

OUT_DIR = PROJECT_ROOT / "03_Cleaned_Data"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("WITH_DIR:", WITH_DIR, "| exists:", WITH_DIR.exists())
print("WITHOUT_DIR:", WITHOUT_DIR, "| exists:", WITHOUT_DIR.exists())
print("OUT_DIR:", OUT_DIR)


PROJECT_ROOT: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS
WITH_DIR: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\01_Raw_Data\Gross Regional Domestic Product\By Industry (with NIR, 2025) | exists: True
WITHOUT_DIR: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\01_Raw_Data\Gross Regional Domestic Product\By Industry (without NIR, 2021) | exists: False
OUT_DIR: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\03_Cleaned_Data


## Block 2: List Excel Files


In [33]:
# ==========================================================
# BLOCK 1 — Robust Project Root + Paths
# Finds the folder that contains "01_Raw_Data"
# ==========================================================
PROJECT_ROOT = Path.cwd()

while not (PROJECT_ROOT / "01_Raw_Data").exists():
    if PROJECT_ROOT.parent == PROJECT_ROOT:
        raise RuntimeError("Could not find PROJECT_ROOT (folder containing 01_Raw_Data).")
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_ROOT = PROJECT_ROOT / "01_Raw_Data" / "Gross Regional Domestic Product"
WITH_DIR = RAW_ROOT / "By Industry (with NIR, 2025)"
# Auto-detect "without NIR" folder
WITHOUT_DIR = next(
    d for d in RAW_ROOT.iterdir()
    if d.is_dir() and "without" in d.name.lower()
)

print("WITHOUT_DIR FOUND:", WITHOUT_DIR)

OUT_DIR = PROJECT_ROOT / "03_Cleaned_Data"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("WITH_DIR:", WITH_DIR, "| exists:", WITH_DIR.exists())
print("WITHOUT_DIR:", WITHOUT_DIR, "| exists:", WITHOUT_DIR.exists())
print("OUT_DIR:", OUT_DIR)


WITHOUT_DIR FOUND: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\01_Raw_Data\Gross Regional Domestic Product\By Industry (without NIR, 2024)
PROJECT_ROOT: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS
WITH_DIR: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\01_Raw_Data\Gross Regional Domestic Product\By Industry (with NIR, 2025) | exists: True
WITHOUT_DIR: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\01_Raw_Data\Gross Regional Domestic Product\By Industry (without NIR, 2024) | exists: True
OUT_DIR: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\03_Cleaned_Data


## Block 3: Text Cleaner

In [34]:
# ==========================================================
# BLOCK 3 — Text Cleaning Utility
# ==========================================================
def clean_text(x):
    if pd.isna(x):
        return np.nan
    s = str(x).replace("\u00a0", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s


## Block 4: Header Detection

In [35]:
# ==========================================================
# BLOCK 4 — Header Row Detection (PSA tables often start lower)
# Detects a header row by:
#  - keyword presence OR
#  - many year cells in same row
# ==========================================================
HEADER_KEYWORDS = [
    "industry", "sector", "economic activity",
    "kind of economic activity", "major industry"
]
YEAR_CELL = re.compile(r"^(19\d{2}|20\d{2})$")

def find_header_row(df_preview: pd.DataFrame, max_scan_rows=120) -> int | None:
    for r in range(min(max_scan_rows, len(df_preview))):
        row = df_preview.iloc[r].astype(str).fillna("")
        low = row.str.lower()

        joined = " | ".join(low.tolist())
        if any(k in joined for k in HEADER_KEYWORDS):
            return r

        year_hits = sum(bool(YEAR_CELL.match(v.strip())) for v in row.tolist())
        if year_hits >= 3:
            return r

    return None


## Block 5: Read best sheet

In [36]:
# ==========================================================
# BLOCK 5 — Read Best Sheet from an Excel File
# Tries each sheet, finds header row, then reads with that header.
# ==========================================================
def read_best_sheet(path: Path, preview_rows=80):
    xls = pd.ExcelFile(path, engine="openpyxl")

    for sheet in xls.sheet_names:
        prev = pd.read_excel(path, sheet_name=sheet, engine="openpyxl", header=None, nrows=preview_rows)
        hdr = find_header_row(prev)

        if hdr is not None:
            df = pd.read_excel(path, sheet_name=sheet, engine="openpyxl", header=hdr)
            df.columns = [clean_text(c) for c in df.columns]
            df = df.dropna(axis=1, how="all").copy()
            return sheet, df

    # fallback if nothing detected
    df = pd.read_excel(path, sheet_name=0, engine="openpyxl")
    df.columns = [clean_text(c) for c in df.columns]
    df = df.dropna(axis=1, how="all").copy()
    return xls.sheet_names[0], df


## Block 6: Infer Year + Price_Type from column header


In [37]:
# ==========================================================
# BLOCK 6 — Infer Year and Price_Type from column names
# Works for headers like:
#   "At Current Prices 2000"
#   "2000 At Constant 2018 Prices"
# ==========================================================
YEAR_RE = re.compile(r"(19\d{2}|20\d{2})")

def infer_year_and_price(col_name: str):
    c = clean_text(col_name).lower()
    year_match = YEAR_RE.search(c)
    year = int(year_match.group(1)) if year_match else None

    if "current" in c:
        price_type = "At Current Prices"
    elif "constant" in c:
        price_type = "At Constant 2018 Prices"
    else:
        price_type = None

    return year, price_type


## Block 7: Process ONE file into long format

In [38]:
def process_one_file(file_path: Path, regime_label: str) -> pd.DataFrame:
    region_name = clean_text(file_path.stem.split(",")[0])

    # --- read sheet raw (no header) ---
    sheet, _ = read_best_sheet(file_path)  # reuse your sheet chooser
    raw = pd.read_excel(file_path, sheet_name=sheet, engine="openpyxl", header=None)

    # --- find the row that contains BOTH "At Current" and "At Constant" (price header row) ---
    def row_has_price_headers(row):
        s = row.astype(str).str.lower().fillna("")
        joined = " | ".join(s.tolist())
        return ("at current" in joined) and ("at constant" in joined)

    price_row = None
    for r in range(min(60, len(raw))):
        if row_has_price_headers(raw.iloc[r]):
            price_row = r
            break
    if price_row is None:
        raise ValueError(f"Could not find price header row (sheet={sheet}, file={file_path.name}).")

    year_row = price_row + 1  # in your preview, years are directly below

    # --- build column -> Price_Type map from the price_row ---
    price_hdr = raw.iloc[price_row].astype(str).str.strip()
    price_type_by_col = pd.Series(index=raw.columns, dtype="object")

    lower = price_hdr.str.lower()
    price_type_by_col[lower.str.contains("at current")] = "At Current Prices"
    price_type_by_col[lower.str.contains("at constant")] = "At Constant 2018 Prices"
    price_type_by_col = price_type_by_col.ffill()  # fill across unnamed columns in each block

    # --- build column -> Year map from year_row ---
    years = raw.iloc[year_row]
    year_by_col = pd.Series(index=raw.columns, dtype="float64")
    year_by_col[:] = pd.to_numeric(years, errors="coerce")  # 2022, 2023, 2024...

    # --- value columns are those that have BOTH a price type and a year ---
    value_cols = [
        c for c in raw.columns
        if pd.notna(price_type_by_col.get(c)) and pd.notna(year_by_col.get(c))
    ]
    if len(value_cols) == 0:
        raise ValueError(f"No value columns found after mapping (sheet={sheet}, file={file_path.name}).")

    # --- industry rows start after year_row ---
    data = raw.iloc[year_row + 1:].copy()

    # first column that contains industry labels is the one where industries appear
    # in your preview, industry text appears in column 0 (but sometimes it may shift)
    # choose the column with the most non-null text entries
    best_label_col = None
    best_score = -1
    for c in raw.columns[:3]:  # usually in first few cols
        col = data[c].astype(str)
        score = col.str.contains(r"[A-Za-z]", regex=True).sum()
        if score > best_score:
            best_score = score
            best_label_col = c

    label_col = best_label_col if best_label_col is not None else 0
    data = data.rename(columns={label_col: "Industry"})

    data["Industry"] = data["Industry"].apply(clean_text)

    # drop rows that are empty labels
    data = data[data["Industry"].notna() & (data["Industry"] != "")].copy()

    # --- clean leading dots like "..Industry" / "....Manufacturing" ---
    data["Industry"] = data["Industry"].str.replace(r"^\.+\s*", "", regex=True).str.strip()

    # --- melt ---
    long_df = data.melt(
        id_vars=["Industry"],
        value_vars=value_cols,
        var_name="col_id",
        value_name="GRDP"
    )

    # attach Price_Type and Year from mapping
    long_df["Price_Type"] = long_df["col_id"].map(price_type_by_col)
    long_df["Year"] = long_df["col_id"].map(year_by_col).astype("Int64")

    # numeric cleanup
    long_df["GRDP"] = pd.to_numeric(
        long_df["GRDP"].astype(str).str.replace(",", "", regex=False),
        errors="coerce"
    )

    long_df = long_df.dropna(subset=["GRDP", "Year", "Price_Type", "Industry"]).copy()

    # add metadata
    long_df["Region"] = region_name
    long_df["GRDP_Regime"] = regime_label
    long_df["Source_File"] = file_path.name

    # final columns
    long_df = long_df[["Region", "Industry", "Year", "Price_Type", "GRDP", "GRDP_Regime", "Source_File"]].copy()

    return long_df


## Block 8: Process an entire folder + write error logs to 03_Cleaned_Data

In [39]:
# ==========================================================
# BLOCK 8 — Process Folder + Error Log
# Saves errors to 03_Cleaned_Data so you can always find it.
# ==========================================================
def process_folder(folder: Path, regime_label: str, error_log_name: str) -> pd.DataFrame:
    files = list_excel_files(folder)
    print(f"[INFO] {regime_label}: Found {len(files)} Excel files in {folder}")

    if len(files) == 0:
        raise RuntimeError(f"No Excel files found in {folder}")

    all_parts = []
    errors = []

    for fp in files:
        try:
            part = process_one_file(fp, regime_label)
            all_parts.append(part)
        except Exception as e:
            errors.append({"file": fp.name, "error": repr(e)})

    # Always write error log if anything failed
    if errors:
        err_df = pd.DataFrame(errors)
        err_path = OUT_DIR / error_log_name
        err_df.to_csv(err_path, index=False)
        print(f"[WARN] {len(errors)} files failed for {regime_label}. Logged to: {err_path}")
        print(err_df.head(5).to_string(index=False))

    if not all_parts:
        # Print one-file debug hint
        print("\n[DEBUG] Trying to inspect first file quickly...")
        sample = files[0]
        print("Sample file:", sample)
        try:
            sheet, df = read_best_sheet(sample)
            print("Chosen sheet:", sheet)
            print("Columns:", df.columns.tolist()[:20])
            print(df.head(5).to_string(index=False))
        except Exception as e:
            print("Even sample debug failed:", repr(e))

        raise RuntimeError(f"No files successfully processed in {folder.resolve()}")

    return pd.concat(all_parts, ignore_index=True)


In [41]:
from pathlib import Path

def list_excel_files(folder: Path) -> list[Path]:
    folder = Path(folder)
    files = []
    files += list(folder.rglob("*.xlsx"))
    files += list(folder.rglob("*.xlsm"))
    files += list(folder.rglob("*.xls"))   # safe addition
    files = sorted(set(files), key=lambda p: p.name.lower())
    return [f for f in files if f.is_file()]


## Block 9: Run ingestion (creates combined long table)

In [42]:
# ==========================================================
# BLOCK 9 — RUN INGESTION
# ==========================================================
with_long = process_folder(WITH_DIR, "With NIR", "_preprocessing_errors_with_nir.csv")
without_long = process_folder(WITHOUT_DIR, "Without NIR", "_preprocessing_errors_without_nir.csv")

combined = pd.concat([with_long, without_long], ignore_index=True)

print("combined shape:", combined.shape)
print("Price types:\n", combined["Price_Type"].value_counts())
print("Year range:", combined["Year"].min(), "to", combined["Year"].max())
combined.head()


[INFO] With NIR: Found 16 Excel files in C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\01_Raw_Data\Gross Regional Domestic Product\By Industry (with NIR, 2025)
[INFO] Without NIR: Found 15 Excel files in C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\01_Raw_Data\Gross Regional Domestic Product\By Industry (without NIR, 2024)
combined shape: (15504, 7)
Price types:
 Price_Type
At Current Prices          7752
At Constant 2018 Prices    7752
Name: count, dtype: int64
Year range: 2000 to 2024


Unnamed: 0,Region,Industry,Year,Price_Type,GRDP,GRDP_Regime,Source_File
0,Bicol Region,"Agriculture, forestry, and fishing",2022,At Current Prices,100710958.0,With NIR,"Bicol Region, Gross Regional Domestic Product,..."
1,Bicol Region,Industry,2022,At Current Prices,222926775.0,With NIR,"Bicol Region, Gross Regional Domestic Product,..."
2,Bicol Region,Mining and quarrying,2022,At Current Prices,13179419.0,With NIR,"Bicol Region, Gross Regional Domestic Product,..."
3,Bicol Region,Manufacturing,2022,At Current Prices,43289696.0,With NIR,"Bicol Region, Gross Regional Domestic Product,..."
4,Bicol Region,"Electricity, steam, water and waste management",2022,At Current Prices,38587933.0,With NIR,"Bicol Region, Gross Regional Domestic Product,..."


### Block 9.1: Quick sanity checks (make sure data is valid)

In [43]:
# ==========================================================
# BLOCK 9.1 — Sanity Checks (combined long table)
# ==========================================================
print("WITH rows:", len(with_long))
print("WITHOUT rows:", len(without_long))
print("COMBINED rows:", len(combined))

print("\nPrice_Type distribution:")
print(combined["Price_Type"].value_counts(dropna=False))

print("\nYear range:", combined["Year"].min(), "to", combined["Year"].max())

print("\nSample rows:")
display(combined.head(10)) if "display" in globals() else print(combined.head(10).to_string(index=False))


WITH rows: 1824
WITHOUT rows: 13680
COMBINED rows: 15504

Price_Type distribution:
Price_Type
At Current Prices          7752
At Constant 2018 Prices    7752
Name: count, dtype: int64

Year range: 2000 to 2024

Sample rows:
      Region                                                             Industry  Year        Price_Type        GRDP GRDP_Regime                                                     Source_File
Bicol Region                                   Agriculture, forestry, and fishing  2022 At Current Prices 100710958.0    With NIR Bicol Region, Gross Regional Domestic Product, by Industry.xlsm
Bicol Region                                                             Industry  2022 At Current Prices 222926775.0    With NIR Bicol Region, Gross Regional Domestic Product, by Industry.xlsm
Bicol Region                                                 Mining and quarrying  2022 At Current Prices  13179419.0    With NIR Bicol Region, Gross Regional Domestic Product, by Industry.xlsm


### Block 9.2: Clean "Industry" Labels

In [44]:
# ==========================================================
# BLOCK 9.2 — Clean Industry Labels
# ==========================================================
combined["Industry"] = combined["Industry"].astype(str).str.strip()
combined["Industry"] = combined["Industry"].str.replace(r"^\.+\s*", "", regex=True)
combined["Industry"] = combined["Industry"].str.replace(r"\s+", " ", regex=True).str.strip()

# Remove obvious blanks
combined = combined[combined["Industry"].notna() & (combined["Industry"] != "")].copy()


## Create Dataset 1 / 2 / 3 (CLEANED)

In [45]:
# ==========================================================
# BLOCK 10 — Build Dataset 1/2/3 (CLEANED)
# ==========================================================

def map_main_sector(industry: str):
    s = str(industry).lower().strip()

    if s == "gross domestic product":
        return "Gross Domestic Product"
    if "agriculture" in s and ("forestry" in s or "fishing" in s or "agriculture" in s):
        return "Agriculture, forestry, and fishing"
    if s == "industry" or s.endswith("industry"):
        return "Industry"
    if s == "services" or s.endswith("services"):
        return "Services"
    return None

combined["Main_Sector"] = combined["Industry"].apply(map_main_sector)

# --------------------------
# Dataset 3
# --------------------------
df3 = (
    combined[combined["Main_Sector"].notna()].copy()
    .groupby(["Region", "Main_Sector", "Year", "Price_Type"], as_index=False)["GRDP"]
    .sum()
    .rename(columns={"GRDP": "Sector_GRDP"})
)

# --------------------------
# Dataset 2 (exclude GDP total)
# --------------------------
df2 = (
    df3[df3["Main_Sector"].ne("Gross Domestic Product")].copy()
    .groupby(["Main_Sector", "Year", "Price_Type"], as_index=False)["Sector_GRDP"]
    .sum()
    .rename(columns={"Sector_GRDP": "Total_GRDP"})
)

# --------------------------
# Dataset 1 (prefer GDP total row if present)
# --------------------------
gdp_rows = combined[combined["Industry"].astype(str).str.strip().eq("Gross Domestic Product")].copy()

if len(gdp_rows) > 0:
    df1 = (
        gdp_rows.groupby(["Region", "Year", "Price_Type"], as_index=False)["GRDP"]
        .sum()
        .rename(columns={"GRDP": "Total_GRDP"})
    )
else:
    df1 = (
        df3[df3["Main_Sector"].ne("Gross Domestic Product")].copy()
        .groupby(["Region", "Year", "Price_Type"], as_index=False)["Sector_GRDP"]
        .sum()
        .rename(columns={"Sector_GRDP": "Total_GRDP"})
    )

print("DF1 shape:", df1.shape)
print("DF2 shape:", df2.shape)
print("DF3 shape:", df3.shape)

print("\nDF1 Price_Type:")
print(df1["Price_Type"].value_counts())

print("\nDF2 Main_Sector:")
print(df2["Main_Sector"].value_counts())


DF1 shape: (760, 4)
DF2 shape: (150, 4)
DF3 shape: (3040, 5)

DF1 Price_Type:
Price_Type
At Constant 2018 Prices    380
At Current Prices          380
Name: count, dtype: int64

DF2 Main_Sector:
Main_Sector
Agriculture, forestry, and fishing    50
Industry                              50
Services                              50
Name: count, dtype: int64


### Block 10.1: Validate that Dataset 2 has NO "Gross Domestic Product"

In [169]:
# ==========================================================
# BLOCK 10.1 — Validate Dataset 2 (must not include GDP total)
# ==========================================================
assert "Gross Domestic Product" not in df2["Main_Sector"].unique()
print("✅ Dataset 2 is clean (no GDP total row).")


✅ Dataset 2 is clean (no GDP total row).


In [170]:
# ==========================================================
# BLOCK 11 — Save Outputs (CLEANED CSVs)
# ==========================================================
df1.to_csv(OUT_DIR / "Dataset 1 Regional_Economic_Magnitude (CLEANED).csv", index=False)
df2.to_csv(OUT_DIR / "Dataset 2 Sector_Economic_Structure (CLEANED).csv", index=False)
df3.to_csv(OUT_DIR / "Dataset 3 Region_Sector_Structure (CLEANED).csv", index=False)

print("✅ Saved to:", OUT_DIR)
print(" - Dataset 1 Regional_Economic_Magnitude (CLEANED).csv")
print(" - Dataset 2 Sector_Economic_Structure (CLEANED).csv")
print(" - Dataset 3 Region_Sector_Structure (CLEANED).csv")


✅ Saved to: C:\Users\Anthony Llena\OneDrive\Documents\SLU\DATA SCI PRELIMS\9345-Prelim-Summative-Activity-TeamWPS\03_Cleaned_Data
 - Dataset 1 Regional_Economic_Magnitude (CLEANED).csv
 - Dataset 2 Sector_Economic_Structure (CLEANED).csv
 - Dataset 3 Region_Sector_Structure (CLEANED).csv


In [171]:
# ==========================================================
# BLOCK 12 — Final Quick Checks
# ==========================================================
print("\nDF1 preview:")
print(df1.head(5).to_string(index=False))

print("\nDF2 preview:")
print(df2.head(5).to_string(index=False))

print("\nDF3 preview:")
print(df3.head(5).to_string(index=False))



DF1 preview:
                                         Region  Year              Price_Type  Total_GRDP
Bangsamoro Autonomous Region in Muslim Mindanao  2000 At Constant 2018 Prices  98017851.0
Bangsamoro Autonomous Region in Muslim Mindanao  2000       At Current Prices  45671649.0
Bangsamoro Autonomous Region in Muslim Mindanao  2001 At Constant 2018 Prices  94136902.0
Bangsamoro Autonomous Region in Muslim Mindanao  2001       At Current Prices  44751362.0
Bangsamoro Autonomous Region in Muslim Mindanao  2002 At Constant 2018 Prices 105745055.0

DF2 preview:
                       Main_Sector  Year              Price_Type   Total_GRDP
Agriculture, forestry, and fishing  2000 At Constant 2018 Prices 1045259507.0
Agriculture, forestry, and fishing  2000       At Current Prices  505134510.0
Agriculture, forestry, and fishing  2001 At Constant 2018 Prices 1084420004.0
Agriculture, forestry, and fishing  2001       At Current Prices  525368838.0
Agriculture, forestry, and fishing  2002 A

In [172]:
regions = sorted(combined["Region"].dropna().unique())
print("Total regions:", len(regions))
print("Regions (sample):", regions[:30])

# search for likely BARMM strings
hits = [r for r in regions if "barmm" in r.lower() or "bangsamoro" in r.lower() or "muslim mindanao" in r.lower()]
print("BARMM-like hits:", hits)


Total regions: 18
Regions (sample): ['Bangsamoro Autonomous Region in Muslim Mindanao', 'Bicol Region', 'CALABARZON', 'Cagayan Valley', 'Caraga', 'Central Luzon', 'Central Visayas', 'Cordillera Administrative Region', 'Davao Region', 'Eastern Visayas', 'Ilocos Region', 'MIMAROPA', 'MIMAROPA Region', 'National Capital Region', 'Northern Mindanao', 'SOCCSKSARGEN', 'Western Visayas', 'Zamboanga Peninsula']
BARMM-like hits: ['Bangsamoro Autonomous Region in Muslim Mindanao']


In [173]:
files = list_excel_files(WITH_DIR)
print("Total files:", len(files))

barmm_files = [
    f.name for f in files
    if ("barmm" in f.name.lower())
    or ("bangsamoro" in f.name.lower())
    or ("armm" in f.name.lower())
    or ("muslim mindanao" in f.name.lower())
]

print("BARMM-like files:", barmm_files)

Total files: 17
BARMM-like files: ['Bangsamoro Autonomous Region in Muslim Mindanao, Gross Regional Domestic Product, by Industry.xlsm']
