In [1]:
# Install tabula-py (if not already installed)  
%pip install tabula-py  
  
import tabula  
import pandas as pd  
  
# Define the PDF file and the pages we want to extract the table from  
pdf_path = "annual_report_2017_508_0.pdf"  
pages = "9-10"  # pages 9 and 10  
  
# Use tabula to read the table from the specified pages  
# This will return a list of DataFrames  
tables = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True)  
  
# Let's see how many tables were extracted and show a preview of the first few rows of each  
print("Number of tables detected:", len(tables))  
for idx, table in enumerate(tables):  
    print(f"\nTable {idx+1} preview:")  
    print(table.head())  
  
print("\nDone")  

Note: you may need to restart the kernel to use updated packages.


Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'


Number of tables detected: 0

Done


In [6]:
import tabula  
import pandas as pd  
  
pdf_path = "annual_report_2017_508_0.pdf"  
pages = "7-12"  
  
# Use tabula with lattice mode enabled  
tables = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True, lattice=True)  
  
print("Number of tables detected with lattice=True:", len(tables))  
for idx, table in enumerate(tables):  
    print(f"\nTable {idx+1} preview:")  
    print(table.head())  
  
print("\nDone")  

Note: you may need to restart the kernel to use updated packages.
Number of tables detected with lattice=True: 12

Table 1 preview:
Empty DataFrame
5]lifornia that would begin in early November and continue into December. Precipitation inmer
Index: []

Table 2 preview:
Empty DataFrame
6]sidences,4,002minorstructures,229commercialstructuresand10mixed,065five more than inrage).ts of
Index: []

Table 3 preview:
  commercial/residentialstructures.Thisiswellabovetheannualaverageof1,545\rresidences, 1,236 minor structures, and 55 commercial structures destroyed by wildfire\rwith this year ranking 1st in total structures lost (data from 1999 to present). California\raccounted for the highest number of structures lost in one state in 2017: 7,778 residences,\r178 commercial structures and 3,056 minor structures. Florida was second with 44\rresidences, 1 mixed commercial and residential and 282 minor structures.\r\rRequests for firefighting resources placed to the National Interagency Coordinati

In [6]:
import fitz  # PyMuPDF  
import re  
import pandas as pd  
import os  
  
# Define file paths  
pdf_path = "annual_report_2017_508_0.pdf"  
csv_path = "combined_incidents_clean_final_updated_2021_v6.csv"  
output_csv = "combined_incidents_clean_final_updated_manual.csv"  
  
# Function to normalize incident names by stripping, lowercasing, and removing extra spaces  
def normalize_name(name):  
    return re.sub(r'\\s+', ' ', name.strip().lower())  
  
# Extract text from pages 7 to 13  
document = fitz.open(pdf_path)  
extracted_text = ""  
print("Extracting text from pages 7 to 13...")  
for page_num in range(6, 13):  # pages 7-13 (0-indexed)  
    page = document.load_page(page_num)  
    page_text = page.get_text()  
    print("Page", page_num+1, "preview (first 500 chars):")  
    print(page_text[:500])  
    extracted_text += "\n" + page_text  
document.close()  
print("Text extraction complete.\n")  
  
# Use regex to locate lines that likely contain incident names and cost information.  
# For example, we search for lines with a cost number pattern (e.g. 3,200,000).   
# We assume the cost is a number which might include commas, and that the incident name appears before it.  
#  
# This is a rough approach and might need adjustments based on your PDF's table layout.  
#  
# Example pattern: (incident name) ... (cost)  
cost_pattern = re.compile(r'(?P<incident>[\w\s\.\-]+?)\\s+(?P<cost>\\d{1,3}(?:,\\d{3})+(?:\\.\\d+)?)(?!\\S)', re.MULTILINE)  
  
matches = cost_pattern.finditer(extracted_text)  
cost_lookup = {}  
print("Identified cost lines:")  
for match in matches:  
    incident_name = match.group("incident")  
    cost_val = match.group("cost")  
    normalized_incident = normalize_name(incident_name)  
    cost_lookup[normalized_incident] = cost_val  
    print("Incident:", incident_name, "-> Cost:", cost_val)  
  
if not cost_lookup:  
    print("No cost lines found. You may need to adjust the regex pattern.")  
  
# Load the CSV file and update cost values for 2017 incidents where cost is 'NR'  
print("\nLoading CSV file...")  
df = pd.read_csv(csv_path)  
print("CSV loaded. Showing incidents from 2017 with 'NR' cost:")  
df_2017_nr = df[(df['Year'] == 2017) & (df['Cost'] == 'NR')]  
print(df_2017_nr[['Name', 'Cost']])  
  
update_count = 0  
for idx, row in df[df['Year'] == 2017].iterrows():  
    normalized_incident = normalize_name(row['Name'])  
    # Try to find a cost value by matching incident names  
    if normalized_incident in cost_lookup:  
        cost = cost_lookup[normalized_incident]  
        # Only update if cost is not empty and not already set  
        if str(cost).upper() != 'NR' and str(cost).strip() != '':  
            df.at[idx, 'Cost'] = cost  
            update_count += 1  
            print("Updated incident:", row['Name'], "with cost:", cost)  
  
print("\nTotal incidents updated:", update_count)  
  
# Save the updated CSV  
df.to_csv(output_csv, index=False)  
print("Updated CSV saved to:", output_csv)  

Extracting text from pages 7 to 13...
Page 7 preview (first 500 chars):
5 
 
In the East, precipitation was generally within 25 percent of normal for the summer 
months. One exception was along the Gulf Coast of Texas and Louisiana in late August 
where Hurricane Harvey, a rapidly intensifying Category 4 hurricane, made landfall near 
Rockport, Texas, on August 25. The storm was among the wettest ever. An area of 3,643 
square miles between the Louisiana state line with Texas and Houston recorded at least 
40 inches of rain between August 23 and August 30. The highe
Page 8 preview (first 500 chars):
6 
 
fell across the western states but did not extend into central and southern portions of 
California. Fuels returned to a critically dry state rapidly and significant North Wind events 
across northern and central California produced several large fires in early November. A 
month later, strong Santa Ana winds led to the development of multiple large fires in 
central and southern Calif

In [7]:
# Install pdfplumber if not already installed  
%pip install pdfplumber  
  
import pdfplumber  
import pandas as pd  
import re  
import os  
  
# Define file paths  
pdf_path = "annual_report_2017_508_0.pdf"  
csv_path = "combined_incidents_clean_final_updated_2021_v6.csv"  
output_csv = "combined_incidents_clean_final_updated_manual.csv"  
  
# Function to normalize incident names by stripping, lowercasing, and removing extra spaces  
def normalize_name(name):  
    return re.sub(r'\s+', ' ', str(name).strip().lower())  
  
# Attempt to extract tables using pdfplumber on pages 7-13  
all_tables = []  
print("Extracting tables from pages 7 to 13 using pdfplumber...")  
with pdfplumber.open(pdf_path) as pdf:  
    for i in range(6, 13):  # pages 7 to 13 (0-indexed)  
        if i < len(pdf.pages):  
            page = pdf.pages[i]  
            print("\n--- Page", i+1, "---")  
            tables = page.extract_tables()  
            print("Found", len(tables), "tables on this page.")  
            for idx, table in enumerate(tables):  
                df_temp = pd.DataFrame(table[1:], columns=table[0])  
                print("\nTable", idx+1, "preview (first 5 rows):")  
                print(df_temp.head())  
                all_tables.append(df_temp)  
        else:  
            print("Page", i+1, "does not exist in this PDF.")  
              
# If multiple tables were extracted, you'll need to confirm which one has the cost data.  
# For demonstration, we assume that one of the tables has columns for incident name and estimated cost.  
# To help us build a lookup, we loop through each table and search for a column header containing 'Name'  
# and one containing 'Cost'. Adjust these conditions as needed.  
cost_lookup = {}  
for table in all_tables:  
    col_names = [col.lower() for col in table.columns if isinstance(col, str)]  
    if any('name' in col for col in col_names) and any('cost' in col for col in col_names):  
        # Identify columns; this is a crude check; you may need to adjust based on exact column names.  
        name_col = [col for col in table.columns if isinstance(col, str) and 'name' in col.lower()][0]  
        cost_col = [col for col in table.columns if isinstance(col, str) and 'cost' in col.lower()][0]  
        print("\nUsing table with name column:", name_col, "and cost column:", cost_col)  
          
        # Build lookup dictionary from normalized incident name to cost  
        for idx, row in table.iterrows():  
            incident = row[name_col]  
            cost = row[cost_col]  
            if incident and cost and (str(cost).strip().upper() != 'NR'):  
                normalized_incident = normalize_name(incident)  
                cost_lookup[normalized_incident] = str(cost).strip()  
        break  
  
print("\nFound", len(cost_lookup), "incident cost entries from the PDF.")  
  
# OPTIONALLY: print first few entries in the lookup for inspection  
print("\nFirst 10 entries in incident->cost lookup:")  
count = 0  
for k, v in cost_lookup.items():  
    print(k, "->", v)  
    count += 1  
    if count >= 10:  
        break  
  
# Load the CSV file and update cost values for 2017 incidents where cost is 'NR'  
print("\nLoading CSV file...")  
df = pd.read_csv(csv_path)  
print("CSV loaded. Showing incidents from 2017 with 'NR' cost:")  
df_2017_nr = df[(df['Year'] == 2017) & (df['Cost'] == 'NR')]  
print(df_2017_nr[['Name', 'Cost']])  
  
update_count = 0  
for idx, row in df[df['Year'] == 2017].iterrows():  
    normalized_incident = normalize_name(row['Name'])  
    # Try to find a cost value by matching incident names  
    if normalized_incident in cost_lookup:  
        cost = cost_lookup[normalized_incident]  
        # Only update if cost is not empty and not already set to an actual value (not 'NR')  
        if str(cost).upper() != 'NR' and str(cost).strip() != '':  
            df.at[idx, 'Cost'] = cost  
            update_count += 1  
            print("Updated incident:", row['Name'], "with cost:", cost)  
  
print("\nTotal incidents updated:", update_count)  
  
# Save the updated CSV  
df.to_csv(output_csv, index=False)  
print("Updated CSV saved to:", output_csv)  


Extracting tables from pages 7 to 13 using pdfplumber...

--- Page 7 ---
Found 0 tables on this page.

--- Page 8 ---
Found 0 tables on this page.

--- Page 9 ---
Found 1 tables on this page.

Table 1 preview (first 5 rows):
                                                    
0  Military: On July 22nd, one MAFFS unit was act...
1  (RFA) to the Department of Defense. This reque...
2  152nd Airlift Wing and was positioned at Fresn...
3  On July 30th one MAFFS from the 153rd Airlift ...
4  Wing was positioned at Fresno, CA. On Septembe...

--- Page 10 ---
Found 2 tables on this page.

Table 1 preview (first 5 rows):
  On September 11th two hundred and forty-five soldiers from the 23rd Brigade Engineer
0  Battalion and 1-23 Infantry Battalion out of F...                                  
1  support of the Umpqua North Complex. They were...                                  
2                                    September 26th.                                  

Table 2 preview (first 5 row

In [8]:
import pdfplumber  
import pandas as pd  
import re  
  
# Define file paths  
pdf_path = "annual_report_2008_508.pdf"  
csv_path = "combined_incidents_clean_final_updated_2021_v6.csv"  # Adjust if needed  
output_csv = "combined_incidents_clean_final_updated_manual_2008.csv"  
  
# Function to normalize incident names (lowercase, strip, and reduce whitespace)  
def normalize_name(name):  
    return re.sub(r'\s+', ' ', str(name).strip().lower())  
  
# Extract tables from the PDF pages  
all_tables = []  
print("Extracting tables from the 2008 report using pdfplumber...\n")  
with pdfplumber.open(pdf_path) as pdf:  
    total_pages = len(pdf.pages)  
    print("Total pages in PDF:", total_pages)  
    # Loop through all pages (or adjust the range if you know where the tables are)  
    for i in range(total_pages):  
        page = pdf.pages[i]  
        print("\n----- Page", i+1, "-----")  
        tables = page.extract_tables()  
        print("Found", len(tables), "tables on this page.")  
        for idx, table in enumerate(tables):  
            if table and len(table) > 1:  
                df_temp = pd.DataFrame(table[1:], columns=table[0])  
                print("\nTable", idx+1, "preview (first 5 rows):")  
                print(df_temp.head())  
                all_tables.append(df_temp)  
            else:  
                print("Table", idx+1, "is empty or incomplete.")  
  
# -------------------------------------------------------------------------  
# At this point, inspect the output cells to determine which table(s)  
# contain the incident names and cost values. Adjust the following  
# code to select the appropriate table. For demonstration, we assume  
# that one of the tables (e.g., the first candidate) contains the cost data.  
# -------------------------------------------------------------------------  
# For example, let’s assume the desired table is the first candidate with a column header  
# containing 'Cost' or 'Estimat'. You can adjust these conditions as needed.  
cost_table = None  
for table in all_tables:  
    for col in table.columns:  
        if isinstance(col, str) and ("cost" in col.lower() or "estim" in col.lower()):  
            cost_table = table.copy()  
            break  
    if cost_table is not None:  
        break  
  
if cost_table is None:  
    print("\nNo table with cost information detected. Please manually inspect the printed tables.")  
else:  
    print("\nSelected table for cost extraction:")  
    print(cost_table.head())  
  
    # You might need to adjust the column names.  
    # For example, assume the table has columns 'Name' and 'Estimated Cost'  
    # If these differ, update the variables below accordingly.  
    name_col = None  
    cost_col = None  
    for col in cost_table.columns:  
        if isinstance(col, str):  
            if "name" in col.lower():  
                name_col = col  
            if "cost" in col.lower() or "estim" in col.lower():  
                cost_col = col  
  
    if name_col is None or cost_col is None:  
        print("\nUnable to find the appropriate columns. Please update the column selection manually.")  
    else:  
        print("\nUsing column", name_col, "for incident names and", cost_col, "for cost values.")  
  
        # Build a lookup dictionary from incident names to cost values  
        cost_lookup = {}  
        for idx, row in cost_table.iterrows():  
            incident = row[name_col]  
            cost = row[cost_col]  
            if incident and cost and (str(cost).strip().upper() != 'NR'):  
                normalized_incident = normalize_name(incident)  
                cost_lookup[normalized_incident] = str(cost).strip()  
  
        print("\nExtracted", len(cost_lookup), "incident cost entries from the PDF.")  
        print("\nFirst 10 entries in incident -> cost lookup:")  
        count = 0  
        for k, v in cost_lookup.items():  
            print(k, "->", v)  
            count += 1  
            if count >= 10:  
                break  
  
        # -------------------------------------------------------------------------  
        # Now load your CSV, update the cost values for incidents (e.g., for 2008 if that's the case)  
        # Make sure to adjust the 'Year' filter if necessary.  
        # -------------------------------------------------------------------------  
        print("\nLoading CSV file to update missing cost values...")  
        df = pd.read_csv(csv_path)  
        # Adjust the filter below appropriately. For example, if the CSV file contains the year 2008:  
        df_target = df[df['Year'] == 2008]  
        print("Incident records from 2008 with missing cost values:")  
        print(df_target[df_target['Cost'] == 'NR'][['Name', 'Cost']])  
  
        update_count = 0  
        for idx, row in df_target.iterrows():  
            normalized_incident = normalize_name(row['Name'])  
            if normalized_incident in cost_lookup:  
                cost = cost_lookup[normalized_incident]  
                if str(cost).upper() != 'NR' and str(cost).strip() != '':  
                    df.at[idx, 'Cost'] = cost  
                    update_count += 1  
                    print("Updated incident:", row['Name'], "with cost:", cost)  
  
        print("\nTotal incidents updated:", update_count)  
  
        # Save the updated CSV  
        df.to_csv(output_csv, index=False)  
        print("Updated CSV saved to:", output_csv)  

Extracting tables from the 2008 report using pdfplumber...

Total pages in PDF: 76

----- Page 1 -----
Found 0 tables on this page.

----- Page 2 -----
Found 0 tables on this page.

----- Page 3 -----
Found 0 tables on this page.

----- Page 4 -----
Found 0 tables on this page.

----- Page 5 -----
Found 0 tables on this page.

----- Page 6 -----
Found 0 tables on this page.

----- Page 7 -----
Found 0 tables on this page.

----- Page 8 -----
Found 0 tables on this page.

----- Page 9 -----
Found 0 tables on this page.

----- Page 10 -----
Found 0 tables on this page.

----- Page 11 -----
Found 0 tables on this page.

----- Page 12 -----
Found 0 tables on this page.

----- Page 13 -----
Found 0 tables on this page.

----- Page 14 -----
Found 0 tables on this page.

----- Page 15 -----
Found 1 tables on this page.

Table 1 preview (first 5 rows):
                     Name Inc.\nType GACC State Start Date  \
0              Glass Fire         WF   SA    TX  2/25/2008   
1         Klamath T

In [10]:
import pdfplumber  
import pandas as pd  
import re  
  
# Define file paths  
pdf_path = "annual_report.2022.pdf"  
csv_path = "combined_incidents_clean_final_updated_2021_v6.csv"  
output_csv = "combined_incidents_clean_final_updated_manual_2022.csv"  
  
# Function to normalize incident names (convert to string, strip, lower, and reduce whitespace)  
def normalize_name(name):  
    return re.sub(r'\\s+', ' ', str(name).strip().lower())  
  
# Extract the table that includes the estimated cost.  
# You mentioned that the preview output showed two pages with 1 table each,  
# one of which appears to have cost data with a column 'Estimated Cost'.  
# We'll scan through all pages and pick the first table where that column seems to be present.  
cost_lookup = {}  
  
print("Extracting cost data from the 2022 report...")  
with pdfplumber.open(pdf_path) as pdf:  
    for i, page in enumerate(pdf.pages):  
        tables = page.extract_tables()  
        if not tables:  
            continue  
        for table in tables:  
            # Convert to DataFrame if table has headers and at least 2 rows.  
            if len(table) < 2:  
                continue  
            df_temp = pd.DataFrame(table[1:], columns=table[0])  
            # Use lower-case column names for matching.  
            df_temp.columns = [str(col).strip().lower() for col in df_temp.columns]  
            # Check if table has an 'estimated cost' column.  
            possible_cost_cols = [col for col in df_temp.columns if 'cost' in col]  
            if possible_cost_cols:  
                cost_col = possible_cost_cols[0]  # choose the first matching column  
                # We assume the incident name is in a column that includes 'name'  
                possible_name_cols = [col for col in df_temp.columns if 'name' in col]  
                if not possible_name_cols:  
                    continue  
                name_col = possible_name_cols[0]  
                  
                # Build lookup dictionary from normalized incident name to cleaned cost.  
                for idx, row in df_temp.iterrows():  
                    incident = row[name_col]  
                    cost = row[cost_col]  
                    # Clean cost: remove currency symbols and commas, then reformat if desired  
                    if incident and cost:  
                        # Only add if cost looks like a number (has digits)  
                        if any(char.isdigit() for char in str(cost)):  
                            cleaned_cost = str(cost).replace('$', '').replace(',', '').strip()  
                            norm_incident = normalize_name(incident)  
                            cost_lookup[norm_incident] = cleaned_cost  
                print("Extracted", len(cost_lookup), "cost entries from page", i+1)  
                # Uncomment the following if you only expect one table to contain cost info:  
                # break  
        if cost_lookup:  
            break  
  
if not cost_lookup:  
    print("No cost data found in the PDF. Please adjust the extraction logic and column matching.")  
else:  
    print("First few lookup entries:")  
    count = 0  
    for key, value in cost_lookup.items():  
        print(key, "->", value)  
        count += 1  
        if count >= 10:  
            break  
  
# Load the CSV file and update cost values for 2022 records with missing costs (, NR, or empty)  
print("\nLoading CSV file...")  
df = pd.read_csv(csv_path)  
  
# For 2022, we found missing cost values (they are , check output above).  
df_2022 = df[df['Year'] == 2022]  
print("Found", len(df_2022), "records for 2022.")  
missing_cost = df_2022[(df_2022['Cost'].isna()) | (df_2022['Cost'] == 'NR') | (df_2022['Cost'] == '')]  
print("Records with missing costs:", len(missing_cost))  
  
update_count = 0  
for idx, row in df_2022.iterrows():  
    norm_name = normalize_name(row['Name'])  
    if norm_name in cost_lookup:  
        cost = cost_lookup[norm_name]  
        # Update the main CSV DataFrame using index in original df.  
        df.at[row.name, 'Cost'] = cost  
        update_count += 1  
        print("Updated:", row['Name'], "with cost:", cost)  
    else:  
        print("No match for:", row['Name'])  
  
print("\nTotal incidents updated:", update_count)  
  
# Save updated CSV  
df.to_csv(output_csv, index=False)  
print("Updated CSV saved to:", output_csv)  

Extracting cost data from the 2022 report...
Extracted 31 cost entries from page 9
First few lookup entries:
lime complex -> 12726992
hermits peak -> 330100293
black -> 60190000
paradise complex -> 2782896
tatlawiksuk -> 75965
nc hpai 2022 -> 484012
bean complex -> 11656365
hog butte -> 628717
double creek -> 39500000
east fork -> 6150000

Loading CSV file...
Found 40 records for 2022.
Records with missing costs: 40
Updated: Lime Complex with cost: 12726992
Updated: Hermits Peak with cost: 330100293
Updated: Black with cost: 60190000
Updated: Paradise Complex with cost: 2782896
Updated: Tatlawiksuk with cost: 75965
Updated: NC HPAI 2022 with cost: 484012
Updated: Bean Complex with cost: 11656365
No match for: Koktuli River
Updated: Hog Butte with cost: 628717
Updated: Double Creek with cost: 39500000
Updated: East Fork with cost: 6150000
No match for: Poorman Complex
Updated: Moose with cost: 98200000
Updated: Cedar Creek with cost: 133750000
Updated: Aghaluk Mountain with cost: 127269

In [11]:
# Now let's extract the cost data from the 2022 annual report and update our CSV
import pdfplumber
import pandas as pd
import re

# Define file paths
pdf_path = "annual_report.2022.pdf"
csv_path = "combined_incidents_clean_final_updated_manual_2022.csv"
output_csv = "combined_incidents_clean_final_updated_2022_with_costs.csv"

# Function to normalize incident names (convert to string, strip, lower, and reduce whitespace)
def normalize_name(name):
    return re.sub(r'\s+', ' ', str(name).strip().lower())

# Function to clean cost values (remove $ and commas, convert to numeric)
def clean_cost(cost):
    if pd.isna(cost) or cost == '' or str(cost).strip() == '':
        return 'NR'
    cost_str = str(cost).strip()
    if cost_str.upper() == 'NR':
        return 'NR'
    # Remove $ and commas, then add $ back
    cost_str = cost_str.replace('$', '').replace(',', '')
    return '$' + cost_str

# Extract tables from the PDF
cost_lookup = {}
print("Extracting cost data from the 2022 report...")

with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):
        tables = page.extract_tables()
        if not tables:
            continue
            
        for table_idx, table in enumerate(tables):
            if len(table) < 2:  # Skip tables with no data rows
                continue
                
            # Convert to DataFrame
            df_table = pd.DataFrame(table[1:], columns=table[0])
            
            # Check if this table has cost data
            cost_col = None
            name_col = None
            
            for col in df_table.columns:
                if col and 'cost' in str(col).lower():
                    cost_col = col
                if col and 'name' in str(col).lower():
                    name_col = col
            
            if cost_col and name_col:
                print(f"Found cost data in table {table_idx+1} on page {i+1}")
                print(f"Name column: {name_col}, Cost column: {cost_col}")
                
                # Build lookup dictionary
                for _, row in df_table.iterrows():
                    incident_name = row[name_col]
                    cost_value = row[cost_col]
                    
                    if pd.notna(incident_name) and pd.notna(cost_value) and str(incident_name).strip() != '':
                        norm_name = normalize_name(incident_name)
                        clean_cost_val = clean_cost(cost_value)
                        cost_lookup[norm_name] = clean_cost_val
                        
print(f"Extracted {len(cost_lookup)} incident costs from the PDF")
print("\
Sample of extracted costs:")
count = 0
for name, cost in cost_lookup.items():
    print(f"{name}: {cost}")
    count += 1
    if count >= 10:
        break

# Load the CSV file
df = pd.read_csv(csv_path)
print(f"\
Loaded CSV with {len(df)} records")

# Update costs for 2022 records
update_count = 0
df_2022 = df[df['Year'] == 2022]
print(f"Found {len(df_2022)} records for 2022")

for idx, row in df_2022.iterrows():
    norm_name = normalize_name(row['Name'])
    if norm_name in cost_lookup:
        cost = cost_lookup[norm_name]
        if cost != 'NR':  # Only update if we have a real cost value
            df.at[idx, 'Cost'] = cost
            update_count += 1
            print(f"Updated: {row['Name']} with cost: {cost}")

print(f"\
Total incidents updated with actual costs: {update_count}")

# Save the updated CSV
df.to_csv(output_csv, index=False)
print(f"Updated CSV saved to: {output_csv}")

# Show a sample of the updated 2022 records
print("\
Sample of updated 2022 records:")
updated_df = pd.read_csv(output_csv)
updated_2022 = updated_df[updated_df['Year'] == 2022]
print(updated_2022[['Name', 'Cost']].head(10))

Extracting cost data from the 2022 report...
Found cost data in table 1 on page 9
Name column: Name, Cost column: Estimated Cost
Found cost data in table 1 on page 10
Name column: Name, Cost column: Estimated Cost
Extracted 45 incident costs from the PDF
Sample of extracted costs:
lime complex: $12726992
hermits peak: $330100293
black: $60190000
paradise complex: $2782896
tatlawiksuk: $75965
nc hpai 2022: $484012
bean complex: $11656365
koktuli river: NR
hog butte: $628717
double creek: $39500000
Loaded CSV with 470 records
Found 40 records for 2022
Updated: Lime Complex with cost: $12726992
Updated: Hermits Peak with cost: $330100293
Updated: Black with cost: $60190000
Updated: Paradise Complex with cost: $2782896
Updated: Tatlawiksuk with cost: $75965
Updated: NC HPAI 2022 with cost: $484012
Updated: Bean Complex with cost: $11656365
Updated: Hog Butte with cost: $628717
Updated: Double Creek with cost: $39500000
Updated: East Fork with cost: $6150000
Updated: Moose with cost: $98200

In [13]:
import pdfplumber  
import pandas as pd  
import re  
  
# Define file paths for the 2016 report and input CSV (make sure to use the CSV that contains all the years)  
pdf_path_2016 = "annual_report_2016_508.pdf"  
csv_path = "combined_incidents_clean_final_updated_manual_2022.csv"  # assuming this file contains multiple years  
output_csv = "combined_incidents_clean_final_updated_2016.csv"  
  
# Function to normalize incident names (convert to string, strip, lower, and reduce whitespace)  
def normalize_name(name):  
    return re.sub(r'\s+', ' ', str(name).strip().lower())  
  
# Function to clean cost values (remove $ and commas then add $ back; if blank return 'NR')  
def clean_cost(cost):  
    if pd.isna(cost) or str(cost).strip() == '':  
        return 'NR'  
    cost_str = str(cost).strip()  
    if cost_str.upper() == 'NR':  
        return 'NR'  
    # Remove $ and commas then add $ back  
    clean_val = cost_str.replace('$', '').replace(',', '')  
    return '$' + clean_val  
  
# Extract the cost table from the 2016 PDF  
cost_lookup_2016 = {}  
print("Extracting cost data from the 2016 annual report...")  
with pdfplumber.open(pdf_path_2016) as pdf:  
    for i, page in enumerate(pdf.pages):  
        tables = page.extract_tables()  
        if not tables:  
            continue       
        for table_idx, table in enumerate(tables):  
            if len(table) < 2:  
                continue  
            df_table = pd.DataFrame(table[1:], columns=table[0])  
            # Identify columns for incident names and cost by searching header names (lowercased)  
            cost_col = None  
            name_col = None  
            for col in df_table.columns:  
                if col and 'cost' in str(col).lower():  
                    cost_col = col  
                if col and 'name' in str(col).lower():  
                    name_col = col  
            if cost_col and name_col:  
                print("Found cost data in table " + str(table_idx+1) + " on page " + str(i+1))  
                print("Name column: " + str(name_col) + ", Cost column: " + str(cost_col))  
                # Build lookup dictionary  
                for _, row in df_table.iterrows():  
                    incident_name = row[name_col]  
                    cost_value = row[cost_col]  
                    if pd.notna(incident_name) and pd.notna(cost_value) and str(incident_name).strip() != '':  
                        norm_name = normalize_name(incident_name)  
                        clean_cost_val = clean_cost(cost_value)  
                        cost_lookup_2016[norm_name] = clean_cost_val  
  
print("Extracted", len(cost_lookup_2016), "incident cost entries from the 2016 report.")  
print("\nSample of extracted costs:")  
count = 0  
for name, cost in cost_lookup_2016.items():  
    print(name + ": " + cost)  
    count += 1  
    if count >= 10:  
        break  
  
# Load the CSV file containing records for all years  
df = pd.read_csv(csv_path)  
print("\nLoaded CSV with", len(df), "records.")  
  
# Update cost values for records with Year == 2016 using the lookup dictionary  
update_count = 0  
df_2016 = df[df['Year'] == 2016]  
print("Found", len(df_2016), "records for Year 2016.")  
for idx, row in df_2016.iterrows():  
    norm_name = normalize_name(row['Name'])  
    if norm_name in cost_lookup_2016:  
        cost = cost_lookup_2016[norm_name]  
        if cost != 'NR':  # Update only if valid cost exists  
            df.at[idx, 'Cost'] = cost  
            update_count += 1  
            print("Updated:", row['Name'], "with cost:", cost)  
              
print("\nTotal incidents updated with actual costs for 2016:", update_count)  
  
# After updating, ensure every record has a cost entry: if blank or missing, fill with 'NR'  
def format_cost(cost):  
    if pd.isna(cost) or str(cost).strip() == '':  
        return 'NR'  
    cost_str = str(cost).strip()  
    if cost_str.upper() == 'NR':  
        return 'NR'  
    if not cost_str.startswith('$'):  
        return '$' + cost_str  
    return cost_str  
  
df['Cost'] = df['Cost'].apply(format_cost)  
  
# Save the updated CSV file  
df.to_csv(output_csv, index=False)  
print("Updated CSV saved to:", output_csv)  
  
# Display sample 2016 records  
df_updated = pd.read_csv(output_csv)  
df_2016_updated = df_updated[df_updated['Year'] == 2016]  
print("\nSample of updated 2016 records:")  
print(df_2016_updated[['Name', 'Cost']].head(10))  

Extracting cost data from the 2016 annual report...
Found cost data in table 1 on page 11
Name column: Name, Cost column: Estimated
Cost
Extracted 19 incident cost entries from the 2016 report.

Sample of extracted costs:
anderson creek: $1750000
pioneer: $90000000
range 12: $1700000
soberanes: $262500000
hot pot: $3402259
alatna complex: $1040018
virginia mountains complex: $8000200
dulbi flats: NR
350 complex: $900000
henrys creek: $4320000

Loaded CSV with 470 records.
Found 19 records for Year 2016.
Updated: Anderson Creek with cost: $1750000
Updated: Pioneer with cost: $90000000
Updated: Range 12 with cost: $1700000
Updated: Soberanes with cost: $262500000
Updated: Hot Pot with cost: $3402259
Updated: Alatna Complex with cost: $1040018
Updated: Virginia Mountains Complex with cost: $8000200
Updated: 350 Complex with cost: $900000
Updated: Henrys Creek with cost: $4320000
Updated: Erskine with cost: $23000000
Updated: Chimney with cost: $78300000
Updated: Cedar with cost: $14000000

In [14]:
import pdfplumber
import pandas as pd
import re

# Define file paths for the 2016 report and input CSV
pdf_path_2016 = "annual_report_2016_508.pdf"
csv_path = "combined_incidents_clean_final_updated_2022_with_costs.csv"  # Use the most recent updated CSV
output_csv = "combined_incidents_clean_final_updated_2016_2020.csv"

# Function to normalize incident names (convert to string, strip, lower, and reduce whitespace)
def normalize_name(name):
    return re.sub(r'\s+', ' ', str(name).strip().lower())

# Function to clean cost values (remove $ and commas then add $ back; if blank return 'NR')
def clean_cost(cost):
    if pd.isna(cost) or str(cost).strip() == '':
        return 'NR'
    cost_str = str(cost).strip()
    if cost_str.upper() == 'NR':
        return 'NR'
    # Remove $ and commas then add $ back
    clean_val = cost_str.replace('$', '').replace(',', '')
    return '$' + clean_val

# Extract the cost table from the 2016 PDF
cost_lookup_2016 = {}
print("Extracting cost data from the 2016 annual report...")
with pdfplumber.open(pdf_path_2016) as pdf:
    for i, page in enumerate(pdf.pages):
        tables = page.extract_tables()
        if not tables:
            continue     
        for table_idx, table in enumerate(tables):
            if len(table) < 2:
                continue
            df_table = pd.DataFrame(table[1:], columns=table[0])
            
            # Identify columns for incident names and cost by searching header names
            cost_col = None
            name_col = None
            
            for col in df_table.columns:
                if col and 'cost' in str(col).lower():
                    cost_col = col
                if col and 'name' in str(col).lower():
                    name_col = col
            
            if cost_col and name_col:
                print(f"Found cost data in table {table_idx+1} on page {i+1}")
                print(f"Name column: {name_col}, Cost column: {cost_col}")
                
                # Build lookup dictionary
                for _, row in df_table.iterrows():
                    incident_name = row[name_col]
                    cost_value = row[cost_col]
                    
                    if pd.notna(incident_name) and pd.notna(cost_value) and str(incident_name).strip() != '':
                        norm_name = normalize_name(incident_name)
                        clean_cost_val = clean_cost(cost_value)
                        cost_lookup_2016[norm_name] = clean_cost_val

print(f"Extracted {len(cost_lookup_2016)} incident costs from the 2016 report.")
print("\
Sample of extracted costs from 2016:")
count = 0
for name, cost in cost_lookup_2016.items():
    print(f"{name}: {cost}")
    count += 1
    if count >= 10:
        break

# Now extract from the 2020 PDF
pdf_path_2020 = "annual_report_2020.pdf"
cost_lookup_2020 = {}

print("\
Extracting cost data from the 2020 annual report...")
with pdfplumber.open(pdf_path_2020) as pdf:
    for i, page in enumerate(pdf.pages):
        tables = page.extract_tables()
        if not tables:
            continue     
        for table_idx, table in enumerate(tables):
            if len(table) < 2:
                continue
            df_table = pd.DataFrame(table[1:], columns=table[0])
            
            # Identify columns for incident names and cost by searching header names
            cost_col = None
            name_col = None
            
            for col in df_table.columns:
                if col and 'cost' in str(col).lower():
                    cost_col = col
                if col and 'name' in str(col).lower():
                    name_col = col
            
            if cost_col and name_col:
                print(f"Found cost data in table {table_idx+1} on page {i+1}")
                print(f"Name column: {name_col}, Cost column: {cost_col}")
                
                # Build lookup dictionary
                for _, row in df_table.iterrows():
                    incident_name = row[name_col]
                    cost_value = row[cost_col]
                    
                    if pd.notna(incident_name) and pd.notna(cost_value) and str(incident_name).strip() != '':
                        norm_name = normalize_name(incident_name)
                        clean_cost_val = clean_cost(cost_value)
                        cost_lookup_2020[norm_name] = clean_cost_val

print(f"Extracted {len(cost_lookup_2020)} incident costs from the 2020 report.")
print("\
Sample of extracted costs from 2020:")
count = 0
for name, cost in cost_lookup_2020.items():
    print(f"{name}: {cost}")
    count += 1
    if count >= 10:
        break

# Load the CSV file containing records for all years
df = pd.read_csv(csv_path)
print(f"\
Loaded CSV with {len(df)} records.")

# Update cost values for records with Year == 2016 using the lookup dictionary
update_count_2016 = 0
df_2016 = df[df['Year'] == 2016]
print(f"Found {len(df_2016)} records for Year 2016.")
for idx, row in df_2016.iterrows():
    norm_name = normalize_name(row['Name'])
    if norm_name in cost_lookup_2016:
        cost = cost_lookup_2016[norm_name]
        if cost != 'NR':  # Update only if valid cost exists
            df.at[idx, 'Cost'] = cost
            update_count_2016 += 1
            print(f"Updated 2016: {row['Name']} with cost: {cost}")
            
print(f"\
Total incidents updated with actual costs for 2016: {update_count_2016}")

# Update cost values for records with Year == 2020 using the lookup dictionary
update_count_2020 = 0
df_2020 = df[df['Year'] == 2020]
print(f"Found {len(df_2020)} records for Year 2020.")
for idx, row in df_2020.iterrows():
    norm_name = normalize_name(row['Name'])
    if norm_name in cost_lookup_2020:
        cost = cost_lookup_2020[norm_name]
        if cost != 'NR':  # Update only if valid cost exists
            df.at[idx, 'Cost'] = cost
            update_count_2020 += 1
            print(f"Updated 2020: {row['Name']} with cost: {cost}")
            
print(f"\
Total incidents updated with actual costs for 2020: {update_count_2020}")

# After updating, ensure every record has a cost entry: if blank or missing, fill with 'NR'
def format_cost(cost):
    if pd.isna(cost) or str(cost).strip() == '':
        return 'NR'
    cost_str = str(cost).strip()
    if cost_str.upper() == 'NR':
        return 'NR'
    if not cost_str.startswith('$'):
        return '$' + cost_str
    return cost_str

df['Cost'] = df['Cost'].apply(format_cost)

# Save the updated CSV file
df.to_csv(output_csv, index=False)
print(f"Updated CSV saved to: {output_csv}")

# Display sample of updated records
df_updated = pd.read_csv(output_csv)
print("\
Sample of updated 2016 records:")
df_2016_updated = df_updated[df_updated['Year'] == 2016]
print(df_2016_updated[['Name', 'Cost']].head(10))

print("\
Sample of updated 2020 records:")
df_2020_updated = df_updated[df_updated['Year'] == 2020]
print(df_2020_updated[['Name', 'Cost']].head(10))

Extracting cost data from the 2016 annual report...
Found cost data in table 1 on page 11
Name column: Name, Cost column: Estimated
Cost
Extracted 19 incident costs from the 2016 report.
Sample of extracted costs from 2016:
anderson creek: $1750000
pioneer: $90000000
range 12: $1700000
soberanes: $262500000
hot pot: $3402259
alatna complex: $1040018
virginia mountains complex: $8000200
dulbi flats: NR
350 complex: $900000
henrys creek: $4320000
Extracting cost data from the 2020 annual report...
Found cost data in table 1 on page 10
Name column: Name, Cost column: Estimated
Cost
Found cost data in table 1 on page 11
Name column: Name, Cost column: Estimated
Cost
Extracted 50 incident costs from the 2020 report.
Sample of extracted costs from 2020:
august complex: $115511218
scu lightning complex: $69412351
shf elkhorn: NR
creek: $193000000
lnu lightning complex: $94646381
north complex: $112711950
pearl hill: $4241353
cameron peak: $133300000
lionshead: $65440000
east troublesome: $156

In [15]:
import pdfplumber
import pandas as pd
import re

# Define file paths for the two new reports
pdf_path_2023 = "annual_report_2023_508_0.pdf"
pdf_path_2018 = "annual_report_ 2018_508.pdf"  # Note the space in filename

# Use the latest CSV that we have been updating
csv_path = "combined_incidents_clean_final_updated_2016_2020.csv"
output_csv = "combined_incidents_clean_final_updated_all_years.csv"

# Define helper functions

def normalize_name(name):
    return re.sub(r'\s+', ' ', str(name).strip().lower())


def clean_cost(cost):
    if pd.isna(cost) or str(cost).strip() == '':
        return 'NR'
    cost_str = str(cost).strip()
    if cost_str.upper() == 'NR':
        return 'NR'
    clean_val = cost_str.replace('$', '').replace(',', '')
    return '$' + clean_val

# Function to extract cost lookup from a given PDF

def extract_cost_lookup(pdf_path):
    cost_lookup = {}
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                if not tables:
                    continue
                for table_idx, table in enumerate(tables):
                    # skip if table has less than 2 rows
                    if len(table) < 2:
                        continue
                    df_table = pd.DataFrame(table[1:], columns=table[0])
                    cost_col = None
                    name_col = None
                    for col in df_table.columns:
                        if col and 'cost' in str(col).lower():
                            cost_col = col
                        if col and 'name' in str(col).lower():
                            name_col = col
                    if cost_col and name_col:
                        # Debug print for which table found
                        print("Found cost data in table " + str(table_idx+1) + " on page " + str(i+1) + " in " + pdf_path)
                        for _, row in df_table.iterrows():
                            incident_name = row[name_col]
                            cost_value = row[cost_col]
                            if pd.notna(incident_name) and pd.notna(cost_value) and str(incident_name).strip() != "":
                                norm_name = normalize_name(incident_name)
                                clean_cost_val = clean_cost(cost_value)
                                cost_lookup[norm_name] = clean_cost_val
    except Exception as e:
        print("Error processing " + pdf_path + ": " + str(e))
    return cost_lookup

# Extract cost data from the 2023 report
print("\
Extracting cost data from the 2023 annual report...")
cost_lookup_2023 = extract_cost_lookup(pdf_path_2023)
print("Extracted " + str(len(cost_lookup_2023)) + " incident costs from the 2023 report.")
print("\
Sample of extracted costs from 2023:")
count = 0
for name, cost in cost_lookup_2023.items():
    print(name + ": " + cost)
    count += 1
    if count >= 10:
        break

# Extract cost data from the 2018 report
print("\
Extracting cost data from the 2018 annual report...")
cost_lookup_2018 = extract_cost_lookup(pdf_path_2018)
print("Extracted " + str(len(cost_lookup_2018)) + " incident costs from the 2018 report.")
print("\
Sample of extracted costs from 2018:")
count = 0
for name, cost in cost_lookup_2018.items():
    print(name + ": " + cost)
    count += 1
    if count >= 10:
        break

# Load CSV file containing records for all years
print("\
Loading CSV from " + csv_path)
df = pd.read_csv(csv_path)
print("Loaded CSV with " + str(len(df)) + " records.")

# Update cost values based on lookup for 2023 and 2018
update_count_2023 = 0
update_count_2018 = 0

# Update records for year 2023
df_2023 = df[df['Year'] == 2023]
print("Found " + str(len(df_2023)) + " records for Year 2023.")
for idx, row in df_2023.iterrows():
    norm_name = normalize_name(row['Name'])
    if norm_name in cost_lookup_2023:
        cost = cost_lookup_2023[norm_name]
        if cost != 'NR':
            df.at[idx, 'Cost'] = cost
            update_count_2023 += 1
            print("Updated 2023: " + row['Name'] + " with cost: " + cost)

# Update records for year 2018
df_2018 = df[df['Year'] == 2018]
print("Found " + str(len(df_2018)) + " records for Year 2018.")
for idx, row in df_2018.iterrows():
    norm_name = normalize_name(row['Name'])
    if norm_name in cost_lookup_2018:
        cost = cost_lookup_2018[norm_name]
        if cost != 'NR':
            df.at[idx, 'Cost'] = cost
            update_count_2018 += 1
            print("Updated 2018: " + row['Name'] + " with cost: " + cost)

print("\
Total incidents updated with actual costs for 2023: " + str(update_count_2023))
print("Total incidents updated with actual costs for 2018: " + str(update_count_2018))

# Ensure every record has a cost entry: fill blank/missing with 'NR'
def format_cost(cost):
    if pd.isna(cost) or str(cost).strip() == '' :
        return 'NR'
    cost_str = str(cost).strip()
    if cost_str.upper() == 'NR':
        return 'NR'
    if not cost_str.startswith('$'):
        return '$' + cost_str
    return cost_str

df['Cost'] = df['Cost'].apply(format_cost)

# Save the updated CSV

df.to_csv(output_csv, index=False)
print("\
Updated CSV saved to: " + output_csv)

# Display sample of updated 2023 and 2018 records
print("\
Sample of updated 2023 records:")
df_updated = pd.read_csv(output_csv)
df_2023_updated = df_updated[df_updated['Year'] == 2023]
print(df_2023_updated[['Name', 'Cost']].head(10))

print("\
Sample of updated 2018 records:")
df_2018_updated = df_updated[df_updated['Year'] == 2018]
print(df_2018_updated[['Name', 'Cost']].head(10))

print("\
Processing complete.")

Extracting cost data from the 2023 annual report...
Extracted 0 incident costs from the 2023 report.
Sample of extracted costs from 2023:
Extracting cost data from the 2018 annual report...
Found cost data in table 1 on page 9 in annual_report_ 2018_508.pdf
Found cost data in table 1 on page 10 in annual_report_ 2018_508.pdf
Extracted 47 incident costs from the 2018 report.
Sample of extracted costs from 2018:
mendocino complex: $220000000
martin: $10000000
rhea: $3800000
south sugarloaf: $20000000
carr: $162289294
klondike: $104500000
camp: $120000000
goose creek: $7850000
spring creek: $35000000
pole creek: $29100000
Loading CSV from combined_incidents_clean_final_updated_2016_2020.csv
Loaded CSV with 470 records.
Found 10 records for Year 2023.
Found 24 records for Year 2018.
Updated 2018: Mendocino Complex with cost: $220000000
Updated 2018: Martin with cost: $10000000
Updated 2018: Rhea with cost: $3800000
Updated 2018: South Sugarloaf with cost: $20000000
Updated 2018: Carr with 

In [16]:
import pdfplumber
import pandas as pd
import re

# Define file paths for the three new reports
pdf_path_2019 = "annual_report_2019_508.pdf"
pdf_path_2021 = "annual_report_2021.pdf"
pdf_path_2024 = "annual_report_2024.pdf"

# Use the latest CSV that we have been updating
csv_path = "combined_incidents_clean_final_updated_all_years.csv"
output_csv = "combined_incidents_clean_final_updated_2019_2021_2024.csv"

# Define helper functions
def normalize_name(name):
    return re.sub(r'\s+', ' ', str(name).strip().lower())

def clean_cost(cost):
    if pd.isna(cost) or str(cost).strip() == '':
        return 'NR'
    cost_str = str(cost).strip()
    if cost_str.upper() == 'NR':
        return 'NR'
    clean_val = cost_str.replace('$', '').replace(',', '')
    return '$' + clean_val

# Function to extract cost lookup from a given PDF
def extract_cost_lookup(pdf_path):
    cost_lookup = {}
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                if not tables:
                    continue
                for table_idx, table in enumerate(tables):
                    # skip if table has less than 2 rows
                    if len(table) < 2:
                        continue
                    df_table = pd.DataFrame(table[1:], columns=table[0])
                    # Clean column names
                    df_table.columns = [str(c).strip() for c in df_table.columns]
                    
                    cost_col = None
                    name_col = None
                    
                    # Look for columns with cost and name
                    for col in df_table.columns:
                        if col and 'cost' in str(col).lower():
                            cost_col = col
                        if col and 'name' in str(col).lower():
                            name_col = col
                    
                    if cost_col and name_col:
                        # Debug print for which table found
                        print(f"Found cost data in table {table_idx+1} on page {i+1} in {pdf_path}")
                        print(f"Name column: {name_col}, Cost column: {cost_col}")
                        
                        # Build lookup dictionary
                        for _, row in df_table.iterrows():
                            incident_name = row[name_col]
                            cost_value = row[cost_col]
                            
                            if pd.notna(incident_name) and pd.notna(cost_value) and str(incident_name).strip() != "":
                                norm_name = normalize_name(incident_name)
                                clean_cost_val = clean_cost(cost_value)
                                cost_lookup[norm_name] = clean_cost_val
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
    return cost_lookup

# Extract cost data from the 2019 report
print("\
Extracting cost data from the 2019 annual report...")
cost_lookup_2019 = extract_cost_lookup(pdf_path_2019)
print(f"Extracted {len(cost_lookup_2019)} incident costs from the 2019 report.")
print("\
Sample of extracted costs from 2019:")
count = 0
for name, cost in cost_lookup_2019.items():
    print(f"{name}: {cost}")
    count += 1
    if count >= 10:
        break

# Extract cost data from the 2021 report
print("\
Extracting cost data from the 2021 annual report...")
cost_lookup_2021 = extract_cost_lookup(pdf_path_2021)
print(f"Extracted {len(cost_lookup_2021)} incident costs from the 2021 report.")
print("\
Sample of extracted costs from 2021:")
count = 0
for name, cost in cost_lookup_2021.items():
    print(f"{name}: {cost}")
    count += 1
    if count >= 10:
        break

# Extract cost data from the 2024 report
print("\
Extracting cost data from the 2024 annual report...")
cost_lookup_2024 = extract_cost_lookup(pdf_path_2024)
print(f"Extracted {len(cost_lookup_2024)} incident costs from the 2024 report.")
print("\
Sample of extracted costs from 2024:")
count = 0
for name, cost in cost_lookup_2024.items():
    print(f"{name}: {cost}")
    count += 1
    if count >= 10:
        break

# Load CSV file containing records for all years
print(f"\
Loading CSV from {csv_path}")
df = pd.read_csv(csv_path)
print(f"Loaded CSV with {len(df)} records.")

# Update cost values based on lookup for 2019, 2021, and 2024
update_count_2019 = 0
update_count_2021 = 0
update_count_2024 = 0

# Update records for year 2019
df_2019 = df[df['Year'] == 2019]
print(f"Found {len(df_2019)} records for Year 2019.")
for idx, row in df_2019.iterrows():
    norm_name = normalize_name(row['Name'])
    if norm_name in cost_lookup_2019:
        cost = cost_lookup_2019[norm_name]
        if cost != 'NR':
            df.at[idx, 'Cost'] = cost
            update_count_2019 += 1
            print(f"Updated 2019: {row['Name']} with cost: {cost}")

# Update records for year 2021
df_2021 = df[df['Year'] == 2021]
print(f"Found {len(df_2021)} records for Year 2021.")
for idx, row in df_2021.iterrows():
    norm_name = normalize_name(row['Name'])
    if norm_name in cost_lookup_2021:
        cost = cost_lookup_2021[norm_name]
        if cost != 'NR':
            df.at[idx, 'Cost'] = cost
            update_count_2021 += 1
            print(f"Updated 2021: {row['Name']} with cost: {cost}")

# Update records for year 2024
df_2024 = df[df['Year'] == 2024]
print(f"Found {len(df_2024)} records for Year 2024.")
for idx, row in df_2024.iterrows():
    norm_name = normalize_name(row['Name'])
    if norm_name in cost_lookup_2024:
        cost = cost_lookup_2024[norm_name]
        if cost != 'NR':
            df.at[idx, 'Cost'] = cost
            update_count_2024 += 1
            print(f"Updated 2024: {row['Name']} with cost: {cost}")

print(f"\
Total incidents updated with actual costs for 2019: {update_count_2019}")
print(f"Total incidents updated with actual costs for 2021: {update_count_2021}")
print(f"Total incidents updated with actual costs for 2024: {update_count_2024}")

# Ensure every record has a cost entry: fill blank/missing with 'NR'
def format_cost(cost):
    if pd.isna(cost) or str(cost).strip() == '':
        return 'NR'
    cost_str = str(cost).strip()
    if cost_str.upper() == 'NR':
        return 'NR'
    if not cost_str.startswith('$'):
        return '$' + cost_str
    return cost_str

df['Cost'] = df['Cost'].apply(format_cost)

# Save the updated CSV
df.to_csv(output_csv, index=False)
print(f"\
Updated CSV saved to: {output_csv}")

# Display sample of updated records
df_updated = pd.read_csv(output_csv)

print("\
Sample of updated 2019 records:")
df_2019_updated = df_updated[df_updated['Year'] == 2019]
if len(df_2019_updated) > 0:
    print(df_2019_updated[['Name', 'Cost']].head(10))
else:
    print("No records found for 2019")

print("\
Sample of updated 2021 records:")
df_2021_updated = df_updated[df_updated['Year'] == 2021]
if len(df_2021_updated) > 0:
    print(df_2021_updated[['Name', 'Cost']].head(10))
else:
    print("No records found for 2021")

print("\
Sample of updated 2024 records:")
df_2024_updated = df_updated[df_updated['Year'] == 2024]
if len(df_2024_updated) > 0:
    print(df_2024_updated[['Name', 'Cost']].head(10))
else:
    print("No records found for 2024")

print("\
Processing complete.")

Extracting cost data from the 2019 annual report...
Found cost data in table 1 on page 10 in annual_report_2019_508.pdf
Name column: Name, Cost column: Estimated
Cost
Found cost data in table 1 on page 11 in annual_report_2019_508.pdf
Name column: Name, Cost column: Estimated
Cost
Extracted 27 incident costs from the 2019 report.
Sample of extracted costs from 2019:
old grouch top: $61000
frozen calf: $4332806
hess creek: $3005369
swan lake: $48101094
bearnose hill: $2108024
woodbury: $20000000
sheep: $710000
black river: $30000
north river: $40000
tractor trail 2: $461188
Extracting cost data from the 2021 annual report...
Found cost data in table 1 on page 11 in annual_report_2021.pdf
Name column: Name, Cost column: Estimated
Cost
Found cost data in table 1 on page 12 in annual_report_2021.pdf
Name column: Name, Cost column: Estimated
Cost
Extracted 37 incident costs from the 2021 report.
Sample of extracted costs from 2021:
dixie: $30000000
bootleg: $100900000
monument: $163739291
c