In [3]:
# Let's extract the date information from incidents_2021.json and update our CSV
import pandas as pd
import json
from datetime import datetime

# Load the JSON file
json_file = 'incidents_2021.json'
with open(json_file, 'r') as f:
    incidents_2021 = json.load(f)

# Load the CSV file
csv_file = 'combined_incidents_clean_final_updated.csv'
df = pd.read_csv(csv_file)
print(f"Loaded {csv_file} with {len(df)} rows")

# Check how many 2021 incidents are in our CSV
incidents_2021_count = len(df[df['Year'] == 2021])
print(f"Number of 2021 incidents in CSV: {incidents_2021_count}")

# Create a dictionary to map incident names to their dates from the JSON
name_to_date = {}
for incident in incidents_2021:
    # The field name has spaces in it
    name = incident.get('Name', '').strip()
    date_field = 'Contain or Last_Report Date'  # This is the actual field name in the JSON
    date_value = incident.get(date_field, '').strip()
    
    if name and date_value:
        name_to_date[name.lower()] = date_value

print(f"Extracted {len(name_to_date)} date values from {json_file}")
print("Sample of extracted dates:")
sample_items = list(name_to_date.items())[:5]
for name, date in sample_items:
    print(f"{name}: {date}")

# Function to format date with year
def format_date_with_year(date_str, year):
    if not date_str or pd.isna(date_str):
        return None
    
    date_str = str(date_str).strip()
    
    # Handle format like "10/23" by adding the year
    if '/' in date_str and len(date_str) <= 5:
        date_str = f"{date_str}/{year}"
    
    try:
        date_obj = datetime.strptime(date_str, '%m/%d/%Y')
        return date_obj.strftime('%m/%d/%Y')
    except ValueError:
        try:
            # Try other formats
            formats = ['%m/%d/%y', '%d-%b-%y', '%d-%b-%Y']
            for fmt in formats:
                try:
                    date_obj = datetime.strptime(date_str, fmt)
                    return date_obj.strftime('%m/%d/%Y')
                except ValueError:
                    continue
            
            # If all formats fail, try pandas to_datetime
            date_obj = pd.to_datetime(date_str)
            return date_obj.strftime('%m/%d/%Y')
        except:
            print(f"Could not parse date: {date_str}")
            return None

# Update the Last_Report_Date column for 2021 incidents
updated_count = 0
for idx, row in df[df['Year'] == 2021].iterrows():
    name = row['Name'].lower().strip() if isinstance(row['Name'], str) else ''
    
    if name in name_to_date:
        date_value = name_to_date[name]
        formatted_date = format_date_with_year(date_value, 2021)
        
        if formatted_date and (pd.isna(row['Last_Report_Date']) or row['Last_Report_Date'] == ''):
            df.at[idx, 'Last_Report_Date'] = formatted_date
            updated_count += 1

print(f"Updated {updated_count} rows with dates from {json_file}")

# Count remaining null values in Last_Report_Date
null_last_report = df['Last_Report_Date'].isna().sum()
print(f"Remaining null values in Last_Report_Date: {null_last_report}")

# Save the updated CSV
output_file = 'combined_incidents_clean_final_updated_2021.csv'
df.to_csv(output_file, index=False)
print(f"Saved updated CSV to {output_file}")

# Show a sample of the updated 2021 incidents
print("Sample of updated 2021 incidents:")
sample = df[df['Year'] == 2021].head(5)
print(sample[['Name', 'Year', 'Start_Date', 'Last_Report_Date']])

print("Done")

Loaded combined_incidents_clean_final_updated.csv with 470 rows
Number of 2021 incidents in CSV: 24
Extracted 24 date values from incidents_2021.json
Sample of extracted dates:
dixie: 10/23
bootleg: 8/13
monument: 10/25
caldor: 10/20
river complex: 10/24
Updated 24 rows with dates from incidents_2021.json
Remaining null values in Last_Report_Date: 0
Saved updated CSV to combined_incidents_clean_final_updated_2021.csv
Sample of updated 2021 incidents:
              Name    Year  Start_Date Last_Report_Date
315          Dixie  2021.0  07/13/2021       10/23/2021
316        Bootleg  2021.0  07/06/2021       08/13/2021
317       Monument  2021.0  07/31/2021       10/25/2021
318         Caldor  2021.0  08/14/2021       10/20/2021
319  River Complex  2021.0  07/30/2021       10/24/2021
Done


In [5]:
# Convert Last_Report_Date into mm/dd/year format
import pandas as pd
from datetime import datetime

# Load the updated CSV file
csv_file = 'combined_incidents_clean_final_updated_2021.csv'
df = pd.read_csv(csv_file)
print(f"Loaded {csv_file} with {len(df)} rows")

# Define a function to convert a Last_Report_Date value to mm/dd/yyyy
# If the value is just a month name, assume it's the first day of that month and year from Start_Date

def convert_last_report_date(lr_date, start_date):
    # Strip and check value
    if pd.isna(lr_date) or lr_date == '':
        return None
    lr_date_str = str(lr_date).strip()
    
    # Try known date formats
    for fmt in ['%m/%d/%Y', '%m/%d/%y', '%b %d, %Y', '%B %d, %Y']:
        try:
            dt = datetime.strptime(lr_date_str, fmt)
            return dt.strftime('%m/%d/%Y')
        except:
            continue
    
    # Check if the string is just a month name or month abbreviation (e.g., 'August' or 'Aug')
    try:
        # Try to parse with full month name
        dt = datetime.strptime(lr_date_str, '%B')
    except:
        try:
            dt = datetime.strptime(lr_date_str, '%b')
        except Exception as e:
            # If it fails, then return the original value
            return lr_date_str
    
    # If we got the month, then use the year from start_date
    if pd.isna(start_date) or start_date == '':
        year = datetime.now().year
    else:
        # Expect start_date to be in mm/dd/yyyy or similar format.
        try:
            dt_start = datetime.strptime(start_date, '%m/%d/%Y')
            year = dt_start.year
        except Exception as e:
            year = datetime.now().year
    
    # Use the month from dt, day = 1, and the year from start_date
    new_date = datetime(year, dt.month, 1)
    return new_date.strftime('%m/%d/%Y')

# Let's apply this function on our dataframe and update Last_Report_Date

updated_dates = 0
for idx, row in df.iterrows():
    original_lr = row['Last_Report_Date']
    start_date = row['Start_Date']
    new_lr = convert_last_report_date(original_lr, start_date)
    # if conversion returns a different value, update
    if new_lr and new_lr != original_lr:
        df.at[idx, 'Last_Report_Date'] = new_lr
        updated_dates += 1

print(f"Updated Last_Report_Date for {updated_dates} rows")

# Save the updated CSV
output_file = 'combined_incidents_clean_final_updated_2021_v2.csv'
df.to_csv(output_file, index=False)
print(f"Saved updated CSV to {output_file}")

# Show a sample of the updated data
print("Sample rows after converting Last_Report_Date:")
print(df[['Name', 'Start_Date', 'Last_Report_Date']].head(10))

print("Done")

Loaded combined_incidents_clean_final_updated_2021.csv with 470 rows
Updated Last_Report_Date for 10 rows
Saved updated CSV to combined_incidents_clean_final_updated_2021_v2.csv
Sample rows after converting Last_Report_Date:
                 Name  Start_Date Last_Report_Date
0     Buzzard Complex  07/14/2014       09/11/2014
1     Carlton Complex  07/14/2014       08/28/2014
2         Funny River  05/19/2014       08/14/2014
3  Happy Camp Complex  08/14/2014       12/04/2014
4                King  09/13/2014       10/09/2014
5               Skunk  04/19/2014       06/26/2014
6          Big Cougar  08/02/2014       08/22/2014
7        July Complex  08/03/2014       09/25/2014
8       Shaniko Butte  07/12/2014       09/25/2014
9          Glass Fire  02/25/2008       03/02/2008
Done


In [9]:
# Check the current date formats in the Last_Report_Date column
import pandas as pd
import re
from datetime import datetime

# Load the CSV file
csv_file = 'combined_incidents_clean_final_updated_2021_v2.csv'
df = pd.read_csv(csv_file)
print(f"Loaded {csv_file} with {len(df)} rows")

# Check unique formats in Last_Report_Date
unique_formats = set()
for date_str in df['Last_Report_Date'].dropna().unique():
    # Try to identify the format
    if re.match(r'\d{1,2}/\d{1,2}/\d{4}', str(date_str)):
        unique_formats.add('mm/dd/yyyy')
    elif re.match(r'\d{1,2}/\d{1,2}/\d{2}', str(date_str)):
        unique_formats.add('mm/dd/yy')
    elif re.match(r'\d{1,2}-[A-Za-z]{3}-\d{2}', str(date_str)):
        unique_formats.add('dd-MMM-yy')
    elif re.match(r'\d{1,2}-[A-Za-z]{3}-\d{4}', str(date_str)):
        unique_formats.add('dd-MMM-yyyy')
    elif re.match(r'[A-Za-z]{3,9} \d{1,2}, \d{4}', str(date_str)):
        unique_formats.add('Month dd, yyyy')
    else:
        unique_formats.add(f'Other: {date_str}')

print(f"Found {len(unique_formats)} unique date formats:")
for fmt in unique_formats:
    print(f"- {fmt}")

# Sample of dates that might not be in mm/dd/yyyy format
non_standard_dates = []
for idx, row in df.iterrows():
    date_str = str(row['Last_Report_Date'])
    if not pd.isna(row['Last_Report_Date']) and not re.match(r'\d{1,2}/\d{1,2}/\d{4}', date_str):
        non_standard_dates.append((idx, row['Name'], date_str))
        if len(non_standard_dates) >= 10:
            break

print(f"\
Found {len(non_standard_dates)} non-standard date formats (showing up to 10):")
for idx, name, date_str in non_standard_dates:
    print(f"Row {idx}: {name} - {date_str}")

print("\
Done")

Loaded combined_incidents_clean_final_updated_2021_v2.csv with 470 rows
Found 2 unique date formats:
- dd-MMM-yy
- mm/dd/yyyy
Found 10 non-standard date formats (showing up to 10):
Row 122: Long Butte - 3-Sep-10
Row 123: Toklat - 4-Jun-10
Row 124: Jefferson - 17-Jul-10
Row 125: Turquoise Lake - 7-Sep-10
Row 126: Big Mountain - 2-Sep-10
Row 127: Pat Creek - 30-Dec-10
Row 128: Lone Mountain - 22-Dec-10
Row 129: Twitchell Canyon - 18-Nov-10
Row 130: Bull Creek - 25-Jun-10
Row 402: Wallow - 8-Jul-11
Done


In [11]:
# Convert all dates in Last_Report_Date column to mm/dd/yyyy format using pandas to_datetime
import pandas as pd
from datetime import datetime

# Load the CSV file
csv_file = 'combined_incidents_clean_final_updated_2021_v2.csv'
df = pd.read_csv(csv_file)
print(f"Loaded {csv_file} with {len(df)} rows")

# Function to convert a date string to mm/dd/yyyy using pd.to_datetime

def convert_to_mmddyyyy(date_str):
    try:
        # Some dates are in non-standard format, let pandas infer
        dt = pd.to_datetime(date_str, infer_datetime_format=True, errors='coerce')
        if pd.isna(dt):
            return date_str
        return dt.strftime('%m/%d/%Y')
    except Exception as e:
        return date_str

# Apply conversion to Last_Report_Date column
# We'll update the column using the conversion function for every non-null entry

df['Last_Report_Date'] = df['Last_Report_Date'].apply(lambda x: convert_to_mmddyyyy(x) if pd.notna(x) and x != '' else x)

# Verify conversion by printing some rows with non-standard values
print("Sample of converted Last_Report_Date values:")
sample_non_standard = df.loc[df['Last_Report_Date'].apply(lambda x: not pd.isna(x) and not pd.Series(x).str.match(r'\d{1,2}/\d{1,2}/\d{4}').bool()), ['Name', 'Last_Report_Date']].head(10)
print(sample_non_standard.to_string(index=False))

# Save the updated CSV
output_file = 'combined_incidents_clean_final_updated_2021_v3.csv'
df.to_csv(output_file, index=False)
print(f"\
Saved updated CSV to {output_file}")

print("Done")

Loaded combined_incidents_clean_final_updated_2021_v2.csv with 470 rows
Sample of converted Last_Report_Date values:


  dt = pd.to_datetime(date_str, infer_datetime_format=True, errors='coerce')
  sample_non_standard = df.loc[df['Last_Report_Date'].apply(lambda x: not pd.isna(x) and not pd.Series(x).str.match(r'\d{1,2}/\d{1,2}/\d{4}').bool()), ['Name', 'Last_Report_Date']].head(10)


Empty DataFrame
Columns: [Name, Last_Report_Date]
Index: []
Saved updated CSV to combined_incidents_clean_final_updated_2021_v3.csv
Done


In [13]:
# Update the CSV with cost information from incidents_2017.json
import json
import pandas as pd

# Load the incidents_2017.json file
json_file = 'incidents_2017.json'
with open(json_file, 'r') as f:
    incidents_2017 = json.load(f)

# Load the CSV file
csv_file = 'combined_incidents_clean_final_updated_2021_v4.csv'
df = pd.read_csv(csv_file)

# Create a dictionary mapping incident names to their estimated costs
cost_dict = {}
for incident in incidents_2017:
    name = incident.get('Name', '').strip()
    # Handle the case where name might have newlines or other whitespace
    name = ' '.join(name.split())
    # Replace spaces with underscores to match CSV format
    name = name.replace(' ', '_')
    
    estimated_cost = incident.get('Estimated Cost', '')
    if estimated_cost and estimated_cost != 'NR':
        cost_dict[name] = estimated_cost

print(f"Found cost information for {len(cost_dict)} incidents in {json_file}")
print("Sample of cost information:")
for i, (name, cost) in enumerate(list(cost_dict.items())[:5]):
    print(f"  {name}: {cost}")

# Update the Cost column in the CSV for 2017 incidents
updated_count = 0
for name, cost in cost_dict.items():
    # Find matching rows in the dataframe (2017 incidents with matching name)
    mask = (df['Year'] == 2017) & (df['Name'] == name)
    if mask.any():
        df.loc[mask, 'Cost'] = cost
        updated_count += 1
    else:
        # Try a more flexible match (in case of slight name differences)
        for csv_name in df.loc[df['Year'] == 2017, 'Name'].unique():
            if name.lower() in csv_name.lower() or csv_name.lower() in name.lower():
                mask = (df['Year'] == 2017) & (df['Name'] == csv_name)
                df.loc[mask, 'Cost'] = cost
                updated_count += 1
                print(f"  Matched {name} to {csv_name}")
                break

print(f"Updated cost information for {updated_count} incidents in the CSV")

# Check how many 2017 incidents still have null Cost
df_2017 = df[df['Year'] == 2017]
null_cost_count = df_2017['Cost'].isna().sum()
print(f"Number of 2017 incidents with null Cost after update: {null_cost_count}")

# Show the updated 2017 incidents from the CSV
print("\
Sample of updated 2017 incidents from CSV:")
print(df_2017[['Name', 'Year', 'Cost']].head(10))

# Save the updated CSV
output_file = 'combined_incidents_clean_final_updated_2021_v5.csv'
df.to_csv(output_file, index=False)
print(f"\
Saved updated CSV to {output_file}")

print("\
Done")

Found cost information for 27 incidents in incidents_2017.json
Sample of cost information:
  NW_Oklahoma_Complex: 3,200,000
  Thomas: 123,836,000
  Lodgepole_Complex: 9,800,000
  Roosters_Comb: 4,000,000
  Chetco_Bar: 72,000,000
Updated cost information for 7 incidents in the CSV
Number of 2017 incidents with null Cost after update: 23
Sample of updated 2017 incidents from CSV:
                     Name    Year         Cost
32   NW Oklahoma\nComplex  2017.0          NaN
33               Perryton  2017.0          NaN
34                 Thomas  2017.0  123,836,000
35     Lodgepole\nComplex  2017.0          NaN
36          Roosters Comb  2017.0          NaN
37             Chetco Bar  2017.0          NaN
38             Rice Ridge  2017.0          NaN
39  Four Seasons\nComplex  2017.0          NaN
40              West Mims  2017.0          NaN
41            Lefors East  2017.0          NaN
Saved updated CSV to combined_incidents_clean_final_updated_2021_v5.csv
Done


Collecting tabula-py
  Downloading tabula_py-2.10.0-py3-none-any.whl.metadata (7.6 kB)
Downloading tabula_py-2.10.0-py3-none-any.whl (12.0 MB)
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/12.0 MB 656.4 kB/s eta 0:00:19
   - -------------------------------------- 0.3/12.0 MB 2.1 MB/s eta 0:00:06
   -- ------------------------------------- 0.8/12.0 MB 4.4 MB/s eta 0:00:03
   --------- ------------------------------ 2.8/12.0 MB 11.8 MB/s eta 0:00:01
   -------------------- ------------------- 6.2/12.0 MB 21.9 MB/s eta 0:00:01
   ---------------------------------- ----- 10.3/12.0 MB 38.5 MB/s eta 0:00:01
   ---------------------------------------  12.0/12.0 MB 81.8 MB/s eta 0:00:01
   ---------------------------------------  12.0/12.0 MB 81.8 MB/s eta 0:00:01
   ------------------

FileNotFoundError: [Errno 2] No such file or directory: 'annual_report_2017_508_0.pdf'