In [1]:
# Let's examine the original Last_Report_Date column to understand the format
import pandas as pd
import numpy as np

# Reload the file
df = pd.read_csv('combined_incidents.csv', encoding='utf-8')

# Look at some examples of Last_Report_Date
print("Sample values from Last_Report_Date column:")
print(df['Last_Report_Date'].dropna().head(10).tolist())

# Check if there are any patterns in the date format
print("\
Unique formats in Last_Report_Date column:")
formats = df['Last_Report_Date'].dropna().unique()
print(formats[:20])  # Show first 20 unique values

Sample values from Last_Report_Date column:
['9/11', '8/28', '8/14', '12/4', '10/9', '6/26', '8/22', '9/25', '9/25', '3/24']
Unique formats in Last_Report_Date column:
['9/11' '8/28' '8/14' '12/4' '10/9' '6/26' '8/22' '9/25' '3/24' '3/13'
 'Active\ninto\n2018' '8/11' '7/24' '10/26' '10/17' '7/25' '6/29' '3/10'
 '10/5' '10/10']


In [3]:
# Let's check the first row where we have a Last_Report_Date but Last_Report_Date_Clean is null
import pandas as pd
import numpy as np
from datetime import datetime

# Reload the file
df = pd.read_csv('combined_incidents.csv', encoding='utf-8')

# Create a clean dataframe with the original columns
df_clean = df.copy()

# Let's look at the first few rows to understand the issue
print("Original Last_Report_Date values:")
print(df['Last_Report_Date'].head(10))

# The issue is that dates like '9/11' don't have a year component
# We need to add the year from the 'Year' column

# Function to parse and standardize dates
def parse_date(date_str, year=None):
    if pd.isnull(date_str) or date_str == 'None':
        return np.nan
    
    # Handle special cases
    if 'Active' in str(date_str):
        return np.nan
    
    try:
        # Try to parse with various formats
        date_str = str(date_str).strip()
        
        # Check if it's just month/day format (e.g., '9/11')
        if '/' in date_str and len(date_str.split('/')) == 2:
            month, day = date_str.split('/')
            if year is not None:
                # Create a date with the provided year
                return f"{day.zfill(2)}/{month.zfill(2)}/{int(year)}"
        
        # Try to parse with datetime to standardize
        try:
            # Try different formats
            for fmt in ['%d-%b-%y', '%d/%m/%Y', '%d/%m/%y', '%d-%m-%Y', '%Y-%m-%d']:
                try:
                    dt = datetime.strptime(date_str, fmt)
                    return f"{dt.day:02d}/{dt.month:02d}/{dt.year}"
                except:
                    continue
        except:
            pass
        
        # If we couldn't parse it, return as is
        return date_str
    except:
        return date_str

# Apply the function to Start_Date and Last_Report_Date
df_clean['Start_Date_Clean'] = df.apply(
    lambda row: parse_date(row['Start_Date'], row['Year']), axis=1
)

df_clean['Last_Report_Date_Clean'] = df.apply(
    lambda row: parse_date(row['Last_Report_Date'], row['Year']), axis=1
)

# For Contain_Control_Date, also use the Year column
df_clean['Contain_Control_Date_Clean'] = df.apply(
    lambda row: parse_date(row['Contain_Control_Date'], row['Year']), axis=1
)

# Clean the Size_Acres column
def clean_size(x):
    try:
        if pd.isnull(x):
            return np.nan
        # Remove commas
        x = str(x).replace(',', '')
        return float(x)
    except:
        return np.nan

df_clean['Size_Acres_Clean'] = df['Size_Acres'].apply(clean_size)

# Clean the Cost columns
def clean_cost(x):
    try:
        if pd.isnull(x) or x in ['NR', 'None']:
            return np.nan
        # Remove $ and commas
        x = str(x).replace('$', '').replace(',', '')
        return float(x)
    except:
        return np.nan

df_clean['Cost_Clean'] = df['Cost'].apply(clean_cost)
df_clean['Estimated_Cost_Clean'] = df['Estimated_Cost'].apply(clean_cost)

# Show the results
print("\
Cleaned date columns (first 10 rows):")
print("Start_Date_Clean:")
print(df_clean['Start_Date_Clean'].head(10))
print("\
Last_Report_Date_Clean:")
print(df_clean['Last_Report_Date_Clean'].head(10))
print("\
Contain_Control_Date_Clean:")
print(df_clean['Contain_Control_Date_Clean'].head(10))

Original Last_Report_Date values:
0    9/11
1    8/28
2    8/14
3    12/4
4    10/9
5    6/26
6    8/22
7    9/25
8    9/25
9     NaN
Name: Last_Report_Date, dtype: object
Cleaned date columns (first 10 rows):
Start_Date_Clean:
0    14/07/2014
1    14/07/2014
2    19/05/2014
3    14/08/2014
4    13/09/2014
5    19/04/2014
6    02/08/2014
7    03/08/2014
8    12/07/2014
9     2/25/2008
Name: Start_Date_Clean, dtype: object
Last_Report_Date_Clean:
0    11/09/2014
1    28/08/2014
2    14/08/2014
3    04/12/2014
4    09/10/2014
5    26/06/2014
6    22/08/2014
7    25/09/2014
8    25/09/2014
9           NaN
Name: Last_Report_Date_Clean, dtype: object
Contain_Control_Date_Clean:
0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5           NaN
6           NaN
7           NaN
8           NaN
9    03/02/2008
Name: Contain_Control_Date_Clean, dtype: object


In [8]:
# Reload the original CSV and convert the dates to mm/dd/yyyy
import pandas as pd
import numpy as np
from datetime import datetime

# Reload the original file
df_original = pd.read_csv('combined_incidents.csv', encoding='utf-8')

# Function to parse and standardize dates in mm/dd/yyyy format

def parse_date_mmddyyyy(date_str, year=None):
    if pd.isnull(date_str) or date_str == 'None':
        return np.nan
    
    # Remove extra spaces
    date_str = str(date_str).strip()
    
    # Handle special case for 'Active' present content
    if 'Active' in date_str:
        if year is not None:
            return f"12/31/{int(year)}"  # placeholder end of year
        return np.nan
    
    try:
        # Check if it's just month/day format (e.g., '9/11')
        if '/' in date_str and len(date_str.split('/')) == 2:
            month, day = date_str.split('/')
            if year is not None:
                return f"{month.zfill(2)}/{day.zfill(2)}/{int(year)}"
            else:
                return f"{month.zfill(2)}/{day.zfill(2)}/1900"
        
        # Handle dash-separated format with a month abbreviation, e.g., "14-Jul-14"
        if '-' in date_str:
            try:
                dt = datetime.strptime(date_str, '%d-%b-%y')
                return f"{dt.month:02d}/{dt.day:02d}/{dt.year}"
            except:
                pass
        
        # Try parsing with common formats
        for fmt in ['%m/%d/%Y','%m/%d/%y', '%d/%m/%Y', '%d/%m/%y','%m-%d-%Y','%Y-%m-%d']:
            try:
                dt = datetime.strptime(date_str, fmt)
                return f"{dt.month:02d}/{dt.day:02d}/{dt.year}"
            except Exception as e:
                continue
        
        # If not parsed, try to manually combine with year if available
        if year is not None and '/' in date_str:
            parts = date_str.split('/')
            if len(parts)==2:
                month, day = parts
                return f"{month.zfill(2)}/{day.zfill(2)}/{int(year)}"
        
        return date_str
    except Exception as e:
        return date_str

# Create a new dataframe to store the date converted results, starting from original
# Keeping the original columns
df_mmdd = df_original.copy()

# Convert date columns using the new function with mm/dd/yyyy formatting

# For Start_Date
if 'Start_Date' in df_mmdd.columns:
    df_mmdd['Start_Date'] = df_mmdd.apply(lambda row: parse_date_mmddyyyy(row['Start_Date'], row['Year'] if 'Year' in row else None), axis=1)

# For Last_Report_Date
if 'Last_Report_Date' in df_mmdd.columns:
    df_mmdd['Last_Report_Date'] = df_mmdd.apply(lambda row: parse_date_mmddyyyy(row['Last_Report_Date'], row['Year'] if 'Year' in row else None), axis=1)

# For 'Contain_Control_Date' if present
if 'Contain_Control_Date' in df_mmdd.columns:
    df_mmdd['Contain_Control_Date'] = df_mmdd.apply(lambda row: parse_date_mmddyyyy(row['Contain_Control_Date'], row['Year'] if 'Year' in row else None), axis=1)

# Show a preview of the modified dataframe
print('Modified original combined_incidents.csv with dates in mm/dd/yyyy format (first 10 rows):')
print(df_mmdd.head(10))

# Save the modified dataframe to a new CSV file
filename = 'combined_incidents_mmdd_format.csv'
df_mmdd.to_csv(filename, index=False)
print('\
Modified file saved as ' + filename)

Modified original combined_incidents.csv with dates in mm/dd/yyyy format (first 10 rows):
                 Name GACC State  Start_Date Last_Report_Date Size_Acres  \
0     Buzzard Complex   NW    OR  07/14/2014       09/11/2014    395,747   
1     Carlton Complex   NW    WA  07/14/2014       08/28/2014    256,108   
2         Funny River   AK    AK  05/19/2014       08/14/2014    195,858   
3  Happy Camp Complex   NO    CA  08/14/2014       12/04/2014    134,056   
4                King   NO    CA  09/13/2014       10/09/2014     97,717   
5               Skunk   SW    AZ  04/19/2014       06/26/2014     73,622   
6          Big Cougar   NR    ID  08/02/2014       08/22/2014     65,227   
7        July Complex   NO    CA  08/03/2014       09/25/2014     50,042   
8       Shaniko Butte   NW    OR  07/12/2014       09/25/2014     42,044   
9          Glass Fire   SA    TX  02/25/2008              NaN    219,556   

  Cause          Cost Inc_Type Contain_Control_Date    Year Estimated_Cos

In [10]:
# Let's fix the issue with the 2024 rows by moving the data from the extra columns to the standard columns
import pandas as pd
import numpy as np

# Load the file
df = pd.read_csv('combined_incidents_mmdd_format.csv')

# Function to convert date format for 2024 rows
def convert_date_format(date_str, year):
    if pd.isnull(date_str) or date_str == 'None':
        return np.nan
    
    date_str = str(date_str).strip()
    
    # Check if it's just month/day format (e.g., '2/26')
    if '/' in date_str and len(date_str.split('/')) == 2:
        month, day = date_str.split('/')
        return f"{month.zfill(2)}/{day.zfill(2)}/{int(year)}"
    
    return date_str

# Function to clean size acres
def clean_size_acres(size_str):
    if pd.isnull(size_str) or size_str == 'None':
        return np.nan
    
    # Remove commas and convert to float
    try:
        return float(str(size_str).replace(',', ''))
    except:
        return np.nan

# Process 2024 rows
mask_2024 = df['Year'] == 2024

# Move data from extra columns to standard columns for 2024 rows
df.loc[mask_2024, 'Start_Date'] = df.loc[mask_2024].apply(
    lambda row: convert_date_format(row['Start Date'], row['Year']), axis=1
)

df.loc[mask_2024, 'Last_Report_Date'] = df.loc[mask_2024].apply(
    lambda row: convert_date_format(row['Last Report Date'], row['Year']), axis=1
)

df.loc[mask_2024, 'Size_Acres'] = df.loc[mask_2024]['Size In Acres'].apply(clean_size_acres)

# Show the updated 2024 rows
print("Updated 2024 rows (first 10):")
print(df.loc[mask_2024].head(10))

# Save the updated dataframe
df.to_csv('combined_incidents_fixed.csv', index=False)
print("\
Updated data saved to 'combined_incidents_fixed.csv'")

# Let's also create a clean version with only the essential columns
df_clean = df[['Name', 'GACC', 'State', 'Start_Date', 'Last_Report_Date', 'Size_Acres', 'Cause', 'Cost', 'Inc_Type', 'Contain_Control_Date', 'Year']]

# Save the clean version
df_clean.to_csv('combined_incidents_clean.csv', index=False)
print("\
Clean version with essential columns saved to 'combined_incidents_clean.csv'")

# Show the clean version
print("\
Clean version with essential columns (first 10 rows):")
print(df_clean.head(10))

Updated 2024 rows (first 10):
                   Name GACC State  Start_Date Last_Report_Date Size_Acres  \
339         Betty's Way   RM    NE  02/26/2024       03/11/2024    69810.0   
340    Smokehouse Creek   SA    TX  02/26/2024       03/17/2024  1054153.0   
341             Catesby   SA    OK  02/27/2024       03/15/2024    89688.0   
342            McDonald   AK    AK  06/08/2024       07/20/2024   152227.0   
343            Midnight   AK    AK  06/19/2024       07/09/2024    52550.0   
344  Grapefruit Complex   AK    AK  06/28/2024       07/13/2024    89011.0   
345               Falls   NW    OR  07/10/2024       08/21/2024   151689.0   
346          Cow Valley   NW    OR  07/11/2024       08/09/2024   133490.0   
347           Lone Rock   NW    OR  07/13/2024       08/31/2024   137222.0   
348            Boneyard   NW    OR  07/17/2024       07/25/2024    49716.0   

    Cause Cost Inc_Type Contain_Control_Date    Year Estimated_Cost  \
339     U  NaN      NaN                 

Loaded combined_incidents_clean.csv
Error loading incidents_2012_updated.json: File incidents_2012_updated.json does not exist


FileNotFoundError: File incidents_2012_updated.json does not exist