In [31]:
import os
import numpy as np
import pandas as pd
import re

In [165]:
# Directory path where the Excel files are located
directory = 'Resources\CAL_FireStats'
pattern = r"\b\d{4}-wildfire-activity-stats\.xlsx$"
#pattern = r"2021-wildfire-activity-stats\.xlsx$"

# Initialize an empty DataFrame to store the extracted data
extracted_data = []
# Loop through each file in the directory
for filename in os.listdir(directory):
    if re.match(pattern, filename):
        # Extract the year from the filename
        year = int(filename[:4])

        # Process the file
        #print(f"Processing file: {filename}")
        
        # Construct the full file path
        file_path = os.path.join(directory, filename)

        # Read the Excel file into a DataFrame
        xls = pd.ExcelFile(file_path)
        
        # Loop through each tab in the Excel file
        for sheet_name in xls.sheet_names:
            tab = pd.read_excel(xls, sheet_name,header=None).astype(str)
            #print(f"Processing tab: {sheet_name}")
                
            # Check if the DataFrame is not empty
            if not tab.empty:
                # Check for the conditions before proceeding with extraction
               first_row = tab.iloc[0, 2:].values.tolist()
               #print(first_row)

            # Check for the presence of the dollar sign ($) character in the tab
            if any('$' in item for row in tab.values for item in row):
                # Check for the presence of "Arson", "Campfire", "Lightning" in the non-empty values starting from the third one
                valid_values = ["Arson", "Campfire", "Lightning", "Total","DebrisBurning","Equip.Use","Ltng.","Misc.","Power-line","P-W-F","Rail-road","Smoking","Undet.", "Vehicle"]
                
                # Preprocess the values in first_row by removing whitespace characters
                first_row_cleaned = [value.strip() for value in first_row]

                # Define the desired column index in data_to_append for the dollar values
                dollar_target_column_index = 1  # For example, placing the dollar values in the second column of data_to_append

                if any(value in valid_values for value in first_row_cleaned):
                     dollar_column = tab.columns[tab.apply(lambda col: col.astype(str).str.contains('\$', na=False).any())]
                     # Convert the NumPy array to a Pandas Series
                     dollar_column_series = pd.Series(dollar_column)
                     # Ensure the data in dollar_column_series is treated as strings
                     dollar_column_series = dollar_column_series.astype(str)
                     
                     # Repeat the corresponding dollar values for each split value in the county_column
                     dollar_values_split = dollar_column_series.str.split('\n').explode()

                     # Reset the index of dollar_values_split and county_column_split
                     dollar_values_split = dollar_values_split.reset_index(drop=True)
    
                     # Extract the county column from the tab DataFrame
                     county_column = tab.iloc[:, 0]

                     # Split multi-line values in the county_column
                     county_column_split = county_column.str.split('\n').explode()

                     # Reset the index of dollar_values_split and county_column_split
                     county_column_split = county_column_split.reset_index(drop=True)

                     # Create a DataFrame with the split values
                     data_to_append = pd.concat([dollar_values_split.to_frame(name='DollarAmount'),\
                                                  county_column_split.to_frame(name='County'), pd.Series(year, name='year')], axis=1)
                     # Reassign the dollar values to the desired column index in data_to_append
                     data_to_append.insert(dollar_target_column_index, 'DollarAmount', data_to_append.pop('DollarAmount'))

                     # Check if the required columns are found
                     if dollar_column.size > 0:
                        # Append the data to the extracted_data list
                        extracted_data.append(data_to_append)
                #       print("Data extracted and appended successfully.")
                    #else:
                #        print("Required columns not found in the tab.")
                #else:
               #     print("Non-empty values starting from the third one do not match the specified list.")
            #else:
              #  print("Dollar sign ($) character not found in the tab.")
            
# Check if data is being appended to the extracted_data list
if extracted_data:
    # Concatenate the extracted data into a DataFrame
    extracted_data_df = pd.concat(extracted_data, ignore_index=True)

    # Rename the columns

    # check column names
    #print(extracted_data_df.columns)
    extracted_data_df.rename(columns={0: 'County', 'DollarAmount': 'Tot_Damage', 'year': 'Year'}, inplace=True)

    # List of county names to filter
    county_names = ["Alameda","Alpine", "Amador", "Butte","Calaveras","Colusa","Contra Costa","Del Norte","El Dorado","Fresno",\
    "Glenn","Humboldt","Imperial","Inyo","Kern","Kings","Lake","Lassen","Los Angeles","Madera","Marin",\
        "Mariposa","Mendocino","Merced","Modoc","Mono","Monterey","Napa","Nevada","Orange","Placer","Plumas",\
            "Riverside","Sacramento","San Benito","San Bernardino","San Diego","San Francisco","San Joaquin",\
            "San Luis Obispo","San Mateo","Santa Barbara","Santa Clara","Santa Cruz","Shasta","Sierra","Siskiyou",\
            "Solano","Sonoma","Stanislaus","Sutter","Tehama","Trinity","Tulare","Tuolumne","Ventura","Yolo","Yuba"]

    # Filter the county column
    filtered_data = extracted_data_df[extracted_data_df['County'].isin(county_names)]
    #Save the values from the county column that do not match any of the ones in the county names list:
    # Save the unmatched county values
    unmatched_data = extracted_data_df[~extracted_data_df['County'].isin(county_names)]

    # Convert the Tot_Damage column to numeric values, handling errors with 'coerce'
    filtered_data['Tot_Damage'] = pd.to_numeric(filtered_data['Tot_Damage'].str.replace('[\$,]', '', regex=True), errors='coerce')
    filtered_data.sort_values(['Year','County','Tot_Damage'])
    filtered_data = filtered_data.drop_duplicates()
    unmatched_data['Tot_Damage'] = pd.to_numeric(unmatched_data['Tot_Damage'].str.replace('[\$,]', '', regex=True), errors='coerce')
    
    summary_unmatched = unmatched_data.groupby('County').agg({'Tot_Damage': 'sum', 'County': 'count'}).\
        rename(columns={'County': 'Frequency'}).sort_values(['Tot_Damage', 'Frequency'], ascending=[False, False])
    print(summary_unmatched)
    summary_unmatched.to_csv('Outputs\\unmatched_DD_rows.csv', index=True)
    
    # Display the extracted data
    #print("Extracted Data:")
    #print(extracted_data_df.tail(5))

    # Export the extracted data DataFrame to a CSV file
    filtered_data.to_csv('Outputs\extracted_data.csv', index=False)

    summary_matched = filtered_data.groupby('Year').agg({'County': 'count'}).\
        rename(columns={'County': 'Frequency'}).sort_values(['Year'])
    print(summary_matched)
else:
    print("No data extracted. Check the conditions for data extraction.")

           

                                    Tot_Damage  Frequency
County                                                   
nan                                     3891.0        262
E. San Joaquin                           120.0          8
E. Trinity                               117.0          8
Shasta-Trinity                           113.0          9
Sonoma-Lake-Napa                          91.0          8
...                                        ...        ...
NORTHERN REGION TOTAL                      0.0          1
SOUTHERN REGION TOTAL                      0.0          1
San Mateo-                                 0.0          1
Shaded areas represent unit totals         0.0          1
Sonoma-Lake-                               0.0          1

[61 rows x 2 columns]
Empty DataFrame
Columns: [Frequency]
Index: []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Tot_Damage'] = pd.to_numeric(filtered_data['Tot_Damage'].str.replace('[\$,]', '', regex=True), errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_data['Tot_Damage'] = pd.to_numeric(unmatched_data['Tot_Damage'].str.replace('[\$,]', '', regex=True), errors='coerce')
