In [3]:
import os
import numpy as np
import pandas as pd
import re

In [6]:
# Directory path where the Excel files are located, and filename pattern
directory = 'Resources\CAL_FireStats'
pattern = r"\b\d{4}-wildfire-activity-stats\.xlsx$"

# Empty DataFrame to store the extracted data
extracted_data = []

# Process each year's data in the directory
for filename in os.listdir(directory):
    if re.match(pattern, filename):
        # Extract the year from the filename
        year = int(filename[:4])

        # Construct the full file path and read file into a DataFrame
        file_path = os.path.join(directory, filename)
        xls = pd.ExcelFile(file_path)
        
        # Process tabs in the Excel file, looking for the 1st sheet with "$" character, which is where the Damages data are located
        for sheet_name in xls.sheet_names:
            tab = pd.read_excel(xls, sheet_name,header=None).astype(str)
                            
            # Check if the DataFrame is not empty
            if not tab.empty:
                first_row = tab.iloc[0, 2:].values.tolist()

            # Check for the presence of the dollar sign ($) character in the tab
            if any('$' in item for row in tab.values for item in row):
                # Look for the indicators of the tabs containing by-county damages data
                valid_values = ["Arson", "Campfire", "Lightning", "Total","DebrisBurning","Equip.Use","Ltng.","Misc.","Power-line","P-W-F","Rail-road","Smoking","Undet.", "Vehicle"]
                first_row_cleaned = [value.strip() for value in first_row]

                # Define the desired column index in data_to_append for the dollar values
                dollar_target_column_index = 1  # ()

                if any(value in valid_values for value in first_row_cleaned):
                    # Find the column with the dollar sign ($)
                    dollar_column = tab.columns[tab.apply(lambda col: col.astype(str).str.contains('\$', na=False).any())]
                    print("Dollar column:", dollar_column)
                
                    # Extract data from the tab
                    # Convert the Index objects to numpy arrays before indexing
                    dollar_column = tab.columns.to_numpy()[tab.apply(lambda col: col.astype(str).str.contains('\$', na=False).any())]
                    print("$ columns np: ", dollar_column)

                    # Extract the county column from the tab DataFrame
                    county_column = tab.iloc[:, 0]

                    # Create a DataFrame with the extracted columns and the assigned year
                    data_to_append = pd.concat([tab.loc[:, dollar_column[0]].to_frame(), county_column.to_frame(), pd.Series(year, index=tab.index, name='year')], axis=1)
                    
                    # Reassign the dollar values to the desired column index in data_to_append
                    data_to_append.insert(dollar_target_column_index, 'DollarAmount', data_to_append.pop(dollar_column[0]))

                    # Check if the required columns are found
                    if dollar_column.size > 0:
                        # Append the data to the extracted_data list
                        extracted_data.append(data_to_append)
# DEBUGGING CODE PRESERVED FOR FUTURE CONVENIENCE
                 #       print("Data extracted and appended successfully.")
                    #else:
                #        print("Required columns not found in the tab.")
                #else:
               #     print("Non-empty values starting from the third one do not match the specified list.")
            #else:
              #  print("Dollar sign ($) character not found in the tab.")
            
# Ensure extracted data is appended to the extracted_data list
if extracted_data:
    # Concatenate the extracted data into a DataFrame
    extracted_data_df = pd.concat(extracted_data, ignore_index=True)

    # check column names
    print(extracted_data_df.columns)
    extracted_data_df.rename(columns={0: 'County', 'DollarAmount': 'Tot_Damage', 'year': 'Year'}, inplace=True)

    # List of county names to filter
    county_names = ["Alameda","Alpine", "Amador", "Butte","Calaveras","Colusa","Contra Costa","Del Norte","El Dorado","Fresno",\
    "Glenn","Humboldt","Imperial","Inyo","Kern","Kings","Lake","Lassen","Los Angeles","Madera","Marin",\
        "Mariposa","Mendocino","Merced","Modoc","Mono","Monterey","Napa","Nevada","Orange","Placer","Plumas",\
            "Riverside","Sacramento","San Benito","San Bernardino","San Diego","San Francisco","San Joaquin",\
            "San Luis Obispo","San Mateo","Santa Barbara","Santa Clara","Santa Cruz","Shasta","Sierra","Siskiyou",\
            "Solano","Sonoma","Stanislaus","Sutter","Tehama","Trinity","Tulare","Tuolumne","Ventura","Yolo","Yuba"]

    # Filter the county column to remove 'Units' (i.e. aggregrate regions comprised of multiple counties)
    filtered_data = extracted_data_df[extracted_data_df['County'].isin(county_names)]

    # Process the unmatched values from the county column
    # (for analysis to ensure none are unintentionally excluded)
    unmatched_data = extracted_data_df[~extracted_data_df['County'].isin(county_names)]
    unmatched_data['Tot_Damage'] = pd.to_numeric(unmatched_data['Tot_Damage'].str.replace('[\$,]', '', regex=True), errors='coerce')
    summary_unmatched = unmatched_data.groupby('County').agg({'Tot_Damage': 'sum', 'County': 'count'}).\
        rename(columns={'County': 'Frequency'}).sort_values(['Tot_Damage', 'Frequency'], ascending=[False, False])
    print(summary_unmatched)
    summary_unmatched.to_csv('Outputs\\unmatched_DD_rows.csv', index=True)

    # Final data transformation and export steps 
    # Convert the Tot_Damage column to numeric values to facilitate calculations
    filtered_data['Tot_Damage'] = filtered_data['Tot_Damage'].str.replace('[\$,]', '', regex=True).astype(float)
    filtered_data.sort_values(['Year','County','Tot_Damage'])
    filtered_data = filtered_data.drop_duplicates()    
    filtered_data.to_csv('Outputs\extracted_data.csv', index=False)
else:
    print("No data extracted. Check the conditions for data extraction.")

Dollar column: Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')
$ columns np:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13]
Dollar column: Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')
$ columns np:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13]
Dollar column: Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')
$ columns np:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13]
Dollar column: Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')
$ columns np:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13]
Dollar column: Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')
$ columns np:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13]
Dollar column: Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')
$ columns np:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13]
Dollar column: Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')
$ columns np:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13]
Dollar column: Index([ 3,  4,  6,  7,  9,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_data['Tot_Damage'] = pd.to_numeric(unmatched_data['Tot_Damage'].str.replace('[\$,]', '', regex=True), errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Tot_Damage'] = filtered_data['Tot_Damage'].str.replace('[\$,]', '', regex=True).astype(float)
