In [36]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import os, glob

In [42]:
# Define a function to pre-process the files
def process_file(file_path):    
    # df = pd.read_csv("aemo_data/High_Impact_Outages_20210830.csv", encoding='latin1')
    df = pd.read_csv(file_path, encoding='latin1')

    # certain columns have \n and \r, so they are removed
    df = df.replace({r"\r|\n": " "}, regex=True)
    df.columns = df.columns.str.replace(r"[\r\n]", " ", regex=True)

    # drop the columns which are Unnamed and have no values (total of 7 columns)
    df = df.iloc[:, :-7]

    # pre-processing on the columns
    # extract information from the 'start' and 'finish' columns and separate into separate columns
    pattern = r"(\d{2}/\d{2}/\d{4})\s+(\d{2}:\d{2})\s+(\w+)"
    df[["Start_Date", "Start_Time", "Start_Day"]] = df["Start"].str.extract(pattern)
    df[["Finish_Date", "Finish_Time", "Finish_Day"]] = df["Finish"].str.extract(pattern)
    df['Start_Date'] = pd.to_datetime(df['Start_Date'], dayfirst=True)
    df['Start_Time'] = pd.to_datetime(df['Start_Time']).dt.strftime('%H:%M:%S')
    df['Finish_Date'] = pd.to_datetime(df['Finish_Date'], dayfirst=True)
    df['Finish_Time'] = pd.to_datetime(df['Finish_Time']).dt.strftime('%H:%M:%S')

    # extract information from 'recall'
    pattern = r"Day:\s*(\d+)\s*hr?s?(?:-Night:\s*(\d+)\s*hr?s?)?"
    df[["Recall_Day_Hours", "Recall_Night_Hours"]] = df["Recall"].str.extract(pattern)
    df["Recall_Day_Hours"] = pd.to_numeric(df["Recall_Day_Hours"], errors="coerce")
    df["Recall_Night_Hours"] = pd.to_numeric(df["Recall_Night_Hours"], errors="coerce")

    # extract information from 'reason'
    df["Reason"] = df["Reason and  Duration"].str.extract(r"([a-zA-Z\s]+)")

    # extract information from 'duration'
    pattern = r"([\d\.]+)\s*(Days?|Hours?|Minutes?)"
    df[["Value", "Unit"]] = df["Duration"].str.extract(pattern)
    df["Value"] = df["Value"].astype(float)
    df["Duration_Hours"] = df.apply(
        lambda row: row["Value"] * 24 if pd.notnull(row["Unit"]) and "Day" in row["Unit"]  # Convert Days to Hours
        else row["Value"] if pd.notnull(row["Unit"]) and "Hour" in row["Unit"]  # Keep Hours as is
        else row["Value"] / 60 if pd.notnull(row["Unit"]) and "Minute" in row["Unit"]  # Convert Minutes to Hours
        else None, axis=1  # Handle any unexpected cases
    )

    # split information from 'status'
    df[["Status_Description", "Status_Code"]] = df["Status"].str.split(" - ", expand=True)
    # Split information from 'status and market notice'
    df[['Status_Description_Market', 'Market_Notice_Code']] = df['Status and  Market Notice'].str.split(" - ", expand=True)

    # List of boolean columns
    bool_cols = ['Project Work?', 'Unplanned?', 'Generator Aware?', 'DNSP Aware?', 'Inter-Regional']
    # Replace 'T' with 1 and NaN with 0
    df[bool_cols] = df[bool_cols].applymap(lambda x: 1 if x == 'T' else 0)

    # drop non-needed columns now:
    df.drop(columns=['Start', 'Finish', 'Status', 'Status and  Market Notice', 'Duration', 'Value', 'Unit',
                    'Reason and  Duration', 'Recall', 'Start', 'Finish'], inplace=True)

    # re-order columns
    cols = list(df.columns)
    new_col_order = ['Region', 'NSP', 
                    'Start_Date', 'Start_Time', 'Start_Day', # Start time information
                    'Finish_Date', 'Finish_Time', 'Finish_Day', # End time information
                    'Network Asset', # identifying information
                    'Recall_Day_Hours', 'Recall_Night_Hours', # recall information
                    'Project Work?', 'Unplanned?', 'DNSP Aware?', 'Generator Aware?', 'Inter-Regional', # boolean terms
                    'Status_Description', 'Status_Code', 'Status_Description_Market', 'Market_Notice_Code', # status information
                    'Reason', 'Duration_Hours', # reason and duration information
                    'Impact'
                    ]
    new_col_order = list(dict.fromkeys(new_col_order))
    df = df[new_col_order]

    return df

In [43]:
# List of CSV files to process
csv_files = glob.glob("aemo_data/*.csv")

# List to store processed dataframes
processed_dfs = []

# Loop through each file, apply the processing function and collect the results
for file in csv_files:
    processed_df = process_file(file)
    processed_dfs.append(processed_df)

  df['Start_Time'] = pd.to_datetime(df['Start_Time']).dt.strftime('%H:%M:%S')
  df['Finish_Time'] = pd.to_datetime(df['Finish_Time']).dt.strftime('%H:%M:%S')


KeyError: 'Reason and  Duration'