In [6]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import os, glob, re

In [10]:
# Define a function to pre-process the files
def process_file(file_path):    
    df = pd.read_csv("aemo_data/High_Impact_Outages_20210830.csv", encoding='latin1')
    # df = pd.read_csv(file_path, encoding='latin1')

    # add a column to keep track of the csv file in which each entry came from
    base_name = os.path.basename(file_path)
    date_match = re.search(r'(\d{8})', base_name)
    if date_match:
        file_date = pd.to_datetime(date_match.group(1), format='%Y%m%d').date()
    else:
        file_date = None  # fallback if no date in filename
    df["file_name"] = file_date

    # drop the 'Vs Old' column, it has no meaning
    if 'Vs Old' in df.columns:
            df.drop(columns=['Vs Old'], inplace=True)

    # certain columns have \n and \r, so they are removed
    df = df.replace({r"\r|\n": " "}, regex=True)
    df.columns = df.columns.str.replace(r"[\r\n]", " ", regex=True)

    # drop the columns which are Unnamed and have no values (total of 7 columns)
    # df = df.iloc[:, :-7]

    # pre-processing on the columns
    # extract information from the 'start' and 'finish' columns and separate into separate columns
    df['Start'] = df['Start'].replace({r'\r|\n': ' '}, regex=True)
    df['Finish'] = df['Finish'].replace({r'\r|\n': ' '}, regex=True)
    df['Start'] = pd.to_datetime(df['Start'], dayfirst=True, format='%d/%m/%Y %H:%M %A', errors='coerce')
    df['Finish'] = pd.to_datetime(df['Finish'], dayfirst=True, format='%d/%m/%Y %H:%M %A', errors='coerce')

    # extract information from 'recall'
    pattern = r"Day:\s*(\d+)\s*hr?s?(?:-Night:\s*(\d+)\s*hr?s?)?"
    df[["Recall_Day_Hours", "Recall_Night_Hours"]] = df["Recall"].str.extract(pattern)
    df["Recall_Day_Hours"] = pd.to_numeric(df["Recall_Day_Hours"], errors="coerce")
    df['Recall_Day_Hours'] = df['Recall_Day_Hours'].fillna(0)
    df["Recall_Night_Hours"] = pd.to_numeric(df["Recall_Night_Hours"], errors="coerce")
    df['Recall_Night_Hours'] = df['Recall_Night_Hours'].fillna(0)

    # extract information for 'Impact region, reason and duration'
    # this column only appears as of 28/03/2022
    if "Impact Region, Reason and Duration" in df.columns:
        impact_reason_pattern = r"^([\w\s/]+?)\s+([a-zA-Z\s]+?)\s+([\d.]+)\s*(Days?|Hours?|Minutes?)$"
        temp_cols = df["Impact Region, Reason and Duration"].str.extract(impact_reason_pattern)
        temp_cols.columns = ['Impact_Region', 'Reason', 'Value', 'Unit']
        temp_cols["Value"] = temp_cols["Value"].astype(float)
        df["Impact_Region"] = temp_cols["Impact_Region"].str.strip()
        df["Reason"] = temp_cols["Reason"].str.strip()
        df["Duration_Hours"] = temp_cols.apply(
            lambda row: row["Value"] * 24 if pd.notnull(row["Unit"]) and "Day" in row["Unit"]
            else row["Value"] if pd.notnull(row["Unit"]) and "Hour" in row["Unit"]
            else row["Value"] / 60 if pd.notnull(row["Unit"]) and "Minute" in row["Unit"]
            else None, axis=1)
    else:
        # if the column does not exist, then make let the impact region and region be the same
        df["Reason"] = df["Reason and  Duration"].str.extract(r"([a-zA-Z\s]+)")
        duration_pattern = r"([\d\.]+)\s*(Days?|Hours?|Minutes?)"
        df[["Value", "Unit"]] = df["Duration"].str.extract(duration_pattern)
        df["Value"] = df["Value"].astype(float)
        df["Duration_Hours"] = df.apply(
            lambda row: row["Value"] * 24 if pd.notnull(row["Unit"]) and "Day" in row["Unit"]
            else row["Value"] if pd.notnull(row["Unit"]) and "Hour" in row["Unit"]
            else row["Value"] / 60 if pd.notnull(row["Unit"]) and "Minute" in row["Unit"]
            else None, axis=1)
        df["Impact_Region"] = df["Region"]

    # split information from 'status'
    df[["Status_Description", "Status_Code"]] = df["Status"].str.split(" - ", expand=True)
    # Split information from 'status and market notice'
    df[['Status_Description_Market', 'Market_Notice_Code']] = df['Status and  Market Notice'].str.split(" - ", expand=True)

    # List of boolean columns
    bool_cols = ['Project Work?', 'Unplanned?', 'Generator Aware?', 'DNSP Aware?', 'Inter-Regional']
    # Replace 'T' with 1 and NaN with 0
    df[bool_cols] = df[bool_cols].apply(lambda col: col.map(lambda x: 1 if x == 'T' else 0))

    # drop non-needed columns now:
    df.drop(columns=['Status', 'Status and  Market Notice', 'Duration', 'Value', 'Unit',
                    'Reason and  Duration', 'Recall'], inplace=True)

    # re-order columns
    cols = list(df.columns)
    new_col_order = ['Region', 'NSP', 
                    'Start', 'Finish', # Start and Finish information
                    'Network Asset', # identifying information
                    'Recall_Day_Hours', 'Recall_Night_Hours', # recall information
                    'Project Work?', 'Unplanned?', 'DNSP Aware?', 'Generator Aware?', 'Inter-Regional', # boolean terms
                    'Status_Description', 'Status_Code', 'Status_Description_Market', 'Market_Notice_Code', # status information
                    'Impact_Region', 'Reason', 'Duration_Hours', # reason and duration information
                    'Impact', 'file_name'
                    ]
    new_col_order = list(dict.fromkeys(new_col_order))
    df = df[new_col_order]

    # remove rows where 'Region' is empty
    df = df[df['Region'].notna() & (df['Region'].str.strip() != '')]

    return df


In [13]:
# List of CSV files to process
csv_files = glob.glob("aemo_data/*.csv")

# List to store processed dataframes
processed_dfs = []

# Loop through each file, apply the processing function and collect the results
for file in csv_files:
    processed_df = process_file(file)
    processed_dfs.append(processed_df)

In [None]:
# concatenate all dataframes and export to csv
full_df = pd.concat(processed_dfs, ignore_index=True)
full_df.info()
full_df.to_csv("processed_high_impact_outages.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13156 entries, 0 to 13155
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Region                     13156 non-null  object        
 1   NSP                        13156 non-null  object        
 2   Start                      13156 non-null  datetime64[ns]
 3   Finish                     13156 non-null  datetime64[ns]
 4   Network Asset              13156 non-null  object        
 5   Recall_Day_Hours           13156 non-null  float64       
 6   Recall_Night_Hours         13156 non-null  float64       
 7   Project Work?              13156 non-null  int64         
 8   Unplanned?                 13156 non-null  int64         
 9   DNSP Aware?                13156 non-null  int64         
 10  Generator Aware?           13156 non-null  int64         
 11  Inter-Regional             13156 non-null  int64         
 12  Stat