In [31]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import os, glob

In [None]:
df = pd.read_csv("aemo_data/High_Impact_Outages_20210830.csv", encoding='latin1')
# df.info()

# certain columns have \n and \r, so they are removed
df = df.replace({r"\r|\n": " "}, regex=True)
df.columns = df.columns.str.replace(r"[\r\n]", " ", regex=True)

# drop the columns which are Unnamed and have no values (total of 7 columns)
df = df.iloc[:, :-7]

# pre-processing on the columns
# extract information from the 'start' and 'finish' columns and separate into separate columns
pattern = r"(\d{2}/\d{2}/\d{4})\s+(\d{2}:\d{2})\s+(\w+)"
df[["Start_Date", "Start_Time", "Start_Day"]] = df["Start"].str.extract(pattern)
df[["Finish_Date", "Finish_Time", "Finish_Day"]] = df["Finish"].str.extract(pattern)
df['Start_Date'] = pd.to_datetime(df['Start_Date'], dayfirst=True)
df['Start_Time'] = pd.to_datetime(df['Start_Time']).dt.strftime('%H:%M:%S')
df['Finish_Date'] = pd.to_datetime(df['Finish_Date'], dayfirst=True)
df['Finish_Time'] = pd.to_datetime(df['Finish_Time']).dt.strftime('%H:%M:%S')

# extract information from 'recall'
pattern = r"Day:\s*(\d+)\s*hr?s?(?:-Night:\s*(\d+)\s*hr?s?)?"
df[["Recall_Day_Hours", "Recall_Night_Hours"]] = df["Recall"].str.extract(pattern)
df["Recall_Day_Hours"] = pd.to_numeric(df["Recall_Day_Hours"], errors="coerce")
df["Recall_Night_Hours"] = pd.to_numeric(df["Recall_Night_Hours"], errors="coerce")

# extract information from 'reason'
df["Reason"] = df["Reason and  Duration"].str.extract(r"([a-zA-Z\s]+)")

# extract information from 'duration'
pattern = r"([\d\.]+)\s*(Days?|Hours?|Minutes?)"
df[["Value", "Unit"]] = df["Duration"].str.extract(pattern)
df["Value"] = df["Value"].astype(float)
df["Duration_Hours"] = df.apply(
    lambda row: row["Value"] * 24 if pd.notnull(row["Unit"]) and "Day" in row["Unit"]  # Convert Days to Hours
    else row["Value"] if pd.notnull(row["Unit"]) and "Hour" in row["Unit"]  # Keep Hours as is
    else row["Value"] / 60 if pd.notnull(row["Unit"]) and "Minute" in row["Unit"]  # Convert Minutes to Hours
    else None, axis=1  # Handle any unexpected cases
)

# split information from 'status'
df[["Status_Description", "Status_Code"]] = df["Status"].str.split(" - ", expand=True)
# Split information from 'status and market notice'
df[['Status_Description_Market', 'Market_Notice_Code']] = df['Status and  Market Notice'].str.split(" - ", expand=True)

# List of boolean columns
bool_cols = ['Project Work?', 'Unplanned?', 'Generator Aware?', 'DNSP Aware?', 'Inter-Regional']
# Replace 'T' with 1 and NaN with 0
df[bool_cols] = df[bool_cols].applymap(lambda x: 1 if x == 'T' else 0)

# drop non-needed columns now:
df.drop(columns=['Start', 'Finish', 'Status', 'Status and  Market Notice', 'Duration', 'Value', 'Unit',
                 'Reason and  Duration', 'Recall', 'Start', 'Finish'], inplace=True)

# re-order columns
cols = list(df.columns)
new_col_order = ['Region', 'NSP', 
                 'Start_Date', 'Start_Time', 'Start_Day', # Start time information
                 'Finish_Date', 'Finish_Time', 'Finish_Day', # End time information
                 'Network Asset', # identifying information
                 'Recall_Day_Hours', 'Recall_Night_Hours', # recall information
                 'Project Work?', 'Unplanned?', 'DNSP Aware?', 'Generator Aware?', 'Inter-Regional', # boolean terms
                 'Status_Description', 'Status_Code', 'Status_Description_Market', 'Market_Notice_Code', # status information
                 'Reason', 'Duration_Hours', # reason and duration information
                 'Impact'
                 ]
new_col_order = list(dict.fromkeys(new_col_order))
df = df[new_col_order]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Region                     92 non-null     object        
 1   NSP                        92 non-null     object        
 2   Start_Date                 92 non-null     datetime64[ns]
 3   Start_Time                 92 non-null     object        
 4   Start_Day                  92 non-null     object        
 5   Finish_Date                92 non-null     datetime64[ns]
 6   Finish_Time                92 non-null     object        
 7   Finish_Day                 92 non-null     object        
 8   Network Asset              92 non-null     object        
 9   Recall_Day_Hours           84 non-null     float64       
 10  Recall_Night_Hours         27 non-null     float64       
 11  Project Work?              117 non-null    int64         
 12  Unplanne

  df['Start_Time'] = pd.to_datetime(df['Start_Time']).dt.strftime('%H:%M:%S')
  df['Finish_Time'] = pd.to_datetime(df['Finish_Time']).dt.strftime('%H:%M:%S')
  df[bool_cols] = df[bool_cols].applymap(lambda x: 1 if x == 'T' else 0)


Unnamed: 0,Region,NSP,Start_Date,Start_Time,Start_Day,Finish_Date,Finish_Time,Finish_Day,Network Asset,Recall_Day_Hours,Recall_Night_Hours,Project Work?,Unplanned?,DNSP Aware?,Generator Aware?,Inter-Regional,Status_Description,Status_Code,Status_Description_Market,Market_Notice_Code,Reason,Duration_Hours,Impact
0,NSW,Transgrid,2021-08-30,06:05:00,Monday,2021-09-01,12:00:00,Wednesday,Armidale - Dumaresq (8C) 330 kV Line,3.0,4.0,0,0,0,0,1,In Progress,PTP,In Progress,PTP,Commissioning,52.8,<<UPDATED since the last notification>> A credible contingency event during this planned outage could cause:  Synchronous separation of the Queensland region from the rest of the NEM
1,QLD,Powerlink,2021-09-01,08:30:00,Wednesday,2021-09-01,12:00:00,Wednesday,Ross No.4 288/138/19 kV Transformer,1.0,,0,0,1,0,0,Planned,MTLTP,Planned,MTLTP,Maintenance,3.5,This is a high impact outage because the recall time is greater than 30 minutes. A credible contingency event during this planned outage may require market intervention through issuing of directions.
2,NSW,Transgrid,2021-09-04,06:00:00,Saturday,2021-09-05,17:00:00,Sunday,Liddell to Muswellbrook (83) 330 kV Line,4.0,4.0,0,0,1,0,1,Planned,MTLTP,Planned,MTLTP,Commissioning,36.0,A credible contingency event during this planned outage could cause:  Synchronous separation of the Queensland region from the rest of the NEM
3,SA,ElectraNet,2021-09-06,08:00:00,Monday,2021-09-12,08:15:00,Sunday,Heywood - South East No.1 275 kV Line,159.0,159.0,0,0,0,0,1,Planned,SUBMIT,Planned,SUBMIT,Commissioning,144.0,<<NEW since the last notification>> A credible contingency event during this planned outage could cause synchronous separation of the South Australia region from the rest of the NEM. During this planned outage:  Power transfer will be restricted across the Victoria - South Australia interconnector (Heywood interconnector).  Post contingent FCAS will be sourced within SA following Separation event.
4,QLD,Powerlink,2021-09-06,08:30:00,Monday,2021-09-10,16:00:00,Friday,Ross No.3 288/138/19 kV Transformer,4.0,6.0,0,0,1,0,0,Planned,MTLTP,Planned,MTLTP,Maintenance,103.2,This is a high impact outage because the recall time is greater than 30 minutes. A credible contingency event during this planned outage may require market intervention through issuing of directions.
