In [23]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import os

In [27]:
df = pd.read_csv("aemo_data/High_Impact_Outages_20210830.csv", encoding='latin1')

# certain columns have \n and \r, so they are removed
df = df.replace({r"\r|\n": " "}, regex=True)
df.columns = df.columns.str.replace(r"[\r\n]", " ", regex=True)

# drop the columns which are Unnamed and have no values (total of 7 columns)
df = df.iloc[:, :-7]

# extract information from the 'start' and 'finish' columns and separate into separate columns
pattern = r"(\d{2}/\d{2}/\d{4})\s+(\d{2}:\d{2})\s+(\w+)"
df[["Start_Date", "Start_Time", "Start_Day"]] = df["Start"].str.extract(pattern)
df[["Finish_Date", "Finish_Time", "Finish_Day"]] = df["Finish"].str.extract(pattern)
df.drop(columns=["Start", "Finish"], inplace=True)

# extract information from 'recall'
pattern = r"Day:\s*(\d+)\s*hr?s?(?:-Night:\s*(\d+)\s*hr?s?)?"
df[["Day_Hours", "Night_Hours"]] = df["Recall"].str.extract(pattern)
df["Day_Hours"] = pd.to_numeric(df["Day_Hours"], errors="coerce")
df["Night_Hours"] = pd.to_numeric(df["Night_Hours"], errors="coerce")
# df.drop(columns=["Recall"], inplace=True)

# extract information from 'reason'
df["Reason"] = df["Reason and  Duration"].str.extract(r"([a-zA-Z\s]+)")

# extract information from 'duration'
pattern = r"([\d\.]+)\s*(Days?|Hours?|Minutes?)"
df[["Value", "Unit"]] = df["Duration"].str.extract(pattern)
df["Value"] = df["Value"].astype(float)
df["Duration_Hours"] = df.apply(
    lambda row: row["Value"] * 24 if pd.notnull(row["Unit"]) and "Day" in row["Unit"]  # Convert Days to Hours
    else row["Value"] if pd.notnull(row["Unit"]) and "Hour" in row["Unit"]  # Keep Hours as is
    else row["Value"] / 60 if pd.notnull(row["Unit"]) and "Minute" in row["Unit"]  # Convert Minutes to Hours
    else None, axis=1  # Handle any unexpected cases
)
df.drop(columns=["Duration", "Value", "Unit"], inplace=True)


# # re-order columns
# cols = list(df.columns)
# new_col_order = cols[:2] + ["Start_Date", "Start_Time", "Start_Day", "Finish_Date", "Finish_Time", "Finish_Day"] + cols[2:]
# new_col_order = list(dict.fromkeys(new_col_order))
# df = df[new_col_order]

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Region                    92 non-null     object 
 1   NSP                       92 non-null     object 
 2   Network Asset             92 non-null     object 
 3   Impact                    92 non-null     object 
 4   Recall                    92 non-null     object 
 5   Status                    92 non-null     object 
 6   Status and Market Notice  117 non-null    object 
 7   Project Work?             2 non-null      object 
 8   Unplanned?                0 non-null      float64
 9   DNSP Aware?               34 non-null     object 
 10  Generator Aware?          23 non-null     object 
 11  Inter-Regional            35 non-null     object 
 12  Reason and  Duration      92 non-null     object 
 13  Start_Date                92 non-null     object 
 14  Start_Time

Unnamed: 0,Region,NSP,Network Asset,Impact,Recall,Status,Status and Market Notice,Project Work?,Unplanned?,DNSP Aware?,Generator Aware?,Inter-Regional,Reason and Duration,Start_Date,Start_Time,Start_Day,Finish_Date,Finish_Time,Finish_Day,Day_Hours,Night_Hours,Reason,Duration_Hours
0,NSW,Transgrid,Armidale - Dumaresq (8C) 330 kV Line,<<UPDATED since the last notification>> A credible contingency event during this planned outage could cause:  Synchronous separation of the Queensland region from the rest of the NEM,Day: 3 hrs-Night: 4 hrs,In Progress - PTP,In Progress - PTP,,,,,T,Commissioning 2.2 Days,30/08/2021,06:05,Monday,01/09/2021,12:00,Wednesday,3.0,4.0,Commissioning,52.8
1,QLD,Powerlink,Ross No.4 288/138/19 kV Transformer,This is a high impact outage because the recall time is greater than 30 minutes. A credible contingency event during this planned outage may require market intervention through issuing of directions.,Day: 1 hr-Night: NA,Planned - MTLTP,Planned - MTLTP,,,T,,,Maintenance 3.5 Hours,01/09/2021,08:30,Wednesday,01/09/2021,12:00,Wednesday,1.0,,Maintenance,3.5
2,NSW,Transgrid,Liddell to Muswellbrook (83) 330 kV Line,A credible contingency event during this planned outage could cause:  Synchronous separation of the Queensland region from the rest of the NEM,Day: 4 hrs-Night: 4 hrs,Planned - MTLTP,Planned - MTLTP,,,T,,T,Commissioning 1.5 Days,04/09/2021,06:00,Saturday,05/09/2021,17:00,Sunday,4.0,4.0,Commissioning,36.0
3,SA,ElectraNet,Heywood - South East No.1 275 kV Line,<<NEW since the last notification>> A credible contingency event during this planned outage could cause synchronous separation of the South Australia region from the rest of the NEM. During this planned outage:  Power transfer will be restricted across the Victoria - South Australia interconnector (Heywood interconnector).  Post contingent FCAS will be sourced within SA following Separation event.,Day: 159 hrs-Night: 159 hrs,Planned - SUBMIT,Planned - SUBMIT,,,,,T,Commissioning 6 Days,06/09/2021,08:00,Monday,12/09/2021,08:15,Sunday,159.0,159.0,Commissioning,144.0
4,QLD,Powerlink,Ross No.3 288/138/19 kV Transformer,This is a high impact outage because the recall time is greater than 30 minutes. A credible contingency event during this planned outage may require market intervention through issuing of directions.,Day: 4 hrs-Night: 6 hrs,Planned - MTLTP,Planned - MTLTP,,,T,,,Maintenance 4.3 Days,06/09/2021,08:30,Monday,10/09/2021,16:00,Friday,4.0,6.0,Maintenance,103.2
