In [1]:
import pandas as pd

In [2]:
# List of 12 monthly dataset filenames
dataset_filenames = [
    '202101.CSV',
    '202102.CSV',
    '202103.CSV',
    '202104.CSV',
    '202105.CSV',
    '202106.CSV',
    '202107.CSV',
    '202108.CSV',
    '202109.CSV',
    '202110.CSV',
    '202111.CSV',
    '202112.CSV'
]

In [3]:
# List to store the monthly dataframes
monthly_dataframes = []

In [4]:
# Iterate over each dataset filename
for filename in dataset_filenames:
    # Open the monthly dataset
    dataset = pd.read_csv(filename)
    
    # Drop the original headers in the dataset, use the first row as the new headers
    dataset = dataset.rename(columns=dataset.iloc[0]).drop(dataset.index[0]).reset_index(drop=True)
    
    # Drop the last row ('END OF REPORT') in the dataset
    dataset = dataset.iloc[:-1]
    
    # Drop the last 3 columns in the dataset
    dataset = dataset.iloc[:, :-3]
    
    # Drop the 1st, 2nd, 3rd, and 4th columns
    dataset = dataset.iloc[:, 4:]
    
    # Change SCADAVALUE to numeric
    dataset['SCADAVALUE'] = pd.to_numeric(dataset['SCADAVALUE'], errors='coerce')
    
    # Change SETTLEMENTDATE to datetime
    dataset['SETTLEMENTDATE'] = pd.to_datetime(dataset['SETTLEMENTDATE'])
    
    # Add a new column 'Month'
    dataset['Month'] = int(filename[4:6])  # Extract the month from the filename
    
    # Append the monthly dataset to the list
    monthly_dataframes.append(dataset)

  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)


In [5]:
# Concatenate the monthly dataframes vertically to get the yearly electricity generation dataframe
yearly_df = pd.concat(monthly_dataframes, ignore_index=True)

yearly_df

Unnamed: 0,SETTLEMENTDATE,DUID,SCADAVALUE,Month
0,2021-01-01 00:55:00,BARCSF1,0.100000,1
1,2021-01-01 00:55:00,BUTLERSG,9.899999,1
2,2021-01-01 00:55:00,CALL_A_4,0.000000,1
3,2021-01-01 00:55:00,CAPTL_WF,31.332001,1
4,2021-01-01 00:55:00,CATHROCK,37.730000,1
...,...,...,...,...
40161129,2022-01-01 00:00:00,YENDWF1,17.010000,12
40161130,2022-01-01 00:00:00,YWPS1,380.450620,12
40161131,2022-01-01 00:00:00,YWPS2,0.000000,12
40161132,2022-01-01 00:00:00,YWPS3,351.767090,12


In [6]:
# 2021 yearly elecrtricity generation aggragation by 'Month' and'DUID' 
# Group by 'Month' and 'DUID', and sum the 'SCADAVALUE' - i.e. the capacity of generator
aggregated_df = yearly_df.groupby(['Month', 'DUID'])['SCADAVALUE'].sum().reset_index()
aggregated_df

Unnamed: 0,Month,DUID,SCADAVALUE
0,1,AGLHAL,3.425169e+04
1,1,AGLSOM,9.407800e+03
2,1,ANGAST1,2.476500e+03
3,1,ARWF1,7.323133e+05
4,1,BALBG1,9.052110e+03
...,...,...,...
4601,12,YSWF1,6.866740e+04
4602,12,YWPS1,2.127056e+06
4603,12,YWPS2,1.818258e+03
4604,12,YWPS3,2.583632e+06


In [7]:
aggregated_df.isnull().sum()

Month         0
DUID          0
SCADAVALUE    0
dtype: int64

In [8]:
#Save the aggregated sheet as the yearly aggregated dataset
aggregated_df.to_csv('aggregatedDUID_2021.csv', index=False)

In [9]:
#Open the dataset of the generator information - 'Generators.csv'

file_generators = 'Generators.csv'

generators = pd.read_csv(file_generators)

generators

Unnamed: 0,DUID,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW),Unnamed: 6,Unnamed: 7
0,ADPBA1G,Adelaide Desalination Plant,SA,Generator,Battery storage,5.45,,
1,ADPMH1,Adelaide Desalination Plant,SA,Generator,Hydro,0.72,,
2,ADPPV3,Adelaide Desalination Plant,SA,Generator,Solar,0.02,,
3,ADPPV2,Adelaide Desalination Plant,SA,Generator,Solar,0.1,,
4,ADPPV1,Adelaide Desalination Plant,SA,Generator,Solar,2.75,,
...,...,...,...,...,...,...,...,...
509,VBBL1,Victorian Big Battery,VIC,Load,,1.179,,
510,WALGRVL1,Wallgrove BESS 1,NSW,Load,,1.306,,
511,WANDBL1,Wandoan Battery Energy Storage System (BESS),QLD,Load,,2.94,,
512,PUMP1,Wivenhoe Power Station,QLD,Load,-,240,,


In [10]:
#Drop'Unnamed' columns with null values
generators.drop(generators.columns[6:8], axis=1, inplace=True)
generators

Unnamed: 0,DUID,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,ADPBA1G,Adelaide Desalination Plant,SA,Generator,Battery storage,5.45
1,ADPMH1,Adelaide Desalination Plant,SA,Generator,Hydro,0.72
2,ADPPV3,Adelaide Desalination Plant,SA,Generator,Solar,0.02
3,ADPPV2,Adelaide Desalination Plant,SA,Generator,Solar,0.1
4,ADPPV1,Adelaide Desalination Plant,SA,Generator,Solar,2.75
...,...,...,...,...,...,...
509,VBBL1,Victorian Big Battery,VIC,Load,,1.179
510,WALGRVL1,Wallgrove BESS 1,NSW,Load,,1.306
511,WANDBL1,Wandoan Battery Energy Storage System (BESS),QLD,Load,,2.94
512,PUMP1,Wivenhoe Power Station,QLD,Load,-,240


In [11]:
#Merge aggregated_df and generators using LEFT JOIN

merged_df = aggregated_df.merge(generators, on='DUID', how='left')
merged_df

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,3.425169e+04,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,9.407800e+03,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,2.476500e+03,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,7.323133e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALBG1,9.052110e+03,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764
...,...,...,...,...,...,...,...,...
4613,12,YSWF1,6.866740e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05
4614,12,YWPS1,2.127056e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
4615,12,YWPS2,1.818258e+03,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
4616,12,YWPS3,2.583632e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,380


In [12]:
merged_df.isnull().sum()

Month                      0
DUID                       0
SCADAVALUE                 0
Station Name             484
Region                   484
Dispatch Type            484
Fuel Source - Primary    505
Unit Size (MW)           484
dtype: int64

In [13]:
merged_df.head(20)

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,34251.68538,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,9407.80003,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,2476.5,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,732313.27191,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALBG1,9052.11,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764
5,1,BALBL1,11716.08,Ballarat Battery Energy Storage System,VIC,Load,-,1.764
6,1,BALDHWF1,398483.22074,Bald Hills Wind Farm,VIC,Generator,Wind,2.05
7,1,BANGOWF1,997.2591,Bango 973 Wind Farm,NSW,Generator,Wind,5.16
8,1,BANN1,282794.78253,Bannerton Solar Park,VIC,Generator,Solar,2.5
9,1,BARCALDN,1209.08,Barcaldine Power Station,QLD,Generator,Fossil,37.0


In [14]:
#Drop the rows of which the value in the 'SCADAVALUE' equals or is less than 0
#Drop the rows of which the value in the 'Dispatch Type' equals 'Load' - i.e. they are not generators

temp_df = merged_df[(merged_df['SCADAVALUE'] > 0) & (merged_df['Dispatch Type'] !='Load')]
temp_df

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,3.425169e+04,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,9.407800e+03,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,2.476500e+03,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,7.323133e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALBG1,9.052110e+03,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764
...,...,...,...,...,...,...,...,...
4613,12,YSWF1,6.866740e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05
4614,12,YWPS1,2.127056e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
4615,12,YWPS2,1.818258e+03,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
4616,12,YWPS3,2.583632e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,380


In [17]:
temp_df.isnull().sum()

Month                            0
DUID                             0
SCADAVALUE                       0
Station Name                    67
Region                          67
Dispatch Type                   67
Fuel Source - Primary           67
Unit Size (MW)                  67
Electricity Generation (MWh)     0
dtype: int64

In [15]:
#Add a new column 'Electricity Generation(MWh)'
temp_df['Electricity Generation (MWh)'] = round(temp_df['SCADAVALUE'] * (5/60), 2)
temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Electricity Generation (MWh)'] = round(temp_df['SCADAVALUE'] * (5/60), 2)


Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW),Electricity Generation (MWh)
0,1,AGLHAL,3.425169e+04,Hallett Power Station,SA,Generator,Fossil,228.6,2854.31
1,1,AGLSOM,9.407800e+03,Somerton Power Station,VIC,Generator,Fossil,42.5,783.98
2,1,ANGAST1,2.476500e+03,Angaston Power Station,SA,Generator,Fossil,1.667,206.38
3,1,ARWF1,7.323133e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03,61026.11
4,1,BALBG1,9.052110e+03,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764,754.34
...,...,...,...,...,...,...,...,...,...
4613,12,YSWF1,6.866740e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05,5722.28
4614,12,YWPS1,2.127056e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360,177254.66
4615,12,YWPS2,1.818258e+03,Yallourn 'W' Power Station,VIC,Generator,Fossil,360,151.52
4616,12,YWPS3,2.583632e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,380,215302.66


In [16]:
temp_df.to_csv('temp_2021.csv', index=False)