In [1]:
import pandas as pd

In [2]:
# List of 12 monthly dataset filenames
dataset_filenames = [
    '201901.CSV',
    '201902.CSV',
    '201903.CSV',
    '201904.CSV',
    '201905.CSV',
    '201906.CSV',
    '201907.CSV',
    '201908.CSV',
    '201909.CSV',
    '201910.CSV',
    '201911.CSV',
    '201912.CSV'
]

In [3]:
# List to store the monthly dataframes
monthly_dataframes = []

In [4]:
# Iterate over each dataset filename
for filename in dataset_filenames:
    # Open the monthly dataset
    dataset = pd.read_csv(filename)
    
    # Drop the original headers in the dataset, use the first row as the new headers
    dataset = dataset.rename(columns=dataset.iloc[0]).drop(dataset.index[0]).reset_index(drop=True)
    
    # Drop the last row ('END OF REPORT') in the dataset
    dataset = dataset.iloc[:-1]
    
    # Drop the last 3 columns in the dataset
    dataset = dataset.iloc[:, :-3]
    
    # Drop the 1st, 2nd, 3rd, and 4th columns
    dataset = dataset.iloc[:, 4:]
    
    # Change SCADAVALUE to numeric
    dataset['SCADAVALUE'] = pd.to_numeric(dataset['SCADAVALUE'], errors='coerce')
    
    # Change SETTLEMENTDATE to datetime
    dataset['SETTLEMENTDATE'] = pd.to_datetime(dataset['SETTLEMENTDATE'])
    
    # Add a new column 'Month'
    dataset['Month'] = int(filename[4:6])  # Extract the month from the filename
    
    # Append the monthly dataset to the list
    monthly_dataframes.append(dataset)

  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)


In [5]:
# Concatenate the monthly dataframes vertically to get the yearly electricity generation dataframe
yearly_df = pd.concat(monthly_dataframes, ignore_index=True)

yearly_df

Unnamed: 0,SETTLEMENTDATE,DUID,SCADAVALUE,Month
0,2019-01-01 00:05:00,BARCSF1,0.500000,1
1,2019-01-01 00:05:00,BUTLERSG,10.299998,1
2,2019-01-01 00:05:00,CALL_A_4,0.000000,1
3,2019-01-01 00:05:00,CAPTL_WF,0.000000,1
4,2019-01-01 00:05:00,CATHROCK,2.050000,1
...,...,...,...,...
34466487,2020-01-01 00:00:00,YENDWF1,6.680000,12
34466488,2020-01-01 00:00:00,YWPS1,331.075960,12
34466489,2020-01-01 00:00:00,YWPS2,339.848110,12
34466490,2020-01-01 00:00:00,YWPS3,369.832920,12


In [6]:
# 2019 yearly elecrtricity generation aggragation by 'Month' and'DUID' 
# Group by 'Month' and 'DUID', and sum the 'SCADAVALUE' - i.e. the capacity of generator
aggregated_df = yearly_df.groupby(['Month', 'DUID'])['SCADAVALUE'].sum().reset_index()
aggregated_df

Unnamed: 0,Month,DUID,SCADAVALUE
0,1,AGLHAL,7.228478e+04
1,1,AGLSOM,1.345085e+05
2,1,ANGAST1,4.560454e+03
3,1,ARWF1,6.119055e+05
4,1,BALBG1,2.609080e+03
...,...,...,...
3939,12,YSWF1,8.414720e+04
3940,12,YWPS1,2.819144e+06
3941,12,YWPS2,2.903466e+06
3942,12,YWPS3,2.053961e+06


In [7]:
aggregated_df.isnull().sum()

Month         0
DUID          0
SCADAVALUE    0
dtype: int64

In [8]:
#Save the aggregated sheet as the yearly aggregated dataset
aggregated_df.to_csv('aggregatedDUID_2019.csv', index=False)

In [9]:
#Open the dataset of the generator information - 'Generators.csv'

file_generators = 'Generators.csv'
generators = pd.read_csv(file_generators)
generators

Unnamed: 0,DUID,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW),Unnamed: 6,Unnamed: 7
0,ADPBA1G,Adelaide Desalination Plant,SA,Generator,Battery storage,5.45,,
1,ADPMH1,Adelaide Desalination Plant,SA,Generator,Hydro,0.72,,
2,ADPPV3,Adelaide Desalination Plant,SA,Generator,Solar,0.02,,
3,ADPPV2,Adelaide Desalination Plant,SA,Generator,Solar,0.1,,
4,ADPPV1,Adelaide Desalination Plant,SA,Generator,Solar,2.75,,
...,...,...,...,...,...,...,...,...
511,WANDBL1,Wandoan Battery Energy Storage System (BESS),QLD,Load,,2.94,,
512,PUMP1,Wivenhoe Power Station,QLD,Load,-,240,,
513,PUMP2,Wivenhoe Power Station,QLD,Load,-,240,,
514,MACKAYGT,Mackay Gas Turbine,QLD,Generator,Fossil,34,,


In [10]:
#Drop'Unnamed' columns with null values
generators.drop(generators.columns[6:8], axis=1, inplace=True)
generators

Unnamed: 0,DUID,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,ADPBA1G,Adelaide Desalination Plant,SA,Generator,Battery storage,5.45
1,ADPMH1,Adelaide Desalination Plant,SA,Generator,Hydro,0.72
2,ADPPV3,Adelaide Desalination Plant,SA,Generator,Solar,0.02
3,ADPPV2,Adelaide Desalination Plant,SA,Generator,Solar,0.1
4,ADPPV1,Adelaide Desalination Plant,SA,Generator,Solar,2.75
...,...,...,...,...,...,...
511,WANDBL1,Wandoan Battery Energy Storage System (BESS),QLD,Load,,2.94
512,PUMP1,Wivenhoe Power Station,QLD,Load,-,240
513,PUMP2,Wivenhoe Power Station,QLD,Load,-,240
514,MACKAYGT,Mackay Gas Turbine,QLD,Generator,Fossil,34


In [11]:
#Merge aggregated_df and generators using LEFT JOIN
merged_df = aggregated_df.merge(generators, on='DUID', how='left')
merged_df

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,7.228478e+04,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,1.345085e+05,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,4.560454e+03,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,6.119055e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALBG1,2.609080e+03,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764
...,...,...,...,...,...,...,...,...
3951,12,YSWF1,8.414720e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05
3952,12,YWPS1,2.819144e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
3953,12,YWPS2,2.903466e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
3954,12,YWPS3,2.053961e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,380


In [12]:
merged_df.isnull().sum()

Month                      0
DUID                       0
SCADAVALUE                 0
Station Name             444
Region                   444
Dispatch Type            444
Fuel Source - Primary    456
Unit Size (MW)           444
dtype: int64

In [13]:
merged_df.head()

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,72284.77589,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,134508.50038,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,4560.45442,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,611905.49552,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALBG1,2609.08,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764


In [14]:
#Drop the rows of which the value in the 'SCADAVALUE' equals or is less than 0
#Drop the rows of which the value in the 'Dispatch Type' equals 'Load' - i.e. they are not generators

temp_df = merged_df[(merged_df['SCADAVALUE'] > 0) & (merged_df['Dispatch Type'] !='Load')]
temp_df

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,7.228478e+04,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,1.345085e+05,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,4.560454e+03,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,6.119055e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALBG1,2.609080e+03,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764
...,...,...,...,...,...,...,...,...
3951,12,YSWF1,8.414720e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05
3952,12,YWPS1,2.819144e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
3953,12,YWPS2,2.903466e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
3954,12,YWPS3,2.053961e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,380


In [15]:
temp_df.isnull().sum()

Month                     0
DUID                      0
SCADAVALUE                0
Station Name             70
Region                   70
Dispatch Type            70
Fuel Source - Primary    70
Unit Size (MW)           70
dtype: int64

In [16]:
#Add a new column 'Electricity Generation(MWh)'
temp_df['Electricity Generation (MWh)'] = round(temp_df['SCADAVALUE'] * (5/60), 2)
temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Electricity Generation (MWh)'] = round(temp_df['SCADAVALUE'] * (5/60), 2)


Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW),Electricity Generation (MWh)
0,1,AGLHAL,7.228478e+04,Hallett Power Station,SA,Generator,Fossil,228.6,6023.73
1,1,AGLSOM,1.345085e+05,Somerton Power Station,VIC,Generator,Fossil,42.5,11209.04
2,1,ANGAST1,4.560454e+03,Angaston Power Station,SA,Generator,Fossil,1.667,380.04
3,1,ARWF1,6.119055e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03,50992.12
4,1,BALBG1,2.609080e+03,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764,217.42
...,...,...,...,...,...,...,...,...,...
3951,12,YSWF1,8.414720e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05,7012.27
3952,12,YWPS1,2.819144e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360,234928.66
3953,12,YWPS2,2.903466e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360,241955.48
3954,12,YWPS3,2.053961e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,380,171163.38


In [17]:
temp_df.to_csv('temp_2019.csv', index=False)