In [1]:
import pandas as pd

In [2]:
# List of 12 monthly dataset filenames
dataset_filenames = [
    '202001.CSV',
    '202002.CSV',
    '202003.CSV',
    '202004.CSV',
    '202005.CSV',
    '202006.CSV',
    '202007.CSV',
    '202008.CSV',
    '202009.CSV',
    '202010.CSV',
    '202011.CSV',
    '202012.CSV'
]

In [3]:
# List to store the monthly dataframes
monthly_dataframes = []

In [4]:
# Iterate over each dataset filename
for filename in dataset_filenames:
    # Open the monthly dataset
    dataset = pd.read_csv(filename)
    
    # Drop the original headers in the dataset, use the first row as the new headers
    dataset = dataset.rename(columns=dataset.iloc[0]).drop(dataset.index[0]).reset_index(drop=True)
    
    # Drop the last row ('END OF REPORT') in the dataset
    dataset = dataset.iloc[:-1]
    
    # Drop the last 3 columns in the dataset
    dataset = dataset.iloc[:, :-3]
    
    # Drop the 1st, 2nd, 3rd, and 4th columns
    dataset = dataset.iloc[:, 4:]
    
    # Change SCADAVALUE to numeric
    dataset['SCADAVALUE'] = pd.to_numeric(dataset['SCADAVALUE'], errors='coerce')
    
    # Change SETTLEMENTDATE to datetime
    dataset['SETTLEMENTDATE'] = pd.to_datetime(dataset['SETTLEMENTDATE'])
    
    # Add a new column 'Month'
    dataset['Month'] = int(filename[4:6])  # Extract the month from the filename
    
    # Append the monthly dataset to the list
    monthly_dataframes.append(dataset)

  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)


In [5]:
# Concatenate the monthly dataframes vertically to get the yearly electricity generation dataframe
yearly_df = pd.concat(monthly_dataframes, ignore_index=True)

yearly_df

Unnamed: 0,SETTLEMENTDATE,DUID,SCADAVALUE,Month
0,2020-01-01 00:05:00,BARCSF1,0.100000,1
1,2020-01-01 00:05:00,BUTLERSG,11.199999,1
2,2020-01-01 00:05:00,CALL_A_4,0.000000,1
3,2020-01-01 00:05:00,CAPTL_WF,70.840004,1
4,2020-01-01 00:05:00,CATHROCK,0.000000,1
...,...,...,...,...
36882689,2021-01-01 00:00:00,YARANSF1,0.000000,12
36882690,2021-01-01 00:00:00,YARWUN_1,114.350000,12
36882691,2021-01-01 00:00:00,YATSF1,0.000000,12
36882692,2021-01-01 00:00:00,YENDWF1,13.880000,12


In [6]:
# 2020 yearly elecrtricity generation aggragation by 'Month' and'DUID' 
# Group by 'Month' and 'DUID', and sum the 'SCADAVALUE' - i.e. the capacity of generator
aggregated_df = yearly_df.groupby(['Month', 'DUID'])['SCADAVALUE'].sum().reset_index()
aggregated_df

Unnamed: 0,Month,DUID,SCADAVALUE
0,1,AGLHAL,1.780565e+04
1,1,AGLSOM,7.314150e+04
2,1,ANGAST1,2.335100e+03
3,1,ARWF1,6.474134e+05
4,1,BALBG1,4.586680e+03
...,...,...,...
4208,12,YSWF1,7.911740e+04
4209,12,YWPS1,2.558718e+06
4210,12,YWPS2,2.098455e+06
4211,12,YWPS3,2.220697e+06


In [7]:
aggregated_df.isnull().sum()

Month         0
DUID          0
SCADAVALUE    0
dtype: int64

In [8]:
#Save the aggregated sheet as the yearly aggregated dataset
aggregated_df.to_csv('aggregatedDUID_2020.csv', index=False)

In [9]:
#Open the dataset of the generator information - 'Generators.csv'
file_generators = 'Generators.csv'
generators = pd.read_csv(file_generators)
generators

Unnamed: 0,DUID,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW),Unnamed: 6,Unnamed: 7
0,ADPBA1G,Adelaide Desalination Plant,SA,Generator,Battery storage,5.45,,
1,ADPMH1,Adelaide Desalination Plant,SA,Generator,Hydro,0.72,,
2,ADPPV3,Adelaide Desalination Plant,SA,Generator,Solar,0.02,,
3,ADPPV2,Adelaide Desalination Plant,SA,Generator,Solar,0.1,,
4,ADPPV1,Adelaide Desalination Plant,SA,Generator,Solar,2.75,,
...,...,...,...,...,...,...,...,...
511,WANDBL1,Wandoan Battery Energy Storage System (BESS),QLD,Load,,2.94,,
512,PUMP1,Wivenhoe Power Station,QLD,Load,-,240,,
513,PUMP2,Wivenhoe Power Station,QLD,Load,-,240,,
514,MACKAYGT,Mackay Gas Turbine,QLD,Generator,Fossil,34,,


In [10]:
#Drop'Unnamed' columns with null values
generators.drop(generators.columns[6:8], axis=1, inplace=True)
generators

Unnamed: 0,DUID,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,ADPBA1G,Adelaide Desalination Plant,SA,Generator,Battery storage,5.45
1,ADPMH1,Adelaide Desalination Plant,SA,Generator,Hydro,0.72
2,ADPPV3,Adelaide Desalination Plant,SA,Generator,Solar,0.02
3,ADPPV2,Adelaide Desalination Plant,SA,Generator,Solar,0.1
4,ADPPV1,Adelaide Desalination Plant,SA,Generator,Solar,2.75
...,...,...,...,...,...,...
511,WANDBL1,Wandoan Battery Energy Storage System (BESS),QLD,Load,,2.94
512,PUMP1,Wivenhoe Power Station,QLD,Load,-,240
513,PUMP2,Wivenhoe Power Station,QLD,Load,-,240
514,MACKAYGT,Mackay Gas Turbine,QLD,Generator,Fossil,34


In [11]:
#Merge aggregated_df and generators using LEFT JOIN
merged_df = aggregated_df.merge(generators, on='DUID', how='left')
merged_df

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,1.780565e+04,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,7.314150e+04,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,2.335100e+03,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,6.474134e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALBG1,4.586680e+03,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764
...,...,...,...,...,...,...,...,...
4220,12,YSWF1,7.911740e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05
4221,12,YWPS1,2.558718e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
4222,12,YWPS2,2.098455e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
4223,12,YWPS3,2.220697e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,380


In [12]:
merged_df.isnull().sum()

Month                      0
DUID                       0
SCADAVALUE                 0
Station Name             438
Region                   438
Dispatch Type            438
Fuel Source - Primary    450
Unit Size (MW)           438
dtype: int64

In [13]:
merged_df.head()

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,17805.64937,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,73141.49987,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,2335.1,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,647413.40034,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALBG1,4586.68,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764


In [14]:
#Drop the rows of which the value in the 'SCADAVALUE' equals or is less than 0
#Drop the rows of which the value in the 'Dispatch Type' equals 'Load' - i.e. they are not generators

temp_df = merged_df[(merged_df['SCADAVALUE'] > 0) & (merged_df['Dispatch Type'] !='Load')]
temp_df

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,1.780565e+04,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,7.314150e+04,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,2.335100e+03,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,6.474134e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALBG1,4.586680e+03,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764
...,...,...,...,...,...,...,...,...
4220,12,YSWF1,7.911740e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05
4221,12,YWPS1,2.558718e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
4222,12,YWPS2,2.098455e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
4223,12,YWPS3,2.220697e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,380


In [15]:
temp_df.isnull().sum()

Month                     0
DUID                      0
SCADAVALUE                0
Station Name             40
Region                   40
Dispatch Type            40
Fuel Source - Primary    40
Unit Size (MW)           40
dtype: int64

In [16]:
#Add a new column 'Electricity Generation(MWh)'
temp_df['Electricity Generation (MWh)'] = round(temp_df['SCADAVALUE'] * (5/60), 2)
temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Electricity Generation (MWh)'] = round(temp_df['SCADAVALUE'] * (5/60), 2)


Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW),Electricity Generation (MWh)
0,1,AGLHAL,1.780565e+04,Hallett Power Station,SA,Generator,Fossil,228.6,1483.80
1,1,AGLSOM,7.314150e+04,Somerton Power Station,VIC,Generator,Fossil,42.5,6095.12
2,1,ANGAST1,2.335100e+03,Angaston Power Station,SA,Generator,Fossil,1.667,194.59
3,1,ARWF1,6.474134e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03,53951.12
4,1,BALBG1,4.586680e+03,Ballarat Battery Energy Storage System,VIC,Generator,Battery storage,1.764,382.22
...,...,...,...,...,...,...,...,...,...
4220,12,YSWF1,7.911740e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05,6593.12
4221,12,YWPS1,2.558718e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360,213226.48
4222,12,YWPS2,2.098455e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360,174871.29
4223,12,YWPS3,2.220697e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,380,185058.05


In [17]:
temp_df.to_csv('temp_2020.csv', index=False)