In [1]:
import pandas as pd

In [2]:
# List of 12 monthly dataset filenames
dataset_filenames = [
    '201801.CSV',
    '201802.CSV',
    '201803.CSV',
    '201804.CSV',
    '201805.CSV',
    '201806.CSV',
    '201807.CSV',
    '201808.CSV',
    '201809.CSV',
    '201810.CSV',
    '201811.CSV',
    '201812.CSV'
]

In [3]:
# List to store the monthly dataframes
monthly_dataframes = []

In [4]:
# Iterate over each dataset filename
for filename in dataset_filenames:
    # Open the monthly dataset
    dataset = pd.read_csv(filename)
    
    # Drop the original headers in the dataset, use the first row as the new headers
    dataset = dataset.rename(columns=dataset.iloc[0]).drop(dataset.index[0]).reset_index(drop=True)
    
    # Drop the last row ('END OF REPORT') in the dataset
    dataset = dataset.iloc[:-1]
    
    # Drop the last 3 columns in the dataset
    dataset = dataset.iloc[:, :-3]
    
    # Drop the 1st, 2nd, 3rd, and 4th columns
    dataset = dataset.iloc[:, 4:]
    
    # Change SCADAVALUE to numeric
    dataset['SCADAVALUE'] = pd.to_numeric(dataset['SCADAVALUE'], errors='coerce')
    
    # Change SETTLEMENTDATE to datetime
    dataset['SETTLEMENTDATE'] = pd.to_datetime(dataset['SETTLEMENTDATE'])
    
    # Add a new column 'Month'
    dataset['Month'] = int(filename[4:6])  # Extract the month from the filename
    
    # Append the monthly dataset to the list
    monthly_dataframes.append(dataset)

  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)
  dataset = pd.read_csv(filename)


In [5]:
# Concatenate the monthly dataframes vertically to get the yearly electricity generation dataframe
yearly_df = pd.concat(monthly_dataframes, ignore_index=True)
yearly_df

Unnamed: 0,SETTLEMENTDATE,DUID,SCADAVALUE,Month
0,2018-01-01 00:05:00,BARCSF1,1.300000,1
1,2018-01-01 00:05:00,BUTLERSG,9.599998,1
2,2018-01-01 00:05:00,CALL_A_4,0.000000,1
3,2018-01-01 00:05:00,CAPTL_WF,0.000000,1
4,2018-01-01 00:05:00,CATHROCK,4.270000,1
...,...,...,...,...
31254680,2019-01-01 00:00:00,YARWUN_1,140.360000,12
31254681,2019-01-01 00:00:00,YWPS1,366.665830,12
31254682,2019-01-01 00:00:00,YWPS2,374.686070,12
31254683,2019-01-01 00:00:00,YWPS3,0.000000,12


In [6]:
# 2018 yearly elecrtricity generation aggragation by 'Month' and'DUID' 
# Group by 'Month' and 'DUID', and sum the 'SCADAVALUE' - i.e. the capacity of generator
aggregated_df = yearly_df.groupby(['Month', 'DUID'])['SCADAVALUE'].sum().reset_index()
aggregated_df

Unnamed: 0,Month,DUID,SCADAVALUE
0,1,AGLHAL,5.993667e+04
1,1,AGLSOM,3.920060e+04
2,1,ANGAST1,1.468031e+04
3,1,ARWF1,6.437637e+05
4,1,BALDHWF1,2.749672e+05
...,...,...,...
3585,12,YSWF1,5.909640e+04
3586,12,YWPS1,2.514930e+06
3587,12,YWPS2,2.912853e+06
3588,12,YWPS3,0.000000e+00


In [7]:
aggregated_df.isnull().sum()

Month         0
DUID          0
SCADAVALUE    0
dtype: int64

In [8]:
#Save the aggregated sheet as the yearly aggregated dataset
aggregated_df.to_csv('aggregatedDUID_2018.csv', index=False)

In [9]:
#Open the dataset of the generator information - 'Generators.csv'

file_generators = 'Generators.csv'
generators = pd.read_csv(file_generators)
generators

Unnamed: 0,DUID,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW),Unnamed: 6,Unnamed: 7
0,ADPBA1G,Adelaide Desalination Plant,SA,Generator,Battery storage,5.45,,
1,ADPMH1,Adelaide Desalination Plant,SA,Generator,Hydro,0.72,,
2,ADPPV3,Adelaide Desalination Plant,SA,Generator,Solar,0.02,,
3,ADPPV2,Adelaide Desalination Plant,SA,Generator,Solar,0.1,,
4,ADPPV1,Adelaide Desalination Plant,SA,Generator,Solar,2.75,,
...,...,...,...,...,...,...,...,...
512,PUMP1,Wivenhoe Power Station,QLD,Load,-,240,,
513,PUMP2,Wivenhoe Power Station,QLD,Load,-,240,,
514,MACKAYGT,Mackay Gas Turbine,QLD,Generator,Fossil,34,,
515,LD03,Liddell Power Station,NSW,Generator,Fossil,30,,


In [10]:
#Drop'Unnamed' columns with null values
generators.drop(generators.columns[6:8], axis=1, inplace=True)
generators

Unnamed: 0,DUID,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,ADPBA1G,Adelaide Desalination Plant,SA,Generator,Battery storage,5.45
1,ADPMH1,Adelaide Desalination Plant,SA,Generator,Hydro,0.72
2,ADPPV3,Adelaide Desalination Plant,SA,Generator,Solar,0.02
3,ADPPV2,Adelaide Desalination Plant,SA,Generator,Solar,0.1
4,ADPPV1,Adelaide Desalination Plant,SA,Generator,Solar,2.75
...,...,...,...,...,...,...
512,PUMP1,Wivenhoe Power Station,QLD,Load,-,240
513,PUMP2,Wivenhoe Power Station,QLD,Load,-,240
514,MACKAYGT,Mackay Gas Turbine,QLD,Generator,Fossil,34
515,LD03,Liddell Power Station,NSW,Generator,Fossil,30


In [11]:
#Merge aggregated_df and generators using LEFT JOIN
merged_df = aggregated_df.merge(generators, on='DUID', how='left')
merged_df

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,5.993667e+04,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,3.920060e+04,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,1.468031e+04,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,6.437637e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALDHWF1,2.749672e+05,Bald Hills Wind Farm,VIC,Generator,Wind,2.05
...,...,...,...,...,...,...,...,...
3597,12,YSWF1,5.909640e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05
3598,12,YWPS1,2.514930e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
3599,12,YWPS2,2.912853e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
3600,12,YWPS3,0.000000e+00,Yallourn 'W' Power Station,VIC,Generator,Fossil,380


In [12]:
merged_df.isnull().sum()

Month                      0
DUID                       0
SCADAVALUE                 0
Station Name             432
Region                   432
Dispatch Type            432
Fuel Source - Primary    444
Unit Size (MW)           432
dtype: int64

In [14]:
#Drop the rows of which the value in the 'SCADAVALUE' equals or is less than 0
#Drop the rows of which the value in the 'Dispatch Type' equals 'Load' - i.e. they are not generators

temp_df = merged_df[(merged_df['SCADAVALUE'] > 0) & (merged_df['Dispatch Type'] !='Load')]
temp_df

Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW)
0,1,AGLHAL,5.993667e+04,Hallett Power Station,SA,Generator,Fossil,228.6
1,1,AGLSOM,3.920060e+04,Somerton Power Station,VIC,Generator,Fossil,42.5
2,1,ANGAST1,1.468031e+04,Angaston Power Station,SA,Generator,Fossil,1.667
3,1,ARWF1,6.437637e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03
4,1,BALDHWF1,2.749672e+05,Bald Hills Wind Farm,VIC,Generator,Wind,2.05
...,...,...,...,...,...,...,...,...
3596,12,YARWUN_1,1.174888e+06,Yarwun Power Station,QLD,Generator,Fossil,154
3597,12,YSWF1,5.909640e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05
3598,12,YWPS1,2.514930e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360
3599,12,YWPS2,2.912853e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360


In [15]:
temp_df.isnull().sum()

Month                     0
DUID                      0
SCADAVALUE                0
Station Name             53
Region                   53
Dispatch Type            53
Fuel Source - Primary    53
Unit Size (MW)           53
dtype: int64

In [16]:
#Add a new column 'Electricity Generation(MWh)'
temp_df['Electricity Generation (MWh)'] = round(temp_df['SCADAVALUE'] * (5/60), 2)
temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Electricity Generation (MWh)'] = round(temp_df['SCADAVALUE'] * (5/60), 2)


Unnamed: 0,Month,DUID,SCADAVALUE,Station Name,Region,Dispatch Type,Fuel Source - Primary,Unit Size (MW),Electricity Generation (MWh)
0,1,AGLHAL,5.993667e+04,Hallett Power Station,SA,Generator,Fossil,228.6,4994.72
1,1,AGLSOM,3.920060e+04,Somerton Power Station,VIC,Generator,Fossil,42.5,3266.72
2,1,ANGAST1,1.468031e+04,Angaston Power Station,SA,Generator,Fossil,1.667,1223.36
3,1,ARWF1,6.437637e+05,Ararat Wind Farm,VIC,Generator,Wind,9.03,53646.98
4,1,BALDHWF1,2.749672e+05,Bald Hills Wind Farm,VIC,Generator,Wind,2.05,22913.93
...,...,...,...,...,...,...,...,...,...
3596,12,YARWUN_1,1.174888e+06,Yarwun Power Station,QLD,Generator,Fossil,154,97907.33
3597,12,YSWF1,5.909640e+04,Yaloak South Wind Farm,VIC,Generator,Wind,2.05,4924.70
3598,12,YWPS1,2.514930e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360,209577.51
3599,12,YWPS2,2.912853e+06,Yallourn 'W' Power Station,VIC,Generator,Fossil,360,242737.74


In [17]:
temp_df.to_csv('temp_2018.csv', index=False)