In [1]:
import pandas as pd

In [2]:
# List of 12 monthly dataset filenames
dataset_filenames = [
    '201901-PV.CSV',
    '201902-PV.CSV',
    '201903-PV.CSV',
    '201904-PV.CSV',
    '201905-PV.CSV',
    '201906-PV.CSV',
    '201907-PV.CSV',
    '201908-PV.CSV',
    '201909-PV.CSV',
    '201910-PV.CSV',
    '201911-PV.CSV',
    '201912-PV.CSV'
]

In [3]:
# List to store the monthly dataframes
monthly_dataframes = []

In [4]:
# Iterate over each dataset filename
for filename in dataset_filenames:
    # Open the monthly dataset
    dataset = pd.read_csv(filename)
    
    # Drop the original headers in the dataset, use the first row as the new headers
    dataset = dataset.rename(columns=dataset.iloc[0]).drop(dataset.index[0]).reset_index(drop=True)
    
    # Drop the last row ('END OF REPORT') in the dataset
    dataset = dataset.iloc[:-1]
    
    # Drop the 1st-4th, 8th and 10th columns (irrelavant columns:'I','ROOFTOP','ACTUAL','2','QI','LASTCHANGED')
    columns_to_drop = dataset.columns[[0, 1, 2, 3, 7, 9]]
    dataset = dataset.drop(columns_to_drop, axis=1)
    
    # Keep all rows with ‘REGIONID’ as: NSW1, QLD1, SA1, VIC1, TAS1; 
    # Reason: all these Region IDs are the sum of other IDs, such as QLDC, QLDS, QLDN, etc
    regions_to_keep = ['NSW1', 'QLD1', 'VIC1', 'TAS1', 'SA1']
    dataset = dataset[dataset['REGIONID'].isin(regions_to_keep)]
    
    # Keep all records with ‘TYPE’ as: ‘MEASUREMENT’; 
    # Reason: ‘MEASUREMENT’ is the actual measured results of ‘POWER’, ‘SATELLITE’ is the estimated results of ‘power’)
    dataset = dataset[dataset['TYPE'] == 'MEASUREMENT'] 
    
    # Clean the 'REGIONID' column: e.g. 'NSW1' to 'NSW'
    dataset['REGIONID'] = dataset['REGIONID'].replace({'NSW1':'NSW','QLD1':'QLD','VIC1':'VIC','SA1':'SA','TAS1':'TAS'})
    
     # Change POWER to numeric
    dataset['POWER'] = pd.to_numeric(dataset['POWER'], errors='coerce')
    
    # Change INTERVAL_DATETIME to datetime
    dataset['INTERVAL_DATETIME'] = pd.to_datetime(dataset['INTERVAL_DATETIME'])
    
    # Add a new column 'Month'
    dataset['Month'] = int(filename[4:6])  # Extract the month from the filename
    
    # Append the monthly dataset to the list
    monthly_dataframes.append(dataset)

In [5]:
# Concatenate the monthly dataframes vertically to get the yearly electricity generation dataframe
yearly_df = pd.concat(monthly_dataframes, ignore_index=True)
yearly_df

Unnamed: 0,INTERVAL_DATETIME,REGIONID,POWER,TYPE,Month
0,2019-01-01 00:30:00,NSW,0.0,MEASUREMENT,1
1,2019-01-01 00:30:00,QLD,0.0,MEASUREMENT,1
2,2019-01-01 00:30:00,SA,0.0,MEASUREMENT,1
3,2019-01-01 00:30:00,TAS,0.0,MEASUREMENT,1
4,2019-01-01 00:30:00,VIC,0.0,MEASUREMENT,1
...,...,...,...,...,...
88315,2020-01-02 00:00:00,NSW,0.0,MEASUREMENT,12
88316,2020-01-02 00:00:00,QLD,0.0,MEASUREMENT,12
88317,2020-01-02 00:00:00,SA,0.0,MEASUREMENT,12
88318,2020-01-02 00:00:00,TAS,0.0,MEASUREMENT,12


In [6]:
# Delete any rows where the 'INTERVAL_DATETIME' is from 2020
yearly_df = yearly_df[~(yearly_df['INTERVAL_DATETIME'].dt.year == 2020)]
yearly_df

Unnamed: 0,INTERVAL_DATETIME,REGIONID,POWER,TYPE,Month
0,2019-01-01 00:30:00,NSW,0.0,MEASUREMENT,1
1,2019-01-01 00:30:00,QLD,0.0,MEASUREMENT,1
2,2019-01-01 00:30:00,SA,0.0,MEASUREMENT,1
3,2019-01-01 00:30:00,TAS,0.0,MEASUREMENT,1
4,2019-01-01 00:30:00,VIC,0.0,MEASUREMENT,1
...,...,...,...,...,...
88070,2019-12-31 23:30:00,NSW,0.0,MEASUREMENT,12
88071,2019-12-31 23:30:00,QLD,0.0,MEASUREMENT,12
88072,2019-12-31 23:30:00,SA,0.0,MEASUREMENT,12
88073,2019-12-31 23:30:00,TAS,0.0,MEASUREMENT,12


In [7]:
# 2019 yearly rooftop PV elecrtricity generation aggragation by 'Month' and'REGIONID' 
# Group by 'Month' and 'REGIONID', and sum the 'POWER' - i.e. the power of generator
aggregated_df = yearly_df.groupby(['Month', 'REGIONID'])['POWER'].sum().reset_index()
aggregated_df

Unnamed: 0,Month,REGIONID,POWER
0,1,NSW,529016.461
1,1,QLD,692482.212
2,1,SA,330449.519
3,1,TAS,45592.918
4,1,VIC,472643.335
5,2,NSW,460096.405
6,2,QLD,565912.253
7,2,SA,286275.677
8,2,TAS,33731.029
9,2,VIC,389092.132


In [8]:
aggregated_df.isnull().sum()

Month       0
REGIONID    0
POWER       0
dtype: int64

In [9]:
#Add a new column 'Electricity Generation(MWh)'
aggregated_df['Electricity Generation (MWh)'] = round(aggregated_df['POWER'] * (30/60), 2)
aggregated_df

Unnamed: 0,Month,REGIONID,POWER,Electricity Generation (MWh)
0,1,NSW,529016.461,264508.23
1,1,QLD,692482.212,346241.11
2,1,SA,330449.519,165224.76
3,1,TAS,45592.918,22796.46
4,1,VIC,472643.335,236321.67
5,2,NSW,460096.405,230048.2
6,2,QLD,565912.253,282956.13
7,2,SA,286275.677,143137.84
8,2,TAS,33731.029,16865.51
9,2,VIC,389092.132,194546.07


In [10]:
#Save the aggregated sheet as the yearly aggregated dataset
aggregated_df.to_csv('aggregatedREGIONID_PV_2019.csv', index=False)