In [1]:
import pandas as pd

In [2]:
# List of 12 monthly dataset filenames
dataset_filenames = [
    '202001-PV.CSV',
    '202002-PV.CSV',
    '202003-PV.CSV',
    '202004-PV.CSV',
    '202005-PV.CSV',
    '202006-PV.CSV',
    '202007-PV.CSV',
    '202008-PV.CSV',
    '202009-PV.CSV',
    '202010-PV.CSV',
    '202011-PV.CSV',
    '202012-PV.CSV'
]

In [3]:
# List to store the monthly dataframes
monthly_dataframes = []

In [4]:
# Iterate over each dataset filename
for filename in dataset_filenames:
    # Open the monthly dataset
    dataset = pd.read_csv(filename)
    
    # Drop the original headers in the dataset, use the first row as the new headers
    dataset = dataset.rename(columns=dataset.iloc[0]).drop(dataset.index[0]).reset_index(drop=True)
    
    # Drop the last row ('END OF REPORT') in the dataset
    dataset = dataset.iloc[:-1]
    
    # Drop the 1st-4th, 8th and 10th columns (irrelavant columns:'I','ROOFTOP','ACTUAL','2','QI','LASTCHANGED')
    columns_to_drop = dataset.columns[[0, 1, 2, 3, 7, 9]]
    dataset = dataset.drop(columns_to_drop, axis=1)
    
    # Keep all rows with ‘REGIONID’ as: NSW1, QLD1, SA1, VIC1, TAS1; 
    # Reason: all these Region IDs are the sum of other IDs, such as QLDC, QLDS, QLDN, etc
    regions_to_keep = ['NSW1', 'QLD1', 'VIC1', 'TAS1', 'SA1']
    dataset = dataset[dataset['REGIONID'].isin(regions_to_keep)]
    
    # Keep all records with ‘TYPE’ as: ‘MEASUREMENT’; 
    # Reason: ‘MEASUREMENT’ is the actual measured results of ‘POWER’, ‘SATELLITE’ is the estimated results of ‘power’)
    dataset = dataset[dataset['TYPE'] == 'MEASUREMENT'] 
    
    # Clean the 'REGIONID' column: e.g. 'NSW1' to 'NSW'
    dataset['REGIONID'] = dataset['REGIONID'].replace({'NSW1':'NSW','QLD1':'QLD','VIC1':'VIC','SA1':'SA','TAS1':'TAS'})
    
     # Change POWER to numeric
    dataset['POWER'] = pd.to_numeric(dataset['POWER'], errors='coerce')
    
    # Change INTERVAL_DATETIME to datetime
    dataset['INTERVAL_DATETIME'] = pd.to_datetime(dataset['INTERVAL_DATETIME'])
    
    # Add a new column 'Month'
    dataset['Month'] = int(filename[4:6])  # Extract the month from the filename
    
    # Append the monthly dataset to the list
    monthly_dataframes.append(dataset)

In [5]:
# Concatenate the monthly dataframes vertically to get the yearly electricity generation dataframe
yearly_df = pd.concat(monthly_dataframes, ignore_index=True)
yearly_df

Unnamed: 0,INTERVAL_DATETIME,REGIONID,POWER,TYPE,Month
0,2020-01-01 00:30:00,NSW,0.0,MEASUREMENT,1
1,2020-01-01 00:30:00,QLD,0.0,MEASUREMENT,1
2,2020-01-01 00:30:00,SA,0.0,MEASUREMENT,1
3,2020-01-01 00:30:00,TAS,0.0,MEASUREMENT,1
4,2020-01-01 00:30:00,VIC,0.0,MEASUREMENT,1
...,...,...,...,...,...
90715,2021-01-02 00:00:00,NSW,0.0,MEASUREMENT,12
90716,2021-01-02 00:00:00,QLD,0.0,MEASUREMENT,12
90717,2021-01-02 00:00:00,SA,0.0,MEASUREMENT,12
90718,2021-01-02 00:00:00,TAS,0.0,MEASUREMENT,12


In [9]:
# Delete any rows where the 'INTERVAL_DATETIME' is from 2021
yearly_df = yearly_df[~(yearly_df['INTERVAL_DATETIME'].dt.year == 2021)]
yearly_df

Unnamed: 0,INTERVAL_DATETIME,REGIONID,POWER,TYPE,Month
0,2020-01-01 00:30:00,NSW,0.0,MEASUREMENT,1
1,2020-01-01 00:30:00,QLD,0.0,MEASUREMENT,1
2,2020-01-01 00:30:00,SA,0.0,MEASUREMENT,1
3,2020-01-01 00:30:00,TAS,0.0,MEASUREMENT,1
4,2020-01-01 00:30:00,VIC,0.0,MEASUREMENT,1
...,...,...,...,...,...
90470,2020-12-31 23:30:00,NSW,0.0,MEASUREMENT,12
90471,2020-12-31 23:30:00,QLD,0.0,MEASUREMENT,12
90472,2020-12-31 23:30:00,SA,0.0,MEASUREMENT,12
90473,2020-12-31 23:30:00,TAS,0.0,MEASUREMENT,12


In [10]:
# 2020 yearly rooftop PV elecrtricity generation aggragation by 'Month' and'REGIONID' 
# Group by 'Month' and 'REGIONID', and sum the 'POWER' - i.e. the power of generator
aggregated_df = yearly_df.groupby(['Month', 'REGIONID'])['POWER'].sum().reset_index()
aggregated_df

Unnamed: 0,Month,REGIONID,POWER
0,1,NSW,667420.844
1,1,QLD,839180.411
2,1,SA,390383.278
3,1,TAS,44923.17
4,1,VIC,560725.575
5,2,NSW,585165.391
6,2,QLD,683965.864
7,2,SA,348489.586
8,2,TAS,39975.783
9,2,VIC,492445.765


In [11]:
aggregated_df.isnull().sum()

Month       0
REGIONID    0
POWER       0
dtype: int64

In [13]:
#Add a new column 'Electricity Generation(MWh)'
aggregated_df['Electricity Generation (MWh)'] = round(aggregated_df['POWER'] * (30/60), 2)
aggregated_df

Unnamed: 0,Month,REGIONID,POWER,Electricity Generation (MWh)
0,1,NSW,667420.844,333710.42
1,1,QLD,839180.411,419590.21
2,1,SA,390383.278,195191.64
3,1,TAS,44923.17,22461.58
4,1,VIC,560725.575,280362.79
5,2,NSW,585165.391,292582.7
6,2,QLD,683965.864,341982.93
7,2,SA,348489.586,174244.79
8,2,TAS,39975.783,19987.89
9,2,VIC,492445.765,246222.88


In [14]:
#Save the aggregated sheet as the yearly aggregated dataset
aggregated_df.to_csv('aggregatedREGIONID_PV_2020.csv', index=False)