In [1]:
import pandas as pd

In [2]:
# List of 12 monthly dataset filenames
dataset_filenames = [
    '202101-PV.CSV',
    '202102-PV.CSV',
    '202103-PV.CSV',
    '202104-PV.CSV',
    '202105-PV.CSV',
    '202106-PV.CSV',
    '202107-PV.CSV',
    '202108-PV.CSV',
    '202109-PV.CSV',
    '202110-PV.CSV',
    '202111-PV.CSV',
    '202112-PV.CSV'
]

In [3]:
# List to store the monthly dataframes
monthly_dataframes = []

In [4]:
# Iterate over each dataset filename
for filename in dataset_filenames:
    # Open the monthly dataset
    dataset = pd.read_csv(filename)
    
    # Drop the original headers in the dataset, use the first row as the new headers
    dataset = dataset.rename(columns=dataset.iloc[0]).drop(dataset.index[0]).reset_index(drop=True)
    
    # Drop the last row ('END OF REPORT') in the dataset
    dataset = dataset.iloc[:-1]
    
    # Drop the 1st-4th, 8th and 10th columns (irrelavant columns:'I','ROOFTOP','ACTUAL','2','QI','LASTCHANGED')
    columns_to_drop = dataset.columns[[0, 1, 2, 3, 7, 9]]
    dataset = dataset.drop(columns_to_drop, axis=1)
    
    # Keep all rows with ‘REGIONID’ as: NSW1, QLD1, SA1, VIC1, TAS1; 
    # Reason: all these Region IDs are the sum of other IDs, such as QLDC, QLDS, QLDN, etc
    regions_to_keep = ['NSW1', 'QLD1', 'VIC1', 'TAS1', 'SA1']
    dataset = dataset[dataset['REGIONID'].isin(regions_to_keep)]
    
    # Keep all records with ‘TYPE’ as: ‘MEASUREMENT’; 
    # Reason: ‘MEASUREMENT’ is the actual measured results of ‘POWER’, ‘SATELLITE’ is the estimated results of ‘power’)
    dataset = dataset[dataset['TYPE'] == 'MEASUREMENT'] 
    
    # Clean the 'REGIONID' column: e.g. 'NSW1' to 'NSW'
    dataset['REGIONID'] = dataset['REGIONID'].replace({'NSW1':'NSW','QLD1':'QLD','VIC1':'VIC','SA1':'SA','TAS1':'TAS'})
    
     # Change POWER to numeric
    dataset['POWER'] = pd.to_numeric(dataset['POWER'], errors='coerce')
    
    # Change INTERVAL_DATETIME to datetime
    dataset['INTERVAL_DATETIME'] = pd.to_datetime(dataset['INTERVAL_DATETIME'])
    
    # Add a new column 'Month'
    dataset['Month'] = int(filename[4:6])  # Extract the month from the filename
    
    # Append the monthly dataset to the list
    monthly_dataframes.append(dataset)

In [5]:
# Concatenate the monthly dataframes vertically to get the yearly electricity generation dataframe
yearly_df = pd.concat(monthly_dataframes, ignore_index=True)
yearly_df

Unnamed: 0,INTERVAL_DATETIME,REGIONID,POWER,TYPE,Month
0,2021-01-01 00:30:00,NSW,0.0,MEASUREMENT,1
1,2021-01-01 00:30:00,QLD,0.0,MEASUREMENT,1
2,2021-01-01 00:30:00,SA,0.0,MEASUREMENT,1
3,2021-01-01 00:30:00,TAS,0.0,MEASUREMENT,1
4,2021-01-01 00:30:00,VIC,0.0,MEASUREMENT,1
...,...,...,...,...,...
90475,2022-01-02 00:00:00,NSW,0.0,MEASUREMENT,12
90476,2022-01-02 00:00:00,QLD,0.0,MEASUREMENT,12
90477,2022-01-02 00:00:00,SA,0.0,MEASUREMENT,12
90478,2022-01-02 00:00:00,TAS,0.0,MEASUREMENT,12


In [9]:
# Delete any rows where the 'INTERVAL_DATETIME' is from 2022
yearly_df = yearly_df[~(yearly_df['INTERVAL_DATETIME'].dt.year == 2022)]
yearly_df

Unnamed: 0,INTERVAL_DATETIME,REGIONID,POWER,TYPE,Month
0,2021-01-01 00:30:00,NSW,0.0,MEASUREMENT,1
1,2021-01-01 00:30:00,QLD,0.0,MEASUREMENT,1
2,2021-01-01 00:30:00,SA,0.0,MEASUREMENT,1
3,2021-01-01 00:30:00,TAS,0.0,MEASUREMENT,1
4,2021-01-01 00:30:00,VIC,0.0,MEASUREMENT,1
...,...,...,...,...,...
90230,2021-12-31 23:30:00,NSW,0.0,MEASUREMENT,12
90231,2021-12-31 23:30:00,QLD,0.0,MEASUREMENT,12
90232,2021-12-31 23:30:00,SA,0.0,MEASUREMENT,12
90233,2021-12-31 23:30:00,TAS,0.0,MEASUREMENT,12


In [10]:
# 2021 yearly rooftop PV elecrtricity generation aggragation by 'Month' and'REGIONID' 
# Group by 'Month' and 'REGIONID', and sum the 'POWER' - i.e. the power of generator
aggregated_df = yearly_df.groupby(['Month', 'REGIONID'])['POWER'].sum().reset_index()
aggregated_df

Unnamed: 0,Month,REGIONID,POWER
0,1,NSW,988500.651
1,1,QLD,995889.624
2,1,SA,517207.833
3,1,TAS,54599.012
4,1,VIC,717042.946
5,2,NSW,833484.738
6,2,QLD,855279.177
7,2,SA,420196.076
8,2,TAS,46969.914
9,2,VIC,622064.339


In [11]:
aggregated_df.isnull().sum()

Month       0
REGIONID    0
POWER       0
dtype: int64

In [13]:
#Add a new column 'Electricity Generation(MWh)'
aggregated_df['Electricity Generation (MWh)'] = round(aggregated_df['POWER'] * (30/60), 2)
aggregated_df

Unnamed: 0,Month,REGIONID,POWER,Electricity Generation (MWh)
0,1,NSW,988500.651,494250.33
1,1,QLD,995889.624,497944.81
2,1,SA,517207.833,258603.92
3,1,TAS,54599.012,27299.51
4,1,VIC,717042.946,358521.47
5,2,NSW,833484.738,416742.37
6,2,QLD,855279.177,427639.59
7,2,SA,420196.076,210098.04
8,2,TAS,46969.914,23484.96
9,2,VIC,622064.339,311032.17


In [14]:
#Save the aggregated sheet as the yearly aggregated dataset
aggregated_df.to_csv('aggregatedREGIONID_PV_2021.csv', index=False)