In [1]:
import os

import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_columns = None

# Data Cleaning
## Import the data
Start by finding a list of all data files (ensuring that the found path is relitive to where everything's run from).

Then combine all seperated data files into single CSVs!

**Note that in the future this code may be simplified by using glob instead of os.listdir**

In [38]:
energy_locations = os.listdir("../Data/Energy")
temperature_locations = os.listdir("../Data/Temperature")

energy_CSVs = [pd.read_csv("../Data/Energy/" + location) for location in energy_locations]
temperature_CSVs = [pd.read_csv("../Data/Temperature/" + location) for location in temperature_locations if "Data" in location]

In [39]:
energy_data = pd.concat(energy_CSVs, ignore_index=True)
temperature_data = pd.concat(temperature_CSVs, ignore_index=True)

# Note some excess data is shaved off here for simplicity
stations_data = pd.read_csv("../Data/Temperature/HM01X_StnDet_999999999743964.txt", skiprows=5, header=None, usecols=range(1, 21), names=["StationNumber", "RainfulDistrict", "Station", "StationOpenDate", "StationCloseDate", "Latitude", "Longitude", "LocationMethod", "State", "StationHeight", "BarometerHeight", "WMO", "FirstYear", "LastYear", "CompletionPercent", "YQualityPercent", "NQualityPercent", "WQualityPercent", "SQualityPercent", "IQualityPercent"])

## Column standardising
Start by renaming all columns in a standardised way (writen in simplest form with no spaces and first letters capatilised)
*Note this standarisation hapens whilst importing the station details (so unecessary here)!*

In [40]:
energy_data.dtypes
temperature_data.dtypes
stations_data.dtypes

REGION             object
SETTLEMENTDATE     object
TOTALDEMAND       float64
RRP               float64
PERIODTYPE         object
dtype: object

hm                                                         object
Station Number                                              int64
Year Month Day Hour Minutes in YYYY                         int64
MM                                                          int64
DD                                                          int64
HH24                                                        int64
MI format in Local time                                     int64
Year Month Day Hour Minutes in YYYY.1                       int64
MM.1                                                        int64
DD.1                                                        int64
HH24.1                                                      int64
MI format in Local standard time                            int64
Precipitation since 9am local time in mm                   object
Quality of precipitation since 9am local time              object
Air Temperature in degrees C                               object
Quality of

StationNumber          int64
RainfulDistrict       object
Station               object
StationOpenDate       object
StationCloseDate      object
Latitude             float64
Longitude            float64
LocationMethod        object
State                 object
StationHeight        float64
BarometerHeight      float64
WMO                    int64
FirstYear              int64
LastYear               int64
CompletionPercent      int64
YQualityPercent        int64
NQualityPercent        int64
WQualityPercent        int64
SQualityPercent        int64
IQualityPercent        int64
dtype: object

In [41]:
energy_data.columns = ["Region", "Date", "TotalDemand", "RRP", "PeriodType"]
temperature_data.columns = [
    "HM", "StationNumber", "Year1", "Month1", "Day1", "Hour1", "Minute1", "Year", "Month", "Day", "Hour", "Minute", "Precipitation", "PrecipitationQuality",
    "AirTemperature", "AirTemperatureQuality", "WetBulbTemperature", "WetBulbTemperatureQuality", "DewTemperature", "DewTemperatureQuality", "RelativeHumidity",
    "RelativeHumidityQuality", "WindSpeed", "WindSpeedQuality", "WindDirection", "WindDirectionQuality", "WindgustSpeed", "WindgustSpeedQuality", "SeaPressure",
    "SeaPressureQuality", "StationPressure", "StationPressureQuality", "AWSFlag", "#"
]

## Remove any easily found unnecessary data
Remove any obvious duplicated data or data which always has the same value (or is just meaningless to analysis).
Briefly taking a look at the data gives an overview of what may easily be removed!

**Start by simply getting an overview of the data, then look at unique values and basic descriptive statistics**.

*Note manual filtering is used for exceptional data*

In [42]:
energy_data
temperature_data
stations_data

Unnamed: 0,Region,Date,TotalDemand,RRP,PeriodType
0,VIC1,2000/04/01 00:30,5132.32333,26.01,TRADE
1,VIC1,2000/04/01 01:00,4928.59500,15.58,TRADE
2,VIC1,2000/04/01 01:30,5155.45500,17.45,TRADE
3,VIC1,2000/04/01 02:00,4991.13833,15.12,TRADE
4,VIC1,2000/04/01 02:30,4777.00167,17.69,TRADE
...,...,...,...,...,...
1658960,TAS1,2018/05/31 22:00:00,1297.12000,78.32,TRADE
1658961,TAS1,2018/05/31 22:30:00,1239.41000,78.33,TRADE
1658962,TAS1,2018/05/31 23:00:00,1194.16000,78.32,TRADE
1658963,TAS1,2018/05/31 23:30:00,1154.50000,78.32,TRADE


Unnamed: 0,HM,StationNumber,Year1,Month1,Day1,Hour1,Minute1,Year,Month,Day,Hour,Minute,Precipitation,PrecipitationQuality,AirTemperature,AirTemperatureQuality,WetBulbTemperature,WetBulbTemperatureQuality,DewTemperature,DewTemperatureQuality,RelativeHumidity,RelativeHumidityQuality,WindSpeed,WindSpeedQuality,WindDirection,WindDirectionQuality,WindgustSpeed,WindgustSpeedQuality,SeaPressure,SeaPressureQuality,StationPressure,StationPressureQuality,AWSFlag,#
0,hm,94029,2000,1,1,2,0,2000,1,1,1,0,0.8,N,10.4,N,7.3,N,3.3,N,61,N,11.2,N,220,N,13.0,N,1019.3,N,1013.0,N,,#
1,hm,94029,2000,1,1,2,30,2000,1,1,1,30,0.8,N,10.3,N,7.1,N,2.9,N,60,N,9.4,N,240,N,11.2,N,1019.1,N,1012.8,N,,#
2,hm,94029,2000,1,1,3,0,2000,1,1,2,0,0.8,N,10.3,N,6.9,N,2.3,N,58,N,13.0,N,240,N,18.4,N,1018.9,N,1012.6,N,,#
3,hm,94029,2000,1,1,3,30,2000,1,1,2,30,0.8,N,10.3,N,6.9,N,2.3,N,58,N,13.0,N,240,N,18.4,N,1018.7,N,1012.4,N,,#
4,hm,94029,2000,1,1,4,0,2000,1,1,3,0,0.8,N,10.1,N,6.9,N,2.6,N,60,N,11.2,N,260,N,13.0,N,1018.5,N,1012.2,N,,#
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1826239,hm,23090,2020,1,20,7,30,2020,1,20,6,30,9.4,N,15.1,N,14.6,N,14.2,N,94,N,7.6,N,240,N,14.8,N,1010.3,N,1004.4,N,1,#
1826240,hm,23090,2020,1,20,8,0,2020,1,20,7,0,9.4,N,15.5,N,14.7,N,14.1,N,91,N,9.4,N,230,N,16.6,N,1010.4,N,1004.5,N,1,#
1826241,hm,23090,2020,1,20,8,30,2020,1,20,7,30,9.4,N,15.7,N,14.8,N,14.1,N,90,N,11.2,N,210,N,29.5,N,1010.7,N,1004.8,N,1,#
1826242,hm,23090,2020,1,20,9,0,2020,1,20,8,0,9.6,N,15.5,N,14.8,N,14.3,N,93,N,9.4,N,220,N,16.6,N,1011,N,1005.1,N,1,#


Unnamed: 0,StationNumber,RainfulDistrict,Station,StationOpenDate,StationCloseDate,Latitude,Longitude,LocationMethod,State,StationHeight,BarometerHeight,WMO,FirstYear,LastYear,CompletionPercent,YQualityPercent,NQualityPercent,WQualityPercent,SQualityPercent,IQualityPercent
0,86338,86,MELBOURNE (OLYMPIC PARK),05/2013,,-37.8255,144.9816,SURVEY,VIC,7.5,7.5,95936,2013,2020,103,0,100,0,0,0
1,86071,86,MELBOURNE REGIONAL OFFICE,01/1908,01/2015,-37.8075,144.97,GPS,VIC,31.2,32.2,94868,2000,2015,99,0,100,0,0,0
2,23090,23A,ADELAIDE (KENT TOWN),01/1977,,-34.9211,138.6216,GPS,SA,48.0,51.0,94675,2000,2020,101,0,100,0,0,0
3,66062,66,SYDNEY (OBSERVATORY HILL),01/1858,,-33.8607,151.205,GPS,NSW,39.0,40.2,94768,2000,2020,99,0,100,0,0,0
4,94029,94,HOBART (ELLERSLIE ROAD),01/1882,,-42.8897,147.3278,GPS,TAS,50.5,51.4,94970,2000,2020,110,0,100,0,0,0
5,40913,40,BRISBANE,12/1999,,-27.4808,153.0389,GPS,QLD,8.1,8.3,94576,2000,2020,99,0,100,0,0,0


In [43]:
def remove_non_uniques(dataframe: pd.DataFrame, filter = []):
    remove = [name for name, series in dataframe.items() if len(series.unique()) <= 2 and not name in filter]
    dataframe.drop(remove, axis=1, inplace=True)
    return remove

print("Removed:")
remove_non_uniques(energy_data)
remove_non_uniques(temperature_data)
remove_non_uniques(stations_data, "LocationMethod")

# Manually remove extra columns
stations_data.drop("StationOpenDate", axis=1, inplace=True)

Removed:


['PeriodType']

['HM',
 'PrecipitationQuality',
 'AirTemperatureQuality',
 'WetBulbTemperatureQuality',
 'DewTemperatureQuality',
 'RelativeHumidityQuality',
 'WindSpeedQuality',
 'WindDirectionQuality',
 'WindgustSpeedQuality',
 'SeaPressureQuality',
 'StationPressureQuality',
 '#']

['StationCloseDate',
 'FirstYear',
 'LastYear',
 'YQualityPercent',
 'NQualityPercent',
 'WQualityPercent',
 'SQualityPercent',
 'IQualityPercent']

In [44]:
energy_data.columns
temperature_data.columns
stations_data.columns

Index(['Region', 'Date', 'TotalDemand', 'RRP'], dtype='object')

Index(['StationNumber', 'Year1', 'Month1', 'Day1', 'Hour1', 'Minute1', 'Year',
       'Month', 'Day', 'Hour', 'Minute', 'Precipitation', 'AirTemperature',
       'WetBulbTemperature', 'DewTemperature', 'RelativeHumidity', 'WindSpeed',
       'WindDirection', 'WindgustSpeed', 'SeaPressure', 'StationPressure',
       'AWSFlag'],
      dtype='object')

Index(['StationNumber', 'RainfulDistrict', 'Station', 'Latitude', 'Longitude',
       'LocationMethod', 'State', 'StationHeight', 'BarometerHeight', 'WMO',
       'CompletionPercent'],
      dtype='object')

## Deal with multiple dates
Find out what the difference between the dates are.
Then ensure all dates are combined into one column.

Provided notes dictate that first set of dates are local and second local standard.
Whilst generally identical, local time incorporates day light saving (shifting the clock back/foward by an hour).
For simplicity, local standard time will be used.
*Note that previouslly the local standard columns were named without a 1, and the others with a 1 to avoid renaming here*!

In [45]:
# Remove extra dates
temperature_data.drop(["Year1", "Month1", "Day1", "Hour1", "Minute1"], axis=1, inplace=True)

# Reformat dates into Pandas' datatime64 objects
# Replacing old format
temperature_data["Date"] = pd.to_datetime(temperature_data[["Year", "Month", "Day", "Hour", "Minute"]])
energy_data["Date"] = pd.to_datetime(energy_data["Date"])

temperature_data.drop(["Year", "Month", "Day", "Hour", "Minute"], axis=1, inplace=True)

## Check for duplicated rows
Note that this process works on *all columns* (as a specific choice hasn't been provided)

To check which rows are removed Pandas duplicated function can be used beforehand.
This has been excluded from analysis for brevity (especially since there are **no found duplicates**)!

In [46]:
energy_data.drop_duplicates(inplace=True)
temperature_data.drop_duplicates(inplace=True)
stations_data.drop_duplicates(inplace=True)

## Converting datatypes
To make sure we can create visualisations/plots and mathematical/statistical models we need to use numerical datatypes (i.e. not objects)!
Note for AWSFlag, empty values are assumed to be manually recorded as this is the worst outcome.

In [47]:
def to_object_columns(lambda_function):
    string_columns = temperature_data.select_dtypes("object").columns
    temperature_data[string_columns] = temperature_data[string_columns].apply(lambda_function)

In [48]:
to_object_columns(lambda column: column.str.strip())

In [49]:
temperature_data["AWSFlag"] = temperature_data["AWSFlag"].replace("", 0).astype("category")
temperature_data["AWSFlag"].fillna(0, inplace=True)
temperature_data["RelativeHumidity"] = temperature_data["RelativeHumidity"].replace("###", np.NaN)

In [50]:
to_object_columns(lambda column: pd.to_numeric(column))

In [51]:
temperature_data.dtypes

StationNumber                  int64
Precipitation                float64
AirTemperature               float64
WetBulbTemperature           float64
DewTemperature               float64
RelativeHumidity             float64
WindSpeed                    float64
WindDirection                float64
WindgustSpeed                float64
SeaPressure                  float64
StationPressure              float64
AWSFlag                     category
Date                  datetime64[ns]
dtype: object

## Finding missing data
Note that the energy data has no null values. This is shown by the empty dictionary (which for the temperature data shows the counts of empty cells for each column)

In [52]:
# Note that this function ignores columns with no missing values
def get_null_counts(dataframe: pd.DataFrame):
    return {column: len(dataframe[dataframe[column].isnull()]) for column in dataframe.columns if len(dataframe[dataframe[column].isnull()]) > 0}

In [53]:
get_null_counts(energy_data)
get_null_counts(temperature_data)

{}

{'Precipitation': 419882,
 'AirTemperature': 811651,
 'WetBulbTemperature': 20680,
 'DewTemperature': 685409,
 'RelativeHumidity': 685412,
 'WindSpeed': 973326,
 'WindDirection': 789494,
 'WindgustSpeed': 736311,
 'SeaPressure': 251529,
 'StationPressure': 20335}

## Saving the cleaned data

In [57]:
pd.to_pickle([energy_data, temperature_data, stations_data], "../Data/CleanedData.pickle")