Electrical Dataset

In [1]:
## Imports

import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv('global_electricity_production_data.csv')

In [3]:
df.head()

Unnamed: 0,country_name,date,parameter,product,value,unit
0,Australia,12/1/2023,Net Electricity Production,Electricity,22646.1901,GWh
1,Australia,12/1/2023,Net Electricity Production,Total Combustible Fuels,13397.9356,GWh
2,Australia,12/1/2023,Net Electricity Production,"Coal, Peat and Manufactured Gases",9768.5223,GWh
3,Australia,12/1/2023,Net Electricity Production,Oil and Petroleum Products,289.5415,GWh
4,Australia,12/1/2023,Net Electricity Production,Natural Gas,3091.9272,GWh


In [4]:
df['date'] = pd.to_datetime(df['date'])
#df.set_index('date', inplace=True)

In [5]:
earliest_date = df['date'].min()
latest_date = df['date'].max()

print('Earliest Date:', earliest_date)
print('Latest Date:', latest_date)

Earliest Date: 2010-01-01 00:00:00
Latest Date: 2023-12-01 00:00:00


In [7]:
#Check for missing values:
print(df.isnull().sum())

country_name     0
date             0
parameter        0
product          0
value           14
unit             0
dtype: int64


In [11]:
# Display rows with the missing values:
df[df['value'].isna()]

Unnamed: 0,country_name,date,parameter,product,value,unit
67,Chile,2023-12-01,Remarks,Data is estimated for this month,,GWh
94,Costa Rica,2023-12-01,Remarks,Data is estimated for this month,,GWh
285,Japan,2023-12-01,Remarks,Data is estimated for this month,,GWh
804,Costa Rica,2023-11-01,Remarks,Data is estimated for this month,,GWh
1517,Costa Rica,2023-10-01,Remarks,Data is estimated for this month,,GWh
2230,Costa Rica,2023-09-01,Remarks,Data is estimated for this month,,GWh
2944,Costa Rica,2023-08-01,Remarks,Data is estimated for this month,,GWh
3659,Costa Rica,2023-07-01,Remarks,Data is estimated for this month,,GWh
4375,Costa Rica,2023-06-01,Remarks,Data is estimated for this month,,GWh
5090,Costa Rica,2023-05-01,Remarks,Data is estimated for this month,,GWh


In [19]:
df_clean = df.dropna(subset=['value'])
df_clean = df_clean.drop(columns=['unit']) #Drop column as all values are the same.
df_clean.set_index('date', inplace=True)
print(df_clean.isnull().sum())

country_name    0
parameter       0
product         0
value           0
dtype: int64


In [21]:
#Encoding categorical variables: use one-hot encoding (non-ordinal)
df_encoded = pd.get_dummies(df_clean,columns=['country_name', 'parameter', 'product'])
df_encoded.head()


Unnamed: 0_level_0,value,country_name_Argentina,country_name_Australia,country_name_Austria,country_name_Belgium,country_name_Brazil,country_name_Bulgaria,country_name_Canada,country_name_Chile,country_name_China,...,product_Natural Gas,product_Not Specified,product_Nuclear,product_Oil and Petroleum Products,product_Other Combustible Non-Renewables,product_Other Renewables,product_Solar,product_Total Combustible Fuels,"product_Total Renewables (Hydro, Geo, Solar, Wind, Other)",product_Wind
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-12-01,22646.1901,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2023-12-01,13397.9356,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2023-12-01,9768.5223,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2023-12-01,289.5415,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2023-12-01,3091.9272,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [23]:
#Checking if data is stationary: Augmented Dicky-Fuller Test (ADF)
from statsmodels.tsa.stattools import adfuller

# Target = ['value']
result = adfuller(df_clean['value'])
print('ADF Statistic:', result[0])
print('p-value', result[1])
print('Critical Values:')
for key, value in result[4].items():
    print(f'\t{key}:{value}')