In [1]:
# Import Necessary Libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [4]:
# Load in the dataset
df = pd.read_csv("world_energy_consumption.csv")

# Display basic information of the dataset
print(df.shape) # number of rows and columns
print(df.columns.tolist()) # View Columns
print(df.head()) # preview first few rows

(22012, 129)
['country', 'year', 'iso_code', 'population', 'gdp', 'biofuel_cons_change_pct', 'biofuel_cons_change_twh', 'biofuel_cons_per_capita', 'biofuel_consumption', 'biofuel_elec_per_capita', 'biofuel_electricity', 'biofuel_share_elec', 'biofuel_share_energy', 'carbon_intensity_elec', 'coal_cons_change_pct', 'coal_cons_change_twh', 'coal_cons_per_capita', 'coal_consumption', 'coal_elec_per_capita', 'coal_electricity', 'coal_prod_change_pct', 'coal_prod_change_twh', 'coal_prod_per_capita', 'coal_production', 'coal_share_elec', 'coal_share_energy', 'electricity_demand', 'electricity_generation', 'electricity_share_energy', 'energy_cons_change_pct', 'energy_cons_change_twh', 'energy_per_capita', 'energy_per_gdp', 'fossil_cons_change_pct', 'fossil_cons_change_twh', 'fossil_elec_per_capita', 'fossil_electricity', 'fossil_energy_per_capita', 'fossil_fuel_consumption', 'fossil_share_elec', 'fossil_share_energy', 'gas_cons_change_pct', 'gas_cons_change_twh', 'gas_consumption', 'gas_elec_p

In [6]:
# Select only rows with countries in Asia
# List of countries in Asia
asia_countries = [
    "Kazakhstan", "Kyrgyzstan", "Tajikistan", "Turkmenistan", "Uzbekistan",
    "China", "Hong Kong", "Japan", "Mongolia", "North Korea", "South Korea",
    "Brunei", "Cambodia", "Indonesia", "Laos", "Malaysia", "Myanmar", "Philippines",
    "Singapore", "Thailand", "East Timor", "Vietnam", "Afghanistan", "Bangladesh",
    "Bhutan", "India", "Iran", "Maldives", "Nepal", "Pakistan", "Sri Lanka",
    "Armenia", "Azerbaijan", "Bahrain", "Cyprus", "Georgia", "Iraq", "Israel",
    "Jordan", "Kuwait", "Lebanon", "Oman", "Qatar", "Saudi Arabia", "Palestine",
    "Syria", "Turkey", "United Arab Emirates", "Yemen"
]
# Assign countries in the list into asia_df
asia_df = df[df["country"].isin(asia_countries)]
print(asia_df["country"].unique())

['Afghanistan' 'Armenia' 'Azerbaijan' 'Bahrain' 'Bangladesh' 'Bhutan'
 'Brunei' 'Cambodia' 'China' 'Cyprus' 'East Timor' 'Georgia' 'Hong Kong'
 'India' 'Indonesia' 'Iran' 'Iraq' 'Israel' 'Japan' 'Jordan' 'Kazakhstan'
 'Kuwait' 'Kyrgyzstan' 'Laos' 'Lebanon' 'Malaysia' 'Maldives' 'Mongolia'
 'Myanmar' 'Nepal' 'North Korea' 'Oman' 'Pakistan' 'Palestine'
 'Philippines' 'Qatar' 'Saudi Arabia' 'Singapore' 'South Korea'
 'Sri Lanka' 'Syria' 'Tajikistan' 'Thailand' 'Turkey' 'Turkmenistan'
 'United Arab Emirates' 'Uzbekistan' 'Vietnam' 'Yemen']


In [32]:
# Select only rows starting from year 2000
asia_df = asia_df[asia_df["year"] >= 2000]
print(asia_df.shape)

(1095, 11)


In [10]:
# Select only relevant columns (relevant to electricity)
relevant_columns = [
    "country",
    "year",
    "population",
    "gdp",
    "electricity_generation",
    "electricity_demand",
    "per_capita_electricity",
    "net_elec_imports",
    "renewables_electricity",
    "renewables_share_elec",
    "electricity_share_energy",
    "carbon_intensity_elec"
]
asia_df = asia_df[relevant_columns]

In [12]:
# Check for duplicated rows
asia_df.duplicated().sum()

0

In [14]:
# Check for missing values
asia_df.isnull().sum()

country                       0
year                          0
population                    0
gdp                         255
electricity_generation        5
electricity_demand           15
per_capita_electricity        5
net_elec_imports             15
renewables_electricity        5
renewables_share_elec         6
electricity_share_energy    466
carbon_intensity_elec        16
dtype: int64

In [None]:
# Handle missing values
# drop rows with missing electricity demand (as it is dependant variable)
asia_df = asia_df.dropna(subset=["electricity_demand"])

# drop electricity_share_energy (not relevant & too many missing values)
asia_df.drop(columns=["electricity_share_energy"], inplace=True)

# fill missing values with median
fill_cols = [
    "gdp", "electricity_generation", "per_capita_electricity",
    "net_elec_imports", "renewables_electricity",
    "renewables_share_elec", "carbon_intensity_elec"
]

for col in fill_cols:
    asia_df[col] = asia_df[col].fillna(asia_df[col].median())

In [20]:
# Check column data types
asia_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1095 entries, 123 to 21673
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   country                 1095 non-null   object 
 1   year                    1095 non-null   int64  
 2   population              1095 non-null   float64
 3   gdp                     1095 non-null   float64
 4   electricity_generation  1095 non-null   float64
 5   electricity_demand      1095 non-null   float64
 6   per_capita_electricity  1095 non-null   float64
 7   net_elec_imports        1095 non-null   float64
 8   renewables_electricity  1095 non-null   float64
 9   renewables_share_elec   1095 non-null   float64
 10  carbon_intensity_elec   1095 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 102.7+ KB


In [22]:
# Perform one hot encoding on the 'country' column
from sklearn.preprocessing import OneHotEncoder

# Initialize encoder (drop='first' avoids dummy variable trap)
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Fit and transform the 'country' column
country_encoded = encoder.fit_transform(asia_df[['country']])

# Get encoded column names
country_encoded_cols = encoder.get_feature_names_out(['country'])

# Convert encoded data into a DataFrame
country_encoded_df = pd.DataFrame(country_encoded, columns=country_encoded_cols)

# Drop original 'country' column and combine encoded columns
X = pd.concat([
    asia_df.drop('country', axis=1).reset_index(drop=True),
    country_encoded_df.reset_index(drop=True)
], axis=1)

print("One-hot encoding successful!")
print(X.head())

One-hot encoding successful!
   year  population           gdp  electricity_generation  electricity_demand  \
0  2000  19542986.0  1.128379e+10                    0.47                0.57   
1  2001  19688634.0  1.102127e+10                    0.59                0.69   
2  2002  21000258.0  1.880487e+10                    0.69                0.79   
3  2003  22645136.0  2.107434e+10                    0.94                1.04   
4  2004  23553554.0  2.233257e+10                    0.89                0.99   

   per_capita_electricity  net_elec_imports  renewables_electricity  \
0                  24.050               0.1                    0.31   
1                  29.967               0.1                    0.50   
2                  32.857               0.1                    0.56   
3                  41.510               0.1                    0.63   
4                  37.786               0.1                    0.56   

   renewables_share_elec  carbon_intensity_elec  ...  cou

In [24]:
y = X["electricity_demand"]
X = X.drop("electricity_demand", axis=1) 

# Data Splitting
# chronological split data by year
cutoff_year = 2018
X_train = X[X["year"] <= cutoff_year]
X_test = X[X["year"] > cutoff_year]
y_train = y[X["year"] <= cutoff_year]
y_test = y[X["year"] > cutoff_year]

print("Training years:", X_train["year"].min(), "to", X_train["year"].max())
print("Testing years:", X_test["year"].min(), "to", X_test["year"].max())
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Training years: 2000 to 2018
Testing years: 2019 to 2022
Train shape: (928, 57)
Test shape: (167, 57)


In [26]:
# Feature Scaling
# Standard Scaler
scaler = StandardScaler()

cols_to_scale = ["population", "gdp", "electricity_generation", "per_capita_electricity",
    "net_elec_imports", "renewables_electricity", "renewables_share_elec",
    "carbon_intensity_elec"]

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Fit scaler on training data and apply it to both train and test
X_train_scaled[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test_scaled[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

# Add back the target column
X_train_scaled["electricity_demand"] = y_train.to_numpy()
X_test_scaled["electricity_demand"]  = y_test.to_numpy()

# Save as CSV files
X_train_scaled.to_csv("training_dataset.csv", index=False)
X_test_scaled.to_csv("testing_dataset.csv", index=False)

print("Datasets saved successfully!")

Datasets saved successfully!


In [34]:
# Import libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

# Load datasets
train = pd.read_csv("training_dataset.csv")
test = pd.read_csv("testing_dataset.csv")

# Separate features (X) and target (y)
y_train = train["electricity_demand"]
X_train = train.drop(columns=["electricity_demand"])
y_test = test["electricity_demand"]
X_test = test.drop(columns=["electricity_demand"])

# Drop feature highly correlated with the target
X_train = X_train.drop(columns=["electricity_generation"], errors="ignore")
X_test = X_test.drop(columns=["electricity_generation"], errors="ignore")

# Train the model
mlr = LinearRegression()
mlr.fit(X_train, y_train)

# Make predictions
pred_mlr = mlr.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, pred_mlr)
rmse = root_mean_squared_error(y_test, pred_mlr)
r2 = r2_score(y_test, pred_mlr)

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")


MAE: 367.3444
RMSE: 1093.5058
R2: 0.2426


In [38]:
# Save metrics
metrics = pd.DataFrame({
    "Model": ["Multiple Linear Regression"],
    "MAE": [mae],
    "RMSE": [rmse],
    "R2": [r2]
})
metrics.to_csv("mlr_results.csv", index=False)
print("Results saved as mlr_results.csv")

Results saved as mlr_results.csv


In [36]:
# Extract the actual country from one-hot columns
country_cols = [col for col in test.columns if col.startswith("country_")]
country_series = test[country_cols].idxmax(axis=1).str.replace("country_", "")

# Save predictions with readable country names
pred_df = pd.DataFrame({
    "country": country_series,
    "year": test["year"],
    "actual": y_test,
    "pred_mlr": pred_mlr
})

pred_df.to_csv("mlr_prediction_country.csv", index=False)

In [40]:
# Feedforward Neural Network (MLP)
# For Electricity Demand Prediction (Regression)

# Load libraries
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

train = pd.read_csv("training_dataset.csv")
test = pd.read_csv("testing_dataset.csv")

y_train = train["electricity_demand"]
X_train = train.drop(columns=["electricity_demand"])
y_test = test["electricity_demand"]
X_test = test.drop(columns=["electricity_demand"])

print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (928, 57) Test: (167, 57)


In [42]:
# Train MLP Model
mlp = MLPRegressor(
    hidden_layer_sizes=(64,32),
    activation="relu",
    solver="adam",
    learning_rate_init=0.001,
    alpha=0.0001,
    max_iter=1000,
    early_stopping=True,
    random_state=42
)
mlp.fit(X_train, y_train)
pred = mlp.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)

print(f"MAE: {mae:.4f} ")
print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")

MAE: 386.5028 
RMSE: 1264.5021
R2: -0.0128


In [44]:
# Save metrics
metrics = pd.DataFrame({
    "Model": ["MLPRegressor (FNN)"],
    "MAE": [mae],
    "RMSE": [rmse],
    "R2": [r2]
})
metrics.to_csv("fnn_mlp_results.csv", index=False)
print("Results saved as fnn_mlp_results.csv")

Results saved as fnn_mlp_results.csv


In [48]:
country_cols = [col for col in test.columns if col.startswith("country_")]
country_series = test[country_cols].idxmax(axis=1).str.replace("country_", "")

pred_df = pd.DataFrame({
    "country": country_series,
    "year": test["year"],
    "actual": y_test,
    "pred_mlp": pred  # or pred_mlp, depending on your variable name
})
pred_df.to_csv("fnn_mlp_predictions.csv", index=False)

In [54]:
# Ensemble
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

# Load prediction files
mlr = pd.read_csv("mlr_prediction_country.csv")
mlp = pd.read_csv("fnn_mlp_predictions.csv")

# Merge on country and year (they match from your earlier saving step)
combined = mlr.merge(mlp, on=["country", "year"], suffixes=("_mlr", "_mlp"))

# Simple average of predictions
combined["pred_avg"] = (combined["pred_mlr"] + combined["pred_mlp"]) / 2

# Evaluate performance
mae = mean_absolute_error(combined["actual_mlr"], combined["pred_avg"])
rmse = root_mean_squared_error(combined["actual_mlr"], combined["pred_avg"])
r2 = r2_score(combined["actual_mlr"], combined["pred_avg"])

print(f"Combined MAE: {mae:.3f}")
print(f"Combined RMSE: {rmse:.3f}")
print(f"Combined R²: {r2:.3f}")

Combined MAE: 345.301
Combined RMSE: 1143.566
Combined R²: 0.144
