### MONTHLY DATA AT 6PM

This notebook is used to predict the electricity use at 6pm per month in the UK households

- Linear Regression
- Random Forest
- XGBoost

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.metrics import accuracy_score, r2_score

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
#pip install plotly

In [None]:
# Plots
# ==============================================================================
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
import plotly.graph_objects as go
import plotly.io as pio
import plotly.offline as poff
pio.renderers.default = 'notebook'
pio.templates.default = "seaborn"
poff.init_notebook_mode(connected=True)
plt.style.use('seaborn-v0_8-darkgrid')

In [None]:
# Load your dataset
data = pd.read_csv("merged_monthly_6pm_geo.csv")
print(data.shape)

(304367, 16)


In [None]:
# Drop columns
columns_to_drop = ["time", "id:1", "acorn_description", "acorn_type"] #acron_type removed because it had too many categories in order to make dummies
data = data.drop(columns=columns_to_drop)
print(data.shape)

(304367, 12)


In [None]:
months_abbrev_to_numeric = {
    'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4,
    'MAY': 5, 'JUN': 6, 'JUL': 7, 'AUG': 8,
    'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12
}

# Apply the mapping to the 'month' column
data['month_n'] = data['month'].map(months_abbrev_to_numeric)

In [None]:
#data.info()

In [None]:
data.head()

Unnamed: 0,id,month,year,mean_kwh,training_or_test,electricity_profile_class,fuel_types,acorn_category,acorn_group,uk_administrative_geography,grid_supply_point,has_electricity_meter,month_n
0,1,APR,9,0.209333,training,2,Dual,1.0,C,UKG,_B,0,4
1,1,APR,10,0.266333,training,2,Dual,1.0,C,UKG,_B,0,4
2,1,AUG,8,0.337667,training,2,Dual,1.0,C,UKG,_B,0,8
3,1,AUG,9,0.213871,training,2,Dual,1.0,C,UKG,_B,0,8
4,1,DEC,8,0.513462,training,2,Dual,1.0,C,UKG,_B,0,12


In [None]:
# Define features and target variable
cat_variables = ["electricity_profile_class", "fuel_types", "acorn_category",
            "acorn_group", "uk_administrative_geography",
            "grid_supply_point", "has_electricity_meter"]
target = "mean_kwh"

In [None]:
# Replace blank and "--" values with NaN
data.replace({"": np.nan, "--": np.nan}, inplace=True)

In [None]:
#data.info()

In [None]:
# Perform one-hot encoding for categorical variables
data = pd.get_dummies(data, columns=cat_variables, drop_first=False)
#data.info()

In [None]:
#data = data.drop("acorn_category_0.0") #NA/unknown column
data.head(10)

Unnamed: 0,id,month,year,mean_kwh,training_or_test,month_n,electricity_profile_class_1,electricity_profile_class_2,fuel_types_Dual,fuel_types_ElecOnly,...,grid_supply_point__B,grid_supply_point__C,grid_supply_point__E,grid_supply_point__H,grid_supply_point__J,grid_supply_point__K,grid_supply_point__N,grid_supply_point__P,has_electricity_meter_0,has_electricity_meter_1
0,1,APR,9,0.209333,training,4,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False
1,1,APR,10,0.266333,training,4,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False
2,1,AUG,8,0.337667,training,8,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False
3,1,AUG,9,0.213871,training,8,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False
4,1,DEC,8,0.513462,training,12,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False
5,1,DEC,9,0.682581,training,12,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False
6,1,FEB,9,0.738,training,2,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False
7,1,FEB,10,0.845185,training,2,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False
8,1,JAN,9,0.419355,training,1,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False
9,1,JAN,10,0.583871,training,1,False,True,True,False,...,True,False,False,False,False,False,False,False,True,False


In [None]:
# Filter training data based on year 08 or 09
training_data = data[((data["training_or_test"] == "training") & (data["year"].isin([8, 9])))]

# Filter test data based on year 10
#test_data = data[((data["training_or_test"] == "test") & (data["year"] == 10))]

test_data = data[((data["training_or_test"] == "training") & (data["year"] == 10))]

In [None]:
# Split data into features and target variable
X_train = training_data.iloc[:, ~training_data.columns.isin(["id", "mean_kwh", "training_or_test"])] #removed month
y_train = training_data[target]
X_test = test_data.iloc[:, ~test_data.columns.isin(["id", "mean_kwh", "training_or_test"])] #removed month
y_test = test_data[target]

In [None]:
print(X_train.shape)
print(y_train.shape)

(147364, 50)
(147364,)


In [None]:
print(X_test.shape)
print(y_test.shape)

(65319, 50)
(65319,)


In [None]:
#training_data.head()

In [None]:
#X_train.head()

In [None]:
#y_train.head()

In [None]:
#X_train.info()

### EDA & VISUALISATION

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304367 entries, 0 to 304366
Data columns (total 54 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   id                               304367 non-null  int64         
 1   month                            304367 non-null  object        
 2   year                             304367 non-null  int64         
 3   mean_kwh                         304367 non-null  float64       
 4   training_or_test                 304367 non-null  object        
 5   month_n                          304367 non-null  int64         
 6   electricity_profile_class_1      304367 non-null  bool          
 7   electricity_profile_class_2      304367 non-null  bool          
 8   fuel_types_Dual                  304367 non-null  bool          
 9   fuel_types_ElecOnly              304367 non-null  bool          
 10  acorn_category_0.0               304367 non-

### LINEAR REGRESSION

In [None]:
# Initialize and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Evaluate the model on training data
y_train_pred = model.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
print("Training MSE:", mse_train)

Training MSE: 0.07550592932711928


In [None]:
# Evaluate the model on test data
y_test_pred = model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
print("Test MSE:", mse_test)

Test MSE: 0.08309158041357695


In [None]:

# Assuming y_true contains the true target values and y_pred contains the predicted target values
r2 = r2_score(y_test, y_test_pred)
print("R-squared:", r2)

R-squared: 0.020731321666662583


### RANDOM FOREST

In [None]:
rf = RandomForestRegressor(random_state=123)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=5, scoring='neg_mean_absolute_error')
rf_grid_search.fit(X_train, y_train)
rf_best = rf_grid_search.best_estimator_
rf_pred = rf_best.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)

KeyboardInterrupt: 

In [None]:
print("Random Forest MAE:", rf_mae)

### XGBoost

In [None]:
# 3. Gradient Boosting Regressor (using XGBoost as an example)
import xgboost as xgb
xgb_model = xgb.XGBRegressor()
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='neg_mean_absolute_error')
xgb_grid_search.fit(X_train, y_train)
xgb_best = xgb_grid_search.best_estimator_
xgb_pred = xgb_best.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_pred)

In [None]:
print("XGBoost MAE:", xgb_mae)