# Feature Construction

In [150]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import  RandomForestRegressor

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [151]:
final_data = pd.read_csv('C:/Users/COMTECH COMPUTER/Desktop/UK energy conumptipon data/great_britian_electricity_consumption_2009_to_2024.csv')

In [152]:
#-> Converting to date time format
final_data['settlement_date'] = pd.to_datetime(final_data['settlement_date'])
final_data.set_index('settlement_date', inplace=True)

In [153]:
final_data.head(3)

Unnamed: 0_level_0,nd,embedded_wind_generation,embedded_solar_generation
settlement_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01 00:00:00,37910,54,0
2009-01-01 00:30:00,38047,53,0
2009-01-01 01:00:00,37380,53,0


In [154]:
# Extract time-based features
final_data['wind + solar generation'] = final_data['embedded_wind_generation'] + final_data['embedded_solar_generation']

final_data['year']  = final_data.index.year
# final_data['week']  = final_data.index.isocalendar().week
# final_data['day_of_year'] = final_data.index.dayofyear
final_data['month']       = final_data.index.month
final_data['hour_of_day'] = final_data.index.hour
final_data['day_of_week'] = final_data.index.dayofweek
final_data['weekend']     = final_data['day_of_week'].isin([5, 6])  # Saturday & Sunday as weekends

In [155]:
final_data.head()

Unnamed: 0_level_0,nd,embedded_wind_generation,embedded_solar_generation,wind + solar generation,year,month,hour_of_day,day_of_week,weekend
settlement_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-01-01 00:00:00,37910,54,0,54,2009,1,0,3,False
2009-01-01 00:30:00,38047,53,0,53,2009,1,0,3,False
2009-01-01 01:00:00,37380,53,0,53,2009,1,1,3,False
2009-01-01 01:30:00,36426,50,0,50,2009,1,1,3,False
2009-01-01 02:00:00,35687,50,0,50,2009,1,2,3,False


# Encoding cyclic values

In [156]:
final_data['timeofday'] = pd.cut(final_data['hour_of_day'], 
                                 bins=[0, 6, 12, 18, 24], 
                                 labels=['Night', 'Morning', 'Afternoon', 'Evening'], 
                                 right=False)

In [157]:
# add sine/cosine encoding for hour of day and month of the year
final_data["hour_of_day_x"] = np.sin(np.radians((360/24) * final_data['hour_of_day']))
final_data["hour_of_day_y"] = np.cos(np.radians((360/24) * final_data['hour_of_day']))

final_data["week_x"] = np.sin(np.radians((360/7) * final_data['day_of_week']))
final_data["week_y"] = np.cos(np.radians((360/7) * final_data['day_of_week']))

final_data["month_x"] = np.sin(np.radians((360/12) * final_data['month']))
final_data["month_y"] = np.cos(np.radians((360/12) * final_data['month']))

In [158]:
def get_season(month):
    if month in [12, 1, 2]: return 'Winter'
    elif month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    else: return 'Autumn'

final_data['season'] = final_data['month'].apply(get_season)

In [159]:
final_data['demand_diff'] = final_data['nd'].diff()
final_data['wind_diff'] = final_data['embedded_wind_generation'].diff()
final_data['solar_diff'] = final_data['embedded_solar_generation'].diff()

In [160]:
# Ensures the rolling mean/std for time 't' only uses data up to 't-1'
final_data['rolling_mean_24'] = final_data['nd'].rolling(window=24).mean().shift(1)
final_data['rolling_std_24'] = final_data['nd'].rolling(window=24).std().shift(1)

final_data['rolling_mean_48'] = final_data['nd'].rolling(window=48).mean().shift(1)
final_data['rolling_std_48'] = final_data['nd'].rolling(window=48).std().shift(1)

# Creating Lag Values

In [161]:

final_data['load_lag_1'] = final_data["nd"].shift(1)
final_data['load_lag_2'] = final_data["nd"].shift(2)
final_data['load_lag_3'] = final_data["nd"].shift(3)
final_data['load_lag_4'] = final_data["nd"].shift(4)
final_data['load_lag_5'] = final_data["nd"].shift(5)
final_data['load_lag_6'] = final_data["nd"].shift(6)

final_data['load_lag_48'] = final_data["nd"].shift(48)

In [162]:
final_data.head(24)

Unnamed: 0_level_0,nd,embedded_wind_generation,embedded_solar_generation,wind + solar generation,year,month,hour_of_day,day_of_week,weekend,timeofday,...,rolling_std_24,rolling_mean_48,rolling_std_48,load_lag_1,load_lag_2,load_lag_3,load_lag_4,load_lag_5,load_lag_6,load_lag_48
settlement_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-01 00:00:00,37910,54,0,54,2009,1,0,3,False,Night,...,,,,,,,,,,
2009-01-01 00:30:00,38047,53,0,53,2009,1,0,3,False,Night,...,,,,37910.0,,,,,,
2009-01-01 01:00:00,37380,53,0,53,2009,1,1,3,False,Night,...,,,,38047.0,37910.0,,,,,
2009-01-01 01:30:00,36426,50,0,50,2009,1,1,3,False,Night,...,,,,37380.0,38047.0,37910.0,,,,
2009-01-01 02:00:00,35687,50,0,50,2009,1,2,3,False,Night,...,,,,36426.0,37380.0,38047.0,37910.0,,,
2009-01-01 02:30:00,35408,43,0,43,2009,1,2,3,False,Night,...,,,,35687.0,36426.0,37380.0,38047.0,37910.0,,
2009-01-01 03:00:00,34322,43,0,43,2009,1,3,3,False,Night,...,,,,35408.0,35687.0,36426.0,37380.0,38047.0,37910.0,
2009-01-01 03:30:00,33076,56,0,56,2009,1,3,3,False,Night,...,,,,34322.0,35408.0,35687.0,36426.0,37380.0,38047.0,
2009-01-01 04:00:00,31970,56,0,56,2009,1,4,3,False,Night,...,,,,33076.0,34322.0,35408.0,35687.0,36426.0,37380.0,
2009-01-01 04:30:00,31270,56,0,56,2009,1,4,3,False,Night,...,,,,31970.0,33076.0,34322.0,35408.0,35687.0,36426.0,


In [163]:
final_data.columns

Index(['nd', 'embedded_wind_generation', 'embedded_solar_generation',
       'wind + solar generation', 'year', 'month', 'hour_of_day',
       'day_of_week', 'weekend', 'timeofday', 'hour_of_day_x', 'hour_of_day_y',
       'week_x', 'week_y', 'month_x', 'month_y', 'season', 'demand_diff',
       'wind_diff', 'solar_diff', 'rolling_mean_24', 'rolling_std_24',
       'rolling_mean_48', 'rolling_std_48', 'load_lag_1', 'load_lag_2',
       'load_lag_3', 'load_lag_4', 'load_lag_5', 'load_lag_6', 'load_lag_48'],
      dtype='object')

In [164]:
# final_data.drop(columns=['hour_of_day', 'day_of_week'], inplace=True)

In [165]:
final_data.to_csv('C:/Users/COMTECH COMPUTER/Desktop/UK energy conumptipon data/FeatureEngineeringOnGreatBritianElectricityConsumption.csv')