In [57]:
import numpy as np
import pandas as pd

In [34]:
# data = pd.read_csv('./Data/CSD Generation (Hourly) - 2020-01 to 2020-06/CSD Generation (Hourly) - 2020-01 to 2020-06.csv')
# renewable_data = data.loc[(data['Fuel Type'] == 'WIND') | (data['Fuel Type'] == 'SOLAR')]
# area_data = renewable_data.loc[renewable_data['Planning Area'] == 52]
# area_data["Fuel Type"].unique()
# area_data["Asset Name"].unique()

## Weather Data

In [58]:
# Read CSVs and drop any unnamed index column
def clean_df(file_path):
    df = pd.read_csv(file_path, encoding='unicode_escape')
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df

# Load cleaned CSVs
vauxhaul1 = clean_df('./Data/ACIS/Vauxhall-20200101-20200601.csv')
vauxhaul2 = clean_df('./Data/ACIS/Vauxhall-20200701-20201231.csv')
vauxhaul3 = clean_df('./Data/ACIS/Vauxhall-20210101-20210630.csv')
vauxhaul4 = clean_df('./Data/ACIS/Vauxhall-20210701-20211231.csv')

# Concatenate without extra index
vauxhal = pd.concat([vauxhaul1, vauxhaul2, vauxhaul3, vauxhaul4], ignore_index=True)

# Save to CSV without index
vauxhal.to_csv('./Data/ACIS/Vauxhall.csv', index=False)

## Generation Data

In [59]:
# Function to clean DataFrame by removing unnamed index columns
def clean_df(file_path):
    df = pd.read_csv(file_path, encoding='unicode_escape')
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Remove unnamed columns
    return df

# Load cleaned CSVs for GenData
Gen1 = clean_df('./Data/CSD/CSD Generation (Hourly) - 2020-01 to 2020-06.csv')
Gen2 = clean_df('./Data/CSD/CSD Generation (Hourly) - 2020-07 to 2020-12.csv')
Gen3 = clean_df('./Data/CSD/CSD Generation (Hourly) - 2021-01 to 2021-06.csv')
Gen4 = clean_df('./Data/CSD/CSD Generation (Hourly) - 2021-07 to 2021-12.csv')

# Concatenate without extra index
GenData = pd.concat([Gen1, Gen2, Gen3, Gen4], ignore_index=True)

# Save to CSV without index
GenData.to_csv('./Data/CSD/VauxhallGen.csv', index=False)

In [60]:
vauxhaul_Gen = GenData.loc[GenData['Asset Name'] == 'VXH1 Vauxhall']

In [61]:
vauxhaul_Gen.head()

Unnamed: 0,Date (MST),Date (MPT),Asset Short Name,Asset Name,Asset Grouping,Volume,Maximum Capability,System Capability,Fuel Type,Sub Fuel Type,Planning Area,Region
177120,2020-02-19 01:00:00,2020-02-19 01:00:00,VXH1,VXH1 Vauxhall,VXH1,0.0,22.0,22.0,SOLAR,SOLAR,52,South
177121,2020-02-19 02:00:00,2020-02-19 02:00:00,VXH1,VXH1 Vauxhall,VXH1,0.0,22.0,22.0,SOLAR,SOLAR,52,South
177122,2020-02-19 03:00:00,2020-02-19 03:00:00,VXH1,VXH1 Vauxhall,VXH1,0.0,22.0,22.0,SOLAR,SOLAR,52,South
177123,2020-02-19 04:00:00,2020-02-19 04:00:00,VXH1,VXH1 Vauxhall,VXH1,0.0,22.0,22.0,SOLAR,SOLAR,52,South
177124,2020-02-19 05:00:00,2020-02-19 05:00:00,VXH1,VXH1 Vauxhall,VXH1,0.0,22.0,22.0,SOLAR,SOLAR,52,South


In [None]:
vauxhal.head()

Unnamed: 0,Station Name,Date (Local Standard Time),Air Temp. Inst. (°C),Air Temp. Inst. Source Flag,Air Temp. Inst. Comment,Humidity Inst. (%),Humidity Inst. Source Flag,Humidity Inst. Comment,Relative Humidity Avg. (%),Relative Humidity Avg. Source Flag,...,Wind Speed 10 m Syno. Comment,Wind Dir. 10 m Syno. (°),Wind Dir. 10 m Syno. Source Flag,Wind Dir. 10 m Syno. Comment,Wind Speed 10 m Avg. (km/h),Wind Speed 10 m Avg. Source Flag,Wind Speed 10 m Avg. Comment,Wind Dir. 10 m Avg. (°),Wind Dir. 10 m Avg. Source Flag,Wind Dir. 10 m Avg. Comment
0,Vauxhall CDA CS,01-January-2020 00:00,2.5,ACTUAL,,74.0,ACTUAL,,76.0,ACTUAL,...,,271.0,ACTUAL,,22.2,ACTUAL,,271.0,ACTUAL,
1,Vauxhall CDA CS,01-January-2020 01:00,2.5,ACTUAL,,75.0,ACTUAL,,75.0,ACTUAL,...,,272.0,ACTUAL,,22.8,ACTUAL,,272.0,ACTUAL,
2,Vauxhall CDA CS,01-January-2020 02:00,2.4,ACTUAL,,77.0,ACTUAL,,77.0,ACTUAL,...,,273.0,ACTUAL,,21.6,ACTUAL,,269.0,ACTUAL,
3,Vauxhall CDA CS,01-January-2020 03:00,2.8,ACTUAL,,78.0,ACTUAL,,78.0,ACTUAL,...,,277.0,ACTUAL,,18.7,ACTUAL,,275.0,ACTUAL,
4,Vauxhall CDA CS,01-January-2020 04:00,0.9,ACTUAL,,83.0,ACTUAL,,81.0,ACTUAL,...,,349.0,ACTUAL,,13.5,ACTUAL,,335.0,ACTUAL,


In [62]:
#make a copy
vauxhaul_Gen = vauxhaul_Gen.copy()
vauxhaul_Gen.loc[:, "timestamp"] = pd.to_datetime(vauxhaul_Gen["Date (MPT)"]).dt.date

# Drop columns 
vauxhaul_Gen.drop(columns=["Asset Short Name", "Asset Name", "Asset Grouping", "Planning Area", "Region"], inplace=True, errors="ignore")

# make a copy
vauxhal = vauxhal.copy()
vauxhal.loc[:, "timestamp"] = pd.to_datetime(vauxhal["Date (Local Standard Time)"]).dt.date

# Drop columns
vauxhal.drop(columns=["Date (Local Standard Time)", "Station Name"], inplace=True, errors="ignore")

# Save to CSV
vauxhaul_Gen.to_csv('./Data/CSD/VauxhallGen.csv', index=False)
vauxhal.to_csv('./Data/ACIS/Vauxhall.csv', index=False)

In [63]:
#merge gen and weather data
vauxhall_weather_data_frame = pd.read_csv("./Data/ACIS/Vauxhall.csv")  # CSD data
vauxhall_gen_data_frame =pd.read_csv("./Data/CSD/VauxhallGen.csv")  # Historical weather data


vauxhall_merged_df = pd.merge(vauxhall_weather_data_frame, vauxhall_gen_data_frame, on="timestamp", how="inner")

In [64]:
vauxhall_merged_df.to_csv('./Data/Merged/VauxhallMerged.csv') # export merged data

In [65]:
#sort dataframe by dateframe 

vauxhall_merged_df.sort_values(by=["timestamp"],inplace =True ,ascending = True)

In [56]:
vauxhall_merged_df.head(10)

Unnamed: 0,Air Temp. Inst. (°C),Air Temp. Inst. Source Flag,Air Temp. Inst. Comment,Humidity Inst. (%),Humidity Inst. Source Flag,Humidity Inst. Comment,Relative Humidity Avg. (%),Relative Humidity Avg. Source Flag,Relative Humidity Avg. Comment,Incoming Solar Rad. (W/m2),...,Wind Dir. 10 m Avg. Source Flag,Wind Dir. 10 m Avg. Comment,timestamp,Date (MST),Date (MPT),Volume,Maximum Capability,System Capability,Fuel Type,Sub Fuel Type
0,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,2020-02-19,2020-02-19 01:00:00,2020-02-19 01:00:00,0.0,22.0,22.0,SOLAR,SOLAR
1,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,2020-02-19,2020-02-19 02:00:00,2020-02-19 02:00:00,0.0,22.0,22.0,SOLAR,SOLAR
2,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,2020-02-19,2020-02-19 03:00:00,2020-02-19 03:00:00,0.0,22.0,22.0,SOLAR,SOLAR
3,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,2020-02-19,2020-02-19 04:00:00,2020-02-19 04:00:00,0.0,22.0,22.0,SOLAR,SOLAR
4,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,2020-02-19,2020-02-19 05:00:00,2020-02-19 05:00:00,0.0,22.0,22.0,SOLAR,SOLAR
5,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,2020-02-19,2020-02-19 06:00:00,2020-02-19 06:00:00,0.0,22.0,22.0,SOLAR,SOLAR
6,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,2020-02-19,2020-02-19 07:00:00,2020-02-19 07:00:00,0.0,22.0,22.0,SOLAR,SOLAR
7,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,2020-02-19,2020-02-19 08:00:00,2020-02-19 08:00:00,0.0,22.0,22.0,SOLAR,SOLAR
8,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,2020-02-19,2020-02-19 09:00:00,2020-02-19 09:00:00,0.0,22.0,22.0,SOLAR,SOLAR
9,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,2020-02-19,2020-02-19 10:00:00,2020-02-19 10:00:00,0.0,22.0,22.0,SOLAR,SOLAR


In [66]:
#make a copy of the main dataset

df_temp = vauxhall_merged_df.copy()

In [67]:
#enriching dataframe
#converting into day,month ,year 
type(df_temp.timestamp)

pandas.core.series.Series

In [68]:
df_temp["timestamp"] = pd.to_datetime(df_temp["timestamp"])

# Extract year, month, and day
df_temp["year"] = df_temp["timestamp"].dt.year
df_temp["month"] = df_temp["timestamp"].dt.month
df_temp["day"] = df_temp["timestamp"].dt.day

In [None]:
df_temp.drop(columns=["timestamp","Date (MST)","Date (MPT)"],inplace=True)

KeyError: "['timestamp', 'Date (MST)', 'Date (MPT)'] not found in axis"

In [72]:
df_temp.head(10)

Unnamed: 0,Air Temp. Inst. (°C),Air Temp. Inst. Source Flag,Air Temp. Inst. Comment,Humidity Inst. (%),Humidity Inst. Source Flag,Humidity Inst. Comment,Relative Humidity Avg. (%),Relative Humidity Avg. Source Flag,Relative Humidity Avg. Comment,Incoming Solar Rad. (W/m2),...,Wind Dir. 10 m Avg. Source Flag,Wind Dir. 10 m Avg. Comment,Volume,Maximum Capability,System Capability,Fuel Type,Sub Fuel Type,year,month,day
0,-19.5,ACTUAL,,83.0,ACTUAL,,83.0,ACTUAL,,0.0,...,ACTUAL,,0.0,22.0,22.0,SOLAR,SOLAR,2020,2,19
376,-3.8,ACTUAL,,72.0,ACTUAL,,72.0,ACTUAL,,304.2,...,ACTUAL,,0.0,22.0,22.0,SOLAR,SOLAR,2020,2,19
375,-3.8,ACTUAL,,72.0,ACTUAL,,72.0,ACTUAL,,304.2,...,ACTUAL,,0.0,22.0,22.0,SOLAR,SOLAR,2020,2,19
374,-3.8,ACTUAL,,72.0,ACTUAL,,72.0,ACTUAL,,304.2,...,ACTUAL,,0.0,22.0,22.0,SOLAR,SOLAR,2020,2,19
373,-3.8,ACTUAL,,72.0,ACTUAL,,72.0,ACTUAL,,304.2,...,ACTUAL,,0.0,22.0,22.0,SOLAR,SOLAR,2020,2,19
372,-3.8,ACTUAL,,72.0,ACTUAL,,72.0,ACTUAL,,304.2,...,ACTUAL,,0.0,22.0,22.0,SOLAR,SOLAR,2020,2,19
371,-3.8,ACTUAL,,72.0,ACTUAL,,72.0,ACTUAL,,304.2,...,ACTUAL,,0.0,22.0,22.0,SOLAR,SOLAR,2020,2,19
370,-3.8,ACTUAL,,72.0,ACTUAL,,72.0,ACTUAL,,304.2,...,ACTUAL,,0.0,22.0,22.0,SOLAR,SOLAR,2020,2,19
369,-3.8,ACTUAL,,72.0,ACTUAL,,72.0,ACTUAL,,304.2,...,ACTUAL,,0.0,22.0,22.0,SOLAR,SOLAR,2020,2,19
368,-3.8,ACTUAL,,72.0,ACTUAL,,72.0,ACTUAL,,304.2,...,ACTUAL,,0.0,22.0,22.0,SOLAR,SOLAR,2020,2,19


In [73]:
# turning data into numbers
#checking which columns are string 
for label,columns in df_temp.items():
	if pd.api.types.is_string_dtype(columns):
		print(columns)

0         ACTUAL
376       ACTUAL
375       ACTUAL
374       ACTUAL
373       ACTUAL
           ...  
375674    ACTUAL
375675    ACTUAL
375676    ACTUAL
375768    ACTUAL
376057    ACTUAL
Name: Air Temp. Inst. Source Flag, Length: 376058, dtype: object
0         ACTUAL
376       ACTUAL
375       ACTUAL
374       ACTUAL
373       ACTUAL
           ...  
375674    ACTUAL
375675    ACTUAL
375676    ACTUAL
375768    ACTUAL
376057    ACTUAL
Name: Humidity Inst. Source Flag, Length: 376058, dtype: object
0         ACTUAL
376       ACTUAL
375       ACTUAL
374       ACTUAL
373       ACTUAL
           ...  
375674    ACTUAL
375675    ACTUAL
375676    ACTUAL
375768    ACTUAL
376057    ACTUAL
Name: Relative Humidity Avg. Source Flag, Length: 376058, dtype: object
0         ESTIMATED
376       ESTIMATED
375       ESTIMATED
374       ESTIMATED
373       ESTIMATED
            ...    
375674    ESTIMATED
375675    ESTIMATED
375676    ESTIMATED
375768    ESTIMATED
376057    ESTIMATED
Name: Incoming Sol

In [74]:
#turn the string column into numbers 

for label,columns in df_temp.items():
	if pd.api.types.is_string_dtype(columns):
		df_temp[label]=columns.astype("category").cat.as_ordered()

In [76]:
print(df_temp.columns)  # Check available columns


Index(['Air Temp. Inst. (°C)', 'Air Temp. Inst. Source Flag',
       'Air Temp. Inst. Comment', 'Humidity Inst. (%)',
       'Humidity Inst. Source Flag', 'Humidity Inst. Comment',
       'Relative Humidity Avg. (%)', 'Relative Humidity Avg. Source Flag',
       'Relative Humidity Avg. Comment', 'Incoming Solar Rad. (W/m2)',
       'Incoming Solar Rad. Source Flag', 'Incoming Solar Rad. Comment',
       'Precip. (mm)', 'Precip. Source Flag', 'Precip. Comment',
       'Wind Speed 2 m Avg. (km/h)', 'Wind Speed 2 m Avg. Source Flag',
       'Wind Speed 2 m Avg. Comment', 'Wind Speed 10 m Syno. (km/h)',
       'Wind Speed 10 m Syno. Source Flag', 'Wind Speed 10 m Syno. Comment',
       'Wind Dir. 10 m Syno. (°)', 'Wind Dir. 10 m Syno. Source Flag',
       'Wind Dir. 10 m Syno. Comment', 'Wind Speed 10 m Avg. (km/h)',
       'Wind Speed 10 m Avg. Source Flag', 'Wind Speed 10 m Avg. Comment',
       'Wind Dir. 10 m Avg. (°)', 'Wind Dir. 10 m Avg. Source Flag',
       'Wind Dir. 10 m Avg. Com

In [84]:
# filling missing values 
for label, content in df_temp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column which tells us if the data was missing or not
            df_temp[label+"_is_missing"] = pd.isnull(content)
            # Fill missing numeric values with median
            df_temp[label] = content.fillna(content.median())

    # Filled categorical missing data and turn categories into numbers
    if not pd.api.types.is_numeric_dtype(content):
        df_temp[label+"_is_missing"] = pd.isnull(content)
        # We add +1 to the category code because pandas encodes missing categories as -1
        df_temp[label] = pd.Categorical(content).codes+1


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [83]:
df_temp.tail(20).columns

Index(['Air Temp. Inst. (°C)', 'Air Temp. Inst. Source Flag',
       'Air Temp. Inst. Comment', 'Humidity Inst. (%)',
       'Humidity Inst. Source Flag', 'Humidity Inst. Comment',
       'Relative Humidity Avg. (%)', 'Relative Humidity Avg. Source Flag',
       'Relative Humidity Avg. Comment', 'Incoming Solar Rad. (W/m2)',
       'Incoming Solar Rad. Source Flag', 'Incoming Solar Rad. Comment',
       'Precip. (mm)', 'Precip. Source Flag', 'Precip. Comment',
       'Wind Speed 2 m Avg. (km/h)', 'Wind Speed 2 m Avg. Source Flag',
       'Wind Speed 2 m Avg. Comment', 'Wind Speed 10 m Syno. (km/h)',
       'Wind Speed 10 m Syno. Source Flag', 'Wind Speed 10 m Syno. Comment',
       'Wind Dir. 10 m Syno. (°)', 'Wind Dir. 10 m Syno. Source Flag',
       'Wind Dir. 10 m Syno. Comment', 'Wind Speed 10 m Avg. (km/h)',
       'Wind Speed 10 m Avg. Source Flag', 'Wind Speed 10 m Avg. Comment',
       'Wind Dir. 10 m Avg. (°)', 'Wind Dir. 10 m Avg. Source Flag',
       'Wind Dir. 10 m Avg. Com

In [20]:
# # Splitting train and validation data
# df_val = df_temp[df_temp.year == 2020]
# df_train = df_temp[df_temp.year != 2020]

# # Checking data sizes
# print(len(df_val), len(df_train))

# # Splitting into features (X) and target (y)
# X_train, y_train = df_train.drop(columns=["Volume"]), df_train["Volume"]
# X_valid, y_valid = df_val.drop(columns=["Volume"]), df_val["Volume"]

# # Checking shapes
# X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

In [85]:
from sklearn.model_selection import TimeSeriesSplit

# Assuming 'df_temp' is already sorted by 'year' (or by time column)
# Splitting into features (X) and target (y)
X = df_temp.drop(columns=["Volume"])
y = df_temp["Volume"]

# Checking the shape before splitting
print(X.shape, y.shape)

# Initialize TimeSeriesSplit with the desired number of splits
tscv = TimeSeriesSplit(n_splits=5)

# Loop through the splits and get training and validation data
for train_index, valid_index in tscv.split(X):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    # Check shapes of train and validation sets for each split
    print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)


(376058, 62) (376058,)
(62678, 62) (62678,) (62676, 62) (62676,)
(125354, 62) (125354,) (62676, 62) (62676,)
(188030, 62) (188030,) (62676, 62) (62676,)
(250706, 62) (250706,) (62676, 62) (62676,)
(313382, 62) (313382,) (62676, 62) (62676,)


In [86]:
%%time
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Different RandomForestClassifier hyperparameters
rf_grid = {
    "n_estimators": np.arange(10, 50, 10),  # Reduce complexity
    "max_depth": [5, 10, 15],  # Limit depth
    "min_samples_split": np.arange(5, 20, 5),  # Encourage splits on more data
    "min_samples_leaf": np.arange(5, 20, 5),  # Prevent overfitting on small samples
    "max_features": ["sqrt", 0.5],
    "max_samples": [None]  # Allow dynamic sampling
}

rs_model = RandomizedSearchCV(RandomForestRegressor(),
                              param_distributions=rf_grid,
                              n_iter=20,
                              cv=5,
                              verbose=True)

rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: total: 8min 45s
Wall time: 8min 52s


In [87]:
rs_model.best_params_

{'n_estimators': np.int64(20),
 'min_samples_split': np.int64(10),
 'min_samples_leaf': np.int64(5),
 'max_samples': None,
 'max_features': 0.5,
 'max_depth': 5}

In [88]:
%%time
# Most ideal hyperparameters
ideal_model = RandomForestRegressor(n_estimators=90,
                                    min_samples_leaf=1,
                                    min_samples_split=14,
                                    max_features=0.5,
                                    n_jobs=-1,
                                    max_samples=None)
ideal_model.fit(X_train, y_train)

CPU times: total: 1min 21s
Wall time: 13.4 s


In [89]:
def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))
# Create function to evaluate our model
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_valid, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_valid, val_preds),
              "Training R^2": model.score(X_train, y_train),
              "Valid R^2": model.score(X_valid, y_valid)}
    return scores

In [90]:
show_scores(ideal_model)

{'Training MAE': 4.317008117571929,
 'Valid MAE': 4.807101240960573,
 'Training RMSLE': np.float64(1.1680304665714298),
 'Valid RMSLE': np.float64(1.4806566227149753),
 'Training R^2': 0.2811729630308666,
 'Valid R^2': -0.18345195058558028}