In [7]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

# Load the dataset
file_path = 'C:\\Users\\johan\\TDT4259_Data_Science\\consumption_temp.csv'  # Replace with your file path
df = pd.read_csv(file_path)


In [8]:
# Filter data for Oslo
df_oslo = df[df['location'] == 'oslo'].copy()

# Convert 'time' to datetime and sort
df_oslo['time'] = pd.to_datetime(df_oslo['time'])
df_oslo.sort_values('time', inplace=True)

print(df_oslo.head())


                  time location  consumption  temperature
1  2022-04-07 21:00:00     oslo     4.092830          1.0
6  2022-04-07 22:00:00     oslo     3.818095          0.4
11 2022-04-07 23:00:00     oslo     3.660981          0.3
16 2022-04-08 00:00:00     oslo     3.584066          0.2
21 2022-04-08 01:00:00     oslo     3.549536          0.4


In [9]:
# Generate time-based features
df_oslo['hour'] = df_oslo['time'].dt.hour
df_oslo['day_of_week'] = df_oslo['time'].dt.dayofweek
df_oslo['month'] = df_oslo['time'].dt.month

#generate lagged features
df_oslo['lag24_consumption'] = df_oslo['consumption'].shift(24)
df_oslo['lag24_temperature'] = df_oslo['temperature'].shift(48)
#generate rolling mean features
df_oslo['rolling_mean_consumption_1W'] = df_oslo['consumption'].rolling(window=168).mean()
df_oslo['rolling_mean_consumption_3D'] = df_oslo['consumption'].rolling(window=72).mean()
df_oslo['rolling_mean_temp_1W'] = df_oslo['temperature'].rolling(window=168).mean()
df_oslo['rolling_mean_temp_3D'] = df_oslo['temperature'].rolling(window=72).mean()
# Prepare features and target
X = df_oslo[['temperature', 'hour', 'day_of_week', 'month', 'lag24_consumption', 'lag24_temperature', 'rolling_mean_consumption_1W', 'rolling_mean_consumption_3D', 'rolling_mean_temp_1W', 'rolling_mean_temp_3D']]
y = df_oslo['consumption']

In [10]:
# Initialize some variables for the rolling forecast
window_size = 168*4  # Define the size of the test window (e.g., 30 days)
step_size = 168  # Define the step size for rolling (e.g., 7 days)
n_splits = int((len(X) - window_size) / step_size)

# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)

# Store RMSE for each split
rmse_list = []
mae_list = []

# Rolling Forecast Origin loop
for i in range(n_splits):
    train_end = i * step_size
    test_start = train_end + 1
    test_end = test_start + window_size
    
    X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
    X_test, y_test = X.iloc[test_start:test_end], y.iloc[test_start:test_end]
    
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mae_list.append(mae)
    print(f"Split {i+1}, MAE: {mae}")
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    rmse_list.append(rmse)
    print(f"Split {i+1}, RMSE: {rmse}")
    #print model feature importance
    print(xgb_model.feature_importances_)

# Final model training on the entire dataset
xgb_model.fit(X, y)
print(xgb_model.feature_importances_)

# Average RMSE
print("Average RMSE across all splits:", np.mean(rmse_list))
print("Average MAE across all splits:", np.mean(mae_list))


Split 1, MAE: 2.7857487529761906
Split 1, RMSE: 2.8248904450148102
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Split 2, MAE: 0.7701998853480703
Split 2, RMSE: 0.822801888444303
[1.4406640e-02 5.4135535e-02 9.6603651e-03 0.0000000e+00 7.1634822e-02
 6.6057080e-01 1.9341735e-05 1.1789722e-01 0.0000000e+00 7.1675271e-02]
Split 3, MAE: 0.16722669612818672
Split 3, RMSE: 0.2085335325580084
[0.00874398 0.01650904 0.01273543 0.         0.08108836 0.12771635
 0.7020414  0.03412217 0.00363168 0.01341155]
Split 4, MAE: 0.18555062686998505
Split 4, RMSE: 0.24120969314135504
[0.00818645 0.00886623 0.00944051 0.         0.04821219 0.03957866
 0.05296862 0.009804   0.81209326 0.01085004]
Split 5, MAE: 0.1784501669640768
Split 5, RMSE: 0.22819215145575827
[0.01311792 0.01734158 0.02223004 0.02937495 0.10054808 0.05434546
 0.20555858 0.0134128  0.5359719  0.00809866]
Split 6, MAE: 0.19623820133893832
Split 6, RMSE: 0.24892064889638194
[0.01972769 0.02155473 0.02560192 0.01951905 0.13577402 0.04469564
 0.26826167 

In [11]:
#using the model to predict the next 24 hours 5 days after the last data point

