In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

# Load the dataset
file_path = 'C:\\Users\\johan\\TDT4259_Data_Science\\consumption_temp.csv'  # Replace with your file path
df = pd.read_csv(file_path)


In [2]:
# Filter data for Oslo
df_oslo = df[df['location'] == 'oslo'].copy()

# Convert 'time' to datetime and sort
df_oslo['time'] = pd.to_datetime(df_oslo['time'])
df_oslo.sort_values('time', inplace=True)

print(df_oslo.head())
print(df_oslo)

print(df_oslo['consumption'].mean())


                  time location  consumption  temperature
1  2022-04-07 21:00:00     oslo     4.092830          1.0
6  2022-04-07 22:00:00     oslo     3.818095          0.4
11 2022-04-07 23:00:00     oslo     3.660981          0.3
16 2022-04-08 00:00:00     oslo     3.584066          0.2
21 2022-04-08 01:00:00     oslo     3.549536          0.4
                     time location  consumption  temperature
1     2022-04-07 21:00:00     oslo     4.092830          1.0
6     2022-04-07 22:00:00     oslo     3.818095          0.4
11    2022-04-07 23:00:00     oslo     3.660981          0.3
16    2022-04-08 00:00:00     oslo     3.584066          0.2
21    2022-04-08 01:00:00     oslo     3.549536          0.4
...                   ...      ...          ...          ...
49466 2023-04-02 17:00:00     oslo    12.410225          5.5
49472 2023-04-02 18:00:00     oslo    12.856381          4.8
49478 2023-04-02 19:00:00     oslo    13.010338          0.6
49484 2023-04-02 20:00:00     oslo    12.7

In [3]:
# Generate time-based features
df_oslo['hour'] = df_oslo['time'].dt.hour
df_oslo['day_of_week'] = df_oslo['time'].dt.dayofweek
df_oslo['month'] = df_oslo['time'].dt.month

#generate lagged features
df_oslo['lag7D_consumption'] = df_oslo['consumption'].shift(24*7)
df_oslo['lag24_temperature'] = df_oslo['temperature'].shift(48)
#generate rolling mean features

df_oslo['rolling_mean_temp_1W'] = df_oslo['temperature'].rolling(window=168).mean()
df_oslo['rolling_mean_temp_3D'] = df_oslo['temperature'].rolling(window=72).mean()
#creating binary feature for working day
df_oslo['working_day'] = np.where(df_oslo['day_of_week'] <= 5, 1, 0)
# Prepare features and target
features = ['temperature', 'hour', 'day_of_week', 'month', 'lag7D_consumption',
              'lag24_temperature', 
              'rolling_mean_temp_1W', 'rolling_mean_temp_3D', 'working_day']

X = df_oslo[features]
print(X)
y = df_oslo['consumption']

       temperature  hour  day_of_week  month  lag7D_consumption  \
1              1.0    21            3      4                NaN   
6              0.4    22            3      4                NaN   
11             0.3    23            3      4                NaN   
16             0.2     0            4      4                NaN   
21             0.4     1            4      4                NaN   
...            ...   ...          ...    ...                ...   
49466          5.5    17            6      4          12.410225   
49472          4.8    18            6      4          12.856381   
49478          0.6    19            6      4          13.010338   
49484         -0.3    20            6      4          12.738356   
49490         -1.1    21            6      4          12.134655   

       lag24_temperature  rolling_mean_temp_1W  rolling_mean_temp_3D  \
1                    NaN                   NaN                   NaN   
6                    NaN                   NaN     

In [4]:
# Initialize some variables for the rolling forecast
window_size = 168  # Define the size of the test window (e.g., 30 days)
step_size = 168  # Define the step size for rolling (e.g., 7 days)
n_splits = int((len(X) - window_size) / step_size)

# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)

# Store RMSE for each split
rmse_list = []
mae_list = []

# Rolling Forecast Origin loop
for i in range(n_splits):
    train_end = i * step_size
    test_start = train_end + 1
    test_end = test_start + window_size
    
    X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
    X_test, y_test = X.iloc[test_start:test_end], y.iloc[test_start:test_end]
    
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mae_list.append(mae)
    print(f"Split {i+1}, MAE: {mae}")
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    rmse_list.append(rmse)
    print(f"Split {i+1}, RMSE: {rmse}")
    #print model feature importance
    featureImportances = []
    for i in range(len(xgb_model.feature_importances_)):
        featureImportances.append(features[i]  + str(xgb_model.feature_importances_[i]))
    print(featureImportances)
# Final model training on the entire dataset
xgb_model.fit(X, y)


# Average RMSE
print("Average RMSE across all splits:", np.mean(rmse_list))
print("Average MAE across all splits:", np.mean(mae_list))




Split 1, MAE: 3.3780590595238102
Split 1, RMSE: 3.395352215839491
['temperature0.0', 'hour0.0', 'day_of_week0.0', 'month0.0', 'lag7D_consumption0.0', 'lag24_temperature0.0', 'rolling_mean_temp_1W0.0', 'rolling_mean_temp_3D0.0', 'working_day0.0']
Split 2, MAE: 0.5208962004005795
Split 2, RMSE: 0.5881096014744248
['temperature0.026457788', 'hour0.11200199', 'day_of_week0.21688834', 'month0.0', 'lag7D_consumption0.0', 'lag24_temperature0.6023971', 'rolling_mean_temp_1W0.0016943846', 'rolling_mean_temp_3D0.040560436', 'working_day0.0']
Split 3, MAE: 0.23900609528823122
Split 3, RMSE: 0.295057204099426
['temperature0.009123523', 'hour0.029820535', 'day_of_week0.023058988', 'month0.0', 'lag7D_consumption0.007681533', 'lag24_temperature0.0697174', 'rolling_mean_temp_1W0.8463824', 'rolling_mean_temp_3D0.014215602', 'working_day0.0']
Split 4, MAE: 0.19106522957066127
Split 4, RMSE: 0.22717584209138397
['temperature0.018395681', 'hour0.047420766', 'day_of_week0.032663018', 'month0.0', 'lag7D_con

In [5]:
#using the model to predict the next 24 hours 5 days after the last data point

