In [7]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import timedelta

In [8]:
df = pd.read_csv(r'data\unpred\combined_data.csv')


In [9]:
df = df.rename(columns={'MN': 'month', 'DT': 'day', 'YEAR': 'year', 'HR': 'hr_code'})


In [10]:
hr_code_to_hour = {0: 0, 12: 3, 24: 6, 36: 9, 48: 12, 60: 15, 72: 18, 84: 21}
valid_codes = list(hr_code_to_hour.keys())

# Filter to valid HR values only
df = df[df['hr_code'].isin(valid_codes)]

# Map HR code to actual hour
df['hour'] = df['hr_code'].map(hr_code_to_hour)

In [11]:
df['DateTime'] = pd.to_datetime(df[['year', 'month', 'day']]) + pd.to_timedelta(df['hour'], unit='h')

# Time-based features
df['dayofyear'] = df['DateTime'].dt.dayofyear
df['weekday'] = df['DateTime'].dt.weekday
df['month'] = df['DateTime'].dt.month

In [12]:
features = ['DPT', 'WBT', 'hour', 'dayofyear', 'weekday', 'month']
target = 'DBT'

# Drop any rows with missing data
df = df.dropna(subset=features + [target])

In [13]:
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train the XGBoost model
model = xgb.XGBRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [14]:
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse:.2f}")

# Create future timestamps
last_dt = df['DateTime'].max()
future_times = pd.date_range(start=last_dt + timedelta(hours=3), periods=56, freq='3H')  # 3h steps × 7 days


Test RMSE: 3.30


  future_times = pd.date_range(start=last_dt + timedelta(hours=3), periods=56, freq='3H')  # 3h steps × 7 days


In [15]:
future_df = pd.DataFrame({
    'DateTime': future_times,
    'DPT': df['DPT'].iloc[-56:].mean(),
    'WBT': df['WBT'].iloc[-56:].mean(),
})

# Add time-based features
future_df['hour'] = future_df['DateTime'].dt.hour
future_df['dayofyear'] = future_df['DateTime'].dt.dayofyear
future_df['weekday'] = future_df['DateTime'].dt.weekday
future_df['month'] = future_df['DateTime'].dt.month

In [16]:
future_features = future_df[features]
future_df['Predicted_DBT'] = model.predict(future_features)

# Show the forecast
print("7-Day Forecast (3-hour intervals):")
print(future_df[['DateTime', 'Predicted_DBT']])

7-Day Forecast (3-hour intervals):
              DateTime  Predicted_DBT
0  2025-04-19 03:00:00      30.060165
1  2025-04-19 06:00:00      31.562016
2  2025-04-19 09:00:00      31.589542
3  2025-04-19 12:00:00      31.354803
4  2025-04-19 15:00:00      31.078085
5  2025-04-19 18:00:00      30.324373
6  2025-04-19 21:00:00      30.127539
7  2025-04-20 00:00:00      29.649000
8  2025-04-20 03:00:00      30.060165
9  2025-04-20 06:00:00      31.562016
10 2025-04-20 09:00:00      31.589542
11 2025-04-20 12:00:00      31.234037
12 2025-04-20 15:00:00      30.934669
13 2025-04-20 18:00:00      30.180958
14 2025-04-20 21:00:00      29.984123
15 2025-04-21 00:00:00      29.633413
16 2025-04-21 03:00:00      30.044579
17 2025-04-21 06:00:00      31.546429
18 2025-04-21 09:00:00      31.573956
19 2025-04-21 12:00:00      31.218451
20 2025-04-21 15:00:00      30.915705
21 2025-04-21 18:00:00      30.161993
22 2025-04-21 21:00:00      29.965158
23 2025-04-22 00:00:00      29.633413
24 2025-04-22 0

In [17]:
future_df[['DateTime', 'Predicted_DBT']].to_csv(r'data/pred/dbt_forecast_xgb.csv', index=False)
print("Forecast saved to dbt_7day_3hr_forecast.csv")

Forecast saved to dbt_7day_3hr_forecast.csv
