# Tree + Forecasted Trajectory Nearest Neighbor

In [8]:
import pandas as pd
import xgboost as xgb
import pandas as pd
import numpy as np
import random
from plotly.subplots import make_subplots
import plotly.graph_objs as go

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import mlflow

from tqdm import tqdm
import json

## Object

In [12]:
class ForecastedTrajectoryNeighbors:
    def __init__(self):
        self.model = None
        self.knn = None

    def fit(self, endog, exog=None):
        self.model.fit(endog, exog)
        self.knn.fit(endog)

    def predict(self, alpha=0.5):
        y_naught = self.model.predict(y)
        y_hat = self.knn.predict(y_hat)
        ans = alpha * y_hat + (1 - alpha) * y_naught
        return ans

In [6]:
# essentially https://goldinlocks.github.io/Time-Series-Cross-Validation/
class BlockingTimeSeriesSplit:
    def __init__(self, n_splits: int = 5, val_size: float = 0.2):
        self.n_splits = n_splits
        self.val_size = val_size

    def split(self, X, y=None):
        block_size = len(X) // self.n_splits
        indices = np.arange(len(X))
        for i in range(self.n_splits):
            start = i * block_size
            stop = start + block_size
            split = stop - int(self.val_size * (stop-start))
            yield indices[start:split], indices[split:stop]

## Prepare the Data

In [4]:
data = pd.read_csv("../../../data/dataset.csv", parse_dates=['date'])
thresh = int(0.9 * len(data))
data = data.dropna(axis=1, thresh=thresh).bfill().ffill().drop('demand', axis=1)
data

Unnamed: 0,date,net_generation,total_interchange,net_generation_coal,net_generation_natural_gas,net_generation_nuclear,net_generation_hydropower_and_pumped_storage,net_generation_solar,net_generation_wind,net_generation_other_fuel_sources,...,SNOW,SNWD,SX52,SX53,TMAX,TMIN,TOBS,WSFG,WT01,consumption
0,2016-01-01,67425.0,-3254.0,39962.0,20101.0,11610.0,746.0,0.0,3865.0,1122.0,...,5.0,96.0,17.0,17.0,-27.0,-77.0,-56.0,192.0,1.0,127103.0
1,2016-01-02,67906.0,-2904.0,39962.0,20101.0,11610.0,746.0,0.0,3865.0,1122.0,...,1.0,92.0,12.0,17.0,-17.0,-78.0,-53.0,192.0,1.0,127530.0
2,2016-01-03,67305.0,-3588.0,39962.0,20101.0,11610.0,746.0,0.0,3865.0,1122.0,...,3.0,85.0,12.0,17.0,3.0,-69.0,-35.0,192.0,1.0,127693.0
3,2016-01-04,77001.0,-2642.0,39962.0,20101.0,11610.0,746.0,0.0,3865.0,1122.0,...,17.0,92.0,12.0,17.0,-21.0,-84.0,-75.0,192.0,1.0,143012.0
4,2016-01-05,81083.0,-896.0,39962.0,20101.0,11610.0,746.0,0.0,3865.0,1122.0,...,1.0,90.0,8.0,17.0,-30.0,-126.0,-86.0,192.0,1.0,147147.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3179,2024-09-14,72728.0,-4453.0,21114.0,31613.0,11557.0,1049.0,1925.0,5164.0,306.0,...,0.0,0.0,217.0,217.0,282.0,136.0,169.0,156.0,1.0,145072.0
3180,2024-09-15,72659.0,-5173.0,21023.0,29004.0,11520.0,1053.0,1917.0,7857.0,283.0,...,0.0,0.0,217.0,217.0,284.0,144.0,176.0,156.0,1.0,145998.0
3181,2024-09-16,78197.0,-4749.0,22613.0,31772.0,11471.0,1118.0,2015.0,8937.0,272.0,...,0.0,0.0,217.0,217.0,295.0,141.0,163.0,156.0,1.0,155157.0
3182,2024-09-17,79791.0,-2634.0,22567.0,30658.0,11452.0,995.0,2400.0,11449.0,272.0,...,0.0,0.0,217.0,217.0,294.0,130.0,155.0,156.0,1.0,154206.0


## Splits

In [9]:
split = int(len(data) * 0.85)
train, test = data.iloc[:split, :], data.iloc[split:, :] # DO NOT TOUCH THE TEST DATA
endog_train, exog_train = train['consumption'], train.drop(['date', 'consumption'], axis=1)
endog_test, exog_test = test['consumption'], test.drop(['date', 'consumption'], axis=1)

## Baseline

## Hyper-Parameter Tuning and CV

## Evaluation