# Batch Training

In [1]:
import numpy
numpy.version.version

'2.0.0'

In [2]:
import yaml
import pandas as pd

# Path to the YAML configuration file
yaml_file_path = 'feature_store/config_v1.yaml'

# Write the configuration data to a YAML file
with open(yaml_file_path, 'r') as file:
    config = yaml.safe_load(file)

In [3]:
# Get Features
from scripts import feature_store
X_train = feature_store.fetch_data_from_store(yaml_file_path = yaml_file_path)
X_train = X_train.head(-7) # save some rows for later
X_train

Unnamed: 0_level_0,lag_1,lag_4,lag_5,lag_6,lag_11,lag_12,lag_13,rolling_mean_7,rolling_std_7
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-01-14,119134.0,129372.0,134119.0,132752.0,131883.0,128866.0,116406.0,127822.29,5524.62
2024-01-15,121604.0,129308.0,129372.0,134119.0,131606.0,131883.0,128866.0,128386.86,6264.72
2024-01-16,136704.0,128467.0,129308.0,129372.0,136960.0,131606.0,131883.0,130031.00,8953.19
2024-01-17,145628.0,119134.0,128467.0,129308.0,130213.0,136960.0,131606.0,133167.71,12008.71
2024-01-18,151329.0,121604.0,119134.0,128467.0,126933.0,130213.0,136960.0,135579.14,12774.85
...,...,...,...,...,...,...,...,...,...
2024-03-27,125493.0,122277.0,130956.0,129138.0,109879.0,119478.0,120293.0,125439.86,4217.72
2024-03-28,125867.0,118230.0,122277.0,130956.0,107680.0,109879.0,119478.0,124931.14,3900.13
2024-03-29,125577.0,126118.0,118230.0,122277.0,118592.0,107680.0,109879.0,123498.29,3072.37
2024-03-30,120926.0,125493.0,126118.0,118230.0,125016.0,118592.0,107680.0,122283.57,4819.72


In [4]:
# Get targets
from scripts import feature_processing

csv_file_path = 'data/energy_data_new.csv'
df = pd.read_csv(csv_file_path, parse_dates=['period'])
df.set_index('period', inplace=True)

targets_df = feature_processing.get_targets(df)
targets_df

Unnamed: 0_level_0,target_1d,target_2d,target_3d
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,128866.0,131883.0,131606.0
2024-01-02,131883.0,131606.0,136960.0
2024-01-03,131606.0,136960.0,130213.0
2024-01-04,136960.0,130213.0,126933.0
2024-01-05,130213.0,126933.0,132752.0
...,...,...,...
2024-03-31,119066.0,124079.0,128889.0
2024-04-01,124079.0,128889.0,127645.0
2024-04-02,128889.0,127645.0,124704.0
2024-04-03,127645.0,124704.0,115664.0


In [5]:
feature_store.update_feature_store(targets_df, yaml_file_path, targets = True)

'Targets updated in feature store with last date 2024-04-04 00:00:00'

In [6]:
feature_store.fetch_data_from_store(X_train.index.min(), yaml_file_path, targets = True)


Unnamed: 0_level_0,target_1d,target_2d,target_3d
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-14,136704.0,145628.0,151329.0
2024-01-15,145628.0,151329.0,146188.0
2024-01-16,151329.0,146188.0,147359.0
2024-01-17,146188.0,147359.0,144471.0
2024-01-18,147359.0,144471.0,143686.0
...,...,...,...
2024-03-31,119066.0,124079.0,128889.0
2024-04-01,124079.0,128889.0,127645.0
2024-04-02,128889.0,127645.0,124704.0
2024-04-03,127645.0,124704.0,115664.0


In [7]:
Y_train = feature_store.fetch_data_from_store(X_train.index.min(), yaml_file_path, targets = True)[:X_train.index.max()]
Y_train

Unnamed: 0_level_0,target_1d,target_2d,target_3d
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-14,136704.0,145628.0,151329.0
2024-01-15,145628.0,151329.0,146188.0
2024-01-16,151329.0,146188.0,147359.0
2024-01-17,146188.0,147359.0,144471.0
2024-01-18,147359.0,144471.0,143686.0
...,...,...,...
2024-03-27,125577.0,120926.0,113774.0
2024-03-28,120926.0,113774.0,107790.0
2024-03-29,113774.0,107790.0,119066.0
2024-03-30,107790.0,119066.0,124079.0


In [8]:
# turn into a list
y_train_ls= []
for target in Y_train.columns:
  y_train_ls.append(Y_train[target])

y_train_ls

[period
 2024-01-14    136704.0
 2024-01-15    145628.0
 2024-01-16    151329.0
 2024-01-17    146188.0
 2024-01-18    147359.0
                 ...   
 2024-03-27    125577.0
 2024-03-28    120926.0
 2024-03-29    113774.0
 2024-03-30    107790.0
 2024-03-31    119066.0
 Name: target_1d, Length: 78, dtype: float64,
 period
 2024-01-14    145628.0
 2024-01-15    151329.0
 2024-01-16    146188.0
 2024-01-17    147359.0
 2024-01-18    144471.0
                 ...   
 2024-03-27    120926.0
 2024-03-28    113774.0
 2024-03-29    107790.0
 2024-03-30    119066.0
 2024-03-31    124079.0
 Name: target_2d, Length: 78, dtype: float64,
 period
 2024-01-14    151329.0
 2024-01-15    146188.0
 2024-01-16    147359.0
 2024-01-17    144471.0
 2024-01-18    143686.0
                 ...   
 2024-03-27    113774.0
 2024-03-28    107790.0
 2024-03-29    119066.0
 2024-03-30    124079.0
 2024-03-31    128889.0
 Name: target_3d, Length: 78, dtype: float64]

In [9]:
# Install XGBoost model
%pip install xgboost


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [10]:
X_train

Unnamed: 0_level_0,lag_1,lag_4,lag_5,lag_6,lag_11,lag_12,lag_13,rolling_mean_7,rolling_std_7
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-01-14,119134.0,129372.0,134119.0,132752.0,131883.0,128866.0,116406.0,127822.29,5524.62
2024-01-15,121604.0,129308.0,129372.0,134119.0,131606.0,131883.0,128866.0,128386.86,6264.72
2024-01-16,136704.0,128467.0,129308.0,129372.0,136960.0,131606.0,131883.0,130031.00,8953.19
2024-01-17,145628.0,119134.0,128467.0,129308.0,130213.0,136960.0,131606.0,133167.71,12008.71
2024-01-18,151329.0,121604.0,119134.0,128467.0,126933.0,130213.0,136960.0,135579.14,12774.85
...,...,...,...,...,...,...,...,...,...
2024-03-27,125493.0,122277.0,130956.0,129138.0,109879.0,119478.0,120293.0,125439.86,4217.72
2024-03-28,125867.0,118230.0,122277.0,130956.0,107680.0,109879.0,119478.0,124931.14,3900.13
2024-03-29,125577.0,126118.0,118230.0,122277.0,118592.0,107680.0,109879.0,123498.29,3072.37
2024-03-30,120926.0,125493.0,126118.0,118230.0,125016.0,118592.0,107680.0,122283.57,4819.72


In [11]:
X_train.to_numpy().copy()

array([[119134.  , 129372.  , 134119.  , 132752.  , 131883.  , 128866.  ,
        116406.  , 127822.29,   5524.62],
       [121604.  , 129308.  , 129372.  , 134119.  , 131606.  , 131883.  ,
        128866.  , 128386.86,   6264.72],
       [136704.  , 128467.  , 129308.  , 129372.  , 136960.  , 131606.  ,
        131883.  , 130031.  ,   8953.19],
       [145628.  , 119134.  , 128467.  , 129308.  , 130213.  , 136960.  ,
        131606.  , 133167.71,  12008.71],
       [151329.  , 121604.  , 119134.  , 128467.  , 126933.  , 130213.  ,
        136960.  , 135579.14,  12774.85],
       [146188.  , 136704.  , 121604.  , 119134.  , 132752.  , 126933.  ,
        130213.  , 138278.  ,  13015.22],
       [147359.  , 145628.  , 136704.  , 121604.  , 134119.  , 132752.  ,
        126933.  , 141897.57,   9971.02],
       [144471.  , 151329.  , 145628.  , 136704.  , 129372.  , 134119.  ,
        132752.  , 145052.14,   4439.18],
       [143686.  , 146188.  , 151329.  , 145628.  , 129308.  , 129372.  

In [12]:
%pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
import sklearn
import xgboost as xgb

In [14]:
# List to store the final models
final_models_ls = []

# Use the same params here you used in cross-validation
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
  }

# Iterate through each set of cross-validation results and corresponding training targets
    # Determine the optimal number of boosting rounds from cross-validation results
optimal_boost_rounds = 20

# Initialize the XGBoost regressor with determined parameters
final_model = xgb.XGBRegressor(
    n_estimators=optimal_boost_rounds,
    **params
)

# Train the model on the full training dataset
final_model.fit(X_train, y_train_ls[1])

# Append the trained model to the list
final_models_ls.append(final_model)

In [15]:
final_models_ls

[XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='rmse', feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=20, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)]

In [16]:
import xgboost as xgb

# Prepare the DMatrix which is required by XGBoost
dtrain = xgb.DMatrix(data=X_train, label=y_train_ls[1])

# Define XGBoost parameters
params = {
  #'max_depth': 6,
  #'min_child_weight': 1,
  #'eta': 0.5,
  #'subsample': 1,
  #'colsample_bytree': 1,
  'objective': 'reg:squarederror',
  'eval_metric': 'rmse'
}

# Perform cross-validation
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=100,
    nfold = 4,
    early_stopping_rounds=10,
    metrics='rmse',
    as_pandas=True,
    seed=123
)

ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.