In [5]:
import os
os.chdir('/workspaces/b2b-wf-experiments/stacks/short_term_forecast')

In [6]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass(frozen=True)
class CrossValSplitConfig:
    root_dir: Path
    input_dataset: Path
    time_column: str
    forecast_horizon: int
    train_file_name: str
    test_file_name: str

In [19]:
from typing import List

from src.ShortTermForecast.constants import *
from src.ShortTermForecast.utils.common import read_yaml, create_directories
#from src.ShortTermForecast.entity.config_entity import CrossValSplitConfig


class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        
        if config_filepath is not None:
            self.config = read_yaml(config_filepath)
            create_directories([self.config.artifacts_root])
        
        if params_filepath is not None:
            self.params = read_yaml(params_filepath)


    def get_cross_validation_split_dvc_config(self) -> CrossValSplitConfig:
        
        general_configs = self.config.general_setup
        data_ingestion_config = self.config.data_ingestion
        cross_validation_config = self.config.cross_validation

        create_directories([cross_validation_config.root_dir])

        return CrossValSplitConfig(
            root_dir=Path(cross_validation_config.root_dir),
            input_dataset=Path(data_ingestion_config.root_dir, data_ingestion_config.local_file_name),
            time_column=general_configs.target_column,
            forecast_horizon=general_configs.forecast_horizon,
            train_file_name=cross_validation_config.train_file_name,
            test_file_name=cross_validation_config.test_file_name
        )
    
    def get_cross_validation_split_kfp_config(
            self,
            input_dataset: str,
            time_column: str,
            forecast_horizon: int
    ) -> CrossValSplitConfig:

        return CrossValSplitConfig(
            root_dir=None,
            input_dataset=input_dataset,
            time_column=time_column,
            forecast_horizon=forecast_horizon,
            train_file_name=None,
            test_file_name=None
        )        

In [22]:
import pandas as pd
from datetime import timedelta
from pathlib import Path

from src.ShortTermForecast import logger
#\from src.ShortTermForecast.entity.config_entity import CrossValSplitConfig

class TimeSeriesCV:
    """
    A class for implementing time series cross-validation with rolling forecast windows.
    
    This class handles the creation of training and test datasets using a rolling window
    cross-validation strategy for time series forecasting. It reads timestamp-based data
    and creates splits with an additional 'split_index' column to identify different
    temporal splits.
    
    Attributes:
        config (CrossValSplitConfig): Configuration object containing necessary parameters.
        data (pandas.DataFrame): DataFrame to store the input data.
        splits (list): List of dictionaries containing split configurations.
    """

    def __init__(self, config: CrossValSplitConfig) -> None:
        """
        Initialize the TimeSeriesCV class.

        Args:
            config (CrossValSplitConfig): Configuration object containing necessary parameters.
        """
        self.config = config
        self.data = None
        self.splits = None
        logger.info("TimeSeriesCV instance initialized with provided configuration.")

    def load(self):
        """
        Load data from the input dataset specified in the configuration.
        """
        logger.info(f"Reading input dataset from: {self.config.input_dataset}")
        try:
            self.data = pd.read_csv(self.config.input_dataset)
            # Ensure time column is properly converted to datetime
            self.data[self.config.time_column] = pd.to_datetime(self.data[self.config.time_column])
            logger.info(f"Data loaded successfully. Shape: {self.data.shape}")
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise

    def generate_splits(self):
        """
        Generate time series cross-validation splits.
        
        This method creates a list of split configurations, where each split represents
        a different forecasting period. All splits start training from 2022-01-01, with
        training periods growing progressively longer and test periods moving forward in time.
        """
        logger.info("Generating time series cross-validation splits")
        
        train_start = pd.to_datetime("2022-01-01")
        splits = []
        current_train_end = pd.to_datetime("2024-03-31")
        
        for split_index in range(1, 5):
            test_start = current_train_end + timedelta(days=1)
            test_end = test_start + timedelta(days=self.config.forecast_horizon-1)
            
            splits.append({
                "split_index": split_index,
                "train_start": train_start,
                "train_end": current_train_end,
                "test_start": test_start,
                "test_end": test_end
            })
            
            current_train_end = test_end
        
        self.splits = splits
        
        logger.info("Generated splits:")
        for s in splits:
            logger.info(f"Split {s['split_index']}:")
            logger.info(f"  Train: {s['train_start'].strftime('%Y-%m-%d')} to {s['train_end'].strftime('%Y-%m-%d')}")
            logger.info(f"  Test:  {s['test_start'].strftime('%Y-%m-%d')} to {s['test_end'].strftime('%Y-%m-%d')}")

    def process_splits(self):
        """
        Process the generated splits to create training and test datasets.
        
        This method applies the split configurations to the input data, creating
        separate training and test datasets with a 'split_index' column to identify
        different temporal splits.
        
        Returns:
            tuple: (combined_train, combined_test) DataFrames containing all splits
        """
        if self.splits is None:
            logger.error("Splits have not been generated. Call generate_splits() first.")
            raise ValueError("Splits not generated")
            
        all_train_data = []
        all_test_data = []
        
        for s in self.splits:
            logger.info(f"\nProcessing split {s['split_index']}")
            
            train_mask = (self.data[self.config.time_column] >= s["train_start"]) & \
                        (self.data[self.config.time_column] <= s["train_end"])
            
            test_mask = (self.data[self.config.time_column] >= s["test_start"]) & \
                       (self.data[self.config.time_column] <= s["test_end"])

            train_df = self.data.loc[train_mask].copy()
            test_df = self.data.loc[test_mask].copy()
            
            train_df['split_index'] = s['split_index']
            test_df['split_index'] = s['split_index']
            
            all_train_data.append(train_df)
            all_test_data.append(test_df)
            
            logger.info(f"Split {s['split_index']} - Train shape: {train_df.shape}, Test shape: {test_df.shape}")
        

        self.train = pd.concat(all_train_data, ignore_index=True)
        self.test = pd.concat(all_test_data, ignore_index=True)

    def save(self, save_train_path: str = None, save_test_path: str = None):
        """
        Save the processed training and test datasets to CSV files.

        Args:
            train_df (pd.DataFrame): Combined training dataset
            test_df (pd.DataFrame): Combined test dataset
        """

        if save_train_path is None:
            save_train_path = Path(self.config.root_dir, self.config.train_file_name)
        if save_test_path is None:
            save_test_path = Path(self.config.root_dir, self.config.test_file_name)
            
        logger.info(f"Saving combined training data (shape: {self.train.shape}) to {save_train_path}")
        self.train.to_csv(save_train_path, index=False)
        
        logger.info(f"Saving combined test data (shape: {self.test.shape}) to {save_test_path}")
        self.test.to_csv(save_test_path, index=False)


In [24]:
STAGE_NAME = "Cross Validation Split"


class TimeSeriesCVPipeline:
    def __init__(self):
        pass

    def main(self):
        config = ConfigurationManager()
        time_series_cv = TimeSeriesCV(config.get_cross_validation_split_dvc_config())
        time_series_cv.load()
        time_series_cv.generate_splits()
        time_series_cv.process_splits()
        time_series_cv.save()

if __name__ == '__main__':
    try:
        logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
        obj = TimeSeriesCVPipeline()
        obj.main()
        logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<")
        logger.info("\nx" + "=" * 50 + "x")
    except Exception as e:
        logger.exception(e)
        raise e

[2025-04-15 19:02:08,961: INFO: 3118263232] >>>>>> stage Cross Validation Split started <<<<<<
[2025-04-15 19:02:08,968: INFO: common] yaml file: config/config.yaml loaded successfully
[2025-04-15 19:02:08,970: INFO: common] Creating directory: artifacts
[2025-04-15 19:02:08,971: INFO: common] yaml file: params.yaml loaded successfully
[2025-04-15 19:02:08,972: INFO: common] Creating directory: cross_validation
[2025-04-15 19:02:08,972: INFO: 2254608862] TimeSeriesCV instance initialized with provided configuration.
[2025-04-15 19:02:08,973: INFO: 2254608862] Reading input dataset from: artifacts/data_ingestion/fwds_daily_data.csv
[2025-04-15 19:02:09,491: INFO: 2254608862] Data loaded successfully. Shape: (299313, 11)
[2025-04-15 19:02:09,492: INFO: 2254608862] Generating time series cross-validation splits
[2025-04-15 19:02:09,494: INFO: 2254608862] Generated splits:
[2025-04-15 19:02:09,494: INFO: 2254608862] Split 1:
[2025-04-15 19:02:09,495: INFO: 2254608862]   Train: 2022-01-01 t

In [None]:
from kfp.dsl import component, Input, Output, Dataset
from google_cloud_pipeline_components.v1.custom_job import create_custom_training_job_from_component

@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/b2b_ai/wf_pipeline/training:1.0.1-rc"
)
def generate_time_series_cv_op(
    input_dataset: Input[Dataset],
    time_column: str,
    forecast_horizon: int,
    output_train: Output[Dataset],
    output_test: Output[Dataset]
):
    from src.ShortTermForecast.components.data_ingestion import TimeSeriesCV
    from src.ShortTermForecast.config.configuration import ConfigurationManager

    config = ConfigurationManager(
        config_filepath=None, 
        params_filepath=None
    )

    time_series_cv_config = config.get_cross_validation_split_kfp_config(
        input_dataset=input_dataset.path,
        time_column=time_column,
        forecast_horizon=forecast_horizon,
    )

    time_series_cv = TimeSeriesCV(time_series_cv_config)

    time_series_cv.load()
    time_series_cv.generate_splits()
    time_series_cv.process_splits()
    time_series_cv.save(
        save_train_path=output_train.path,
        save_test_path=output_test.path
    )

generate_time_series_cv_job = create_custom_training_job_from_component(
    generate_time_series_cv_op,
    display_name='generate-timeseries-cv-job',
    machine_type='e2-standard-4'
)
