In [1]:
import os
os.chdir('/workspaces/b2b-wf-experiments/stacks/short_term_forecast')

In [2]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    local_file_name: str
    project_id: str
    project_location: str
    bq_dataset: str
    bq_source_table: str
    time_column: str
    target_column: str
    series_identifier: str
    attribute_columns: List[str]

In [None]:
from src.ShortTermForecast.constants import *
from src.ShortTermForecast.utils.common import read_yaml, create_directories

In [None]:
from typing import List

from src.ShortTermForecast.constants import *
from src.ShortTermForecast.utils.common import read_yaml, create_directories
from src.ShortTermForecast.entity.config_entity import DataIngestionConfig


class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        
        if config_filepath is not None:
            self.config = read_yaml(config_filepath)
            create_directories([self.config.artifacts_root])
        
        if params_filepath is not None:
            self.params = read_yaml(params_filepath)


    def get_data_ingestion_dvc_config(self) -> DataIngestionConfig:
        """
        Returns a DataIngestionConfig configured for DVC/local runs.
        """

        general_configs = self.config.general_setup
        data_ingestion_config = self.config.data_ingestion

        create_directories([data_ingestion_config.root_dir])

        return DataIngestionConfig(
            root_dir=Path(data_ingestion_config.root_dir),
            local_file_name=data_ingestion_config.local_file_name,
            project_id=general_configs.project_id,
            project_location=general_configs.project_location,
            bq_dataset=data_ingestion_config.bq_dataset,
            bq_source_table=data_ingestion_config.bq_source_table,
            time_column=general_configs.time_column,
            target_column=general_configs.target_column,
            series_identifier=general_configs.series_identifier,
            attribute_columns=general_configs.attribute_columns
        )
    
    def get_data_ingestion_kfp_config(
            self,
            project_id: str,
            project_location: str,
            bq_dataset: str,
            bq_source_table: str,
            time_column: str,
            target_column: str,
            series_identifier: str,
            attribute_columns: List[str]
    ) -> DataIngestionConfig:
        """
        Returns a DataIngestionConfig configured for Kubeflow Pipeline runs.
        """

        return DataIngestionConfig(
            root_dir=None,
            local_file_name=None,
            project_id=project_id,
            project_location=project_location,
            bq_dataset=bq_dataset,
            bq_source_table=bq_source_table,
            time_column=time_column,
            target_column=target_column,
            series_identifier=series_identifier,
            attribute_columns=attribute_columns
        )        

In [None]:
from pathlib import Path
from google.cloud import bigquery

from src.ShortTermForecast import logger

class DataIngestion:
    """
    A class for ingesting data from BigQuery, processing it, and saving it locally.

    This class handles the creation of series identifiers, data loading from BigQuery,
    and saving the processed data to a local CSV file.

    Attributes:
        config (DataIngestionConfig): Configuration object containing necessary parameters.
        client (bigquery.Client): BigQuery client for data querying.
        data (pandas.DataFrame): Dataframe to store the ingested data.
    """

    def __init__(self, config: DataIngestionConfig):
        """
        Initialize the DataIngestion class.

        Args:
            config (DataIngestionConfig): Configuration object containing necessary parameters.
        """
        self.config = config
        self.client = None
        self.data = None
        logger.info("DataIngestion instance initialized with provided configuration.")

    def _create_series_identifier(self) -> str:
        """
        Create the series identifier query field based on the attribute columns.

        This method generates a SQL CONCAT statement to combine multiple attribute columns
        into a single series identifier.

        Example:
            attribute_columns = ['local', 'type']
            series identifier string = '<local> <type>' (e.g., 'vancouver 1')

        Returns:
            str: The CONCAT query to create the series identifiers column in the BigQuery view query.
        """
        coalesce_parts = [f"COALESCE({column}, 'None')" for column in self.config.attribute_columns]
        separator = "' '"
        series_identifier = f"CONCAT({f', {separator}, '.join(coalesce_parts)}) AS {self.config.series_identifier}"
        logger.debug(f"Created series identifier: {series_identifier}")
        return series_identifier

    def load(self):
        """
        Load data from BigQuery using the configured query.

        This method initializes the BigQuery client and executes the data ingestion query,
        storing the results in a pandas DataFrame.
        """
        logger.info("Initializing BigQuery client and loading data.")
        self.client = bigquery.Client(
            project=self.config.project_id,
            location=self.config.project_location
        )

        try:
            self.data = self.client.query(self.data_ingestion_query).to_dataframe()
            logger.info(f"Data loaded successfully. Shape: {self.data.shape}")
        except Exception as e:
            logger.error(f"Error loading data from BigQuery: {str(e)}")
            raise

    def save(self, save_path: str = None):
        """
        Save the loaded data to a CSV file.

        Args:
            save_path (str, optional): The path where the CSV file will be saved.
                If not provided, it will use the default path from the configuration.
        """
        if save_path is None:
            save_path = Path(self.config.root_dir, self.config.local_file_name)
        
        logger.info(f"Saving data to {save_path}")
        try:
            self.data.to_csv(save_path, index=False)
            logger.info("Data saved successfully.")
        except Exception as e:
            logger.error(f"Error saving data to CSV: {str(e)}")
            raise

    @property
    def data_ingestion_query(self) -> str:
        """
        Generate the BigQuery SQL query for data ingestion.

        This property creates a SQL query that selects and aggregates data from the source table,
        applies date filtering, and includes the series identifier.

        Returns:
            str: The complete SQL query string for data ingestion.
        """
        query = f"""
        WITH historical_table AS (
            SELECT 
                {self.config.time_column},
                {self.attribute_string},
                SUM({self.config.target_column}) AS {self.config.target_column}
            FROM 
                `{self.config.project_id}.{self.config.bq_dataset}.{self.config.bq_source_table}`
            WHERE 
                {self.config.time_column} <= DATE('2025-03-31')
            GROUP BY 
                {self.config.time_column},
                {self.attribute_string}
        )
        SELECT 
            {self._create_series_identifier()},
            {self.config.time_column},
            {self.attribute_string},
            {self.config.target_column}
        FROM historical_table
        """
        logger.debug("Generated data ingestion query.")
        return query

    @property
    def attribute_string(self) -> str:
        """
        Generate a comma-separated string of attribute columns.

        Returns:
            str: A comma-separated string of attribute column names.
        """
        return ','.join(self.config.attribute_columns)

In [6]:
STAGE_NAME = "Data Ingestion"


class DataIngestionPipeline:
    def __init__(self):
        pass

    def main(self):
        config = ConfigurationManager()
        data_ingestion = DataIngestion(config.get_data_ingestion_dvc_config())
        data_ingestion.load()
        data_ingestion.save()

if __name__ == '__main__':
    try:
        logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
        obj = DataIngestionPipeline()
        obj.main()
        logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<")
        logger.info("\nx" + "=" * 50 + "x")
    except Exception as e:
        logger.exception(e)
        raise e

[2025-04-15 17:53:16,535: INFO: 1731669044] >>>>>> stage Data Ingestion started <<<<<<
[2025-04-15 17:53:16,541: INFO: common] yaml file: config/config.yaml loaded successfully
[2025-04-15 17:53:16,550: INFO: common] Creating directory: artifacts
[2025-04-15 17:53:16,552: INFO: common] yaml file: params.yaml loaded successfully
[2025-04-15 17:53:16,554: INFO: common] Creating directory: artifacts/data_ingestion
[2025-04-15 17:53:16,555: INFO: 3841732962] DataIngestion instance initialized with provided configuration.
[2025-04-15 17:53:16,556: INFO: 3841732962] Initializing BigQuery client and loading data.
[2025-04-15 17:54:34,478: INFO: 3841732962] Data loaded successfully. Shape: (299313, 11)
[2025-04-15 17:54:34,479: INFO: 3841732962] Saving data to artifacts/data_ingestion/fwds_daily_data.csv
[2025-04-15 17:54:37,538: INFO: 3841732962] Data saved successfully.
[2025-04-15 17:54:37,551: INFO: 1731669044] >>>>>> stage Data Ingestion completed <<<<<<
[2025-04-15 17:54:37,552: INFO: 17

In [None]:


# query_and_preprocess
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/kfp-preprocess-slim:2.0.1"
)
def data_ingestion_op(
    project_id: str,
    project_location: str,
    bq_dataset: str,
    bq_source_table: str,
    time_column: str,
    target_column: str,
    series_identifier: str,
    attribute_columns: List[str],
    output_dataset: Output[Dataset]
):
    # import the custom models

    config = ConfigurationManager(
        config_filepath=None, 
        params_filepath=None
    )

    data_ingestion_config = config.get_data_ingestion_kfp_config(
        project_id=project_id,
        project_location=project_location,
        bq_dataset=bq_dataset,
        bq_source_table=bq_source_table,
        time_column=time_column,
        target_column=target_column,
        series_identifier=series_identifier,
        attribute_columns=attribute_columns
    )


    data_ingestion = DataIngestion(data_ingestion_config)

    data_ingestion.load()
    data_ingestion.save(output_dataset.path)

custom_data_ingestion_job = create_custom_training_job_from_component(
    data_ingestion_op,
    display_name='data-ingestion-job',
    machine_type='e2-standard-4'
)
