In [1]:
import os
import sys

In [2]:
os.chdir('../')

In [3]:
sys.path.append(os.path.join(os.getcwd(), "src"))

In [4]:
from dataclasses import dataclass
from pathlib import Path
from electron.utils.helpers import *
from electron.utils.exception import *
from electron.constants import *
from electron import logger

In [5]:
@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_file: Path
    status_file: str
    label_encoder: Path
    preprocessor: Path
    x_transform: Path
    y_transform: Path
    train_features: Path
    test_features: Path
    train_target: Path
    test_target: Path
    input_seq_len: int
    step_size: int
    cutoff_date: str

In [None]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_PATH,
                       params_filepath=PARAMS_PATH,
                       schema_filepath=SCHEMA_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        schema = self.schema
        params = self.params.transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_file=Path(config.data_file),
            status_file=Path(config.status_file),
            label_encoder=Path(config.label_encoder),
            preprocessor=Path(config.preprocessor),
            x_transform=Path(config.x_transform),
            y_transform=Path(config.y_transform),
            train_features=Path(config.train_features),
            test_features=Path(config.test_features),
            train_target=Path(config.train_target),
            test_target=Path(config.test_target),
            input_seq_len=params.input_seq_len,
            step_size=params.step_size,
            cutoff_date=params.cutoff_date
        )

        return data_transformation_config

In [7]:
import os
import sys
import tqdm
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def check_status(self):
        try:
            with open(self.config.status_file, 'r') as f:
                status_data = json.load(f)
            validation_status = status_data.get("Validation status", False)
            logger.info(f"Data validation status: {validation_status}")
            return validation_status
        
        except Exception as e:
            logger.error(f"Error reading validation status: {e}")
            return False

    def basic_preprocessing(self) -> pd.DataFrame:
        try:
            df = pd.read_csv(self.config.data_file)
            df = df[['period', 'subba', 'value', 'temperature_2m']]
            le = LabelEncoder()
            df['sub_region_code'] = le.fit_transform(df['subba'])

            df.rename(columns={
                'period': 'date',
                'subba': 'sub_region',
                'value': 'demand'
            }, inplace=True)

            df = df[['date', 'sub_region_code', 'demand', 'temperature_2m']]

            create_directories([os.path.dirname(self.config.label_encoder)])
            save_bin(le, self.config.label_encoder)

            logger.info("Basic preprocessing completed.")
            return df

        except Exception as e:
            raise CustomException(e, sys)

    def feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            df['date'] = pd.to_datetime(df['date'], errors='coerce', utc=True)

            df['hour'] = df['date'].dt.hour
            df['day_of_week'] = df['date'].dt.dayofweek
            df['month'] = df['date'].dt.month
            df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

            holidays = calendar().holidays(start=df['date'].min(), end=df['date'].max())
            df['is_holiday'] = df['date'].isin(holidays).astype(int)

            logger.info("Feature engineering completed.")
            return df

        except Exception as e:
            raise CustomException(e, sys)

    def train_test_splitting(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        try:
            df = self.feature_engineering(self.basic_preprocessing())
            df.sort_values("date", inplace=True)

            cutoff = pd.to_datetime(self.config.cutoff_date, utc=True)

            train_df = df[df['date'] < cutoff].reset_index(drop=True)
            test_df = df[df['date'] >= cutoff].reset_index(drop=True)

            logger.info(f"Train size: {train_df.shape}, Test size: {test_df.shape}")
            return train_df, test_df

        except Exception as e:
            raise CustomException(e, sys)
        
    def _get_cutoff_indices(self, df: pd.DataFrame, input_seq_len: int, step_size: int):
        stop = len(df) - input_seq_len - 1
        return [(i, i + input_seq_len, i + input_seq_len + 1) for i in range(0, stop, step_size)]
        
    def transform_ts_data_into_features_and_target(self, ts_data: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
            
            assert set(['date', 'demand', 'sub_region_code', 'temperature_2m']).issubset(ts_data.columns)

            region_codes = ts_data['sub_region_code'].unique()
            features = pd.DataFrame()
            targets = pd.DataFrame()

            input_seq_len = self.config.input_seq_len
            step_size = self.config.step_size

            for code in tqdm.tqdm(region_codes, desc="Transforming TS Data"):
                ts_one = ts_data[ts_data['sub_region_code'] == code].sort_values(by='date')
                indices = self._get_cutoff_indices(ts_one, input_seq_len, step_size)

                x = np.zeros((len(indices), input_seq_len), dtype=np.float64)
                y = np.zeros((len(indices)), dtype=np.float64)
                date_hours, temps = [], []

                for i, (start, mid, end) in enumerate(indices):
                    x[i, :] = ts_one.iloc[start:mid]['demand'].values
                    y[i] = ts_one.iloc[mid]['demand']
                    date_hours.append(ts_one.iloc[mid]['date'])
                    temps.append(ts_one.iloc[mid]['temperature_2m'])

                features_one = pd.DataFrame(
                    x,
                    columns=[f'demand_prev_{i+1}_hr' for i in reversed(range(input_seq_len))]
                )
                features_one['date'] = date_hours
                features_one['sub_region_code'] = code
                features_one['temperature_2m'] = temps

                targets_one = pd.DataFrame(y, columns=['target_demand_next_hour'])

                features = pd.concat([features, features_one], ignore_index=True)
                targets = pd.concat([targets, targets_one], ignore_index=True)

            return features, targets['target_demand_next_hour']

    def preprocess_features(self, train_df: pd.DataFrame, test_df: pd.DataFrame):
        try:
            validation_status = self.check_status()
        
            if not validation_status:
                logger.error("Data validation failed. Skipping data cleaning.")
            logger.info(f"Validation Status : {validation_status}")

        except Exception as e:
            raise CustomException(e, sys)
        
        try:
        
            train_x, train_y = self.transform_ts_data_into_features_and_target(train_df)
            test_x, test_y = self.transform_ts_data_into_features_and_target(test_df)

            # Save numpy arrays
            np.save(self.config.x_transform, train_x.values)
            np.save(self.config.y_transform, train_y.values)

            # Save as CSV
            train_x.to_csv(self.config.train_features, index=False)
            train_y.to_csv(self.config.train_target, index=False)
            test_x.to_csv(self.config.test_features, index=False)
            test_y.to_csv(self.config.test_target, index=False)

            logger.info("Feature transformation and saving completed.")
            return (train_x, train_y), (test_x, test_y)

        except Exception as e:
            raise CustomException(e, sys)

In [None]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    train_df, test_df = data_transformation.train_test_splitting()
    (train_x, train_y), (test_x, test_y) = data_transformation.preprocess_features(train_df, test_df)

except Exception as e:
    raise CustomException(str(e), sys)

[2025-07-05 08:52:12,382: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-05 08:52:12,398: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-05 08:52:12,412: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-05 08:52:12,688: INFO: helpers: created directory at: artifacts\data_transformation]
[2025-07-05 08:52:12,707: INFO: helpers: binary file saved at: artifacts\data_transformation\label_encoder.pkl]
[2025-07-05 08:52:12,708: INFO: 364925033: Basic preprocessing completed.]
[2025-07-05 08:52:12,771: INFO: 364925033: Feature engineering completed.]
[2025-07-05 08:52:12,808: INFO: 364925033: Train size: (80245, 9), Test size: (20086, 9)]
[2025-07-05 08:52:12,824: ERROR: 364925033: Error reading validation status: Expecting value: line 1 column 1 (char 0)]
[2025-07-05 08:52:12,824: ERROR: 364925033: Data validation failed. Skipping data cleaning.]
[2025-07-05 08:52:12,824: INFO: 364925033: Validat

Transforming TS Data: 100%|██████████| 11/11 [00:20<00:00,  1.89s/it]
Transforming TS Data: 100%|██████████| 11/11 [00:03<00:00,  3.17it/s]


[2025-07-05 08:53:33,142: INFO: 364925033: Feature transformation and saving completed.]
