In [1]:
import os

In [2]:
import pandas as pd

In [3]:
'''from pathlib import Path
import pandas as pd

# Start from notebook location, then go one level up
BASE_DIR = Path.cwd().parent

# Build the path to the CSV file
csv_path = BASE_DIR / "artifacts" / "data_ingestion" / "data" / "elec_wx_demand.csv"

df = pd.read_csv(csv_path)
df.dtypes'''

'from pathlib import Path\nimport pandas as pd\n\n# Start from notebook location, then go one level up\nBASE_DIR = Path.cwd().parent\n\n# Build the path to the CSV file\ncsv_path = BASE_DIR / "artifacts" / "data_ingestion" / "data" / "elec_wx_demand.csv"\n\ndf = pd.read_csv(csv_path)\ndf.dtypes'

In [4]:
import os
import sys
os.chdir('../')
sys.path.append(os.path.join(os.getcwd(), "src"))

In [5]:
import os
import sys
import json
import requests
import pandas as pd
from pathlib import Path
import openmeteo_requests
import requests_cache
from retry_requests import retry
from datetime import datetime, timedelta
from dotenv import load_dotenv
from WattPredictor.utils.logging import logger
from WattPredictor.utils.exception import CustomException
from WattPredictor.utils.helpers import create_directories, save_json, load_json
from WattPredictor.entity.config_entity import DataIngestionConfig
from WattPredictor.utils.feature import feature_store_instance

cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
load_dotenv()

class Ingestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        self.feature_store = feature_store_instance()
        self.openmeteo = openmeteo_requests.Client(session=retry_session)

    def _elec_get_api(self, year, month, day):
        return self.config.elec_api, {
            "frequency": "hourly",
            "data[0]": "value",
            "sort[0][column]": "period",
            "sort[0][direction]": "desc",
            "facets[parent][0]": "NYIS",
            "offset": 0,
            "length": 5000,
            "start": f"{year}-{month:02d}-{day:02d}",
            "end": (datetime(year, month, day) + timedelta(days=1)).strftime("%Y-%m-%d"),
            "api_key": self.config.elec_api_key
        }

    def _wx_get_api(self, start_date, end_date):
        return self.config.wx_api, {
            "latitude": 40.7128,
            "longitude": -74.0060,
            "start_date": start_date.strftime("%Y-%m-%d"),
            "end_date": end_date.strftime("%Y-%m-%d"),
            "hourly": ["temperature_2m", "weather_code", "relative_humidity_2m", "wind_speed_10m"],
            "timeformat": "unixtime",
            "timezone": "America/New_York"
        }

    def _fetch_electricity_data(self, year, month, day):
        file_path = self.config.elec_raw_data / f"hourly_demand_{year}-{month:02d}-{day:02d}.json"
        if file_path.exists():
            data = load_json(file_path)
            if 'response' in data and 'data' in data['response']:
                return pd.DataFrame(data['response']['data'])

        url, params = self._elec_get_api(year, month, day)
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        create_directories([self.config.elec_raw_data])
        save_json(file_path, data)
        return pd.DataFrame(data['response']['data']) if 'response' in data and 'data' in data['response'] else pd.DataFrame()

    def _fetch_weather_data(self, start_date, end_date):
        file_path = self.config.wx_raw_data / f"weather_data_{start_date.strftime('%Y-%m-%d')}_to_{end_date.strftime('%Y-%m-%d')}.csv"
        if file_path.exists():
            return pd.read_csv(file_path)

        url, params = self._wx_get_api(start_date, end_date)
        responses = self.openmeteo.weather_api(url, params=params)
        hourly = responses[0].Hourly()

        df = pd.DataFrame({
            "date": pd.date_range(
                start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                freq=pd.Timedelta(seconds=hourly.Interval()),
                inclusive="left"
            ),
            "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
            "weather_code": hourly.Variables(1).ValuesAsNumpy(),
            "relative_humidity_2m": hourly.Variables(2).ValuesAsNumpy(),
            "wind_speed_10m": hourly.Variables(3).ValuesAsNumpy()
        })

        create_directories([self.config.wx_raw_data])
        df.to_csv(file_path, index=False)
        return df

    def _prepare_and_merge(self, elec_data_list, weather_df):
        elec_df = pd.concat(elec_data_list, ignore_index=True)
        elec_df["date"] = pd.to_datetime(elec_df["period"], utc=True)
        weather_df["date"] = pd.to_datetime(weather_df["date"], utc=True)

        combined_df = pd.merge(elec_df, weather_df, on="date", how="inner")

        combined_df.columns = (
            combined_df.columns.str.lower()
            .str.replace("-", "_", regex=False)
            .str.replace(" ", "_", regex=False)
            .str.strip()
        )

        combined_df["date_str"] = combined_df["date"].dt.strftime("%Y-%m-%dT%H:%M:%S")
        return combined_df

    def download(self):
        try:
            start = pd.to_datetime(self.config.start_date, utc=True)
            end = pd.to_datetime(self.config.end_date, utc=True)

            current = start
            elec_data = []

            while current <= end:
                year, month, day = current.year, current.month, current.day
                df = self._fetch_electricity_data(year, month, day)
                if not df.empty:
                    elec_data.append(df)
                current += timedelta(days=1)

            if not elec_data:
                logger.warning("No electricity data fetched.")
                return pd.DataFrame()

            wx_df = self._fetch_weather_data(start, end)
            if wx_df.empty:
                logger.warning("No weather data fetched.")
                return pd.DataFrame()

            combined_df = self._prepare_and_merge(elec_data, wx_df)

            if combined_df.empty:
                logger.warning("Merged DataFrame is empty after join.")
                return pd.DataFrame()

            self.feature_store.create_feature_group(
                name="elec_wx_demand",
                df=combined_df,
                primary_key=["date_str", "subba"], 
                event_time="date",
                description="Merged electricity demand and weather data for WattPredictor"
            )

            # Save locally
            create_directories([self.config.data_file.parent])
            combined_df.to_csv(self.config.data_file, index=False)
            logger.info(f"Saved combined dataset to {self.config.data_file}")

            return combined_df

        except Exception as e:
            logger.error(f"Error in DataIngestion.download: {str(e)}")
            raise CustomException(e, sys)

In [6]:
import os
import json
import pandas as pd
from WattPredictor.utils.logging import logger
from WattPredictor.entity.config_entity import DataValidationConfig
from WattPredictor.utils.helpers import create_directories
from WattPredictor.utils.exception import CustomException

class Validation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_data_types(self, data: pd.DataFrame, schema: dict):

        type_mapping = {
            'int': ['int64', 'int32'],
            'float': ['float64', 'float32'],
            'object': ['object'],
            'str': ['object'], 
        }

        for col, expected_type in schema.items():
            if col not in data.columns:
                continue 
                
            actual_dtype = str(data[col].dtype)
            allowed_dtypes = type_mapping.get(expected_type, [expected_type])

            if actual_dtype not in allowed_dtypes:
                logger.error(f"Column '{col}': Expected type '{expected_type}', got '{actual_dtype}'")
                return False
        return True

    def validate_column_presence(self, data: pd.DataFrame, schema: dict):
        all_cols = list(data.columns)
        expected_cols = set(schema.keys())
        missing_cols = expected_cols - set(all_cols)

        if missing_cols:
            logger.error(f"Missing columns: {missing_cols}")
            return False
        return True
    
    def check_missing_values(self, data: pd.DataFrame) -> bool:
        missing = data.isnull().sum()
        if missing.any():
            logger.error(f"Missing values detected:\n{missing[missing > 0]}")
            return False
        return True



    def validator(self):
        data = pd.read_csv(self.config.data_file)
        schema = self.config.all_schema

        logger.info(f"Starting validation for data with shape: {data.shape}")
            
        validation_results = {}
            
        validation_results = {
            'column_presence': self.validate_column_presence(data, schema),
            'data_types': self.validate_data_types(data, schema),
            'missing_values': self.check_missing_values(data),
        }
            
        is_valid = all(validation_results.values())

        create_directories([os.path.dirname(self.config.status_file)])

        for check, result in validation_results.items():
            logger.info(f"{check}: {'PASSED' if result else 'FAILED'}")

        logger.info(f"Overall validation status: {'PASSED' if is_valid else 'FAILED'}")

        with open(self.config.status_file, 'w') as f:
            json.dump({"validation_status": is_valid}, f, indent=4)

        return is_valid

In [7]:
import os
import sys
import json
import joblib
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from WattPredictor.config.data_config import DataTransformationConfig
from WattPredictor.utils.feature import feature_store_instance
from WattPredictor.utils.helpers import create_directories, save_bin
from WattPredictor.utils.exception import CustomException
from WattPredictor.utils.logging import logger


class Engineering:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.feature_store =feature_store_instance()
    def check_status(self):
        try:
            with open(self.config.status_file, 'r') as f:
                status_data = json.load(f)
            return status_data.get("validation_status", False)
        except Exception as e:
            logger.warning(f"Validation status check failed: {e}")
            return False

    def basic_preprocessing(self) -> pd.DataFrame:
        try:
            fg = self.feature_store.feature_store.get_feature_group(name="elec_wx_demand", version=1)
            df = fg.read()
            le = LabelEncoder()
            df['sub_region_code'] = le.fit_transform(df['subba'])
            df.rename(columns={'subba': 'sub_region', 'value': 'demand'}, inplace=True)
            df = df[['date_str','date', 'sub_region_code', 'demand', 'temperature_2m']]

            create_directories([os.path.dirname(self.config.label_encoder)])
            save_bin(le, self.config.label_encoder)
            self.feature_store.upload_file_safely(self.config.label_encoder, "label_encoder.pkl")

            logger.info("Label encoding and preprocessing complete.")
            return df
        except Exception as e:
            raise CustomException(e, sys)

    def feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            df['date'] = pd.to_datetime(df['date'], utc=True)
            
            df['hour'] = df['date'].dt.hour
            df['day_of_week'] = df['date'].dt.dayofweek
            df['month'] = df['date'].dt.month
            df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

            holidays = calendar().holidays(start=df['date'].min(), end=df['date'].max())
            df['is_holiday'] = df['date'].isin(holidays).astype(int)

            
            self.feature_store.create_feature_group(
                name="elec_wx_features",
                df=df,
                primary_key=["date_str","sub_region_code"],
                event_time="date",
                description="Engineered electricity demand features",
                online_enabled=True
            )

            logger.info("Feature group created and feature engineering complete.")
            return df
        except Exception as e:
            raise CustomException(e, sys)

    def transform(self):
        if not self.check_status():
            raise CustomException("Validation failed. Aborting transformation.", sys)
        try:
            df = self.feature_engineering(self.basic_preprocessing())
            df.sort_values("date", inplace=True)

            self.feature_store.create_feature_view(
                name="elec_wx_features_view",
                feature_group_name="elec_wx_features",
                features=[
                    "date", "sub_region_code", "demand", "temperature_2m",
                    "hour", "day_of_week", "month", "is_weekend", "is_holiday"
                ]
            )

            self.feature_store.save_training_dataset(
                feature_view_name="elec_wx_features_view",
                version_description="initial training dataset with all features",
                output_format="csv"
            )

            logger.info("Feature view + training dataset saved successfully.")
            return df
        except Exception as e:
            raise CustomException(e, sys)

In [9]:
from WattPredictor.config.data_config import DataConfigurationManager
from WattPredictor.components.data.ingestion import Ingestion
from WattPredictor.components.data.validation import Validation
from WattPredictor.components.features.engineering import Engineering


data_config = DataConfigurationManager()

ingestion_config = data_config.get_data_ingestion_config()
ingestion = Ingestion(config=ingestion_config)
raw_data = ingestion.download()

data_validation_config = data_config.get_data_validation_config()
data_validation = Validation(data_validation_config)
data_validation.validator()

engineering_config = data_config.get_data_transformation_config()
transformation = Engineering(config=engineering_config)
transformed_data = transformation.transform()

transformed_data

[2025-07-16 12:03:34,492: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-16 12:03:34,509: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-16 12:03:34,533: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-16 12:03:34,534: INFO: helpers: created directory at: artifacts]
[2025-07-16 12:03:34,536: INFO: helpers: created directory at: data]
[2025-07-16 12:03:34,543: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-16 12:03:34,546: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-16 12:03:34,546: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-16 12:03:34,546: INFO: external: Initializing external client]
[2025-07-16 12:03:34,558: INFO: external: Base URL: https://c.app.hopsworks.ai:443]
[2025-07-16 12:03:41,448: INFO: python: Python Engine initialized.]

Logged in to project, explore it here https://c.ap

Uploading Dataframe: 100.00% |██████████| Rows 41184/41184 | Elapsed Time: 00:17 | Remaining Time: 00:00


Launching job: elec_wx_demand_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1240214/jobs/named/elec_wx_demand_1_offline_fg_materialization/executions
[2025-07-16 12:04:25,075: INFO: feature_store: Feature Group 'elec_wx_demand' v1 created and data inserted.]
[2025-07-16 12:04:25,080: INFO: helpers: created directory at: data\processed]
[2025-07-16 12:04:25,462: INFO: ingestion: Saved combined dataset to data\processed\elec_wx_demand.csv]
[2025-07-16 12:04:25,462: INFO: helpers: created directory at: artifacts/data_validation]
[2025-07-16 12:04:25,578: INFO: validation: Starting validation for data with shape: (41184, 13)]
[2025-07-16 12:04:25,598: INFO: helpers: created directory at: artifacts/data_validation]
[2025-07-16 12:04:25,598: INFO: validation: column_presence: PASSED]
[2025-07-16 12:04:25,598: INFO: validation: data_types: PASSED]
[2025-07-16 12:04:25,598: INFO: validation: missing_values: PASSED]
[2025

CustomException: Exception in F:\WattPredictor\src\WattPredictor\components\features\engineering.py, line 80: Exception in F:\WattPredictor\src\WattPredictor\components\features\engineering.py, line 34: Could not read data using Hopsworks Query Service.