In [1]:
import os
import sys
os.chdir('../')
sys.path.append(os.path.join(os.getcwd(), "src"))

In [2]:
from WattPredictor.utils.helpers import *
from WattPredictor.constants import *
from WattPredictor.utils.exception import *
from WattPredictor.utils.logging import logger

In [3]:
from pathlib import Path
from dataclasses import dataclass
from datetime import datetime, timedelta

In [4]:
@dataclass(frozen=True)
class FeatureStoreConfig:
    hopsworks_project_name: str
    hopsworks_api_key: str

In [5]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    elec_raw_data: Path
    wx_raw_data: Path
    elec_api: str
    wx_api: str
    elec_api_key: str
    data_file: Path
    start_date: str
    end_date: str

In [6]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath=CONFIG_PATH,
                 params_filepath=PARAMS_PATH,
                 schema_filepath=SCHEMA_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        if schema_filepath.exists():
            self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        params = self.params.dates

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(config.root_dir),
            elec_raw_data=Path(config.elec_raw_data),
            wx_raw_data=Path(config.wx_raw_data),
            elec_api= os.environ['elec_api'],
            wx_api= os.environ['wx_api'],
            elec_api_key= os.environ['elec_api_key'],
            data_file=Path(config.data_file),
            start_date=params.start_date,
            end_date=params.end_date
        )

        return data_ingestion_config
    
    def get_feature_store_config(self) -> FeatureStoreConfig:

        config = self.config.feature_store

        feature_store_config = FeatureStoreConfig(
            hopsworks_project_name=config.hopsworks_project_name,
            hopsworks_api_key=os.environ['hopsworks_api_key'],
        )

        return feature_store_config

In [None]:
import hopsworks
import pandas as pd
import sys
import os
from WattPredictor.config.feature_config import FeatureStoreConfig
from WattPredictor.utils.exception import CustomException

class FeatureStore:
    def __init__(self, config:FeatureStoreConfig):
        try:
            self.config = config
            self.connect()
        except Exception as e:
            raise CustomException(e, sys)


    def connect(self):
        try:
            self.project = hopsworks.login(
                project=self.config.hopsworks_project_name,
                api_key_value=self.config.hopsworks_api_key
            )
            self.feature_store = self.project.get_feature_store()
            self.dataset_api = self.project.get_dataset_api()
            logger.info(f"Connected to Hopsworks Feature Store: {self.config.hopsworks_project_name}")
        except Exception as e:
            raise CustomException(e, sys)


    def create_feature_group(self, name, df, primary_key, event_time, description, online_enabled=True, version=1):
        """
        Create or update a feature group with proper error handling for metadata inconsistencies
        """
        try:
            try:
                fg = self.feature_store.get_feature_group(name=name, version=version)
                if fg is not None:
                    logger.info(f"Feature Group '{name}' v{version} exists. Attempting to insert data.")
                    try:
                        fg.insert(df)
                        logger.info(f"Successfully inserted data into existing Feature Group '{name}' v{version}")
                        return fg
                    except Exception as insert_error:
                        logger.warning(f"Insert failed: {insert_error}. Attempting to delete and recreate.")
                        try:
                            fg.delete()
                            logger.info(f"Deleted corrupted Feature Group '{name}' v{version}")
                        except Exception as delete_error:
                            logger.warning(f"Delete failed: {delete_error}")
            except Exception as get_error:
                logger.info(f"Feature Group '{name}' v{version} does not exist or is corrupted: {get_error}")

            # Create new feature group
            logger.info(f"Creating new Feature Group '{name}' v{version}")
            try:
                fg = self.feature_store.create_feature_group(
                    name=name,
                    version=version,
                    primary_key=primary_key,
                    event_time=event_time,
                    description=description,
                    online_enabled=online_enabled
                )
                fg.save(df)
                logger.info(f"Successfully created Feature Group '{name}' v{version}")
                return fg
                
            except Exception as create_error:
                if "already exists" in str(create_error):
                    logger.error(f"Hive table exists but metadata is corrupted. Manual cleanup required.")
                    version += 1
                    logger.info(f"Attempting to create with version {version}")
                    fg = self.feature_store.create_feature_group(
                        name=name,
                        version=version,
                        primary_key=primary_key,
                        event_time=event_time,
                        description=description,
                        online_enabled=online_enabled
                    )
                    fg.save(df)
                    logger.info(f"Successfully created Feature Group '{name}' v{version}")
                    return fg
                else:
                    raise create_error

        except Exception as e:
            logger.error(f"Failed to create/update Feature Group '{name}': {str(e)}")
            raise CustomException(e, sys)



    def create_feature_view(self, name: str, feature_group_name: str, features: list):
        try:
            fg = self.feature_store.get_feature_group(name=feature_group_name, version=1)
            fv = self.feature_store.get_or_create_feature_view(
                name=name,
                version=1,
                query=fg.select(features),
                description=f"Feature View for {name}"
            )
            logger.info(f"Feature View '{name}' created successfully")
        except Exception as e:
            raise CustomException(e, sys)
        

    def save_training_dataset(self, feature_view_name, version_description, output_format="csv"):
        try:
            fv = self.feature_store.get_feature_view(name=feature_view_name, version=1)
            td = fv.create_training_data(
                description=version_description,
                data_format=output_format,
                write_options={"wait_for_job": True}
            )
            logger.info(f"Training dataset created for Feature View '{feature_view_name}'.")
            return td
        except Exception as e:
            raise CustomException(e, sys)
        
    def load_latest_training_dataset(self, feature_view_name):
        try:
            fv = self.feature_store.get_feature_view(name=feature_view_name, version=1)
            return fv.training_data()
        except Exception as e:
            raise CustomException(e, sys)


    def upload_file_safely(self, local_path: str, target_name: str):

        try:
            self.dataset_api.upload(
                local_path,
                f"Resources/wattpredictor_artifacts/{target_name}",
                overwrite=True 
            )
            logger.info(f"Uploaded file to Feature Store: {target_name}")
        except Exception as e:
            raise CustomException(e, sys)


    def get_training_data(self, feature_view_name: str):
        try:
            fv = self.feature_store.get_feature_view(name=feature_view_name, version=1)
            X, y = fv.training_data()
            logger.info(f"Retrieved training data from Feature View '{feature_view_name}'")
            return X, y
        except Exception as e:
            raise CustomException(e, sys)
    
    
    def get_online_features(self, feature_view_name, key_dict: dict, version=1):
        try:
            fv = self.feature_store.get_feature_view(name=feature_view_name, version=version)
            if fv is None:
                logger.error(f"[Online Fetch] Feature View '{feature_view_name}' v{version} not found.")
                raise CustomException(f"Feature View '{feature_view_name}' v{version} is None", sys)

            expected_primary_keys = ["date_str", "sub_region_code"]
            
            key_values = [key_dict[key] for key in expected_primary_keys]
            
            try:
                result = fv.get_feature_vector(key_dict)
                logger.info(f"[Online Fetch] Fetched online features using get_feature_vector for {key_dict}: {result}")
                return result
            except Exception as vector_error:
                logger.warning(f"get_feature_vector failed: {vector_error}, trying get_serving_vector")
                
                result = fv.get_serving_vector(key_values).to_dict()
                logger.info(f"[Online Fetch] Fetched online features using get_serving_vector for {key_dict}: {result}")
                return result

        except Exception as e:
            logger.error(f"[Online Fetch] Failed to fetch online features for {feature_view_name} with key {key_dict}")
            raise CustomException(e, sys)

In [8]:
import os
import json
import pandas as pd
import requests
from datetime import datetime, timedelta
import openmeteo_requests
import requests_cache
from retry_requests import retry
from WattPredictor.utils.logging import logger
from WattPredictor.utils.helpers import create_directories, save_json, load_json
from WattPredictor.utils.exception import CustomException
from dotenv import load_dotenv

cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
load_dotenv()


class DataIngestion:
    def __init__(self, config: DataIngestionConfig, feature_store_config: FeatureStoreConfig):
        self.config = config
        self.feature_store = FeatureStore(feature_store_config)
        self.openmeteo = openmeteo_requests.Client(session=retry_session)

    def _elec_get_api(self, year, month, day):
        return self.config.elec_api, {
            "frequency": "hourly",
            "data[0]": "value",
            "sort[0][column]": "period",
            "sort[0][direction]": "desc",
            "facets[parent][0]": "NYIS",
            "offset": 0,
            "length": 5000,
            "start": f"{year}-{month:02d}-{day:02d}",
            "end": (datetime(year, month, day) + timedelta(days=1)).strftime("%Y-%m-%d"),
            "api_key": self.config.elec_api_key
        }

    def _wx_get_api(self, start_date, end_date):
        return self.config.wx_api, {
            "latitude": 40.7128,
            "longitude": -74.0060,
            "start_date": start_date.strftime("%Y-%m-%d"),
            "end_date": end_date.strftime("%Y-%m-%d"),
            "hourly": ["temperature_2m", "weather_code", "relative_humidity_2m", "wind_speed_10m"],
            "timeformat": "unixtime",
            "timezone": "America/New_York"
        }

    def _fetch_electricity_data(self, year, month, day):
        file_path = self.config.elec_raw_data / f"hourly_demand_{year}-{month:02d}-{day:02d}.json"
        if file_path.exists():
            data = load_json(file_path)
            if 'response' in data and 'data' in data['response']:
                return pd.DataFrame(data['response']['data'])

        url, params = self._elec_get_api(year, month, day)
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        create_directories([self.config.elec_raw_data])
        save_json(file_path, data)
        return pd.DataFrame(data['response']['data']) if 'response' in data and 'data' in data['response'] else pd.DataFrame()

    def _fetch_weather_data(self, start_date, end_date):
        file_path = self.config.wx_raw_data / f"weather_data_{start_date.strftime('%Y-%m-%d')}_to_{end_date.strftime('%Y-%m-%d')}.csv"
        if file_path.exists():
            return pd.read_csv(file_path)

        url, params = self._wx_get_api(start_date, end_date)
        responses = self.openmeteo.weather_api(url, params=params)
        hourly = responses[0].Hourly()

        df = pd.DataFrame({
            "date": pd.date_range(
                start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                freq=pd.Timedelta(seconds=hourly.Interval()),
                inclusive="left"
            ),
            "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
            "weather_code": hourly.Variables(1).ValuesAsNumpy(),
            "relative_humidity_2m": hourly.Variables(2).ValuesAsNumpy(),
            "wind_speed_10m": hourly.Variables(3).ValuesAsNumpy()
        })

        create_directories([self.config.wx_raw_data])
        df.to_csv(file_path, index=False)
        return df

    def _prepare_and_merge(self, elec_data_list, weather_df):
        elec_df = pd.concat(elec_data_list, ignore_index=True)
        elec_df["date"] = pd.to_datetime(elec_df["period"], utc=True)
        weather_df["date"] = pd.to_datetime(weather_df["date"], utc=True)

        combined_df = pd.merge(elec_df, weather_df, on="date", how="inner")

        combined_df.columns = (
            combined_df.columns.str.lower()
            .str.replace("-", "_", regex=False)
            .str.replace(" ", "_", regex=False)
            .str.strip()
        )

        # Add string version of date for primary key
        combined_df["date_str"] = combined_df["date"].dt.strftime("%Y-%m-%dT%H:%M:%S")

        return combined_df

    def download(self):
        try:
            start = pd.to_datetime(self.config.start_date, utc=True)
            end = pd.to_datetime(self.config.end_date, utc=True)

            current = start
            elec_data = []

            while current <= end:
                year, month, day = current.year, current.month, current.day
                df = self._fetch_electricity_data(year, month, day)
                if not df.empty:
                    elec_data.append(df)
                current += timedelta(days=1)

            if not elec_data:
                logger.warning("No electricity data fetched.")
                return pd.DataFrame()

            wx_df = self._fetch_weather_data(start, end)
            if wx_df.empty:
                logger.warning("No weather data fetched.")
                return pd.DataFrame()

            combined_df = self._prepare_and_merge(elec_data, wx_df)

            if combined_df.empty:
                logger.warning("Merged DataFrame is empty after join.")
                return pd.DataFrame()

            # Save to Feature Store
            self.feature_store.create_feature_group(
                name="elec_wx_demand",
                df=combined_df,
                primary_key=["date_str", "subba"],
                event_time="date",
                description="Merged electricity demand and weather data for WattPredictor"
            )

            # Save to disk
            create_directories([self.config.data_file.parent])
            combined_df.to_csv(self.config.data_file, index=False)
            logger.info(f"Saved combined dataset to {self.config.data_file}")

            return combined_df

        except Exception as e:
            logger.error(f"Error in DataIngestion.download: {str(e)}")
            raise CustomException(e, sys)

In [9]:
try:
    config_manager = ConfigurationManager()
    ingestion_config = config_manager.get_data_ingestion_config()
    feature_store_config = config_manager.get_feature_store_config()
    ingestion = DataIngestion(config=ingestion_config, feature_store_config=feature_store_config)
    ingestion.download()
    
except Exception as e:
    raise CustomException(e, sys)

[2025-07-16 12:27:49,192: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-16 12:27:49,207: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-16 12:27:49,207: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-16 12:27:49,207: INFO: helpers: created directory at: artifacts]
[2025-07-16 12:27:49,207: INFO: helpers: created directory at: data]
[2025-07-16 12:27:49,224: INFO: external: Initializing external client]
[2025-07-16 12:27:49,226: INFO: external: Base URL: https://c.app.hopsworks.ai:443]
[2025-07-16 12:27:52,413: INFO: python: Python Engine initialized.]

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1240214
[2025-07-16 12:27:55,209: INFO: 411821756: Connected to Hopsworks Feature Store: WattPredictor]
[2025-07-16 12:27:55,225: INFO: helpers: json file loaded succesfully from: data\raw\elec_data\hourly_demand_2025-01-01.json]
[2025-07-16 12:27:55,247: INFO: helpers:

Uploading Dataframe: 100.00% |██████████| Rows 41184/41184 | Elapsed Time: 00:19 | Remaining Time: 00:00


Launching job: elec_wx_demand_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1240214/jobs/named/elec_wx_demand_2_offline_fg_materialization/executions
[2025-07-16 12:29:06,197: INFO: 411821756: Successfully created Feature Group 'elec_wx_demand' v2]
[2025-07-16 12:29:06,197: INFO: helpers: created directory at: data\processed]
[2025-07-16 12:29:06,627: INFO: 1240457731: Saved combined dataset to data\processed\elec_wx_demand.csv]
