In [1]:
import os
import sys
os.chdir('../')

In [2]:
from WattPredictor.utils.helpers import *
from WattPredictor.constants import *
from WattPredictor.utils.exception import *
from WattPredictor import logger

In [3]:
from pathlib import Path
from dataclasses import dataclass
from datetime import datetime, timedelta

In [4]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    elec_raw_data: Path
    wx_raw_data: Path
    data_file: Path
    start_date: str
    end_date: str

In [5]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath=CONFIG_PATH,
                 params_filepath=PARAMS_PATH,
                 schema_filepath=SCHEMA_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        if schema_filepath.exists():
            self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        params = self.params.dates

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(config.root_dir),
            elec_raw_data=Path(config.elec_raw_data),
            wx_raw_data=Path(config.wx_raw_data),
            data_file=Path(config.data_file),
            start_date=params.start_date,
            end_date=params.end_date
        )

        return data_ingestion_config

In [6]:
import pandas as pd
import requests
import json
from datetime import datetime, timedelta
import openmeteo_requests
import requests_cache
from retry_requests import retry
from WattPredictor import logger
from WattPredictor.utils.helpers import create_directories, save_json, load_json
from pathlib import Path
import os


cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        self.openmeteo = openmeteo_requests.Client(session=retry_session)
        try:
            self.elec_api = os.environ['elec_api']
            self.wx_api = os.environ['wx_api']
            self.elec_api_key = os.environ['elec_api_key']
        except KeyError as e:
            logger.error(f"Missing environment variable: {e}")
            raise

    def _elec_get_api_url(self, year, month, day):
        return self.elec_api, {
            "frequency": "hourly",
            "data[0]": "value",
            "sort[0][column]": "period",
            "sort[0][direction]": "desc",
            "facets[parent][0]": "NYIS",
            "offset": 0,
            "length": 5000,
            "start": f"{year}-{month:02d}-{day:02d}",
            "end": (datetime(year, month, day) + timedelta(days=1)).strftime("%Y-%m-%d"),
            "api_key": self.elec_api_key
        }

    def _wx_get_api_url(self, start_date, end_date):
        return self.wx_api, {
            "latitude": 40.7128,
            "longitude": -74.0060,
            "start_date": start_date.strftime("%Y-%m-%d"),
            "end_date": end_date.strftime("%Y-%m-%d"),
            "hourly": ["temperature_2m", "weather_code",
                       "relative_humidity_2m", "wind_speed_10m"],
            "timeformat": "unixtime",
            "timezone": "America/New_York"
        }

    def _fetch_data(self, data_type, *args):
        """Generic fetch method for both electricity and weather data"""
        try:
            if data_type == "electricity":
                year, month, day = args
                url, params = self._elec_get_api_url(year, month, day)
                response = requests.get(url, params=params)
                response.raise_for_status()
                data = response.json()

                create_directories([self.config.elec_raw_data])
                file_path = self.config.elec_raw_data / f"hourly_demand_{year}-{month:02d}-{day:02d}.json"
                save_json(file_path, data)

                if 'response' in data and 'data' in data['response']:
                    return pd.DataFrame(data['response']['data'])

            elif data_type == "weather":
                start_date, end_date = args
                url, params = self._wx_get_api_url(start_date, end_date)
                responses = self.openmeteo.weather_api(url, params=params)
                response = responses[0]

                hourly = response.Hourly()
                hourly_data = {
                    "date": pd.date_range(
                        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                        freq=pd.Timedelta(seconds=hourly.Interval()), inclusive="left"
                    ),
                    "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
                    "weather_code": hourly.Variables(1).ValuesAsNumpy(),
                    "relative_humidity_2m": hourly.Variables(2).ValuesAsNumpy(),
                    "wind_speed_10m": hourly.Variables(3).ValuesAsNumpy()
                }

                df = pd.DataFrame(data=hourly_data)
                create_directories([self.config.wx_raw_data])
                file_path = self.config.wx_raw_data / f"weather_data_{start_date.strftime('%Y-%m-%d')}_to_{end_date.strftime('%Y-%m-%d')}.csv"
                df.to_csv(file_path, index=False)
                return df

            return pd.DataFrame()

        except requests.RequestException as e:
            logger.error(f"API request failed for {data_type} data: {e}")
            return pd.DataFrame()
        except pd.errors.EmptyDataError as e:
            logger.error(f"Empty data error for {data_type} data: {e}")
            return pd.DataFrame()
        except Exception as e:
            logger.error(f"Unexpected error fetching {data_type} data: {e}")
            return pd.DataFrame()

    def _load_existing_data(self, data_type, *args):
        """Load existing data if files exist"""
        try:
            if data_type == "electricity":
                year, month, day = args
                file_path = self.config.elec_raw_data / f"hourly_demand_{year}-{month:02d}-{day:02d}.json"
                if file_path.exists():
                    data = load_json(file_path)
                    if 'response' in data and 'data' in data['response']:
                        return pd.DataFrame(data['response']['data'])

            elif data_type == "weather":
                start_date, end_date = args
                file_path = self.config.wx_raw_data / f"weather_data_{start_date.strftime('%Y-%m-%d')}_to_{end_date.strftime('%Y-%m-%d')}.csv"
                if file_path.exists():
                    return pd.read_csv(file_path)

            return pd.DataFrame()

        except pd.errors.EmptyDataError as e:
            logger.error(f"Empty data error loading {data_type} data: {e}")
            return pd.DataFrame()
        except Exception as e:
            logger.error(f"Unexpected error loading {data_type} data: {e}")
            return pd.DataFrame()

    def download(self) -> pd.DataFrame:
        """Download electricity and weather data, merge and save"""
        try:
            start = pd.to_datetime(self.config.start_date, utc=True)
            end = pd.to_datetime(self.config.end_date, utc=True)

            # Get electricity data
            elec_data = []
            current_date = start
            while current_date <= end:
                year, month, day = current_date.year, current_date.month, current_date.day

                df = self._load_existing_data("electricity", year, month, day)
                if df.empty:
                    df = self._fetch_data("electricity", year, month, day)

                if not df.empty:
                    elec_data.append(df)
                current_date += timedelta(days=1)

            # Get weather data
            wx_df = self._load_existing_data("weather", start, end)
            if wx_df.empty:
                wx_df = self._fetch_data("weather", start, end)

            # Combine and save
            if elec_data:
                elec_df = pd.concat(elec_data, ignore_index=True)

                if not wx_df.empty:
                    if 'period' in elec_df.columns:
                        elec_df['date'] = pd.to_datetime(elec_df['period'], utc=True)
                    if 'date' in wx_df.columns:
                        wx_df['date'] = pd.to_datetime(wx_df['date'], utc=True)

                    combined_df = pd.merge(elec_df, wx_df, on="date", how="inner")
                    logger.info(f"Merged data shape: {combined_df.shape}, electricity: {elec_df.shape}, weather: {wx_df.shape}")
                    if combined_df.empty:
                        logger.warning("Merged dataset is empty, check data alignment")
                else:
                    combined_df = elec_df

                create_directories([self.config.data_file.parent])
                combined_df.to_csv(self.config.data_file, index=False)
                logger.info(f"Dataset saved to {self.config.data_file}, shape: {combined_df.shape}")
                return combined_df

            return pd.DataFrame()

        except Exception as e:
            logger.error(f"Error during download: {e}")
            raise

In [7]:
try:
    config = ConfigurationManager()
    ingestion_config = config.get_data_ingestion_config()
    ingestion = DataIngestion(config=ingestion_config)
    df = ingestion.download()

except Exception as e:
    raise CustomException(str(e), sys)

[2025-07-08 15:18:18,553: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-08 15:18:18,560: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-08 15:18:18,565: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-08 15:18:18,567: INFO: helpers: created directory at: artifacts]
[2025-07-08 15:18:18,568: INFO: helpers: created directory at: data]
[2025-07-08 15:18:21,131: INFO: helpers: created directory at: data\raw\elec_data]
[2025-07-08 15:18:21,139: INFO: helpers: json file saved at: data\raw\elec_data\hourly_demand_2025-01-01.json]
[2025-07-08 15:18:22,983: INFO: helpers: created directory at: data\raw\elec_data]
[2025-07-08 15:18:22,990: INFO: helpers: json file saved at: data\raw\elec_data\hourly_demand_2025-01-02.json]
[2025-07-08 15:18:24,723: INFO: helpers: created directory at: data\raw\elec_data]
[2025-07-08 15:18:24,733: INFO: helpers: json file saved at: data\raw\elec_data\hourly_demand_2