In [4]:
import pandas as pd
import os
from pathlib import Path

# Get the current directory and go up one level to reach project root
current_dir = Path.cwd()
project_root = current_dir.parent.parent  # Go up two levels from src/preprocessing

# Define file paths relative to the project root
WEATHER_PATH = project_root / "data/processed/weather_clean.csv"
GEN_PATH = project_root / "data/processed/Combined_Hourly_Gen_Without_Price.csv"
WIND_SOLAR_PATH = project_root / "data/processed/wind_solar_data.csv"
SOLAR_XLSX_PATH = project_root / "data/processed/solar_data_2022_to_2024.xlsx"

try:
    # Read the files
    print("Reading weather data...")
    weather = pd.read_csv(WEATHER_PATH)
    
    print("Reading generation data...")
    gen = pd.read_csv(GEN_PATH)
    
    print("Reading wind and solar data...")
    wind_solar = pd.read_csv(WIND_SOLAR_PATH)
    
    print("Reading solar xlsx data...")
    solar_xlsx = pd.read_excel(SOLAR_XLSX_PATH)

    # Start with generation data
    total = gen.copy()
    
    # Merge all datasets
    print("Merging datasets...")
    total = total.merge(weather, left_index=True, right_index=True, how='left', suffixes=('', '_weather'))
    total = total.merge(wind_solar, left_index=True, right_index=True, how='left', suffixes=('', '_windsolar'))
    total = total.merge(solar_xlsx, left_index=True, right_index=True, how='left', suffixes=('', '_solar'))

    # Convert datetime columns
    print("Processing datetime columns...")
    datetime_cols = ['Hour', 'DATE', 'Time (Hour-Ending)', 'Date']
    for col in datetime_cols:
        if col in total.columns:
            total[col] = pd.to_datetime(total[col], errors='coerce')

    # Combine datetime columns
    total['time'] = total['Hour'].combine_first(
        total['Time (Hour-Ending)']).combine_first(
        total['DATE']).combine_first(
        total['Date'])

    # Drop original datetime columns
    total.drop(columns=datetime_cols, inplace=True)

    # Reorder columns
    cols = ['time'] + [col for col in total.columns if col != 'time']
    total = total[cols]

    # Save the merged data
    output_path = project_root / "data/processed/final_merged_data.csv"
    print(f"Saving merged data to: {output_path}")
    total.to_csv(output_path, index=False)
    print("Data compilation completed successfully!")
    
except FileNotFoundError as e:
    print(f"Error: Could not find file - {e}")
    print("Please ensure all required files exist in the data/processed directory:")
    print(f"- {WEATHER_PATH}")
    print(f"- {GEN_PATH}")
    print(f"- {WIND_SOLAR_PATH}")
    print(f"- {SOLAR_XLSX_PATH}")
except Exception as e:
    print(f"An error occurred: {e}")

Reading weather data...
Reading generation data...
Error: Could not find file - [Errno 2] No such file or directory: '/Users/sushrut.g12/Desktop/TERepo/totalenergies_price_forecasting/data/processed/Combined_Hourly_Gen_Without_Price.csv'
Please ensure all required files exist in the data/processed directory:
- /Users/sushrut.g12/Desktop/TERepo/totalenergies_price_forecasting/data/processed/weather_clean.csv
- /Users/sushrut.g12/Desktop/TERepo/totalenergies_price_forecasting/data/processed/Combined_Hourly_Gen_Without_Price.csv
- /Users/sushrut.g12/Desktop/TERepo/totalenergies_price_forecasting/data/processed/wind_solar_data.csv
- /Users/sushrut.g12/Desktop/TERepo/totalenergies_price_forecasting/data/processed/solar_data_2022_to_2024.xlsx
