In [9]:
import pandas as pd
import os
from pathlib import Path

In [10]:
# Get the current directory and go up one level to reach project root
current_dir = Path.cwd()
project_root = current_dir.parent.parent  # Go up two levels from src/preprocessing

# Define file paths relative to the project root
WEATHER_PATH = project_root / "data/processed/weather_clean.csv"
GEN_PATH = project_root / "data/processed/Combined_Hourly_Gen_Without_Price.csv"
WIND_SOLAR_PATH = project_root / "data/processed/wind_solar_data.csv"
SOLAR_PATH = project_root / "data/processed/solar_data_2022_to_2024.csv"
FUEL_MIX_PATH = project_root / "data/processed/Fuel_Mix_All_Years.csv"
LOAD_PATH = project_root / "data/processed/processed_load_data.csv"

In [11]:
print("Reading weather data...")
weather = pd.read_csv(WEATHER_PATH)

print("Reading generation data...")
gen = pd.read_csv(GEN_PATH)

print("Reading wind and solar data...")
wind_solar = pd.read_csv(WIND_SOLAR_PATH)

print("Reading solar data...")
solar = pd.read_csv(SOLAR_PATH)

print("Reading fuel mix data...")
fuel_mix = pd.read_csv(FUEL_MIX_PATH)

print("Reading load data...")
load = pd.read_csv(LOAD_PATH)


Reading weather data...
Reading generation data...
Reading wind and solar data...
Reading solar data...
Reading fuel mix data...
Reading load data...


In [12]:
# Start with generation data
total = gen.copy()
print("Merging datasets...")
total = total.merge(weather, left_index=True, right_index=True, how='left', suffixes=('', '_weather'))
total = total.merge(wind_solar, left_index=True, right_index=True, how='left', suffixes=('', '_windsolar'))
total = total.merge(solar, left_index=True, right_index=True, how='left', suffixes=('', '_solar'))
total = total.merge(fuel_mix, left_index=True, right_index=True, how='left', suffixes=('', '_fuelmix'))
total = total.merge(load, left_index=True, right_index=True, how='left', suffixes=('', '_load'))


total


Merging datasets...


Unnamed: 0,Hour,Fuel,MW,NAME,DATE,wind_speed,temperature_c,sky_coverage,precipitation_mm,Time (Hour-Ending),...,load_ramp,load_gen_ratio,load_minus_gen,tie_flow_pct,hour,day_of_week,is_peak_hour,3hr_load_ma,6hr_load_ma,3hr_gen_ma
0,2022-01-01 01:00:00,Biomass,22.297569,"CORPUS CHRISTI INTERNATIONAL AIRPORT, TX US",2022-01-01 00:00:00,1.0,24.4,99.0,,2022-01-01 01:00:00,...,-427.01290,0.999612,-17.8,0.000388,1.0,1.0,0.0,46206.865623,47140.703833,46224.232290
1,2022-01-01 01:00:00,Coal,10269.999131,"HOUSTON INTERCONTINENTAL AIRPORT, TX US",2022-01-01 00:00:00,1.0,25.0,99.0,,2022-01-01 02:00:00,...,-650.30986,0.999613,-17.5,0.000387,1.0,1.0,0.0,45752.797170,46429.363925,45770.130503
2,2022-01-01 01:00:00,Gas,1116.244747,"MIDLAND INTERNATIONAL AIRPORT, TX US",2022-01-01 00:00:00,1.0,11.7,99.0,0.0,2022-01-01 03:00:00,...,-162.91370,0.999620,-17.1,0.000380,1.0,1.0,0.0,45339.385017,45963.987092,45356.851683
3,2022-01-01 01:00:00,Nuclear,5078.955677,"AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US",2022-01-01 00:00:00,1.0,25.6,99.0,,2022-01-01 04:00:00,...,-1504.77453,0.999607,-17.1,0.000393,2.0,1.0,0.0,44566.718987,45386.792305,44583.952320
4,2022-01-01 02:00:00,Biomass,22.300089,"SAN ANTONIO INTERNATIONAL AIRPORT, TX US",2022-01-01 00:00:00,1.0,25.6,99.0,,2022-01-01 05:00:00,...,558.81113,0.999599,-17.7,0.000402,2.0,1.0,0.0,44197.093287,44974.945228,44214.393287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105187,2024-12-31 22:00:00,Nuclear,5085.549549,"SAN ANTONIO INTERNATIONAL AIRPORT, TX US",2022-12-12 02:51:00,5.0,15.6,7.0,0.0,,...,,,,,,,,,,
105188,2024-12-31 23:00:00,Biomass,12.204715,"CORPUS CHRISTI INTERNATIONAL AIRPORT, TX US",2022-12-12 02:51:00,5.0,21.7,2.0,0.0,,...,,,,,,,,,,
105189,2024-12-31 23:00:00,Coal,7419.532962,"ABILENE REGIONAL AIRPORT, TX US",2022-12-12 02:52:00,5.0,12.8,8.0,0.0,,...,,,,,,,,,,
105190,2024-12-31 23:00:00,Gas,1080.756969,"MIDLAND INTERNATIONAL AIRPORT, TX US",2022-12-12 02:53:00,5.0,10.6,0.0,0.0,,...,,,,,,,,,,


In [13]:
# Print available columns for debugging
print("\nAvailable columns in the merged dataset:")
print(total.columns.tolist())

# Convert datetime columns
print("\nProcessing datetime columns...")
datetime_cols = ['Hour', 'DATE', 'Time (Hour-Ending)', 'Date']
existing_datetime_cols = [col for col in datetime_cols if col in total.columns]

for col in existing_datetime_cols:
    total[col] = pd.to_datetime(total[col], errors='coerce')

# Combine datetime columns - only use columns that exist
print("\nCombining datetime columns...")
time_series = None
for col in existing_datetime_cols:
    if time_series is None:
        time_series = total[col]
    else:
        time_series = time_series.combine_first(total[col])

total['time'] = time_series

# Drop original datetime columns
total.drop(columns=existing_datetime_cols, inplace=True)


total


Available columns in the merged dataset:
['Hour', 'Fuel', 'MW', 'NAME', 'DATE', 'wind_speed', 'temperature_c', 'sky_coverage', 'precipitation_mm', 'Time (Hour-Ending)', 'ERCOT.LOAD_wind', 'ERCOT.WIND.GEN', 'Wind 1-hr MW change', 'Wind 1-hr % change', 'ERCOT.LOAD_solar', 'ERCOT.PVGR.GEN', 'Solar 1-hr MW change', 'Solar 1-hr % change', 'Unnamed: 0', 'datetime_col', 'ERCOT.LOAD', 'ERCOT.PVGR.GEN_solar', 'Total Solar Installed, MW', 'Solar Output, % of Load', 'Solar Output, % of Installed', 'Solar 1-hr MW change_solar', 'Solar 1-hr % change_solar', 'Daytime Hour', 'Ramping Daytime Hour', 'Timestamp', 'Biomass', 'Coal', 'Gas', 'Hydro', 'Nuclear', 'Datetime_hour', 'AGG LOAD SUMMARY', 'SUM TELEM GEN MW', 'SUM TELEM DCTIE MW', 'load_ramp', 'load_gen_ratio', 'load_minus_gen', 'tie_flow_pct', 'hour', 'day_of_week', 'is_peak_hour', '3hr_load_ma', '6hr_load_ma', '3hr_gen_ma']

Processing datetime columns...

Combining datetime columns...


Unnamed: 0,Fuel,MW,NAME,wind_speed,temperature_c,sky_coverage,precipitation_mm,ERCOT.LOAD_wind,ERCOT.WIND.GEN,Wind 1-hr MW change,...,load_gen_ratio,load_minus_gen,tie_flow_pct,hour,day_of_week,is_peak_hour,3hr_load_ma,6hr_load_ma,3hr_gen_ma,time
0,Biomass,22.297569,"CORPUS CHRISTI INTERNATIONAL AIRPORT, TX US",1.0,24.4,99.0,,38124.261975,12067.479497,,...,0.999612,-17.8,0.000388,1.0,1.0,0.0,46206.865623,47140.703833,46224.232290,2022-01-01 01:00:00
1,Coal,10269.999131,"HOUSTON INTERCONTINENTAL AIRPORT, TX US",1.0,25.0,99.0,,37122.946803,12884.367833,816.888337,...,0.999613,-17.5,0.000387,1.0,1.0,0.0,45752.797170,46429.363925,45770.130503,2022-01-01 01:00:00
2,Gas,1116.244747,"MIDLAND INTERNATIONAL AIRPORT, TX US",1.0,11.7,99.0,0.0,35936.747949,14366.542968,1482.175134,...,0.999620,-17.1,0.000380,1.0,1.0,0.0,45339.385017,45963.987092,45356.851683,2022-01-01 01:00:00
3,Nuclear,5078.955677,"AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US",1.0,25.6,99.0,,35132.555947,16463.459644,2096.916676,...,0.999607,-17.1,0.000393,2.0,1.0,0.0,44566.718987,45386.792305,44583.952320,2022-01-01 01:00:00
4,Biomass,22.300089,"SAN ANTONIO INTERNATIONAL AIRPORT, TX US",1.0,25.6,99.0,,34602.741810,18337.533839,1874.074195,...,0.999599,-17.7,0.000402,2.0,1.0,0.0,44197.093287,44974.945228,44214.393287,2022-01-01 02:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105187,Nuclear,5085.549549,"SAN ANTONIO INTERNATIONAL AIRPORT, TX US",5.0,15.6,7.0,0.0,,,,...,,,,,,,,,,2024-12-31 22:00:00
105188,Biomass,12.204715,"CORPUS CHRISTI INTERNATIONAL AIRPORT, TX US",5.0,21.7,2.0,0.0,,,,...,,,,,,,,,,2024-12-31 23:00:00
105189,Coal,7419.532962,"ABILENE REGIONAL AIRPORT, TX US",5.0,12.8,8.0,0.0,,,,...,,,,,,,,,,2024-12-31 23:00:00
105190,Gas,1080.756969,"MIDLAND INTERNATIONAL AIRPORT, TX US",5.0,10.6,0.0,0.0,,,,...,,,,,,,,,,2024-12-31 23:00:00


In [15]:
# Reorder columns
cols = ['time'] + [col for col in total.columns if col != 'time']
total = total[cols]

# Save the merged data
output_path = project_root / "data/processed/final_merged_data.csv"
print(f"\nSaving merged data to: {output_path}")
total.to_csv(output_path, index=False)
print("Data compilation completed successfully!")




Saving merged data to: /Users/sushrut.g12/Desktop/TERepo/totalenergies_price_forecasting/data/processed/final_merged_data.csv
Data compilation completed successfully!


In [16]:
total

Unnamed: 0,time,Fuel,MW,NAME,wind_speed,temperature_c,sky_coverage,precipitation_mm,ERCOT.LOAD_wind,ERCOT.WIND.GEN,...,load_ramp,load_gen_ratio,load_minus_gen,tie_flow_pct,hour,day_of_week,is_peak_hour,3hr_load_ma,6hr_load_ma,3hr_gen_ma
0,2022-01-01 01:00:00,Biomass,22.297569,"CORPUS CHRISTI INTERNATIONAL AIRPORT, TX US",1.0,24.4,99.0,,38124.261975,12067.479497,...,-427.01290,0.999612,-17.8,0.000388,1.0,1.0,0.0,46206.865623,47140.703833,46224.232290
1,2022-01-01 01:00:00,Coal,10269.999131,"HOUSTON INTERCONTINENTAL AIRPORT, TX US",1.0,25.0,99.0,,37122.946803,12884.367833,...,-650.30986,0.999613,-17.5,0.000387,1.0,1.0,0.0,45752.797170,46429.363925,45770.130503
2,2022-01-01 01:00:00,Gas,1116.244747,"MIDLAND INTERNATIONAL AIRPORT, TX US",1.0,11.7,99.0,0.0,35936.747949,14366.542968,...,-162.91370,0.999620,-17.1,0.000380,1.0,1.0,0.0,45339.385017,45963.987092,45356.851683
3,2022-01-01 01:00:00,Nuclear,5078.955677,"AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US",1.0,25.6,99.0,,35132.555947,16463.459644,...,-1504.77453,0.999607,-17.1,0.000393,2.0,1.0,0.0,44566.718987,45386.792305,44583.952320
4,2022-01-01 02:00:00,Biomass,22.300089,"SAN ANTONIO INTERNATIONAL AIRPORT, TX US",1.0,25.6,99.0,,34602.741810,18337.533839,...,558.81113,0.999599,-17.7,0.000402,2.0,1.0,0.0,44197.093287,44974.945228,44214.393287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105187,2024-12-31 22:00:00,Nuclear,5085.549549,"SAN ANTONIO INTERNATIONAL AIRPORT, TX US",5.0,15.6,7.0,0.0,,,...,,,,,,,,,,
105188,2024-12-31 23:00:00,Biomass,12.204715,"CORPUS CHRISTI INTERNATIONAL AIRPORT, TX US",5.0,21.7,2.0,0.0,,,...,,,,,,,,,,
105189,2024-12-31 23:00:00,Coal,7419.532962,"ABILENE REGIONAL AIRPORT, TX US",5.0,12.8,8.0,0.0,,,...,,,,,,,,,,
105190,2024-12-31 23:00:00,Gas,1080.756969,"MIDLAND INTERNATIONAL AIRPORT, TX US",5.0,10.6,0.0,0.0,,,...,,,,,,,,,,


In [18]:
# Adjust the path if running from a different directory
df = pd.read_csv(output_path)

# Explicitly convert the time column to datetime
df['time'] = pd.to_datetime(df['time'])

# Now proceed as before
print("Earliest timestamp:", df['time'].min())
print("Latest timestamp:", df['time'].max())

df_sorted = df.sort_values('time')
intervals = df_sorted['time'].diff().dropna()
print("Most common interval:", intervals.mode()[0])

expected_hours = pd.date_range(df['time'].min(), df['time'].max(), freq='H')
missing_hours = set(expected_hours) - set(df['time'])
print(f"Number of missing hourly timestamps: {len(missing_hours)}")

Earliest timestamp: 2022-01-01 01:00:00
Latest timestamp: 2024-12-31 23:00:00
Most common interval: 0 days 00:00:00
Number of missing hourly timestamps: 5


  df = pd.read_csv(output_path)
  expected_hours = pd.date_range(df['time'].min(), df['time'].max(), freq='H')
