In [3]:
import pandas as pd

FUEL = 'hourly_load_forecast.csv'
WEATHER = 'test_data.csv'
OUTPUTPATH = 'outputtestdata.csv'

# Load the weather data CSV
weather_data = pd.read_csv(WEATHER)  # Replace with the actual file path

# Load the fuel mix data CSV
fuel_data = pd.read_csv(FUEL)  # Replace with the actual file path

# Convert 'datetime' in weather data to datetime format
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])

# Convert 'datetime' in weather data to UTC to match the fuel data time zones
if weather_data['datetime'].dt.tz is None:
    weather_data['datetime'] = weather_data['datetime'].dt.tz_localize('UTC')
else:
    weather_data['datetime'] = weather_data['datetime'].dt.tz_convert('UTC')

# Print the 'BeginDate' column for inspection
print("BeginDate column before conversion:")
print(fuel_data['BeginDate'].head())

# Convert 'BeginDate' in fuel data to datetime format with explicit UTC conversion
fuel_data['BeginDate'] = pd.to_datetime(fuel_data['BeginDate'], errors='coerce', utc=True)

# Print the 'BeginDate' column after conversion to check if conversion was successful
print("BeginDate column after conversion:")
print(fuel_data['BeginDate'].head())

# Check for any invalid dates (if any)
invalid_dates = fuel_data[fuel_data['BeginDate'].isna()]
if not invalid_dates.empty:
    print("Invalid 'BeginDate' entries found:")
    print(invalid_dates)
    # Optionally, drop rows with invalid 'BeginDate'
    fuel_data = fuel_data.dropna(subset=['BeginDate'])

# Round 'BeginDate' in the fuel data to the nearest hour
fuel_data['rounded_hour'] = fuel_data['BeginDate'].dt.round('h')

# Sort both DataFrames by time for 'merge_asof' to work
fuel_data = fuel_data.sort_values('rounded_hour')
weather_data = weather_data.sort_values('datetime')

# Perform the merge_asof to match the nearest hour in the weather data for each fuel data entry
combined_data = pd.merge_asof(fuel_data, weather_data, left_on='rounded_hour', right_on='datetime', direction='backward')

# Drop the 'rounded_hour' column since it's no longer needed
combined_data = combined_data.drop(columns=['rounded_hour'])
combined_data = combined_data.sort_values('BeginDate')

# Save the combined data to a new CSV file

combined_data.to_csv(OUTPUTPATH, index=False)

print(f"Data combined and saved to {OUTPUTPATH}")

BeginDate column before conversion:
0    2024-12-05T00:00:00.000-05:00
1    2024-12-05T01:00:00.000-05:00
2    2024-12-05T02:00:00.000-05:00
3    2024-12-05T03:00:00.000-05:00
4    2024-12-05T04:00:00.000-05:00
Name: BeginDate, dtype: object
BeginDate column after conversion:
0   2024-12-05 05:00:00+00:00
1   2024-12-05 06:00:00+00:00
2   2024-12-05 07:00:00+00:00
3   2024-12-05 08:00:00+00:00
4   2024-12-05 09:00:00+00:00
Name: BeginDate, dtype: datetime64[ns, UTC]
Data combined and saved to outputtestdata.csv


In [5]:
import pandas as pd

def combineData(fuel_file, weather_file, output_file):
    # Load the weather data CSV
    weather_data = pd.read_csv(weather_file)

    # Load the fuel mix data CSV
    fuel_data = pd.read_csv(fuel_file)

    # Convert 'datetime' in weather data to datetime format and localize to UTC
    weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])
    if weather_data['datetime'].dt.tz is None:
        weather_data['datetime'] = weather_data['datetime'].dt.tz_localize('UTC')
    else:
        weather_data['datetime'] = weather_data['datetime'].dt.tz_convert('UTC')

    # Convert 'BeginDate' in fuel data to datetime format with explicit UTC conversion
    fuel_data['BeginDate'] = pd.to_datetime(fuel_data['BeginDate'], errors='coerce', utc=True)

    # Check for any invalid dates and drop rows if necessary
    invalid_dates = fuel_data[fuel_data['BeginDate'].isna()]
    if not invalid_dates.empty:
        print("Invalid 'BeginDate' entries found:")
        print(invalid_dates)
        fuel_data = fuel_data.dropna(subset=['BeginDate'])

    # Round 'BeginDate' in the fuel data to the nearest hour
    fuel_data['rounded_hour'] = fuel_data['BeginDate'].dt.round('h')

    # Sort both DataFrames by time for 'merge_asof' to work
    fuel_data = fuel_data.sort_values('rounded_hour')
    weather_data = weather_data.sort_values('datetime')

    # Perform the merge_asof to match the nearest hour in the weather data for each fuel data entry
    combined_data = pd.merge_asof(fuel_data, weather_data, left_on='rounded_hour', right_on='datetime', direction='nearest')

    # Drop the 'rounded_hour' and 'datetime' columns to keep only 'BeginDate'
    combined_data = combined_data.drop(columns=['rounded_hour', 'datetime'])
    combined_data = combined_data.sort_values('BeginDate')

    # Save the combined data to a new CSV file
    combined_data.to_csv(output_file, index=False)
    print(f"Data combined and saved to {output_file}")

# Usage example
if __name__ == "__main__":
    FUEL = 'genfuelmix_aggregatedYear.csv'
    WEATHER = 'Year_weather.csv'
    OUTPUTPATH = 'fuelWeatherCombined.csv'
    
    combineData(FUEL, WEATHER, OUTPUTPATH)


Data combined and saved to fuelWeatherCombined.csv
