In [2]:
from dotenv import load_dotenv
import pandas as pd
import os
from pymongo import MongoClient

load_dotenv()
cluster_uri = os.environ.get("MONGODB_URI")
client = MongoClient(cluster_uri)
db = client["MSCI446_DB"]

### This module is for loading the data and transforming them into dataframes

In [51]:
# This module converts the objects pulled from Mono DB into a dataframe
collection_forecast = db["Load_Forecast"]
collection_solar = db["Solar_Forecast"]
collection_wind = db["Wind_Forecast"]
collection_gas_price = db["Gas_Prices"]
collection_gen_outages = db["Gen_Outages"]
collection_historical_da_price = db["Historical_DA_Prices"]

documents_forecast = list(collection_forecast.find())
documents_solar = list(collection_solar.find())
documents_wind = list(collection_wind.find())
documents_gas_price = list(collection_gas_price.find())
documents_gen_outages = list(collection_gen_outages.find())
documents_historical_da_price = list(collection_historical_da_price.find())

# Converting documents into dataframes
df_forecast = pd.DataFrame(documents_forecast)
df_solar = pd.DataFrame(documents_solar)
df_wind = pd.DataFrame(documents_wind)
df_gas_price = pd.DataFrame(documents_gas_price)
df_gen_outages = pd.DataFrame(documents_gen_outages)
df_historical_da_price = pd.DataFrame(documents_historical_da_price)

# Dropping the id column
df_forecast.drop('_id', axis=1, inplace=True)
df_solar.drop('_id', axis=1, inplace=True)
df_wind.drop('_id', axis=1, inplace=True)
df_gas_price.drop('_id', axis=1, inplace=True)
df_gen_outages.drop('_id', axis=1, inplace=True)
df_historical_da_price.drop('_id', axis=1, inplace=True)

print(df_forecast.head())
print(df_solar.head())
print(df_wind.head())
print(df_gas_price.head())
print(df_gen_outages.head())
print(df_historical_da_price.head())

### This module is for getting unique areas from each dataframe

In [None]:
# Assuming df_solar and df_wind are already created and the '_id' column is dropped

# Get unique area values from each dataframe
unique_areas_solar = df_solar['area'].unique()
unique_areas_wind = df_wind['area'].unique()
unique_areas_load_forecast = df_forecast['forecast_area'].unique()
unique_areas_historical_da_price = df_historical_da_price['zone'].unique()

# Print the unique area values
print("Unique areas in Solar Forecast table:", unique_areas_solar)
print("Unique areas in Wind Forecast table:", unique_areas_wind)
print("Unique areas in Load Forecast:", unique_areas_load_forecast)
print("Unique areas in Historical DA Price:", unique_areas_historical_da_price)


### This module is for preparing the gas_price dataset

In [50]:
# Changing the datetime column into datetime format
df_gas_price['Date'] = pd.to_datetime(df_gas_price['Date'])

# Filter out rows where the date is earlier than 2021
df_gas_price_index = df_gas_price[df_gas_price['Date'].dt.year >= 2021]

print(df_gas_price_index.head())

# Setting the date as the index
df_gas_price_hourly = df_gas_price_index.set_index('Date', inplace=True)

# Resampling to hourly increments and forward filling the gas prices
df_gas_price_hourly = df_gas_price.resample('H').ffill()

print(df_gas_price_hourly)

KeyError: 'Date'

### This module is for data cleansing and preparation before merging into one large dataframe

In [31]:
# Renaming date-time columns in various dataframes to a common date-time column heading
df_forecast.rename(columns={'forecast_hour_beginning_ept': 'datetime_beginning_ept'}, inplace=True)
df_gen_outages.rename(columns={'forecast_date': 'datetime_beginning_ept'}, inplace=True)

# The gen_outage dataframe has duplicate dates (rows) that and their values need to be averaged 
df_gen_outages = df_gen_outages.groupby('datetime_beginning_ept').mean().reset_index()

# Setting datetime column in gen_outages to the index for merge
df_gen_outages.set_index('datetime_beginning_ept', inplace = True)

### This module is for pivoting dataframes and changing column headings for merge

In [None]:
# Instead of dropping the duplicate rows, I just averaged them and since they're the same their value shouldn't change
wind_pivot = df_wind.pivot_table(index='datetime_beginning_ept', columns='area', values='wind_generation_mw', aggfunc='mean')
solar_pivot = df_solar.pivot_table(index='datetime_beginning_ept', columns='area', values='solar_generation_mw', aggfunc='mean')
load_forecast_pivot = df_forecast.pivot_table(index='datetime_beginning_ept', columns ='forecast_area', values = 'forecast_load_mw', aggfunc = 'mean')
historical_da_pivot = df_historical_da_price.pivot_table(index='datetime_beginning_ept', columns ='zone', values = 'total_lmp_da', aggfunc = 'mean')

# Renaming column headings to differentiate from other columns
load_forecast_pivot = load_forecast_pivot.rename(columns={col: f"{col}_forecast" for col in load_forecast_pivot.columns if col != 'datetime_beginning_ept'})
historical_da_pivot = historical_da_pivot.rename(columns={col: f"{col}_historical_da" for col in historical_da_pivot.columns if col != 'datetime_beginning_ept'})

### This module merges the dataframes into one

In [None]:
# Merge the pivoted DataFrames along the dates
merged_df = pd.merge(solar_pivot, wind_pivot, on='datetime_beginning_ept', suffixes=('_solar', '_wind'))

merged_df = pd.merge(merged_df, load_forecast_pivot, on = 'datetime_beginning_ept')

merged_df = pd.merge(merged_df, historical_da_pivot, on = 'datetime_beginning_ept')

print(merged_df.head())

merged_df = pd.merge(merged_df, df_gen_outages, on = 'datetime_beginning_ept')


                        MIDATL_solar  OTHER_solar  RFC_solar  RTO_solar  \
datetime_beginning_ept                                                    
1/1/2023 10:00:00 AM         513.637       57.640    702.862   2316.989   
1/1/2023 10:00:00 PM          -2.141       -0.037     -2.464     -6.803   
1/1/2023 11:00:00 AM         587.103       72.002    801.416   2397.184   
1/1/2023 11:00:00 PM          -1.144       -0.037     -2.467     -6.680   
1/1/2023 12:00:00 AM          -2.147       -0.037     -2.478     -6.621   

                        SOUTH_solar  WEST_solar  MIDATL_wind  OTHER_wind  \
datetime_beginning_ept                                                     
1/1/2023 10:00:00 AM       1614.127     189.225      215.613       3.922   
1/1/2023 10:00:00 PM         -4.339      -0.323      456.294       4.335   
1/1/2023 11:00:00 AM       1595.768     214.313      190.061       3.754   
1/1/2023 11:00:00 PM         -4.213      -1.323      417.703       4.329   
1/1/2023 12:00:00 