In [3]:
from dotenv import load_dotenv
import pandas as pd
import os
from pymongo import MongoClient

load_dotenv()
cluster_uri = os.environ.get("MONGODB_URI")
client = MongoClient(cluster_uri)
db = client["MSCI446_DB"]

### This module is for loading the data and transforming them into dataframes

In [4]:
# This module converts the objects pulled from Mono DB into a dataframe
collection_forecast = db["Load_Forecast"]
collection_solar = db["Solar_Forecast"]
collection_wind = db["Wind_Forecast"]
collection_gas_price = db["Gas_Prices"]
collection_gen_outages = db["Gen_Outages"]
collection_historical_da_price = db["Historical_DA_Prices"]

documents_forecast = list(collection_forecast.find())
documents_solar = list(collection_solar.find())
documents_wind = list(collection_wind.find())
documents_gas_price = list(collection_gas_price.find())
documents_gen_outages = list(collection_gen_outages.find())
documents_historical_da_price = list(collection_historical_da_price.find())

# Converting documents into dataframes
df_forecast = pd.DataFrame(documents_forecast)
df_solar = pd.DataFrame(documents_solar)
df_wind = pd.DataFrame(documents_wind)
df_gas_price = pd.DataFrame(documents_gas_price)
df_gen_outages = pd.DataFrame(documents_gen_outages)
df_historical_da_price = pd.DataFrame(documents_historical_da_price)

# Dropping the id column
df_forecast.drop('_id', axis=1, inplace=True)
df_solar.drop('_id', axis=1, inplace=True)
df_wind.drop('_id', axis=1, inplace=True)
df_gas_price.drop('_id', axis=1, inplace=True)
df_gen_outages.drop('_id', axis=1, inplace=True)
df_historical_da_price.drop('_id', axis=1, inplace=True)

print(df_forecast.head())
print(df_solar.head())
print(df_wind.head())
print(df_gas_price.head())
print(df_gen_outages.head())
print(df_historical_da_price.head())

  forecast_hour_beginning_ept forecast_area  forecast_load_mw
0        1/1/2024 12:00:00 AM           AEP             14145
1         1/1/2024 1:00:00 AM           AEP             13908
2         1/1/2024 2:00:00 AM           AEP             13765
3         1/1/2024 3:00:00 AM           AEP             13788
4         1/1/2024 4:00:00 AM           AEP             13862
  datetime_beginning_ept    area  solar_generation_mw
0  2/28/2024 11:00:00 PM  MIDATL                  0.0
1  2/28/2024 11:00:00 PM   OTHER                  0.0
2  2/28/2024 11:00:00 PM     RFC                  0.0
3  2/28/2024 11:00:00 PM     RTO                  0.0
4  2/28/2024 11:00:00 PM   SOUTH                  0.0
   datetime_beginning_ept    area  wind_generation_mw
0  12/31/2020 11:00:00 PM  MIDATL             112.120
1  12/31/2020 11:00:00 PM   SOUTH             156.846
2  12/31/2020 11:00:00 PM    WEST            2130.528
3  12/31/2020 11:00:00 PM     RTO            2399.494
4  12/31/2020 11:00:00 PM     RFC 

### This module is for getting unique areas from each dataframe

In [5]:
# Get unique area values from each dataframe
unique_areas_solar = df_solar['area'].unique()
unique_areas_wind = df_wind['area'].unique()
unique_areas_load_forecast = df_forecast['forecast_area'].unique()
unique_areas_historical_da_price = df_historical_da_price['zone'].unique()

# Print the unique area values
print("Unique areas in Solar Forecast table:", unique_areas_solar)
print("Unique areas in Wind Forecast table:", unique_areas_wind)
print("Unique areas in Load Forecast:", unique_areas_load_forecast)
print("Unique areas in Historical DA Price:", unique_areas_historical_da_price)

Unique areas in Solar Forecast table: ['MIDATL' 'OTHER' 'RFC' 'RTO' 'SOUTH' 'WEST']
Unique areas in Wind Forecast table: ['MIDATL' 'SOUTH' 'WEST' 'RTO' 'RFC' 'OTHER']
Unique areas in Load Forecast: ['AEP' 'APS' 'ATSI' 'COMED' 'DAY' 'DEOK' 'DOM' 'DUQ' 'EKPC' 'MIDATL' 'RTO']
Unique areas in Historical DA Price: ['DPL']


### This module is for renaming column headings to common datetime heading

In [6]:
# Renaming date-time columns in various dataframes to a common date-time column heading
df_forecast.rename(columns={'forecast_hour_beginning_ept': 'datetime_beginning_ept'}, inplace=True)
df_gen_outages.rename(columns={'forecast_date': 'datetime_beginning_ept'}, inplace=True)
df_gas_price.rename(columns={'Date': 'datetime_beginning_ept'}, inplace=True)

### This module is for pivoting dataframes and changing column headings for merge

In [10]:
# Instead of dropping the duplicate rows, I just averaged them and since they're the same their value shouldn't change
wind_pivot = df_wind.pivot_table(index='datetime_beginning_ept', columns='area', values='wind_generation_mw', aggfunc='mean')
solar_pivot = df_solar.pivot_table(index='datetime_beginning_ept', columns='area', values='solar_generation_mw', aggfunc='mean')
load_forecast_pivot = df_forecast.pivot_table(index='datetime_beginning_ept', columns ='forecast_area', values = 'forecast_load_mw', aggfunc = 'mean')
historical_da_pivot = df_historical_da_price.pivot_table(index='datetime_beginning_ept', columns ='zone', values = 'total_lmp_da', aggfunc = 'mean')

# Renaming column headings to differentiate from other columns
load_forecast_pivot = load_forecast_pivot.rename(columns={col: f"{col}_forecast" for col in load_forecast_pivot.columns if col != 'datetime_beginning_ept'})
historical_da_pivot = historical_da_pivot.rename(columns={col: f"{col}_historical_da" for col in historical_da_pivot.columns if col != 'datetime_beginning_ept'})

### This module is for preparing the gen_outages dataset

In [None]:
# Converting the column into date time
df_gen_outages['datetime_beginning_ept'] = pd.to_datetime(df_gen_outages['datetime_beginning_ept'])

# The gen_outage dataframe has duplicate dates (rows) that and their values need to be averaged 
df_gen_outages = df_gen_outages.groupby('datetime_beginning_ept').mean().reset_index()

# Filtering out rows where the date is earlier than 2022
df_gen_outages = df_gen_outages[df_gen_outages['datetime_beginning_ept'].dt.year >= 2022]
# Set the 'datetime_beginning_ept' column as the index, resample to hourly increments, and forward fill the gas prices
df_gen_outages_index = df_gen_outages.set_index('datetime_beginning_ept').resample('H').ffill()

# Reset the index to make 'datetime_beginning_ept' a column again
df_gen_outages_index.reset_index(inplace = True)

# Convert the 'datetime_beginning_ept' column from datetime to string
df_gen_outages_index['datetime_beginning_ept'] = df_gen_outages_index['datetime_beginning_ept'].dt.strftime('%d/%m/%Y %H:%M:%S %p')

print(df_gen_outages_index.tail())

### This module is for preparing the gas_price dataset

In [None]:
# Changing the 'datetime_beginning_ept' column into datetime format
df_gas_price['datetime_beginning_ept'] = pd.to_datetime(df_gas_price['datetime_beginning_ept'])

# Filtering out rows where the date is earlier than 2022
df_gas_price = df_gas_price[df_gas_price['datetime_beginning_ept'].dt.year >= 2022]

# Set the 'datetime_beginning_ept' column as the index, resample to hourly increments, and forward fill the gas prices
df_gas_price_index = df_gas_price.set_index('datetime_beginning_ept').resample('H').ffill()

# Reset the index to make 'datetime_beginning_ept' a column again
df_gas_price_index.reset_index(inplace=True)

# Convert the 'datetime_beginning_ept' column from datetime to string
df_gas_price_index['datetime_beginning_ept'] = df_gas_price_index['datetime_beginning_ept'].dt.strftime('%d/%m/%Y %H:%M:%S %p')

# Print the first few rows of the resampled DataFrame to verify the results
print(df_gas_price_index)

### This module is for preparing load_forecast dataset

In [None]:
# Converting the column into date time
load_forecast_pivot.index = pd.to_datetime(load_forecast_pivot.index)

# The gen_outage dataframe has duplicate dates (rows) that and their values need to be averaged
load_forecast_pivot = load_forecast_pivot.groupby(load_forecast_pivot.index).mean()

#Filtering out rows where the date is earlier than 2022
load_forecast_pivot = load_forecast_pivot[load_forecast_pivot.index.year >= 2022]

# Set the 'datetime_beginning_ept' column as the index, resample to hourly increments, and forward fill the gas prices
df_forecast_resampled = load_forecast_pivot.resample('H').ffill()

# Reset the index to make 'datetime_beginning_ept' a column again
df_forecast_resampled.reset_index(inplace=True)

# Convert the 'datetime_beginning_ept' column from datetime to string
df_forecast_resampled['datetime_beginning_ept'] = df_forecast_resampled['datetime_beginning_ept'].dt.strftime('%d/%m/%Y %H:%M:%S %p')

print(df_forecast_resampled)

### This module is for preparing the solar table

In [None]:
# Converting the column into date time
solar_pivot.index = pd.to_datetime(solar_pivot.index)

# The gen_outage dataframe has duplicate dates (rows) that and their values need to be averaged
solar_pivot = solar_pivot.groupby(solar_pivot.index).mean()

# The gen_outage dataframe has duplicate dates (rows) that and their values need to be averaged
solar_pivot = solar_pivot.groupby(solar_pivot.index).mean()

#Filtering out rows where the date is earlier than 2022
solar_pivot = solar_pivot[solar_pivot.index.year >= 2022]

# Set the 'datetime_beginning_ept' column as the index, resample to hourly increments, and forward fill the gas prices
df_solar_resampled = solar_pivot.resample('H').ffill()

# Reset the index to make 'datetime_beginning_ept' a column again
df_solar_resampled.reset_index(inplace=True)

# Convert the 'datetime_beginning_ept' column from datetime to string
df_solar_resampled['datetime_beginning_ept'] = df_solar_resampled['datetime_beginning_ept'].dt.strftime('%d/%m/%Y %H:%M:%S %p')

print(df_solar_resampled)

### This module is for preparing the wind table

In [None]:
# Converting the column into date time
wind_pivot.index = pd.to_datetime(wind_pivot.index)

# The gen_outage dataframe has duplicate dates (rows) that and their values need to be averaged
wind_pivot = wind_pivot.groupby(wind_pivot.index).mean()

# The gen_outage dataframe has duplicate dates (rows) that and their values need to be averaged
wind_pivot = wind_pivot.groupby(wind_pivot.index).mean()

#Filtering out rows where the date is earlier than 2022
wind_pivot = wind_pivot[wind_pivot.index.year >= 2022]

# Set the 'datetime_beginning_ept' column as the index, resample to hourly increments, and forward fill the gas prices
df_wind_resampled = wind_pivot.resample('H').ffill()

# Reset the index to make 'datetime_beginning_ept' a column again
df_wind_resampled.reset_index(inplace=True)

# Convert the 'datetime_beginning_ept' column from datetime to string
df_wind_resampled['datetime_beginning_ept'] = df_wind_resampled['datetime_beginning_ept'].dt.strftime('%d/%m/%Y %H:%M:%S %p')

print(df_wind_resampled)

### This module is for preparing the historical_da table

In [None]:
# Converting the column into date time
historical_da_pivot.index = pd.to_datetime(historical_da_pivot.index)

# The gen_outage dataframe has duplicate dates (rows) that and their values need to be averaged
historical_da_pivot = historical_da_pivot.groupby(historical_da_pivot.index).mean()

# The gen_outage dataframe has duplicate dates (rows) that and their values need to be averaged
historical_da_pivot = historical_da_pivot.groupby(historical_da_pivot.index).mean()

#Filtering out rows where the date is earlier than 2022
historical_da_pivot = historical_da_pivot[historical_da_pivot.index.year >= 2022]

# Set the 'datetime_beginning_ept' column as the index, resample to hourly increments, and forward fill the gas prices
df_historical_da_resampled = historical_da_pivot.resample('H').ffill()

# Reset the index to make 'datetime_beginning_ept' a column again
df_historical_da_resampled.reset_index(inplace=True)

# Convert the 'datetime_beginning_ept' column from datetime to string
df_historical_da_resampled['datetime_beginning_ept'] = df_historical_da_resampled['datetime_beginning_ept'].dt.strftime('%d/%m/%Y %H:%M:%S %p')

print(df_historical_da_resampled)

### This module merges the dataframes into one

In [49]:
# Merge the pivoted DataFrames along the dates
merged_df = pd.merge(df_solar_resampled, df_wind_resampled, on='datetime_beginning_ept', suffixes=('_solar', '_wind'))

merged_df = pd.merge(merged_df, df_forecast_resampled, on = 'datetime_beginning_ept')

merged_df = pd.merge(merged_df, df_historical_da_resampled, on = 'datetime_beginning_ept')

merged_df = pd.merge(merged_df, df_gen_outages_index, on = 'datetime_beginning_ept')

merged_df = pd.merge(merged_df, df_gas_price_index, on = 'datetime_beginning_ept')

print(merged_df)


      datetime_beginning_ept  MIDATL_solar  OTHER_solar  RFC_solar  RTO_solar  \
0     02/01/2023 00:00:00 AM        -1.117       -0.037     -1.448     -5.550   
1     02/01/2023 01:00:00 AM        -1.109       -0.038     -1.432     -5.646   
2     02/01/2023 02:00:00 AM        -1.147       -0.037     -1.467     -5.745   
3     02/01/2023 03:00:00 AM        -1.160       -0.037     -2.490     -6.762   
4     02/01/2023 04:00:00 AM        -1.116       -0.036     -1.444     -5.620   
...                      ...           ...          ...        ...        ...   
8731  31/12/2023 19:00:00 PM        -1.877        0.000     -2.824     -7.975   
8732  31/12/2023 20:00:00 PM        -1.853        0.000     -5.067    -10.247   
8733  31/12/2023 21:00:00 PM        -2.898        0.000     -4.362     -9.434   
8734  31/12/2023 22:00:00 PM        -2.912        0.000     -5.665    -10.761   
8735  31/12/2023 23:00:00 PM        -1.874        0.000     -3.646     -8.692   

      SOUTH_solar  WEST_sol