In [3]:
import pandas as pd
import os
import numpy as np
from datetime import datetime
import re

In [None]:
# Step 1: create a function to extract the ISO name from the filename
# Step 2: create a function to read and preprocess data from a single ISO CSV file
# Step 3: create a function to consolidate data from multiple ISO CSV files
# Step 4: create a function to convert the consolidated data from long to wide format
# Step 5: create a function to save both long and wide format datasets

def extract_iso_name(filename):
    """Extract the ISO name from the filename"""
    match = re.match(r'([A-Z]+)_?.*\.csv', filename)
    if match:
        return match.group(1)
    else:
        # Handle special cases like Ohio
        if filename.startswith('OHIO'):
            return 'OHIO'
        elif filename.startswith('ILLINOIS'):
            return 'MISO'
        return filename.split('.')[0]

def read_and_preprocess_iso_data(filepath):
    """Read and preprocess data from a single ISO CSV file"""
    # Read the CSV file
    df = pd.read_csv(filepath)
    
    # Clean up column names (remove whitespace, lowercase)
    df.columns = [col.strip().lower() for col in df.columns]
    
    # Ensure critical columns are present
    required_cols = ['interval_start_utc', 'location', 'lmp']
    if not all(col in df.columns for col in required_cols):
        missing = [col for col in required_cols if col not in df.columns]
        raise ValueError(f"Missing required columns: {missing} in file {filepath}")
    
    # Convert timestamps to datetime
    df['interval_start_utc'] = pd.to_datetime(df['interval_start_utc'])
    
    # Extract the ISO name from the filepath
    iso_name = extract_iso_name(os.path.basename(filepath))
    df['iso'] = iso_name
    
    # Select and rename columns for consistency
    cols_to_keep = ['interval_start_utc', 'iso', 'location', 'lmp']
    
    # Add optional columns if they exist
    optional_cols = ['energy', 'congestion', 'loss']
    for col in optional_cols:
        if col in df.columns:
            cols_to_keep.append(col)
    
    # Drop rows with missing values in critical columns
    df = df.dropna(subset=['interval_start_utc', 'location', 'lmp'])
    
    return df[cols_to_keep]

def consolidate_iso_data(file_list):
    """Consolidate data from multiple ISO CSV files"""
    dfs = []
    
    for file in file_list:
        try:
            print(f"Processing {file}...")
            df = read_and_preprocess_iso_data(file)
            dfs.append(df)
            print(f"Added {len(df)} rows from {file}")
        except Exception as e:
            print(f"Error processing {file}: {e}")
    
    # Combine all dataframes
    if not dfs:
        raise ValueError("No data was successfully processed")
    
    combined_df = pd.concat(dfs, ignore_index=True)

    # Fix ISO names
    combined_df['iso'] = combined_df['iso'].replace({'O': 'OHIO'})


    # Sort by timestamp and ISO
    combined_df = combined_df.sort_values(['interval_start_utc', 'iso', 'location'])
    
    return combined_df

def create_wide_format(df):
    """Convert the consolidated data from long to wide format"""
    # Create a column name by combining ISO and location
    df['iso_location'] = df['iso'] + '_' + df['location']
    
    # Pivot the dataframe to get timestamps as rows and ISO_location as columns
    pivot_df = df.pivot(index='interval_start_utc', columns='iso_location', values='lmp')
    
    # Reset index to make interval_start_utc a column
    pivot_df = pivot_df.reset_index()
    
    return pivot_df

def save_outputs(df_long, df_wide, output_dir='.'):
    """Save both long and wide format datasets"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate timestamp for filenames
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Save long format (all data)
    long_path = os.path.join(output_dir, f'consolidated_iso_lmp_long_{timestamp}.csv')
    df_long.to_csv(long_path, index=False)
    
    # Save wide format (pivoted)
    wide_path = os.path.join(output_dir, f'consolidated_iso_lmp_wide_{timestamp}.csv')
    df_wide.to_csv(wide_path, index=False)
    
    # Save ISO summary statistics
    iso_stats = df_long.groupby('iso')['lmp'].agg([
        ('min', 'min'),
        ('max', 'max'), 
        ('mean', 'mean'),
        ('median', 'median'),
        ('std', 'std'),
        ('count', 'count')
    ]).reset_index()
    
    stats_path = os.path.join(output_dir, f'iso_lmp_statistics_{timestamp}.csv')
    iso_stats.to_csv(stats_path, index=False)
    
    return {
        'long_format': long_path,
        'wide_format': wide_path,
        'statistics': stats_path
    }

def main():
    """Main function to process ISO data files"""
    # List of ISO files to process
    iso_files = [
        'CAISO_LMP.csv',
        'ERCOT_LMP.csv',
        'ILLINOIS_MISO_LMP.csv',
        'ISONE_LMP.csv',
        'NYISO_LMP.csv',
        'Ohio_LMP.csv',
        'PJM_LMP.csv',
        'SPP_LMP.csv'
    ]
    
    # Verify all files exist
    missing_files = [f for f in iso_files if not os.path.exists(f)]
    if missing_files:
        print(f"Warning: The following files are missing: {missing_files}")
        iso_files = [f for f in iso_files if os.path.exists(f)]
    
    # Process the data
    print(f"Processing {len(iso_files)} ISO data files...")
    df_long = consolidate_iso_data(iso_files)
    print(f"Combined dataset has {len(df_long)} rows and {df_long['iso'].nunique()} ISOs")
    
    # Generate time interval statistics
    time_stats = df_long.groupby('iso')['interval_start_utc'].agg([
        ('min_time', 'min'),
        ('max_time', 'max'),
        ('count', 'count')
    ])
    print("\nTime range by ISO:")
    print(time_stats)
    
    # Create wide format
    print("\nCreating wide format dataset...")
    df_wide = create_wide_format(df_long)
    print(f"Wide format dataset has {len(df_wide)} rows and {len(df_wide.columns)} columns")
    
    # Save outputs
    output_paths = save_outputs(df_long, df_wide)
    print("\nFiles saved:")
    for format_type, path in output_paths.items():
        print(f"- {format_type}: {path}")

if __name__ == "__main__":
    main()

Processing 8 ISO data files...
Processing CAISO_LMP.csv...
Added 573 rows from CAISO_LMP.csv
Processing ERCOT_LMP.csv...
Added 576 rows from ERCOT_LMP.csv
Processing ILLINOIS_MISO_LMP.csv...
Added 573 rows from ILLINOIS_MISO_LMP.csv
Processing ISONE_LMP.csv...
Added 575 rows from ISONE_LMP.csv
Processing NYISO_LMP.csv...
Added 576 rows from NYISO_LMP.csv
Processing Ohio_LMP.csv...
Added 575 rows from Ohio_LMP.csv
Processing PJM_LMP.csv...
Added 575 rows from PJM_LMP.csv
Processing SPP_LMP.csv...
Added 562 rows from SPP_LMP.csv
Combined dataset has 4585 rows and 8 ISOs

Time range by ISO:
                          min_time                  max_time  count
iso                                                                
CAISO    2025-03-24 07:00:00+00:00 2025-03-26 06:55:00+00:00    573
ERCOT    2025-03-24 05:00:00+00:00 2025-03-26 04:55:00+00:00    576
ILLINOIS 2025-03-24 05:00:00+00:00 2025-03-26 04:55:00+00:00    573
ISONE    2025-03-24 04:00:00+00:00 2025-03-26 03:55:00+00:00    5