In [1]:
import pandas as pd
import os
import numpy as np
from datetime import datetime
import re
import sys

In [3]:
# Step 1: create a function to extract the ISO name from the filename
# Step 2: create a function to read and preprocess data from a single ISO CSV file
# Step 3: create a function to consolidate data from multiple ISO CSV files
# Step 4: create a function to convert the consolidated data from long to wide format
# Step 5: create a function to save both long and wide format datasets

def extract_iso_name(filename):
    """Extract the ISO name from the filename"""
    match = re.match(r'([A-Z]+)_?.*\.csv', filename)
    if match:
        return match.group(1)
    else:
        # Handle special cases like Ohio
        if filename.startswith('OHIO'):
            return 'OHIO'
        elif filename.startswith('ILLINOIS'):
            return 'MISO'
        return filename.split('.')[0]

def read_and_preprocess_iso_data(filepath):
    """Read and preprocess data from a single ISO CSV file"""
    # Read the CSV file
    df = pd.read_csv(filepath)
    
    # Clean up column names (remove whitespace, lowercase)
    df.columns = [col.strip().lower() for col in df.columns]
    
    # Ensure critical columns are present
    required_cols = ['interval_start_utc', 'location', 'lmp']
    if not all(col in df.columns for col in required_cols):
        missing = [col for col in required_cols if col not in df.columns]
        raise ValueError(f"Missing required columns: {missing} in file {filepath}")
    
    # Convert timestamps to datetime
    df['interval_start_utc'] = pd.to_datetime(df['interval_start_utc'])
    
    # Extract the ISO name from the filepath
    iso_name = extract_iso_name(os.path.basename(filepath))
    df['iso'] = iso_name
    
    # Select and rename columns for consistency
    cols_to_keep = ['interval_start_utc', 'iso', 'location', 'lmp']
    
    # Add optional columns if they exist
    optional_cols = ['energy', 'congestion', 'loss']
    for col in optional_cols:
        if col in df.columns:
            cols_to_keep.append(col)
    
    # Drop rows with missing values in critical columns
    df = df.dropna(subset=['interval_start_utc', 'location', 'lmp'])
    
    return df[cols_to_keep]

def consolidate_iso_data(file_list):
    """Consolidate data from multiple ISO CSV files"""
    dfs = []
    
    for file in file_list:
        try:
            print(f"Processing {file}...")
            df = read_and_preprocess_iso_data(file)
            dfs.append(df)
            print(f"Added {len(df)} rows from {file}")
        except Exception as e:
            print(f"Error processing {file}: {e}")
    
    # Combine all dataframes
    if not dfs:
        raise ValueError("No data was successfully processed")
    
    combined_df = pd.concat(dfs, ignore_index=True)

    # Fix ISO names
    combined_df['iso'] = combined_df['iso'].replace({'O': 'OHIO'})


    # Sort by timestamp and ISO
    combined_df = combined_df.sort_values(['interval_start_utc', 'iso', 'location'])
    
    return combined_df

def create_wide_format(df):
    """Convert the consolidated data from long to wide format"""
    # Create a column name by combining ISO and location
    df['iso_location'] = df['iso'] + '_' + df['location']
    
    # Pivot the dataframe to get timestamps as rows and ISO_location as columns
    pivot_df = df.pivot(index='interval_start_utc', columns='iso_location', values='lmp')
    
    # Reset index to make interval_start_utc a column
    pivot_df = pivot_df.reset_index()
    
    return pivot_df

def save_outputs(df_long, df_wide, output_dir='.'):
    """Save both long and wide format datasets"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate timestamp for filenames
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Save long format (all data)
    long_path = os.path.join(output_dir, f'consolidated_iso_lmp_long_{timestamp}.csv')
    df_long.to_csv(long_path, index=False)
    
    # Save wide format (pivoted)
    wide_path = os.path.join(output_dir, f'consolidated_iso_lmp_wide_{timestamp}.csv')
    df_wide.to_csv(wide_path, index=False)
    
    # Save ISO summary statistics
    iso_stats = df_long.groupby('iso')['lmp'].agg([
        ('min', 'min'),
        ('max', 'max'), 
        ('mean', 'mean'),
        ('median', 'median'),
        ('std', 'std'),
        ('count', 'count')
    ]).reset_index()
    
    stats_path = os.path.join(output_dir, f'iso_lmp_statistics_{timestamp}.csv')
    iso_stats.to_csv(stats_path, index=False)
    
    return {
        'long_format': long_path,
        'wide_format': wide_path,
        'statistics': stats_path
    }

def main():
    """Main function to process ISO data files"""
    # List of ISO files to process
    iso_files = [
        'CAISO_LMP.csv',
        'ERCOT_LMP.csv',
        'ILLINOIS_MISO_LMP.csv',
        'ISONE_LMP.csv',
        'NYISO_LMP.csv',
        'Ohio_LMP.csv',
        'PJM_LMP.csv',
        'SPP_LMP.csv'
        'LOUISIANA_LMP.csv',
        'MISO_LMP.csv',
        'NJ_LMP.csv',
    ]
    
    # Verify all files exist
    missing_files = [f for f in iso_files if not os.path.exists(f)]
    if missing_files:
        print(f"Warning: The following files are missing: {missing_files}")
        iso_files = [f for f in iso_files if os.path.exists(f)]
    
    # Process the data
    print(f"Processing {len(iso_files)} ISO data files...")
    df_long = consolidate_iso_data(iso_files)
    print(f"Combined dataset has {len(df_long)} rows and {df_long['iso'].nunique()} ISOs")
    
    # Generate time interval statistics
    time_stats = df_long.groupby('iso')['interval_start_utc'].agg([
        ('min_time', 'min'),
        ('max_time', 'max'),
        ('count', 'count')
    ])
    print("\nTime range by ISO:")
    print(time_stats)
    
    # Create wide format
    print("\nCreating wide format dataset...")
    df_wide = create_wide_format(df_long)
    print(f"Wide format dataset has {len(df_wide)} rows and {len(df_wide.columns)} columns")
    
    # Save outputs
    output_paths = save_outputs(df_long, df_wide)
    print("\nFiles saved:")
    for format_type, path in output_paths.items():
        print(f"- {format_type}: {path}")

if __name__ == "__main__":
    main()

Processing 8 ISO data files...
Processing CAISO_LMP.csv...
Added 576 rows from CAISO_LMP.csv
Processing ERCOT_LMP.csv...
Added 576 rows from ERCOT_LMP.csv
Processing ILLINOIS_MISO_LMP.csv...
Added 573 rows from ILLINOIS_MISO_LMP.csv
Processing ISONE_LMP.csv...
Added 575 rows from ISONE_LMP.csv
Processing NYISO_LMP.csv...
Added 576 rows from NYISO_LMP.csv
Processing Ohio_LMP.csv...
Added 575 rows from Ohio_LMP.csv
Processing PJM_LMP.csv...
Added 575 rows from PJM_LMP.csv
Processing NJ_LMP.csv...
Added 575 rows from NJ_LMP.csv
Combined dataset has 4601 rows and 8 ISOs

Time range by ISO:
                          min_time                  max_time  count
iso                                                                
CAISO    2025-03-24 07:00:00+00:00 2025-03-26 06:55:00+00:00    576
ERCOT    2025-03-24 05:00:00+00:00 2025-03-26 04:55:00+00:00    576
ILLINOIS 2025-03-24 05:00:00+00:00 2025-03-26 04:55:00+00:00    573
ISONE    2025-03-24 04:00:00+00:00 2025-03-26 03:55:00+00:00    575

In [5]:
def extract_iso_name(filename):
    """Extract the ISO name from the filename"""
    match = re.match(r'([A-Z]+)_?.*\.csv', filename)
    if match:
        return match.group(1)
    else:
        # Handle special cases like Ohio
        if filename.startswith('Ohio'):
            return 'OHIO'
        elif filename.startswith('ILLINOIS'):
            return 'MISO'
        elif filename.startswith('LOUISIANA'):
            return 'LOUISIANA'
        elif filename.startswith('NJ'):
            return 'NJ'
        return filename.split('.')[0]

def read_and_preprocess_iso_data(filepath):
    """Read and preprocess data from a single ISO CSV file"""
    try:
        # Read the CSV file
        df = pd.read_csv(filepath)
        
        # Clean up column names (remove whitespace, lowercase)
        df.columns = [col.strip().lower() for col in df.columns]
        
        # Ensure critical columns are present
        required_cols = ['interval_start_utc', 'location', 'lmp']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols} in file {filepath}")
        
        # Convert timestamps to datetime
        df['interval_start_utc'] = pd.to_datetime(df['interval_start_utc'])
        
        # Extract the ISO name from the filepath
        iso_name = extract_iso_name(os.path.basename(filepath))
        df['iso'] = iso_name
        
        # Replace null location values with a default value based on the ISO
        if df['location'].isna().any():
            print(f"Warning: {filepath} contains {df['location'].isna().sum()} rows with missing location values")
            default_location = f"{iso_name}_DEFAULT"
            df['location'] = df['location'].fillna(default_location)
        
        # Clean location strings (remove special characters, trim whitespace)
        df['location'] = df['location'].astype(str).str.strip()
        
        # Select and rename columns for consistency
        cols_to_keep = ['interval_start_utc', 'iso', 'location', 'lmp']
        
        # Add optional columns if they exist
        optional_cols = ['energy', 'congestion', 'loss']
        for col in optional_cols:
            if col in df.columns:
                cols_to_keep.append(col)
        
        # Drop rows with missing values in critical columns (except location which we fixed)
        df = df.dropna(subset=['interval_start_utc', 'lmp'])
        
        return df[cols_to_keep]
    
    except Exception as e:
        print(f"Error processing {filepath}: {str(e)}")
        raise

def consolidate_iso_data(file_list):
    """Consolidate data from multiple ISO CSV files"""
    dfs = []
    
    for file in file_list:
        try:
            if not os.path.exists(file):
                print(f"Warning: File not found: {file}")
                continue
                
            print(f"Processing {file}...")
            df = read_and_preprocess_iso_data(file)
            dfs.append(df)
            print(f"Added {len(df)} rows from {file}")
        except Exception as e:
            print(f"Error processing {file}: {str(e)}")
    
    # Combine all dataframes
    if not dfs:
        raise ValueError("No data was successfully processed")
    
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Sort by timestamp and ISO
    combined_df = combined_df.sort_values(['interval_start_utc', 'iso', 'location'])
    
    # Identify all unique time points and ISO-location combinations
    all_timestamps = combined_df['interval_start_utc'].unique()
    iso_locations = combined_df.groupby(['iso', 'location']).size().reset_index()[['iso', 'location']]
    
    # Check for missing time points in each ISO-location combination
    print("\nChecking for missing time points in each ISO-location combination...")
    missing_count = 0
    
    for _, row in iso_locations.iterrows():
        iso = row['iso']
        location = row['location']
        
        # Get timestamps for this ISO-location
        location_data = combined_df[(combined_df['iso'] == iso) & (combined_df['location'] == location)]
        location_times = set(location_data['interval_start_utc'])
        
        # Find missing timestamps
        missing_times = set(all_timestamps) - location_times
        
        if missing_times:
            missing_count += len(missing_times)
            print(f"  {iso}-{location}: Missing {len(missing_times)} time points out of {len(all_timestamps)}")
    
    print(f"Total missing time points across all ISO-location combinations: {missing_count}")
    
    return combined_df

def create_wide_format(df, fill_method='interpolate'):
    """
    Convert the consolidated data from long to wide format
    
    Parameters:
    -----------
    df : DataFrame
        Consolidated data in long format
    fill_method : str
        Method to fill missing values: 'interpolate', 'forward', 'backward', or 'none'
        
    Returns:
    --------
    DataFrame
        Data in wide format with timestamps as rows and ISO_location as columns
    """
    # Create a column name by combining ISO and location
    df['iso_location'] = df['iso'] + '_' + df['location']
    
    # Pivot the dataframe to get timestamps as rows and ISO_location as columns
    pivot_df = df.pivot(index='interval_start_utc', columns='iso_location', values='lmp')
    
    # Fill missing values based on the specified method
    if fill_method == 'interpolate':
        # Use linear interpolation to fill gaps
        pivot_df = pivot_df.interpolate(method='linear')
        
        # For any remaining NaNs at the beginning/end, use forward and backward fill
        pivot_df = pivot_df.fillna(method='ffill').fillna(method='bfill')
        
    elif fill_method == 'forward':
        # Use forward fill (last observation carried forward)
        pivot_df = pivot_df.fillna(method='ffill')
        
    elif fill_method == 'backward':
        # Use backward fill (next observation carried backward)
        pivot_df = pivot_df.fillna(method='bfill')
    
    # Reset index to make interval_start_utc a column
    pivot_df = pivot_df.reset_index()
    
    # Count how many missing values remain after filling
    missing_values = pivot_df.isna().sum().sum()
    if missing_values > 0:
        print(f"Warning: {missing_values} missing values remain in the wide format dataset after applying {fill_method} fill")
    
    return pivot_df

def save_outputs(df_long, df_wide, output_dir='.'):
    """Save both long and wide format datasets"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate timestamp for filenames
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Save long format (all data)
    long_path = os.path.join(output_dir, f'consolidated_iso_lmp_long_{timestamp}.csv')
    df_long.to_csv(long_path, index=False)
    
    # Save wide format (pivoted)
    wide_path = os.path.join(output_dir, f'consolidated_iso_lmp_wide_{timestamp}.csv')
    df_wide.to_csv(wide_path, index=False)
    
    # Save ISO summary statistics
    iso_stats = df_long.groupby('iso')['lmp'].agg([
        ('min', 'min'),
        ('max', 'max'), 
        ('mean', 'mean'),
        ('median', 'median'),
        ('std', 'std'),
        ('count', 'count')
    ]).reset_index()
    
    stats_path = os.path.join(output_dir, f'iso_lmp_statistics_{timestamp}.csv')
    iso_stats.to_csv(stats_path, index=False)
    
    # Save ISO-Location completeness report
    completeness_df = df_long.groupby(['iso', 'location']).agg(
        time_points=('interval_start_utc', 'count'),
        min_time=('interval_start_utc', 'min'),
        max_time=('interval_start_utc', 'max'),
        min_lmp=('lmp', 'min'),
        max_lmp=('lmp', 'max'),
        avg_lmp=('lmp', 'mean')
    ).reset_index()
    
    # Calculate expected number of time points (from min to max time with 5-min intervals)
    for idx, row in completeness_df.iterrows():
        min_time = row['min_time']
        max_time = row['max_time']
        
        # Calculate time difference in minutes and divide by 5 to get expected number of intervals
        time_diff_minutes = (max_time - min_time).total_seconds() / 60
        expected_points = time_diff_minutes / 5 + 1  # +1 because we include both endpoints
        
        completeness_df.loc[idx, 'expected_points'] = expected_points
        completeness_df.loc[idx, 'completeness_pct'] = min(100, (row['time_points'] / expected_points) * 100)
    
    completeness_path = os.path.join(output_dir, f'iso_location_completeness_{timestamp}.csv')
    completeness_df.to_csv(completeness_path, index=False)
    
    return {
        'long_format': long_path,
        'wide_format': wide_path,
        'statistics': stats_path,
        'completeness': completeness_path
    }

def main(fill_method='interpolate', output_dir='.'):
    """
    Main function to process ISO data files
    
    Parameters:
    -----------
    fill_method : str
        Method to fill missing values in wide format: 'interpolate', 'forward', 'backward', or 'none'
    output_dir : str
        Directory to save output files
    
    Returns:
    --------
    tuple
        (df_long, df_wide) DataFrames containing the consolidated data
    """
    # List of ISO files to process
    iso_files = [
        'CAISO_LMP.csv',
        'ERCOT_LMP.csv',
        'ILLINOIS_MISO_LMP.csv',
        'ISONE_LMP.csv',
        'NYISO_LMP.csv',
        'Ohio_LMP.csv',
        'PJM_LMP.csv',
        'SPP_LMP.csv',
        'LOUISIANA_LMP.csv',
        'NJ_LMP.csv'
    ]
    
    # Verify files exist and report missing files
    missing_files = [f for f in iso_files if not os.path.exists(f)]
    if missing_files:
        print(f"Warning: The following files are missing: {missing_files}")
        iso_files = [f for f in iso_files if os.path.exists(f)]
        
    if not iso_files:
        raise ValueError("No ISO data files found to process")
    
    # Process the data
    print(f"Processing {len(iso_files)} ISO data files...")
    df_long = consolidate_iso_data(iso_files)
    print(f"Combined dataset has {len(df_long)} rows and {df_long['iso'].nunique()} ISOs")
    
    # Generate time interval statistics
    time_stats = df_long.groupby('iso')['interval_start_utc'].agg([
        ('min_time', 'min'),
        ('max_time', 'max'),
        ('count', 'count')
    ])
    print("\nTime range by ISO:")
    print(time_stats)
    
    # Check for duplicate records
    duplicate_check = df_long.duplicated(subset=['interval_start_utc', 'iso', 'location']).sum()
    if duplicate_check > 0:
        print(f"\nWarning: Found {duplicate_check} duplicate records. Removing duplicates...")
        df_long = df_long.drop_duplicates(subset=['interval_start_utc', 'iso', 'location'])
    
    # Create wide format
    print(f"\nCreating wide format dataset with '{fill_method}' fill method...")
    df_wide = create_wide_format(df_long, fill_method=fill_method)
    print(f"Wide format dataset has {len(df_wide)} rows and {len(df_wide.columns)} columns")
    
    # Check data completeness in wide format
    missing_pct = (df_wide.isnull().sum().sum() / (df_wide.shape[0] * df_wide.shape[1])) * 100
    print(f"Data completeness in wide format: {100 - missing_pct:.2f}%")
    
    # Save outputs
    output_paths = save_outputs(df_long, df_wide, output_dir)
    print("\nFiles saved:")
    for format_type, path in output_paths.items():
        print(f"- {format_type}: {path}")
    
    return df_long, df_wide

def run_from_notebook(iso_files=None, fill_method='interpolate', output_dir='.'):
    """
    Function to run data consolidation from a Jupyter notebook
    
    Parameters:
    -----------
    iso_files : list
        List of ISO data files to process. If None, use default list
    fill_method : str
        Method to fill missing values in wide format: 'interpolate', 'forward', 'backward', or 'none'
    output_dir : str
        Directory to save output files
    
    Returns:
    --------
    tuple
        (df_long, df_wide) DataFrames containing the consolidated data
    """
    if iso_files is None:
        # Default ISO files list
        iso_files = [
            'CAISO_LMP.csv',
            'ERCOT_LMP.csv',
            'ILLINOIS_MISO_LMP.csv',
            'ISONE_LMP.csv',
            'NYISO_LMP.csv',
            'Ohio_LMP.csv',
            'PJM_LMP.csv',
            'SPP_LMP.csv',
            'LOUISIANA_LMP.csv',
            'NJ_LMP.csv'
        ]
    
    # Filter to only existing files
    existing_files = [f for f in iso_files if os.path.exists(f)]
    if not existing_files:
        raise ValueError("No specified ISO data files found")
    
    # Process the data
    print(f"Processing {len(existing_files)} ISO data files...")
    df_long = consolidate_iso_data(existing_files)
    
    # Create wide format
    df_wide = create_wide_format(df_long, fill_method=fill_method)
    
    # Save outputs
    output_paths = save_outputs(df_long, df_wide, output_dir)
    
    return df_long, df_wide

if __name__ == "__main__":
    import argparse
    import sys
    
      # Set up command line argument parsing
    parser = argparse.ArgumentParser(description='Process ISO LMP data files and create consolidated datasets')
    parser.add_argument('--fill', choices=['interpolate', 'forward', 'backward', 'none'], 
                        default='interpolate', help='Method to fill missing values in the wide format')
    parser.add_argument('--output', type=str, default='.', 
                        help='Directory to save output files')
    parser.add_argument('--files', nargs='+', type=str, 
                        help='Specific ISO files to process (optional, uses default list if not provided)')
    
    try:
        # When running in Jupyter, ignore sys.argv and use empty args
        if 'ipykernel_launcher' in sys.argv[0]:
            args = parser.parse_args([])
        else:
            # If run as a standalone script with command line arguments
            args = parser.parse_args()
            
        # Process specific files if provided
        if hasattr(args, 'files') and args.files:
            df_long, df_wide = run_from_notebook(args.files, args.fill, args.output)
        else:
            # Use the main function with default file list
            df_long, df_wide = main(fill_method=args.fill, output_dir=args.output)
    except Exception as e:
        print(f"Error: {str(e)}")
        # For environments where argparse might not be available
        print("Running with default parameters")
        df_long, df_wide = main()

Processing 10 ISO data files...
Processing CAISO_LMP.csv...
Added 576 rows from CAISO_LMP.csv
Processing ERCOT_LMP.csv...
Added 576 rows from ERCOT_LMP.csv
Processing ILLINOIS_MISO_LMP.csv...
Added 573 rows from ILLINOIS_MISO_LMP.csv
Processing ISONE_LMP.csv...
Added 575 rows from ISONE_LMP.csv
Processing NYISO_LMP.csv...
Added 576 rows from NYISO_LMP.csv
Processing Ohio_LMP.csv...
Added 575 rows from Ohio_LMP.csv
Processing PJM_LMP.csv...
Added 575 rows from PJM_LMP.csv
Processing SPP_LMP.csv...
Added 576 rows from SPP_LMP.csv
Processing LOUISIANA_LMP.csv...
Added 573 rows from LOUISIANA_LMP.csv
Processing NJ_LMP.csv...
Added 576 rows from NJ_LMP.csv

Checking for missing time points in each ISO-location combination...
  CAISO-TH_NP15_GEN-APND: Missing 36 time points out of 612
  ERCOT-HB_HOUSTON: Missing 36 time points out of 612
  ILLINOIS-ILLINOIS.HUB: Missing 39 time points out of 612
  ISONE-.H.INTERNAL_HUB: Missing 37 time points out of 612
  LOUISIANA-LOUISIANA.HUB: Missing 39 

# Data center location: 
    # \datacenter_info = {
    #     0: {"name": "us-central1", "location": "Iowa", "renewable": 0.95, "co2": 430},
    #     1: {"name": "us-east1", "location": "South Carolina", "renewable": 0.29, "co2": 560},
    #     2: {"name": "us-east4", "location": "Northern Virginia", "renewable": 0.52, "co2": 322},
    #     3: {"name": "us-east5", "location": "Columbus", "renewable": 0.52, "co2": 322},
    #     4: {"name": "us-south1", "location": "Dallas", "renewable": 0.79, "co2": 321},
    #     5: {"name": "us-west1", "location": "Oregon", "renewable": 0.84, "co2": 94},
    #     6: {"name": "us-west2", "location": "Los Angeles", "renewable": 0.55, "co2": 198},
    #     7: {"name": "us-west3", "location": "Salt Lake City", "renewable": 0.29, "co2": 588},
    #     8: {"name": "us-west4", "location": "Las Vegas", "renewable": 0.26, "co2": 373}
    # }

In [2]:
import pandas as pd
import os
import json
from difflib import get_close_matches
import re

class DataCenterISOMapper:
    """
    A class to map data centers to their corresponding ISO regions
    based on geographical information
    """
    
    def __init__(self):
        # State to ISO region mapping (primary)
        self.state_to_iso = {
            # CAISO (California Independent System Operator)
            "California": "CAISO",
            "CA": "CAISO",
            
            # ERCOT (Electric Reliability Council of Texas)
            "Texas": "ERCOT",
            "TX": "ERCOT",
            
            # MISO (Midcontinent Independent System Operator)
            "Iowa": "MISO",  
            "IA": "MISO",
            "Illinois": "MISO",
            "IL": "MISO",
            "Indiana": "MISO",
            "IN": "MISO",
            "Michigan": "MISO",
            "MI": "MISO",
            "Minnesota": "MISO",
            "MN": "MISO",
            "Louisiana": "MISO",
            "LA": "MISO",
            "Arkansas": "MISO",
            "AR": "MISO",
            "Mississippi": "MISO",
            "MS": "MISO",
            "Missouri": "MISO",
            "MO": "MISO",
            "North Dakota": "MISO",
            "ND": "MISO",
            "South Dakota": "MISO",
            "SD": "MISO",
            "Wisconsin": "MISO",
            "WI": "MISO",
            
            # ISO-NE (ISO New England)
            "Connecticut": "ISONE",
            "CT": "ISONE",
            "Maine": "ISONE",
            "ME": "ISONE",
            "Massachusetts": "ISONE",
            "MA": "ISONE",
            "New Hampshire": "ISONE",
            "NH": "ISONE",
            "Rhode Island": "ISONE",
            "RI": "ISONE",
            "Vermont": "ISONE",
            "VT": "ISONE",
            
            # NYISO (New York Independent System Operator)
            "New York": "NYISO",
            "NY": "NYISO",
            
            # PJM (Pennsylvania-New Jersey-Maryland Interconnection)
            "Delaware": "PJM",
            "DE": "PJM",
            "Kentucky": "PJM",
            "KY": "PJM",
            "Maryland": "PJM",
            "MD": "PJM",
            "New Jersey": "PJM",
            "NJ": "PJM",
            "North Carolina": "PJM",
            "NC": "PJM",
            "Ohio": "PJM",
            "OH": "PJM",
            "Pennsylvania": "PJM",
            "PA": "PJM",
            "Virginia": "PJM",
            "VA": "PJM",
            "West Virginia": "PJM",
            "WV": "PJM",
            "Washington DC": "PJM",
            "District of Columbia": "PJM",
            "DC": "PJM",
            
            # SPP (Southwest Power Pool)
            "Kansas": "SPP",
            "KS": "SPP",
            "Oklahoma": "SPP",
            "OK": "SPP",
            "Nebraska": "SPP",
            "NE": "SPP",
            "New Mexico": "SPP",
            "NM": "SPP",
            "Colorado": "SPP",
            "CO": "SPP",
            "Wyoming": "SPP",
            "WY": "SPP",
            
            # Southeast (not a formal ISO but grouped for reference)
            "Alabama": "SOUTHEAST",
            "AL": "SOUTHEAST",
            "Florida": "SOUTHEAST",
            "FL": "SOUTHEAST",
            "Georgia": "SOUTHEAST",
            "GA": "SOUTHEAST",
            "South Carolina": "SOUTHEAST",
            "SC": "SOUTHEAST",
            "Tennessee": "SOUTHEAST",
            "TN": "SOUTHEAST",
            
            # West (not covered by major ISOs)
            "Arizona": "WEST",
            "AZ": "WEST",
            "Idaho": "WEST",
            "ID": "WEST",
            "Montana": "WEST",
            "MT": "WEST",
            "Nevada": "WEST",
            "NV": "WEST",
            "Oregon": "WEST",
            "OR": "WEST",
            "Utah": "WEST",
            "UT": "WEST",
            "Washington": "WEST",
            "WA": "WEST",
        }
        
        # City to ISO region mapping (secondary, used when state mapping is ambiguous)
        self.city_to_iso = {
            # CAISO
            "Los Angeles": "CAISO",
            "San Francisco": "CAISO",
            "San Diego": "CAISO",
            "Sacramento": "CAISO",
            "San Jose": "CAISO",
            
            # ERCOT
            "Dallas": "ERCOT",
            "Houston": "ERCOT",
            "Austin": "ERCOT",
            "San Antonio": "ERCOT",
            "Fort Worth": "ERCOT",
            
            # MISO
            "Chicago": "MISO",
            "Detroit": "MISO",
            "Minneapolis": "MISO",
            "New Orleans": "MISO",
            "Des Moines": "MISO",
            
            # ISO-NE
            "Boston": "ISONE",
            "Providence": "ISONE",
            "Hartford": "ISONE",
            "Portland ME": "ISONE",
            
            # NYISO
            "New York City": "NYISO",
            "Buffalo": "NYISO",
            "Albany": "NYISO",
            
            # PJM
            "Philadelphia": "PJM",
            "Pittsburgh": "PJM",
            "Columbus": "PJM",
            "Cincinnati": "PJM",
            "Baltimore": "PJM",
            "Washington": "PJM",
            "Richmond": "PJM",
            "Northern Virginia": "PJM",
            
            # SPP
            "Kansas City": "SPP",
            "Oklahoma City": "SPP",
            "Omaha": "SPP",
            "Tulsa": "SPP",
            
            # West
            "Phoenix": "WEST",
            "Las Vegas": "WEST",
            "Salt Lake City": "WEST",
            "Portland OR": "WEST",
            "Seattle": "WEST",
            "Boise": "WEST",
            "Oregon": "WEST",
        }
        
        # Specific datacenter location override (for unique cases)
        self.datacenter_specific_mapping = {
            "us-west1": "CAISO",  # Oregon datacenter is actually in CAISO territory
            "us-west2": "CAISO",  # Los Angeles
            "us-west3": "WEST",   # Salt Lake City
            "us-west4": "WEST",   # Las Vegas
            "us-central1": "MISO", # Iowa
            "us-east1": "SOUTHEAST", # South Carolina
            "us-east4": "PJM",    # Northern Virginia
            "us-east5": "PJM",    # Columbus
            "us-south1": "ERCOT"  # Dallas
        }
        
        # ISO to price file mapping
        self.iso_to_price_file = {
            "CAISO": "CAISO_LMP.csv",
            "ERCOT": "ERCOT_LMP.csv",
            "MISO": "ILLINOIS_MISO_LMP.csv",  # Using Illinois MISO as default
            "ISONE": "ISONE_LMP.csv",
            "NYISO": "NYISO_LMP.csv",
            "PJM": "PJM_LMP.csv",
            "SPP": "SPP_LMP.csv",
            "LOUISIANA": "LOUISIANA_LMP.csv",
            "NJ": "NJ_LMP.csv",
            "OHIO": "Ohio_LMP.csv",
            "SOUTHEAST": "PJM_LMP.csv",  # Fallback to PJM for Southeast
            "WEST": "CAISO_LMP.csv"      # Fallback to CAISO for Western states
        }
        
        # State abbreviation to full name mapping (for reference)
        self.state_abbr_to_name = {
            "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas",
            "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware",
            "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "ID": "Idaho",
            "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas",
            "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland",
            "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi",
            "MO": "Missouri", "MT": "Montana", "NE": "Nebraska", "NV": "Nevada",
            "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico", "NY": "New York",
            "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma",
            "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina",
            "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah",
            "VT": "Vermont", "VA": "Virginia", "WA": "Washington", "WV": "West Virginia",
            "WI": "Wisconsin", "WY": "Wyoming", "DC": "District of Columbia"
        }
        
        # Initialize lookup map for more flexible searching
        self._initialize_lookup_maps()
    
    def _initialize_lookup_maps(self):
        """Initialize lookup maps for more flexible searching"""
        # Create case-insensitive maps
        self.state_lookup = {k.lower(): v for k, v in self.state_to_iso.items()}
        self.city_lookup = {k.lower(): v for k, v in self.city_to_iso.items()}
        self.datacenter_lookup = {k.lower(): v for k, v in self.datacenter_specific_mapping.items()}
    
    def get_iso_for_datacenter(self, datacenter_info):
        """
        Determine the ISO region for a datacenter based on its information
        
        Parameters:
        -----------
        datacenter_info : dict
            Dictionary containing datacenter information with at least 'name' and 'location' keys
            
        Returns:
        --------
        dict
            Dictionary with ISO information including 'iso_name', 'price_file', and 'match_type'
        """
        iso_name = None
        match_type = None
        
        # Try to get a direct match from datacenter name first (highest priority)
        datacenter_name = datacenter_info.get('name', '').lower()
        if datacenter_name in self.datacenter_lookup:
            iso_name = self.datacenter_lookup[datacenter_name]
            match_type = "datacenter_name"
        
        # If no match yet, try to extract location information
        if not iso_name:
            location = datacenter_info.get('location', '')
            
            # Check for direct state match
            state_match = self._find_state_match(location)
            if state_match:
                iso_name = self.state_to_iso.get(state_match)
                match_type = "state"
            
            # If no state match, try city match
            if not iso_name:
                city_match = self._find_city_match(location)
                if city_match:
                    iso_name = self.city_to_iso.get(city_match)
                    match_type = "city"
        
        # Default to PJM if no match found (most extensive coverage)
        if not iso_name:
            iso_name = "PJM"
            match_type = "default"
        
        # Get the appropriate price file
        price_file = self.iso_to_price_file.get(iso_name)
        
        return {
            "datacenter_name": datacenter_info.get('name', ''),
            "location": datacenter_info.get('location', ''),
            "iso_name": iso_name,
            "price_file": price_file,
            "match_type": match_type
        }
    
    def _find_state_match(self, location):
        """Find state match in a location string"""
        if not location:
            return None
            
        location_lower = location.lower()
        
        # Direct lookup
        if location_lower in self.state_lookup:
            return location
        
        # Check for state abbreviation or full name in string
        for abbr, name in self.state_abbr_to_name.items():
            if abbr.lower() in location_lower or name.lower() in location_lower:
                return abbr
        
        # Try fuzzy matching
        possible_states = list(self.state_to_iso.keys())
        matches = get_close_matches(location_lower, [s.lower() for s in possible_states], n=1, cutoff=0.6)
        
        if matches:
            # Find the original case version
            for state in possible_states:
                if state.lower() == matches[0]:
                    return state
        
        return None
    
    def _find_city_match(self, location):
        """Find city match in a location string"""
        if not location:
            return None
            
        location_lower = location.lower()
        
        # Direct lookup
        if location_lower in self.city_lookup:
            return location
        
        # Check for city name in string
        for city in self.city_to_iso.keys():
            if city.lower() in location_lower:
                return city
        
        # Try fuzzy matching
        possible_cities = list(self.city_to_iso.keys())
        matches = get_close_matches(location_lower, [c.lower() for c in possible_cities], n=1, cutoff=0.6)
        
        if matches:
            # Find the original case version
            for city in possible_cities:
                if city.lower() == matches[0]:
                    return city
        
        return None
    
    def map_datacenters(self, datacenter_dict):
        """
        Map all datacenters in a dictionary to their ISO regions
        
        Parameters:
        -----------
        datacenter_dict : dict
            Dictionary of datacenters with their information
            
        Returns:
        --------
        dict
            Dictionary mapping datacenter IDs to ISO information
        """
        result = {}
        
        for dc_id, dc_info in datacenter_dict.items():
            result[dc_id] = self.get_iso_for_datacenter(dc_info)
        
        return result
    
    def get_best_price_column(self, iso_name, price_df):
        """
        Determine the best price column to use from a given ISO's price dataframe
        
        Parameters:
        -----------
        iso_name : str
            Name of the ISO region
        price_df : DataFrame
            DataFrame containing price data for the ISO
            
        Returns:
        --------
        str
            Name of the most appropriate price column to use
        """
        # Default to 'lmp' column if available
        if 'lmp' in price_df.columns:
            return 'lmp'
        
        # Look for other price-related columns
        price_columns = [col for col in price_df.columns if any(
            term in col.lower() for term in ['price', 'cost', 'lmp', 'rate']
        )]
        
        if price_columns:
            return price_columns[0]
        
        # Last resort: return the first numeric column that's not a timestamp
        for col in price_df.columns:
            if price_df[col].dtype in ['int64', 'float64']:
                return col
        
        # If no suitable column found
        raise ValueError(f"No suitable price column found in price data for ISO: {iso_name}")
    
    def update_datacenter_dict_with_iso(self, datacenter_dict):
        """
        Update a datacenter dictionary with ISO region information
        
        Parameters:
        -----------
        datacenter_dict : dict
            Dictionary of datacenters with their information
            
        Returns:
        --------
        dict
            Updated dictionary with ISO information added to each datacenter
        """
        updated_dict = {}
        
        for dc_id, dc_info in datacenter_dict.items():
            # Create a copy of the original info
            updated_info = dc_info.copy()
            
            # Get ISO information
            iso_info = self.get_iso_for_datacenter(dc_info)
            
            # Add ISO information to datacenter info
            updated_info['iso_region'] = iso_info['iso_name']
            updated_info['iso_price_file'] = iso_info['price_file']
            
            # Add to updated dictionary
            updated_dict[dc_id] = updated_info
        
        return updated_dict

def map_google_cloud_datacenters():
    """Map the Google Cloud datacenters to their ISO regions"""
    # Google Cloud datacenter information
    datacenter_info = {
        0: {"name": "us-central1", "location": "Iowa", "renewable": 0.95, "co2": 430},
        1: {"name": "us-east1", "location": "South Carolina", "renewable": 0.29, "co2": 560},
        2: {"name": "us-east4", "location": "Northern Virginia", "renewable": 0.52, "co2": 322},
        3: {"name": "us-east5", "location": "Columbus", "renewable": 0.52, "co2": 322},
        4: {"name": "us-south1", "location": "Dallas", "renewable": 0.79, "co2": 321},
        5: {"name": "us-west1", "location": "Oregon", "renewable": 0.84, "co2": 94},
        6: {"name": "us-west2", "location": "Los Angeles", "renewable": 0.55, "co2": 198},
        7: {"name": "us-west3", "location": "Salt Lake City", "renewable": 0.29, "co2": 588},
        8: {"name": "us-west4", "location": "Las Vegas", "renewable": 0.26, "co2": 373}
    }
    
    # Create mapper and map datacenters
    mapper = DataCenterISOMapper()
    dc_iso_mapping = mapper.map_datacenters(datacenter_info)
    
    # Print results
    print("Google Cloud Datacenter to ISO Region Mapping:")
    print("-" * 80)
    print(f"{'ID':<4} {'Name':<15} {'Location':<20} {'ISO Region':<15} {'Price File':<20} {'Match Type':<15}")
    print("-" * 80)
    
    for dc_id, iso_info in dc_iso_mapping.items():
        print(f"{dc_id:<4} {iso_info['datacenter_name']:<15} {iso_info['location']:<20} "
              f"{iso_info['iso_name']:<15} {iso_info['price_file']:<20} {iso_info['match_type']:<15}")
    
    # Return both the original datacenter info and the mapping
    return datacenter_info, dc_iso_mapping

def get_electricity_price(datacenter_id, timestamp, datacenter_info, dc_iso_mapping, price_data_dir='.'):
    """
    Get the electricity price for a datacenter at a specific timestamp
    
    Parameters:
    -----------
    datacenter_id : int or str
        ID of the datacenter
    timestamp : datetime or str
        Timestamp to get price for
    datacenter_info : dict
        Dictionary of datacenter information
    dc_iso_mapping : dict
        Dictionary mapping datacenter IDs to ISO information
    price_data_dir : str
        Directory containing price data files
        
    Returns:
    --------
    float
        Electricity price for the datacenter at the specified timestamp
    """
    # Convert datacenter_id to string for lookup
    dc_id = str(datacenter_id)
    
    # Get ISO mapping for this datacenter
    iso_mapping = dc_iso_mapping.get(int(dc_id) if dc_id.isdigit() else dc_id)
    
    if not iso_mapping:
        raise ValueError(f"No ISO mapping found for datacenter ID: {datacenter_id}")
    
    # Get price file path
    price_file = iso_mapping.get('price_file')
    if not price_file:
        raise ValueError(f"No price file specified for datacenter ID: {datacenter_id}")
    
    price_file_path = os.path.join(price_data_dir, price_file)
    
    # Check if file exists
    if not os.path.exists(price_file_path):
        raise FileNotFoundError(f"Price file not found: {price_file_path}")
    
    # Load price data
    price_df = pd.read_csv(price_file_path)
    
    # Convert timestamp column to datetime
    timestamp_col = 'interval_start_utc'
    if timestamp_col in price_df.columns:
        price_df[timestamp_col] = pd.to_datetime(price_df[timestamp_col])
    
    # Convert input timestamp to datetime if it's a string
    if isinstance(timestamp, str):
        timestamp = pd.to_datetime(timestamp)
    
    # Get the best price column to use
    mapper = DataCenterISOMapper()
    price_col = mapper.get_best_price_column(iso_mapping['iso_name'], price_df)
    
    # Find the closest timestamp in the data
    closest_row = price_df.iloc[price_df[timestamp_col].sub(timestamp).abs().idxmin()]
    
    # Return the price
    return closest_row[price_col]

def save_mapping_to_json(datacenter_info, dc_iso_mapping, output_file='datacenter_iso_mapping.json'):
    """Save datacenter to ISO mapping to a JSON file"""
    # Create a combined dictionary with all information
    combined_data = {
        'datacenters': {},
        'iso_regions': {}
    }
    
    # Add datacenter information with ISO mapping
    for dc_id, dc_info in datacenter_info.items():
        dc_id_str = str(dc_id)
        iso_mapping = dc_iso_mapping.get(int(dc_id) if dc_id_str.isdigit() else dc_id)
        
        combined_data['datacenters'][dc_id_str] = {
            **dc_info,
            'iso_region': iso_mapping['iso_name'],
            'iso_price_file': iso_mapping['price_file'],
            'iso_match_type': iso_mapping['match_type']
        }
    
    # Add unique ISO regions information
    unique_isos = set(mapping['iso_name'] for mapping in dc_iso_mapping.values())
    for iso_name in unique_isos:
        combined_data['iso_regions'][iso_name] = {
            'price_file': next(mapping['price_file'] for mapping in dc_iso_mapping.values() 
                           if mapping['iso_name'] == iso_name),
            'datacenters': [dc_id for dc_id, mapping in dc_iso_mapping.items() 
                          if mapping['iso_name'] == iso_name]
        }
    
    # Save to JSON file
    with open(output_file, 'w') as f:
        json.dump(combined_data, f, indent=4)
    
    print(f"Mapping saved to {output_file}")
    
    return combined_data

if __name__ == "__main__":
    # Example usage
    datacenter_info, dc_iso_mapping = map_google_cloud_datacenters()
    
    # Save mapping to JSON
    combined_data = save_mapping_to_json(datacenter_info, dc_iso_mapping)
    
    # Example price lookup
    try:
        # Get price for us-central1 for a specific timestamp
        timestamp = "2025-03-24 10:00:00"
        dc_id = 0  # us-central1
        
        price = get_electricity_price(dc_id, timestamp, datacenter_info, dc_iso_mapping)
        print(f"\nElectricity price for {datacenter_info[dc_id]['name']} at {timestamp}: ${price:.2f}/MWh")
    except Exception as e:
        print(f"Error getting price: {str(e)}")

Google Cloud Datacenter to ISO Region Mapping:
--------------------------------------------------------------------------------
ID   Name            Location             ISO Region      Price File           Match Type     
--------------------------------------------------------------------------------
0    us-central1     Iowa                 MISO            ILLINOIS_MISO_LMP.csv datacenter_name
1    us-east1        South Carolina       SOUTHEAST       PJM_LMP.csv          datacenter_name
2    us-east4        Northern Virginia    PJM             PJM_LMP.csv          datacenter_name
3    us-east5        Columbus             PJM             PJM_LMP.csv          datacenter_name
4    us-south1       Dallas               ERCOT           ERCOT_LMP.csv        datacenter_name
5    us-west1        Oregon               CAISO           CAISO_LMP.csv        datacenter_name
6    us-west2        Los Angeles          CAISO           CAISO_LMP.csv        datacenter_name
7    us-west3        Salt Lake

In [4]:
import pandas as pd
import os
import json
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

class DataCenterPowerManager:
    """
    Class for managing data center power pricing and analysis
    """
    
    def __init__(self, datacenter_info=None, iso_mapping_file=None, price_data_dir='.'):
        """
        Initialize the DataCenterPowerManager
        
        Parameters:
        -----------
        datacenter_info : dict
            Dictionary containing datacenter information
        iso_mapping_file : str
            Path to the JSON file containing ISO mapping information
        price_data_dir : str
            Directory containing price data files
        """
        self.price_data_dir = price_data_dir
        
        # Initialize datacenter info and mappings
        if datacenter_info:
            self.datacenter_info = datacenter_info
            # Create mapper and generate ISO mappings
            self.mapper = DataCenterISOMapper()
            self.dc_iso_mapping = self.mapper.map_datacenters(self.datacenter_info)
        elif iso_mapping_file and os.path.exists(iso_mapping_file):
            # Load from mapping file
            with open(iso_mapping_file, 'r') as f:
                mapping_data = json.load(f)
            
            self.datacenter_info = {int(k) if k.isdigit() else k: v 
                                   for k, v in mapping_data['datacenters'].items()}
            
            # Extract ISO mappings
            self.dc_iso_mapping = {}
            for dc_id, dc_info in self.datacenter_info.items():
                dc_id_normalized = int(dc_id) if isinstance(dc_id, str) and dc_id.isdigit() else dc_id
                self.dc_iso_mapping[dc_id_normalized] = {
                    'datacenter_name': dc_info['name'],
                    'location': dc_info['location'],
                    'iso_name': dc_info['iso_region'],
                    'price_file': dc_info['iso_price_file'],
                    'match_type': dc_info.get('iso_match_type', 'loaded')
                }
            
            self.mapper = DataCenterISOMapper()
        else:
            raise ValueError("Either datacenter_info or iso_mapping_file must be provided")
        
        # Cache for price data
        self.price_data_cache = {}
    
    def load_price_data(self, iso_name, price_file):
        """
        Load price data for an ISO region
        
        Parameters:
        -----------
        iso_name : str
            Name of the ISO region
        price_file : str
            Name of the price file
            
        Returns:
        --------
        DataFrame
            DataFrame containing price data
        """
        # Check if data is already in cache
        cache_key = f"{iso_name}_{price_file}"
        if cache_key in self.price_data_cache:
            return self.price_data_cache[cache_key]
        
        # Load data
        price_file_path = os.path.join(self.price_data_dir, price_file)
        if not os.path.exists(price_file_path):
            raise FileNotFoundError(f"Price file not found: {price_file_path}")
        
        df = pd.read_csv(price_file_path)
        
        # Convert timestamp column to datetime
        timestamp_col = 'interval_start_utc'
        if timestamp_col in df.columns:
            df[timestamp_col] = pd.to_datetime(df[timestamp_col])
        
        # Store in cache
        self.price_data_cache[cache_key] = df
        
        return df
    
    def get_price(self, datacenter_id, timestamp):
        """
        Get the electricity price for a datacenter at a specific timestamp
        
        Parameters:
        -----------
        datacenter_id : int or str
            ID of the datacenter
        timestamp : datetime or str
            Timestamp to get price for
            
        Returns:
        --------
        float
            Electricity price for the datacenter at the specified timestamp
        """
        # Normalize datacenter_id
        dc_id = int(datacenter_id) if str(datacenter_id).isdigit() else datacenter_id
        
        # Get ISO mapping for this datacenter
        iso_mapping = self.dc_iso_mapping.get(dc_id)
        if not iso_mapping:
            raise ValueError(f"No ISO mapping found for datacenter ID: {datacenter_id}")
        
        # Get price file path
        price_file = iso_mapping.get('price_file')
        if not price_file:
            raise ValueError(f"No price file specified for datacenter ID: {datacenter_id}")
        
        # Load price data
        try:
            price_df = self.load_price_data(iso_mapping['iso_name'], price_file)
        except Exception as e:
            raise ValueError(f"Failed to load price data for datacenter {datacenter_id}: {str(e)}")
        
        # Convert input timestamp to datetime if it's a string
        if isinstance(timestamp, str):
            timestamp = pd.to_datetime(timestamp)
        
        # Get the best price column to use
        price_col = self.mapper.get_best_price_column(iso_mapping['iso_name'], price_df)
        
        # Find the timestamp column
        timestamp_col = 'interval_start_utc'
        if timestamp_col not in price_df.columns:
            # Try to find an alternative timestamp column
            timestamp_candidates = [col for col in price_df.columns 
                                   if any(term in col.lower() 
                                         for term in ['time', 'date', 'timestamp'])]
            if timestamp_candidates:
                timestamp_col = timestamp_candidates[0]
            else:
                raise ValueError(f"No timestamp column found in price data for ISO: {iso_mapping['iso_name']}")
        
        # Find the closest timestamp in the data
        closest_idx = price_df[timestamp_col].sub(timestamp).abs().idxmin()
        closest_row = price_df.iloc[closest_idx]
        
        # Return the price
        return closest_row[price_col]
    
    def get_price_series(self, datacenter_id, start_time, end_time):
        """
        Get a time series of electricity prices for a datacenter
        
        Parameters:
        -----------
        datacenter_id : int or str
            ID of the datacenter
        start_time : datetime or str
            Start timestamp
        end_time : datetime or str
            End timestamp
            
        Returns:
        --------
        DataFrame
            DataFrame containing timestamp and price data
        """
        # Normalize datacenter_id
        dc_id = int(datacenter_id) if str(datacenter_id).isdigit() else datacenter_id
        
        # Get ISO mapping for this datacenter
        iso_mapping = self.dc_iso_mapping.get(dc_id)
        if not iso_mapping:
            raise ValueError(f"No ISO mapping found for datacenter ID: {datacenter_id}")
        
        # Get price file
        price_file = iso_mapping.get('price_file')
        if not price_file:
            raise ValueError(f"No price file specified for datacenter ID: {datacenter_id}")
        
        # Load price data
        try:
            price_df = self.load_price_data(iso_mapping['iso_name'], price_file)
        except Exception as e:
            raise ValueError(f"Failed to load price data for datacenter {datacenter_id}: {str(e)}")
        
        # Convert input timestamps to datetime if they're strings
        if isinstance(start_time, str):
            start_time = pd.to_datetime(start_time)
        if isinstance(end_time, str):
            end_time = pd.to_datetime(end_time)
        
        # Get the best price column to use
        price_col = self.mapper.get_best_price_column(iso_mapping['iso_name'], price_df)
        
        # Find the timestamp column
        timestamp_col = 'interval_start_utc'
        if timestamp_col not in price_df.columns:
            # Try to find an alternative timestamp column
            timestamp_candidates = [col for col in price_df.columns 
                                   if any(term in col.lower() 
                                         for term in ['time', 'date', 'timestamp'])]
            if timestamp_candidates:
                timestamp_col = timestamp_candidates[0]
            else:
                raise ValueError(f"No timestamp column found in price data for ISO: {iso_mapping['iso_name']}")
        
        # Filter to the requested time range
        filtered_df = price_df[(price_df[timestamp_col] >= start_time) & 
                              (price_df[timestamp_col] <= end_time)].copy()
        
        # Select only the relevant columns
        result_df = filtered_df[[timestamp_col, price_col]]
        
        # Add datacenter information
        result_df['datacenter_id'] = dc_id
        result_df['datacenter_name'] = self.datacenter_info[dc_id]['name']
        result_df['location'] = self.datacenter_info[dc_id]['location']
        result_df['iso_region'] = iso_mapping['iso_name']
        
        return result_df
    
    def compare_datacenter_prices(self, datacenter_ids, timestamp):
        """
        Compare electricity prices across multiple datacenters at a specific timestamp
        
        Parameters:
        -----------
        datacenter_ids : list
            List of datacenter IDs to compare
        timestamp : datetime or str
            Timestamp to compare prices at
            
        Returns:
        --------
        DataFrame
            DataFrame containing price comparison
        """
        # Initialize results
        results = []
        
        # Get price for each datacenter
        for dc_id in datacenter_ids:
            try:
                price = self.get_price(dc_id, timestamp)
                
                # Add to results
                results.append({
                    'datacenter_id': dc_id,
                    'datacenter_name': self.datacenter_info[dc_id]['name'],
                    'location': self.datacenter_info[dc_id]['location'],
                    'iso_region': self.dc_iso_mapping[dc_id]['iso_name'],
                    'price': price,
                    'renewable': self.datacenter_info[dc_id].get('renewable', None),
                    'co2': self.datacenter_info[dc_id].get('co2', None)
                })
            except Exception as e:
                print(f"Warning: Failed to get price for datacenter {dc_id}: {str(e)}")
        
        # Convert to DataFrame
        if results:
            result_df = pd.DataFrame(results)
            
            # Sort by price (ascending)
            result_df = result_df.sort_values('price')
            
            return result_df
        else:
            return pd.DataFrame(columns=[
                'datacenter_id', 'datacenter_name', 'location', 'iso_region', 'price', 'renewable', 'co2'
            ])
    
    def find_cheapest_datacenter(self, timestamp):
        """
        Find the datacenter with the lowest electricity price at a specific timestamp
        
        Parameters:
        -----------
        timestamp : datetime or str
            Timestamp to check prices at
            
        Returns:
        --------
        dict
            Information about the cheapest datacenter
        """
        # Get price comparison for all datacenters
        comparison = self.compare_datacenter_prices(list(self.datacenter_info.keys()), timestamp)
        
        # Return the cheapest (first row after sorting)
        if len(comparison) > 0:
            cheapest = comparison.iloc[0].to_dict()
            return cheapest
        else:
            return None
    
    def find_optimal_datacenter(self, timestamp, price_weight=0.5, renewable_weight=0.3, co2_weight=0.2):
        """
        Find the optimal datacenter based on a weighted combination of price, renewable energy, and CO2 emissions
        
        Parameters:
        -----------
        timestamp : datetime or str
            Timestamp to check prices at
        price_weight : float
            Weight to assign to price factor (lower is better)
        renewable_weight : float
            Weight to assign to renewable energy factor (higher is better)
        co2_weight : float
            Weight to assign to CO2 emissions factor (lower is better)
            
        Returns:
        --------
        dict
            Information about the optimal datacenter
        """
        # Get price comparison for all datacenters
        comparison = self.compare_datacenter_prices(list(self.datacenter_info.keys()), timestamp)
        
        if len(comparison) == 0:
            return None
        
        # Normalize factors to 0-1 scale
        if comparison['price'].max() != comparison['price'].min():
            comparison['price_norm'] = 1 - ((comparison['price'] - comparison['price'].min()) / 
                                         (comparison['price'].max() - comparison['price'].min()))
        else:
            comparison['price_norm'] = 1
        
        if comparison['renewable'].max() != comparison['renewable'].min():
            comparison['renewable_norm'] = (comparison['renewable'] - comparison['renewable'].min()) / \
                                        (comparison['renewable'].max() - comparison['renewable'].min())
        else:
            comparison['renewable_norm'] = 1
        
        if comparison['co2'].max() != comparison['co2'].min():
            comparison['co2_norm'] = 1 - ((comparison['co2'] - comparison['co2'].min()) / 
                                       (comparison['co2'].max() - comparison['co2'].min()))
        else:
            comparison['co2_norm'] = 1
        
        # Calculate weighted score
        comparison['score'] = (comparison['price_norm'] * price_weight + 
                             comparison['renewable_norm'] * renewable_weight + 
                             comparison['co2_norm'] * co2_weight)
        
        # Sort by score (descending)
        comparison = comparison.sort_values('score', ascending=False)
        
        # Return the optimal datacenter
        optimal = comparison.iloc[0].to_dict()
        return optimal
    
    def plot_price_comparison(self, start_time, end_time, datacenter_ids=None, figsize=(12, 8)):
        """
        Plot electricity price comparison across datacenters
        
        Parameters:
        -----------
        start_time : datetime or str
            Start timestamp
        end_time : datetime or str
            End timestamp
        datacenter_ids : list
            List of datacenter IDs to compare (if None, use all)
        figsize : tuple
            Figure size
            
        Returns:
        --------
        matplotlib.figure.Figure
            The created figure
        """
        # Default to all datacenters if none specified
        if datacenter_ids is None:
            datacenter_ids = list(self.datacenter_info.keys())
        
        # Get price series for each datacenter
        all_data = []
        
        for dc_id in datacenter_ids:
            try:
                data = self.get_price_series(dc_id, start_time, end_time)
                all_data.append(data)
            except Exception as e:
                print(f"Warning: Failed to get price data for datacenter {dc_id}: {str(e)}")
        
        if not all_data:
            raise ValueError("No price data available for the specified datacenters and time range")
        
        # Create the plot
        fig, ax = plt.subplots(figsize=figsize)
        
        for data in all_data:
            # Get datacenter name and price column
            dc_name = data['datacenter_name'].iloc[0]
            timestamp_col = 'interval_start_utc'
            price_col = next(col for col in data.columns if col not in [
                'datacenter_id', 'datacenter_name', 'location', 'iso_region', 'interval_start_utc'
            ])
            
            # Plot price series
            ax.plot(data[timestamp_col], data[price_col], label=f"{dc_name} ({data['iso_region'].iloc[0]})")
        
        # Add labels and legend
        ax.set_xlabel('Time (UTC)', fontsize=12)
        ax.set_ylabel('Price ($/MWh)', fontsize=12)
        ax.set_title('Electricity Price Comparison by Data Center', fontsize=14)
        ax.grid(True, alpha=0.3)
        ax.legend()
        
        # Format x-axis to show dates nicely
        fig.autofmt_xdate()
        
        # Format y-axis to show currency
        ax.yaxis.set_major_formatter('${x:.2f}')
        
        plt.tight_layout()
        return fig
    
    def plot_iso_distribution_pie(self, figsize=(10, 8)):
        """
        Plot a pie chart showing the distribution of datacenters across ISO regions
        
        Parameters:
        -----------
        figsize : tuple
            Figure size
            
        Returns:
        --------
        matplotlib.figure.Figure
            The created figure
        """
        # Count datacenters by ISO region
        iso_counts = {}
        
        for dc_id, mapping in self.dc_iso_mapping.items():
            iso_name = mapping['iso_name']
            iso_counts[iso_name] = iso_counts.get(iso_name, 0) + 1
        
        # Create the plot
        fig, ax = plt.subplots(figsize=figsize)
        
        # Create pie chart
        wedges, texts, autotexts = ax.pie(
            iso_counts.values(), 
            labels=iso_counts.keys(),
            autopct='%1.1f%%',
            shadow=False,
            startangle=90
        )
        
        # Style the text
        for text in texts:
            text.set_fontsize(12)
        for autotext in autotexts:
            autotext.set_fontsize(10)
            autotext.set_color('white')
        
        ax.set_title('Distribution of Data Centers by ISO Region', fontsize=14)
        
        plt.tight_layout()
        return fig
    
    def save_mapping_to_json(self, output_file='datacenter_iso_mapping.json'):
        """Save datacenter to ISO mapping to a JSON file"""
        # Create a combined dictionary with all information
        combined_data = {
            'datacenters': {},
            'iso_regions': {}
        }
        
        # Add datacenter information with ISO mapping
        for dc_id, dc_info in self.datacenter_info.items():
            dc_id_str = str(dc_id)
            iso_mapping = self.dc_iso_mapping.get(int(dc_id) if dc_id_str.isdigit() else dc_id)
            
            combined_data['datacenters'][dc_id_str] = {
                **dc_info,
                'iso_region': iso_mapping['iso_name'],
                'iso_price_file': iso_mapping['price_file'],
                'iso_match_type': iso_mapping['match_type']
            }
        
        # Add unique ISO regions information
        unique_isos = set(mapping['iso_name'] for mapping in self.dc_iso_mapping.values())
        for iso_name in unique_isos:
            combined_data['iso_regions'][iso_name] = {
                'price_file': next(mapping['price_file'] for mapping in self.dc_iso_mapping.values() 
                               if mapping['iso_name'] == iso_name),
                'datacenters': [dc_id for dc_id, mapping in self.dc_iso_mapping.items() 
                              if mapping['iso_name'] == iso_name]
            }
        
        # Save to JSON file
        with open(output_file, 'w') as f:
            json.dump(combined_data, f, indent=4)
        
        print(f"Mapping saved to {output_file}")
        
        return combined_data
    
    def add_new_datacenter(self, dc_id, name, location, renewable=None, co2=None):
        """
        Add a new datacenter to the manager
        
        Parameters:
        -----------
        dc_id : int or str
            ID for the new datacenter
        name : str
            Name of the datacenter (e.g., "us-east1")
        location : str
            Location of the datacenter (e.g., "Virginia")
        renewable : float
            Renewable energy percentage (0-1)
        co2 : float
            CO2 emissions (g/kWh)
            
        Returns:
        --------
        dict
            Information about the newly added datacenter including ISO mapping
        """
        # Create datacenter info
        dc_info = {
            "name": name,
            "location": location
        }
        
        if renewable is not None:
            dc_info["renewable"] = renewable
        
        if co2 is not None:
            dc_info["co2"] = co2
        
        # Add to datacenter info dictionary
        self.datacenter_info[dc_id] = dc_info
        
        # Create ISO mapping
        iso_mapping = self.mapper.get_iso_for_datacenter(dc_info)
        self.dc_iso_mapping[dc_id] = iso_mapping
        
        return {
            "datacenter_id": dc_id,
            "info": dc_info,
            "iso_mapping": iso_mapping
        }
    
    def get_datacenter_summary(self):
        """
        Get a summary of all datacenters and their ISO mappings
        
        Returns:
        --------
        DataFrame
            Summary of all datacenters
        """
        # Initialize results
        results = []
        
        # Process each datacenter
        for dc_id, dc_info in self.datacenter_info.items():
            iso_mapping = self.dc_iso_mapping.get(dc_id)
            
            results.append({
                'datacenter_id': dc_id,
                'datacenter_name': dc_info['name'],
                'location': dc_info['location'],
                'iso_region': iso_mapping['iso_name'],
                'price_file': iso_mapping['price_file'],
                'match_type': iso_mapping['match_type'],
                'renewable': dc_info.get('renewable', None),
                'co2': dc_info.get('co2', None)
            })
        
        # Convert to DataFrame
        result_df = pd.DataFrame(results)
        
        # Sort by datacenter ID
        result_df = result_df.sort_values('datacenter_id')
        
        return result_df

In [5]:
import pandas as pd
import matplotlib.pyplot as plt

def main():
    """Example usage of the DataCenterPowerManager class"""
    print("Data Center Power Management Example\n")
    
    # Get Google Cloud datacenter information and ISO mapping
    print("Mapping Google Cloud datacenters to ISO regions...\n")
    datacenter_info, dc_iso_mapping = map_google_cloud_datacenters()
    
    # Create power manager
    power_manager = DataCenterPowerManager(datacenter_info=datacenter_info)
    
    # Display datacenter summary
    print("\nData Center Summary:")
    summary = power_manager.get_datacenter_summary()
    print(summary)
    
    # Example 1: Compare prices at a specific time
    timestamp = "2025-03-24 10:00:00"
    print(f"\nElectricity Price Comparison at {timestamp}:")
    comparison = power_manager.compare_datacenter_prices(list(datacenter_info.keys()), timestamp)
    print(comparison[['datacenter_name', 'location', 'iso_region', 'price', 'renewable', 'co2']])
    
    # Example 2: Find the cheapest datacenter
    cheapest = power_manager.find_cheapest_datacenter(timestamp)
    print(f"\nCheapest datacenter at {timestamp}:")
    print(f"  {cheapest['datacenter_name']} ({cheapest['location']}): ${cheapest['price']:.2f}/MWh")
    
    # Example 3: Find the optimal datacenter considering price, renewable energy, and CO2
    optimal = power_manager.find_optimal_datacenter(timestamp, 
                                                   price_weight=0.4, 
                                                   renewable_weight=0.4, 
                                                   co2_weight=0.2)
    print(f"\nOptimal datacenter at {timestamp} (balancing price, renewable energy, and CO2):")
    print(f"  {optimal['datacenter_name']} ({optimal['location']})")
    print(f"  Price: ${optimal['price']:.2f}/MWh")
    print(f"  Renewable: {optimal['renewable']*100:.1f}%")
    print(f"  CO2: {optimal['co2']} g/kWh")
    print(f"  Score: {optimal['score']:.3f}")
    
    # Example 4: Add a new datacenter
    print("\nAdding a new datacenter...")
    new_dc = power_manager.add_new_datacenter(
        dc_id=9,
        name="us-east-new",
        location="Atlanta, Georgia",
        renewable=0.45,
        co2=410
    )
    print(f"  Added {new_dc['info']['name']} in {new_dc['info']['location']}")
    print(f"  Mapped to ISO region: {new_dc['iso_mapping']['iso_name']}")
    
    # Example 5: Plot price comparison over time
    print("\nGenerating price comparison plot...")
    start_time = "2025-03-24 08:00:00"
    end_time = "2025-03-24 20:00:00"
    
    # Select a subset of datacenters to compare
    compare_dcs = [0, 4, 6]  # us-central1, us-south1, us-west2
    
    try:
        fig = power_manager.plot_price_comparison(start_time, end_time, compare_dcs)
        fig.savefig('datacenter_price_comparison.png')
        print("  Plot saved as 'datacenter_price_comparison.png'")
    except Exception as e:
        print(f"  Error generating plot: {str(e)}")
    
    # Example 6: Plot ISO distribution pie chart
    print("\nGenerating ISO distribution pie chart...")
    try:
        fig = power_manager.plot_iso_distribution_pie()
        fig.savefig('datacenter_iso_distribution.png')
        print("  Plot saved as 'datacenter_iso_distribution.png'")
    except Exception as e:
        print(f"  Error generating plot: {str(e)}")
    
    # Example 7: Get price time series for a specific datacenter
    print("\nGetting price time series for us-west2 (Los Angeles)...")
    try:
        dc_id = 6  # us-west2
        price_series = power_manager.get_price_series(dc_id, start_time, end_time)
        print(f"  Retrieved {len(price_series)} price points")
        print("  Sample of price data:")
        print(price_series.head())
    except Exception as e:
        print(f"  Error getting price series: {str(e)}")
    
    # Example 8: Save mapping to JSON for future use
    print("\nSaving datacenter ISO mapping to JSON...")
    try:
        output_file = 'gcp_datacenter_iso_mapping.json'
        power_manager.save_mapping_to_json(output_file)
        print(f"  Mapping saved to {output_file}")
    except Exception as e:
        print(f"  Error saving mapping: {str(e)}")
    
    # Example 9: Demonstration of handling missing data
    print("\nDemonstrating missing data handling:")
    try:
        # Intentionally use a timestamp that might be outside the data range
        future_time = "2025-03-27 10:00:00"
        print(f"  Attempting to get price for {future_time} (might be outside data range)")
        
        # Try to get price with error handling
        try:
            price = power_manager.get_price(0, future_time)
            print(f"  Successfully got price: ${price:.2f}/MWh")
        except Exception as e:
            print(f"  Handled error: {str(e)}")
            print("  Using nearest available timestamp instead...")
            
            # Get the timestamp range for this datacenter
            iso_name = power_manager.dc_iso_mapping[0]['iso_name']
            price_file = power_manager.dc_iso_mapping[0]['price_file']
            df = power_manager.load_price_data(iso_name, price_file)
            
            if 'interval_start_utc' in df.columns:
                last_time = df['interval_start_utc'].max()
                print(f"  Last available timestamp: {last_time}")
                
                # Get price for the last available timestamp
                price = power_manager.get_price(0, last_time)
                print(f"  Price at last available timestamp: ${price:.2f}/MWh")
    except Exception as e:
        print(f"  Error in demonstration: {str(e)}")
    
    print("\nExample complete.")

if __name__ == "__main__":
    main()        


Data Center Power Management Example

Mapping Google Cloud datacenters to ISO regions...

Google Cloud Datacenter to ISO Region Mapping:
--------------------------------------------------------------------------------
ID   Name            Location             ISO Region      Price File           Match Type     
--------------------------------------------------------------------------------
0    us-central1     Iowa                 MISO            ILLINOIS_MISO_LMP.csv datacenter_name
1    us-east1        South Carolina       SOUTHEAST       PJM_LMP.csv          datacenter_name
2    us-east4        Northern Virginia    PJM             PJM_LMP.csv          datacenter_name
3    us-east5        Columbus             PJM             PJM_LMP.csv          datacenter_name
4    us-south1       Dallas               ERCOT           ERCOT_LMP.csv        datacenter_name
5    us-west1        Oregon               CAISO           CAISO_LMP.csv        datacenter_name
6    us-west2        Los Angeles   

TypeError: 'NoneType' object is not subscriptable