In [1]:
## Carbon intensity dataset consolidation
# This script consolidates carbon intensity data from various sources into a single CSV file.
import pandas as pd
import os
from datetime import datetime


In [5]:

def consolidate_carbon_intensity_data(input_files, output_file):
    """
    Consolidate carbon intensity data from multiple CSV files.
    
    This function:
    1. Extracts data only for March 24-25
    2. Keeps only life cycle carbon intensity
    3. Preserves zone names for region identification
    4. Sorts by datetime and zone name
    
    Parameters:
    ----------
    input_files : list
        List of input CSV file paths
    output_file : str
        Path to save the consolidated data
    
    Returns:
    -------
    pandas.DataFrame
        The consolidated dataframe
    """
    print("Starting data consolidation process...")
    
    # Process all files and concatenate results
    all_data = []
    for file_name in input_files:
        if os.path.exists(file_name):
            df = process_file(file_name)
            if not df.empty:
                all_data.append(df)
            else:
                print(f"No March 24-25 data found in {file_name}")
        else:
            print(f"Warning: File {file_name} not found. Skipping...")
    
    # Combine all dataframes
    if all_data:
        consolidated_df = pd.concat(all_data, ignore_index=True)
        
        # Sort by datetime and zone name
        consolidated_df = consolidated_df.sort_values(['Datetime (UTC)', 'Zone name'])
        
        # Save to output file
        consolidated_df.to_csv(output_file, index=False)
        print(f"Consolidated data saved to {output_file}")
        
        # Print summary statistics
        print("\nSummary statistics:")
        print(f"Total number of rows: {len(consolidated_df)}")
        print(f"Unique zones: {consolidated_df['Zone name'].nunique()}")
        zones = consolidated_df['Zone name'].unique()
        print(f"Zones included: {', '.join(zones)}")
        print(f"Date range: {consolidated_df['Datetime (UTC)'].min()} to {consolidated_df['Datetime (UTC)'].max()}")
        
        return consolidated_df
    else:
        print("No data was processed. Check if input files exist and contain March 24-25 data.")
        return pd.DataFrame()

def process_file(file_path):
    """
    Process a single CSV file to extract March 24-25 data with life cycle carbon intensity.
    
    Parameters:
    ----------
    file_path : str
        Path to the CSV file
    
    Returns:
    -------
    pandas.DataFrame
        Processed dataframe with filtered data
    """
    print(f"Processing {file_path}...")
    
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Convert date string to datetime
        df['Datetime (UTC)'] = pd.to_datetime(df['Datetime (UTC)'])
        
        # Filter for March 24-25 only
        march_24_25 = df[(df['Datetime (UTC)'].dt.month == 3) & 
                         (df['Datetime (UTC)'].dt.day >= 24) & 
                         (df['Datetime (UTC)'].dt.day <= 25)]
        
        # Select only the columns we need
        columns_to_keep = ['Datetime (UTC)', 'Zone name', 'Carbon intensity gCO₂eq/kWh (Life cycle)']
        filtered_df = march_24_25[columns_to_keep]
        
        return filtered_df
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return pd.DataFrame()

if __name__ == "__main__":
    # List of input CSV files
    input_files = [
        "US-MIDW-LGEE_2024_hourly.csv",
        "US-NE-ISNE_2024_hourly.csv",
        "US-NW-NEVP_2024_hourly.csv",
        "US-NY-NYIS_2024_hourly.csv",
        "US-SE-SEPA_2024_hourly.csv",
        "US-SW-AZPS_2024_hourly.csv",
        "US-NW-PGE_2024_hourly.csv",
        "US-CAL-CISO_2024_hourly.csv",
        "US-CAL-LDWP_2024_hourly.csv"
    ]
    
    # Output file name
    output_file = "consolidated_carbon_intensity_march24_25.csv"
    
    # Process and consolidate the data
    consolidated_df = consolidate_carbon_intensity_data(input_files, output_file)
    
    # Additional validation
    if not consolidated_df.empty:
        # Check for missing values
        missing_values = consolidated_df.isnull().sum()
        if missing_values.sum() > 0:
            print("\nWarning: Missing values detected in the consolidated data:")
            print(missing_values)
        else:
            print("\nNo missing values detected in the consolidated data.")

Starting data consolidation process...
Processing US-MIDW-LGEE_2024_hourly.csv...
Processing US-NE-ISNE_2024_hourly.csv...
Processing US-NW-NEVP_2024_hourly.csv...
Processing US-NY-NYIS_2024_hourly.csv...
Processing US-SE-SEPA_2024_hourly.csv...
Processing US-SW-AZPS_2024_hourly.csv...
Processing US-NW-PGE_2024_hourly.csv...
Processing US-CAL-CISO_2024_hourly.csv...
Processing US-CAL-LDWP_2024_hourly.csv...
Consolidated data saved to consolidated_carbon_intensity_march24_25.csv

Summary statistics:
Total number of rows: 432
Unique zones: 9
Zones included: Arizona Public Service Company, CAISO, ISO New England, Los Angeles Department of Water and Power, Louisville Gas and Electric Company and Kentucky Utilities, Nevada Power Company, New York ISO, Portland General Electric Company, Southeastern Power Administration
Date range: 2024-03-24 00:00:00 to 2024-03-25 23:00:00

Datetime (UTC)                              0
Zone name                                   0
Carbon intensity gCO₂eq/kW

In [1]:
import pandas as pd
import os
from datetime import datetime

def consolidate_carbon_intensity_data(input_files, output_file):
    """
    Consolidate carbon intensity data from multiple CSV files.
    
    This function:
    1. Extracts data only for March 24-25
    2. Keeps only life cycle carbon intensity
    3. Creates a wide format with each zone having its own column
    4. Sorts by datetime
    
    Parameters:
    ----------
    input_files : list
        List of input CSV file paths
    output_file : str
        Path to save the consolidated data
    
    Returns:
    -------
    pandas.DataFrame
        The consolidated dataframe
    """
    print("Starting data consolidation process...")
    
    # Process all files and concatenate results
    all_data = []
    for file_name in input_files:
        if os.path.exists(file_name):
            df = process_file(file_name)
            if not df.empty:
                all_data.append(df)
            else:
                print(f"No March 24-25 data found in {file_name}")
        else:
            print(f"Warning: File {file_name} not found. Skipping...")
    
    # Combine all dataframes
    if all_data:
        # Concatenate all data vertically
        long_df = pd.concat(all_data, ignore_index=True)
        
        # Pivot the data to wide format: each zone becomes a column
        wide_df = long_df.pivot(
            index='Datetime (UTC)', 
            columns='Zone name', 
            values='Carbon intensity gCO₂eq/kWh (Life cycle)'
        ).reset_index()
        
        # Rename columns to add clarity
        for col in wide_df.columns:
            if col != 'Datetime (UTC)':
                wide_df.rename(columns={col: f"{col} (gCO₂eq/kWh)"}, inplace=True)
        
        # Save to output file
        wide_df.to_csv(output_file, index=False)
        print(f"Consolidated data saved to {output_file}")
        
        # Print summary statistics
        print("\nSummary statistics:")
        print(f"Total number of rows (timestamps): {len(wide_df)}")
        
        zone_columns = [col for col in wide_df.columns if col != 'Datetime (UTC)']
        print(f"Number of zones (columns): {len(zone_columns)}")
        print(f"Zones included: {', '.join([col.split(' ')[0] for col in zone_columns])}")
        print(f"Date range: {wide_df['Datetime (UTC)'].min()} to {wide_df['Datetime (UTC)'].max()}")
        
        return wide_df
    else:
        print("No data was processed. Check if input files exist and contain March 24-25 data.")
        return pd.DataFrame()

def process_file(file_path):
    """
    Process a single CSV file to extract March 24-25 data with life cycle carbon intensity.
    
    Parameters:
    ----------
    file_path : str
        Path to the CSV file
    
    Returns:
    -------
    pandas.DataFrame
        Processed dataframe with filtered data
    """
    print(f"Processing {file_path}...")
    
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Convert date string to datetime
        df['Datetime (UTC)'] = pd.to_datetime(df['Datetime (UTC)'])
        
        # Filter for March 24-25 only
        march_24 = df[(df['Datetime (UTC)'].dt.month == 3) & 
                         (df['Datetime (UTC)'].dt.day == 24)]
        

        # Select only the columns we need
        columns_to_keep = ['Datetime (UTC)', 'Zone name', 'Carbon intensity gCO₂eq/kWh (Life cycle)']
        filtered_df = march_24[columns_to_keep]
        
        return filtered_df
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return pd.DataFrame()

if __name__ == "__main__":
    # List of input CSV files
    input_files = [
        "US-NE-ISNE_2024_hourly.csv",
        "US-NY-NYIS_2024_hourly.csv",
        "US-SW-AZPS_2024_hourly.csv",
        "US-CAL-CISO_2024_hourly.csv",
        "US-CAL-LDWP_2024_hourly.csv",
        "US-TEX-ERCO_2024_hourly.csv",
        "US-SE-SEPA_2024_hourly.csv",
        "US-NW-PGE_2024_hourly.csv",
        "US-NW-NEVP_2024_hourly.csv",
        "US-NW-PACE_2024_hourly.csv",
        "US-MIDW-MISO_2024_hourly.csv",
        "US-MIDW-LGEE_2024_hourly.csv"
    ]
    
    # Output file name
    output_file = "consolidated_carbon_intensity_march24.csv"
    
    # Process and consolidate the data
    wide_df = consolidate_carbon_intensity_data(input_files, output_file)
    
    # Additional validation
    if not wide_df.empty:
        # Check for missing values
        missing_values = wide_df.isnull().sum()
        if missing_values.sum() > 0:
            print("\nWarning: Missing values detected in the consolidated data:")
            print(missing_values)
            
        # Optional: Fill missing values (e.g., with linear interpolation)
        filled_df = wide_df.copy()
        for col in wide_df.columns:
            if col != 'Datetime (UTC)':
                # Identify missing values
                missing_mask = wide_df[col].isna()
                
                if missing_mask.any():
                    # Create a series to store filled values
                    filled_values = wide_df[col].copy()
                    
                    # For each missing value, compute average of neighboring values
                    for idx in missing_mask[missing_mask].index:
                        # Find position in the dataframe
                        pos = filled_df.index.get_loc(idx)
                        neighbors = []
                        
                        # Try to get value before (if not first row)
                        if pos > 0:
                            prev_val = wide_df[col].iloc[pos-1]
                            if not pd.isna(prev_val):
                                neighbors.append(prev_val)
                        
                        # Try to get value after (if not last row)
                        if pos < len(wide_df) - 1:
                            next_val = wide_df[col].iloc[pos+1]
                            if not pd.isna(next_val):
                                neighbors.append(next_val)
                        
                        # Calculate average if neighbors exist
                        if neighbors:
                            filled_values.iloc[pos] = sum(neighbors) / len(neighbors)
                    
                    # Assign filled values back to the dataframe
                    filled_df[col] = filled_values
                    
                    # For any remaining NA values (if both neighbors were NA), use forward/backward fill
                    if filled_df[col].isna().any():
                        filled_df[col] = filled_df[col].fillna(method='ffill').fillna(method='bfill')

            # Save filled version to a separate file
            filled_output = "consolidated_carbon_intensity_march24_filled.csv"
            filled_df.to_csv(filled_output, index=False)
            print(f"\nFilled data (with neighbor averaging) saved to {filled_output}")
        else:
            print("\nNo missing values detected in the consolidated data.")

Starting data consolidation process...
Processing US-NE-ISNE_2024_hourly.csv...
Processing US-NY-NYIS_2024_hourly.csv...
Processing US-SW-AZPS_2024_hourly.csv...
Processing US-CAL-CISO_2024_hourly.csv...
Processing US-CAL-LDWP_2024_hourly.csv...
Processing US-TEX-ERCO_2024_hourly.csv...
Processing US-SE-SEPA_2024_hourly.csv...
Processing US-NW-PGE_2024_hourly.csv...
Processing US-NW-NEVP_2024_hourly.csv...
Processing US-NW-PACE_2024_hourly.csv...
Processing US-MIDW-MISO_2024_hourly.csv...
Processing US-MIDW-LGEE_2024_hourly.csv...
Consolidated data saved to consolidated_carbon_intensity_march24.csv

Summary statistics:
Total number of rows (timestamps): 24
Number of zones (columns): 12
Zones included: Arizona, CAISO, Electric, ISO, Los, Louisville, Midcontinent, Nevada, New, Pacificorp, Portland, Southeastern
Date range: 2024-03-24 00:00:00 to 2024-03-24 23:00:00

Filled data (with neighbor averaging) saved to consolidated_carbon_intensity_march24_filled.csv

Filled data (with neighbor