# CME Dataset Preprocessing

## Stratified Sampling Strategy

This notebook preprocesses the CME dataset (CSV) using stratified sampling to create:

1. **Training Set**: 2/3 of the total rows
2. **Test Set**: 1/3 of the total rows
3. **Cross-Validation Folds**: 4 folds where each fold represents 1/4 of the training set
   - **Subtraining Set**: 3/4 of the folds (75% of training data)
   - **Validation Set**: The left-out 1/4 fold (25% of training data)


In [1]:
SEP_CME_PATH = 'C:/Users/the_3/Documents/github/keras-functional-api/data/sep_cme/SEP10MeV.csv'

In [2]:
import pandas as pd
from typing import List, Union
import numpy as np


In [3]:
def analyze_dataframe(df: pd.DataFrame) -> None:
    """
    Analyze a dataframe by printing its dimensions, column names, and statistics for each column.
    
    Parameters:
    - df (pd.DataFrame): The dataframe to analyze
    
    Returns:
    - None: This function only prints information, it doesn't return any values
    """
    # --- Get dataset dimensions ---
    num_rows, num_cols = df.shape
    print(f"Number of rows: {num_rows}")
    print(f"Number of columns: {num_cols}")

    # --- Get column names with statistics ---
    print("\n--- Column Names and Statistics ---")
    for col in sorted(df.columns):
        stats = []
        try:
            # Attempt to get min/max - might fail for non-numeric/non-comparable types
            min_val = df[col].min()
            max_val = df[col].max()
            stats.append(f"Min: {min_val}")
            stats.append(f"Max: {max_val}")
        except TypeError:
            stats.append("Min/Max: NA")

        # Get unique count - works for most types
        unique_count = df[col].nunique()
        stats.append(f"Unique: {unique_count}")
        
        # Display column name with its statistics on the same line
        print(f"- {col:<25} | {' | '.join(stats)}")


# Load and analyze the dataset
try:
    # Load the dataset from the specified path
    df = pd.read_csv(SEP_CME_PATH)
    
    # Analyze the loaded dataframe
    analyze_dataframe(df)
    
except FileNotFoundError:
    print(f"Error: File not found at '{SEP_CME_PATH}'. Please provide the correct file path.")
except Exception as e:
    print(f"An error occurred: {e}")

Number of rows: 2297
Number of columns: 29

--- Column Names and Statistics ---
- 2nd_order_speed_20R       | Min: 0 | Max: 3728 | Unique: 878
- 2nd_order_speed_final     | Min: 0 | Max: 3090 | Unique: 883
- Accelaration              | Min: -240.1 | Max: 711.6 | Unique: 681
- CME_CDAW_LinearSpeed      | Min: 26 | Max: 3163 | Unique: 876
- CME_CDAW_MPA              | Min: 0 | Max: 360 | Unique: 355
- CME_CDAW_time             | Min: 1/1/2012 13:36 | Max: 9/9/2017 23:12 | Unique: 2293
- CME_DONKI_latitude        | Min: -88 | Max: 90 | Unique: 161
- CME_DONKI_longitude       | Min: -180.0 | Max: 180.0 | Unique: 355
- CME_DONKI_speed           | Min: 60 | Max: 2800 | Unique: 713
- CME_DONKI_time            | Min: 1/1/2012 13:36 | Max: 9/9/2017 23:12 | Unique: 2297
- CMEs_in_past_9hours       | Min: 1 | Max: 5 | Unique: 5
- CMEs_in_past_month        | Min: 1 | Max: 78 | Unique: 78
- CMEs_with_speed_over_1000_in_past_9hours | Min: 0 | Max: 2 | Unique: 3
- CPA                       | Min: 0 |

In [4]:
def min_max_norm(data: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
    """
    Apply min-max normalization to a pandas DataFrame or Series.
    If the min and max values of a column are the same, that column is replaced with zeros.

    Parameters:
    - cme_files (pd.DataFrame or pd.Series): The pandas DataFrame or Series to be normalized.

    Returns:
    - pd.DataFrame or pd.Series: Min-max normalized pandas DataFrame or Series.
    """

    # Function to normalize a single column
    def normalize_column(column: pd.Series) -> pd.Series:
        min_val = column.min()
        max_val = column.max()

        # Handle case where max and min are the same
        if min_val == max_val:
            return pd.Series(np.zeros_like(column), index=column.index)
        else:
            # Apply min-max normalization
            return (column - min_val) / (max_val - min_val)

    # Check if the input is a DataFrame
    if isinstance(data, pd.DataFrame):
        normalized_df = data.apply(normalize_column, axis=0)
        return normalized_df

    # Check if the input is a Series
    elif isinstance(data, pd.Series):
        return normalize_column(data)

    else:
        raise TypeError("Input must be a pandas DataFrame or Series")

In [5]:
def preprocess_cme_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply preprocessing steps to the CME dataset features.
    
    Parameters:
    - df (pd.DataFrame): The dataframe to preprocess.
    
    Returns:
    - pd.DataFrame: The preprocessed dataframe.
    """
    # Preallocate a dictionary to store preprocessed data
    preprocessed_data = {}
    
    # Include ln_peak_intensity without normalization (as it will be the target)
    preprocessed_data['ln_peak_intensity'] = df['ln_peak_intensity']
    
    # Process solar_wind_speed (was missing)
    preprocessed_data['solar_wind_speed_norm'] = min_max_norm(df['solar_wind_speed'])
    
    # Process connection_angle_degrees (was missing)
    preprocessed_data['connection_angle_degrees_norm'] = min_max_norm(df['connection_angle_degrees'])
    
    # Log transformations for specific features
    preprocessed_data['log_half_richardson_value'] = np.log1p(-df['half_richardson_value'])
    preprocessed_data['log_diffusive_shock'] = np.log1p(df['diffusive_shock'])
    preprocessed_data['log_Type2_Viz_Area'] = df['Type2_Viz_Area'].apply(lambda x: np.log(x) if x != 0 else np.log(1))
    
    # Apply Min-Max normalization on all features, including the log-transformed ones
    for feature, proper_name in {'VlogV': 'VlogV', 
                                'CME_DONKI_speed': 'CME_DONKI_speed',
                                'CME_DONKI_latitude': 'CME_DONKI_latitude', 
                                'CME_DONKI_longitude': 'CME_DONKI_longitude', 
                                'CME_CDAW_MPA': 'CME_CDAW_MPA',
                                'CME_CDAW_LinearSpeed': 'CME_CDAW_LinearSpeed',
                                'DONKI_half_width': 'DONKI_half_width',
                                'Accelaration': 'Accelaration',
                                '2nd_order_speed_final': '2nd_order_speed_final',
                                '2nd_order_speed_20R': '2nd_order_speed_20R',
                                'CPA': 'CPA',
                                'daily_sunspots': 'daily_sunspots',
                                'CMEs_in_past_month': 'CMEs_in_past_month',
                                'CMEs_in_past_9hours': 'CMEs_in_past_9hours',
                                'CMEs_with_speed_over_1000_in_past_9hours': 'CMEs_with_speed_over_1000_in_past_9hours',
                                'max_CME_speed_in_past_day': 'max_CME_speed_in_past_day'}.items():
        preprocessed_data[f"{feature}_norm"] = min_max_norm(df[proper_name])
    
    # Normalize the log-transformed features
    preprocessed_data['log_richardson_value_norm'] = min_max_norm(preprocessed_data['log_half_richardson_value'])
    preprocessed_data['log_diffusive_shock_norm'] = min_max_norm(preprocessed_data['log_diffusive_shock'])
    preprocessed_data['log_Type2_Viz_Area_norm'] = min_max_norm(preprocessed_data['log_Type2_Viz_Area'])
    
    # No transformation for 'Halo'
    preprocessed_data['Halo'] = df['Halo']
    
    # Remove intermediate log-transformed columns
    preprocessed_data.pop('log_half_richardson_value')
    preprocessed_data.pop('log_diffusive_shock')
    preprocessed_data.pop('log_Type2_Viz_Area')
    
    return pd.DataFrame(preprocessed_data)

In [6]:
# preprocess the dataset
preprocessed_df = preprocess_cme_features(df)

# print the statistics of the preprocessed dataframe
analyze_dataframe(preprocessed_df)




Number of rows: 2297
Number of columns: 23

--- Column Names and Statistics ---
- 2nd_order_speed_20R_norm  | Min: 0.0 | Max: 1.0 | Unique: 878
- 2nd_order_speed_final_norm | Min: 0.0 | Max: 1.0 | Unique: 883
- Accelaration_norm         | Min: 0.0 | Max: 1.0 | Unique: 681
- CME_CDAW_LinearSpeed_norm | Min: 0.0 | Max: 1.0 | Unique: 876
- CME_CDAW_MPA_norm         | Min: 0.0 | Max: 1.0 | Unique: 355
- CME_DONKI_latitude_norm   | Min: 0.0 | Max: 1.0 | Unique: 161
- CME_DONKI_longitude_norm  | Min: 0.0 | Max: 1.0 | Unique: 355
- CME_DONKI_speed_norm      | Min: 0.0 | Max: 1.0 | Unique: 713
- CMEs_in_past_9hours_norm  | Min: 0.0 | Max: 1.0 | Unique: 5
- CMEs_in_past_month_norm   | Min: 0.0 | Max: 1.0 | Unique: 78
- CMEs_with_speed_over_1000_in_past_9hours_norm | Min: 0.0 | Max: 1.0 | Unique: 3
- CPA_norm                  | Min: 0.0 | Max: 1.0 | Unique: 350
- DONKI_half_width_norm     | Min: 0.0 | Max: 1.0 | Unique: 71
- Halo                      | Min: 0 | Max: 1 | Unique: 2
- VlogV_norm   

In [7]:
# Rearrange columns to place ln_peak_intensity last
cols = [col for col in preprocessed_df.columns if col != 'ln_peak_intensity']
cols.append('ln_peak_intensity')
preprocessed_df = preprocessed_df[cols]

# Define the output path in the same directory as the original file
import os
output_dir = os.path.dirname(SEP_CME_PATH)
output_path = os.path.join(output_dir, 'SEP10MeV_preprocessed.csv')

# Save to CSV
preprocessed_df.to_csv(output_path, index=False)
print(f"Preprocessed data saved to: {output_path}")

# Verify the column order in the saved file
print("\nColumns in the saved file (last one is the target):")
print(', '.join(cols))

Preprocessed data saved to: C:/Users/the_3/Documents/github/keras-functional-api/data/sep_cme\SEP10MeV_preprocessed.csv

Columns in the saved file (last one is the target):
solar_wind_speed_norm, connection_angle_degrees_norm, VlogV_norm, CME_DONKI_speed_norm, CME_DONKI_latitude_norm, CME_DONKI_longitude_norm, CME_CDAW_MPA_norm, CME_CDAW_LinearSpeed_norm, DONKI_half_width_norm, Accelaration_norm, 2nd_order_speed_final_norm, 2nd_order_speed_20R_norm, CPA_norm, daily_sunspots_norm, CMEs_in_past_month_norm, CMEs_in_past_9hours_norm, CMEs_with_speed_over_1000_in_past_9hours_norm, max_CME_speed_in_past_day_norm, log_richardson_value_norm, log_diffusive_shock_norm, log_Type2_Viz_Area_norm, Halo, ln_peak_intensity


In [8]:
# Load the saved preprocessed CSV file to verify it was saved correctly
loaded_df = pd.read_csv(output_path)
print(f"\nLoaded preprocessed data from: {output_path}")
print(f"Shape of loaded data: {loaded_df.shape}")

# Analyze the loaded dataframe to verify preprocessing was successful
print("\nAnalysis of the loaded preprocessed data:")
analyze_dataframe(loaded_df)

# Verify that the target column is the last column
print("\nVerifying target column position:")
print(f"Last column in loaded data: {loaded_df.columns[-1]}")
if loaded_df.columns[-1] == 'ln_peak_intensity':
    print("✓ Target column 'ln_peak_intensity' is correctly positioned as the last column")
else:
    print("✗ Target column is not in the expected position")




Loaded preprocessed data from: C:/Users/the_3/Documents/github/keras-functional-api/data/sep_cme\SEP10MeV_preprocessed.csv
Shape of loaded data: (2297, 23)

Analysis of the loaded preprocessed data:
Number of rows: 2297
Number of columns: 23

--- Column Names and Statistics ---
- 2nd_order_speed_20R_norm  | Min: 0.0 | Max: 1.0 | Unique: 878
- 2nd_order_speed_final_norm | Min: 0.0 | Max: 1.0 | Unique: 883
- Accelaration_norm         | Min: 0.0 | Max: 1.0 | Unique: 681
- CME_CDAW_LinearSpeed_norm | Min: 0.0 | Max: 1.0 | Unique: 876
- CME_CDAW_MPA_norm         | Min: 0.0 | Max: 1.0 | Unique: 355
- CME_DONKI_latitude_norm   | Min: 0.0 | Max: 1.0 | Unique: 161
- CME_DONKI_longitude_norm  | Min: 0.0 | Max: 1.0 | Unique: 355
- CME_DONKI_speed_norm      | Min: 0.0 | Max: 1.0 | Unique: 713
- CMEs_in_past_9hours_norm  | Min: 0.0 | Max: 1.0 | Unique: 5
- CMEs_in_past_month_norm   | Min: 0.0 | Max: 1.0 | Unique: 78
- CMEs_with_speed_over_1000_in_past_9hours_norm | Min: 0.0 | Max: 1.0 | Unique: 3


In [None]:
def stratified_split(
        X: np.ndarray,
        y: np.ndarray,
        seed: int = None,
        shuffle: bool = True,
        debug: bool = False
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Splits the dataset into subtraining and validation sets using stratified sampling.
    The validation is a quarter of the dataset, and the rest is used for subtraining.

    Parameters:
    X (np.ndarray): Feature matrix of shape (n_samples, n_features).
    y (np.ndarray): Label vector of shape (n_samples, 1).
    shuffle (bool): Whether to shuffle the data before splitting. Default is True.
    seed (int): Random seed for reproducibility. Default is None.
    debug (bool): Whether to plot the distributions of the original, subtrain, and validation sets. Default is False.

    Returns:
    Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Split feature and label matrices:
        - X_subtrain: Features for the subtraining set.
        - y_subtrain: Labels for the subtraining set.
        - X_val: Features for the validation set.
        - y_val: Labels for the validation set.
    """
    if shuffle: np.random.seed(seed)
    # Sort the data by the labels
    sorted_indices = np.argsort(y, axis=0).flatten()
    X_sorted, y_sorted = X[sorted_indices], y[sorted_indices]
    # Calculate the number of validation samples
    num_samples = X.shape[0]
    # val_size = int(num_samples * split)
    # Initialize lists to hold subtraining and validation data
    X_subtrain, y_subtrain, X_val, y_val = [], [], [], []
    # Divide into groups of 4 and split into subtrain and validation
    for i in range(0, num_samples, 4):
        group_indices = list(range(i, min(i + 4, num_samples)))
        if shuffle: np.random.shuffle(group_indices)  # Shuffle within the group
        val_indices = group_indices[:1]
        subtrain_indices = group_indices[1:]
        # Append the samples to the subtraining and validation sets
        X_val.extend(X_sorted[val_indices])
        y_val.extend(y_sorted[val_indices])
        X_subtrain.extend(X_sorted[subtrain_indices])
        y_subtrain.extend(y_sorted[subtrain_indices])

    # Convert lists back to arrays
    X_subtrain, y_subtrain = np.array(X_subtrain), np.array(y_subtrain)
    X_val, y_val = np.array(X_val), np.array(y_val)

    # Ensure the largest y is in the validation set
    max_y_index = np.argmax(y_sorted)  # Index of the largest y value
    max_y_val = y_sorted[max_y_index]  # Largest y value
    # Check if the largest y value is not in the validation set
    if max_y_val not in y_val:
        # Add the sample with the largest y value to the validation set
        X_val = np.vstack([X_val, X_sorted[max_y_index].reshape(1, -1)])
        y_val = np.vstack([y_val, max_y_val.reshape(1, -1)])
        # # Remove the largest y from the subtraining set
        # mask = y_subtrain != max_y_val
        # X_subtrain = X_subtrain[mask.flatten()]
        # y_subtrain = y_subtrain[mask.flatten()]

    if debug:
        plot_distributions(y, y_subtrain, y_val)

    return X_subtrain, y_subtrain, X_val, y_val