In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the original data again
original_data = pd.read_csv("../cme_and_electron/CME_daniel.csv")

| Feature/Target Variable        | Preprocessing Step                                                                 |
|-------------------------------|------------------------------------------------------------------------------------|
| peak_intensity                 | Take the natural log, then divide by the max of the natural log                    |
| CMEs_over_1000_past_9_hrs      | Divide by max (2)                                                                  |
| CMEs_past_9_hours              | Divide by max (6)                                                                  |
| V log V                        | Take the natural log, then divide by the max of the natural log                     |
| longitude                      | Normalize to range \([-1, 1]\) by dividing by 180                                  |
| MPA                            | Normalize to range \([0, 1]\) by dividing by 360                                   |
| latitude                       | Normalize to range \([-1, 1]\) by dividing by 90                                   |
| Acceleration (Accel)           | Divide by max                                                                      |
| Linear Speed (donki_speed)     | Divide by max                                                                      |
| Richardson's Equation          | Take the natural log, then divide by the absolute value of the min of the natural log |
| 2nd Order Speed Final          | Divide by max                                                                      |
| 2nd Order Speed at 20 Solar Radii | Divide by max                                                              |
| Max Speed Past Day             | Divide by max                                                                      |
| CMEs in the Past Month         | Divide by max                                                                      |
| Daily Sunspot Count            | Divide by max                                                                      |
| Half Width (donki_ha)          | Divide by max                                                                      |
| CPA (Central_PA)               | Divide by max                                                                      |
| Diffusive Shock (V^V^2_replacement) | Take the natural log, then divide by the absolute value of the min of the natural log |
| Halo                           | No transformation (categorical)                                                    |
| Type II Visualization Area     | If zero, leave as zero. Else, take the natural log, then divide by the max of the natural log |

In [3]:
# Define the updated preprocessing functions based on the new table
def updated_preprocess_data(df):
    new_data = pd.DataFrame()
    
    # Apply transformations as specified in the updated table
    log_peak_intensity_max = np.log(df['peak_intensity']).max()
    new_data['log_peak_intensity_norm'] = np.log(df['peak_intensity']) / log_peak_intensity_max
    
    new_data['CMEs_over_1000_past_9_hrs_norm'] = df['CMEs_over_1000_past_9_hrs'] / 2
    new_data['CMEs_past_9_hours_norm'] = df['CMEs_past_9_hours'] / 6
    
    log_v_log_v_max = np.log(df['V log V']).max()
    new_data['log_V_log_V_norm'] = np.log(df['V log V']) / log_v_log_v_max
    
    new_data['longitude_norm'] = df['longitude'] / 180
    new_data['MPA_norm'] = df['MPA'] / 360
    new_data['latitude_norm'] = df['latitude'] / 90
    new_data['Accel_norm'] = df['Accel'] / df['Accel'].max()
    new_data['donki_speed_norm'] = df['donki_speed'] / df['donki_speed'].max()

    # Take the natural log of the 'V^V^2_replacement' (Diffusive Shock) and 'richardson_formula_1.0_c' (Richardson's Equation)
    df['log_diffusive_shock'] = np.log(df['V^V^2_replacement'])
    df['log_richardson_formula'] = np.log(df['richardson_formula_1.0_c'])
    
    # Find the absolute value of the minimum of these logged features
    abs_min_log_diffusive_shock = np.abs(df['log_diffusive_shock'].min())
    abs_min_log_richardson_formula = np.abs(df['log_richardson_formula'].min())
    
    # Divide by the absolute value of the min
    new_data['log_diffusive_shock_norm'] = df['log_diffusive_shock'] / abs_min_log_diffusive_shock
    new_data['log_richardson_formula_norm'] = df['log_richardson_formula'] / abs_min_log_richardson_formula
    
    new_data['2nd_order_speed_final_norm'] = df['2nd_order_speed_final'] / df['2nd_order_speed_final'].max()
    new_data['2nd_order_speed_20R_norm'] = df['2nd_order_speed_20R'] / df['2nd_order_speed_20R'].max()
    new_data['Max_speed_past_day_norm'] = df['Max_speed_past_day'] / df['Max_speed_past_day'].max()
    new_data['CMEs_past_month_norm'] = df['CMEs_past_month'] / df['CMEs_past_month'].max()
    new_data['sunspots_norm'] = df['sunspots'] / df['sunspots'].max()
    new_data['donki_ha_norm'] = df['donki_ha'] / df['donki_ha'].max()
    new_data['Central_PA_norm'] = df['Central_PA'] / df['Central_PA'].max()
    
    new_data['HALO'] = df['HALO']
    
    log_type_2_area_max = np.log(df[df['Type_2_Area'] > 0]['Type_2_Area']).max()
    new_data['log_Type_2_Area_norm'] = df['Type_2_Area'].apply(
        lambda x: 0 if x == 0 else np.log(x + 1) / log_type_2_area_max)
    
    return new_data

In [4]:
# Apply the updated preprocessing
updated_preprocessed_data = updated_preprocess_data(original_data)

# Calculate the new min and max for each of the new columns in the preprocessed data
updated_min_max_values = updated_preprocessed_data.agg([np.min, np.max]).T
updated_min_max_values


Unnamed: 0,amin,amax
log_peak_intensity_norm,0.034652,1.0
CMEs_over_1000_past_9_hrs_norm,0.0,1.0
CMEs_past_9_hours_norm,0.0,1.0
log_V_log_V_norm,0.445649,1.0
longitude_norm,-1.0,1.0
MPA_norm,0.0,1.0
latitude_norm,-0.977778,1.0
Accel_norm,-0.337409,1.0
donki_speed_norm,0.024,1.0
log_diffusive_shock_norm,-1.0,-0.175982


In [5]:
# Define the function to save the DataFrame to a CSV file
def save_dataframe_to_csv(df, file_path):
    """
    Save a given DataFrame to a CSV file at the specified file path.
    
    Parameters:
        df (pd.DataFrame): DataFrame to save.
        file_path (str): The file path where the DataFrame should be saved.
    """
    df.to_csv(file_path, index=False)

# Define the file path for saving the updated preprocessed data
file_path_to_save = '../cme_and_electron/cme_josias_10MeV.csv'

# Save the DataFrame to a CSV file
save_dataframe_to_csv(updated_preprocessed_data, file_path_to_save)