In [11]:
import pandas as pd
import numpy as np

In [12]:
# Load the original data again
original_data = pd.read_csv("../cme_and_electron/cme_daniel_100MeV.csv")

| Feature/Target Variable        | Preprocessing Step                                                                 |
|-------------------------------|------------------------------------------------------------------------------------|
| peak_intensity                 | Take the natural log, then divide by the max of the natural log                    |
| CMEs_over_1000_past_9_hrs      | Divide by max (2)                                                                  |
| CMEs_past_9_hours              | Divide by max (6)                                                                  |
| V log V                        | Take the natural log, then divide by the max of the natural log                     |
| longitude                      | Normalize to range \([-1, 1]\) by dividing by 180                                  |
| MPA                            | Normalize to range \([0, 1]\) by dividing by 360                                   |
| latitude                       | Normalize to range \([-1, 1]\) by dividing by 90                                   |
| Acceleration (Accel)           | Divide by max                                                                      |
| Linear Speed (donki_speed)     | Divide by max                                                                      |
| Richardson's Equation          | Take the natural log, then divide by the absolute value of the min of the natural log |
| 2nd Order Speed Final          | Divide by max                                                                      |
| 2nd Order Speed at 20 Solar Radii | Divide by max                                                              |
| Max Speed Past Day             | Divide by max                                                                      |
| CMEs in the Past Month         | Divide by max                                                                      |
| Daily Sunspot Count            | Divide by max                                                                      |
| Half Width (donki_ha)          | Divide by max                                                                      |
| CPA (Central_PA)               | Divide by max                                                                      |
| Diffusive Shock (V^V^2_replacement) | Take the natural log, then divide by the absolute value of the min of the natural log |
| Halo                           | No transformation (categorical)                                                    |
| Type II Visualization Area     | If zero, leave as zero. Else, take the natural log, then divide by the max of the natural log |

In [13]:
# # Define the updated preprocessing functions based on the new table
# def updated_preprocess_data(df):
#     new_data = pd.DataFrame()
# 
#     # Apply transformations as specified in the updated table
#     log_peak_intensity_max = np.log(df['peak_intensity']).max()
#     new_data['log_peak_intensity_norm'] = np.log(df['peak_intensity']) / log_peak_intensity_max
# 
#     new_data['CMEs_over_1000_past_9_hrs_norm'] = df['CMEs_over_1000_past_9_hrs'] / 2
#     new_data['CMEs_past_9_hours_norm'] = df['CMEs_past_9_hours'] / 6
# 
#     log_v_log_v_max = np.log(df['V log V']).max()
#     new_data['log_V_log_V_norm'] = np.log(df['V log V']) / log_v_log_v_max
# 
#     new_data['longitude_norm'] = df['longitude'] / 180
#     new_data['MPA_norm'] = df['MPA'] / 360
#     new_data['latitude_norm'] = df['latitude'] / 90
#     new_data['Accel_norm'] = df['Accel'] / df['Accel'].max()
#     new_data['donki_speed_norm'] = df['donki_speed'] / df['donki_speed'].max()
# 
#     # Take the natural log of the 'V^V^2_replacement' (Diffusive Shock) and 'richardson_formula_1.0_c' (Richardson's Equation)
#     df['log_diffusive_shock'] = np.log(df['V^V^2_replacement'])
#     df['log_richardson_formula'] = np.log(df['richardson_formula_1.0_c'])
# 
#     # Find the absolute value of the minimum of these logged features
#     abs_min_log_diffusive_shock = np.abs(df['log_diffusive_shock'].min())
#     abs_min_log_richardson_formula = np.abs(df['log_richardson_formula'].min())
# 
#     # Divide by the absolute value of the min
#     new_data['log_diffusive_shock_norm'] = df['log_diffusive_shock'] / abs_min_log_diffusive_shock
#     new_data['log_richardson_formula_norm'] = df['log_richardson_formula'] / abs_min_log_richardson_formula
# 
#     new_data['2nd_order_speed_final_norm'] = df['2nd_order_speed_final'] / df['2nd_order_speed_final'].max()
#     new_data['2nd_order_speed_20R_norm'] = df['2nd_order_speed_20R'] / df['2nd_order_speed_20R'].max()
#     new_data['Max_speed_past_day_norm'] = df['Max_speed_past_day'] / df['Max_speed_past_day'].max()
#     new_data['CMEs_past_month_norm'] = df['CMEs_past_month'] / df['CMEs_past_month'].max()
#     new_data['sunspots_norm'] = df['sunspots'] / df['sunspots'].max()
#     new_data['donki_ha_norm'] = df['donki_ha'] / df['donki_ha'].max()
#     new_data['Central_PA_norm'] = df['Central_PA'] / df['Central_PA'].max()
# 
#     new_data['HALO'] = df['HALO']
# 
#     log_type_2_area_max = np.log(df[df['Type_2_Area'] > 0]['Type_2_Area']).max()
#     new_data['log_Type_2_Area_norm'] = df['Type_2_Area'].apply(
#         lambda x: 0 if x == 0 else np.log(x + 1) / log_type_2_area_max)
# 
#     return new_data

| Feature/Target Variable            | Preprocessing Step                                                                                                  |
|------------------------------------|---------------------------------------------------------------------------------------------------------------------|
| peak_intensity                     | Take the natural log                                                                                                |
| CMEs_over_1000_past_9_hrs          | Map to 0-1 using min-max normalization                                                                               |
| CMEs_past_9_hours                  | Map to 0-1 using min-max normalization                                                                               |
| V log V                            | Map to 0-1 using min-max normalization                                                                               |
| longitude                          | Map to 0-1 using min-max normalization                                                                               |
| MPA                                | Map to 0-1 using min-max normalization                                                                               |
| latitude                           | Map to 0-1 using min-max normalization                                                                               |
| Acceleration (Accel)               | Map to 0-1 using min-max normalization                                                                               |
| Linear Speed (donki_speed)         | Map to 0-1 using min-max normalization                                                                               |
| Richardson's Equation              | Take the natural log, map to 0-1 using min-max normalization based on natural log                                    |
| 2nd Order Speed Final              | Map to 0-1 using min-max normalization                                                                               |
| 2nd Order Speed at 20 Solar Radii  | Map to 0-1 using min-max normalization                                                                               |
| Max Speed Past Day                 | Map to 0-1 using min-max normalization                                                                               |
| CMEs in the Past Month             | Map to 0-1 using min-max normalization                                                                               |
| Daily Sunspot Count                | Map to 0-1 using min-max normalization                                                                               |
| Half Width (donki_ha)              | Map to 0-1 using min-max normalization                                                                               |
| CPA (Central_PA)                   | Map to 0-1 using min-max normalization                                                                               |
| Diffusive Shock (V^V^2_replacement) | Take the natural log, map to 0-1 using min-max normalization based on natural log                                    |
| Halo                               | No transformation (categorical)                                                                                      |
| Type II Visualization Area         | If zero, map to 1. Take the natural log, then map to 0-1 using min-max normalization based on the natural log values |


In [22]:
def updated_preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply efficient preprocessing steps to the given dataframe based on the specified scheme table.
    :param df (pd.DataFrame): Original dataframe with raw features.
    :return pd.DataFrame: Dataframe with features processed according to the scheme table.
    """

    # Preallocate a dictionary to store preprocessed data
    preprocessed_data = {}

        # Lambda function for min-max normalization
    min_max_norm = lambda x: (x - x.min()) / (x.max() - x.min())

    # Natural Log Transformations
    preprocessed_data['log_peak_intensity'] = np.log(df['peak_intensity'])
    preprocessed_data['log_richardson_formula_1.0_c'] = np.log(df['richardson_formula_1.0_c'])
    preprocessed_data['log_V^V^2_replacement'] = np.log(df['V^V^2_replacement'])
    preprocessed_data['log_Type_2_Area'] = df['Type_2_Area'].apply(lambda x: np.log(x) if x != 0 else np.log(1))

    # Apply Min-Max normalization on all features, including the log-transformed ones
    for feature, proper_name in {'V Log V': 'V log V',
                                 'Linear Speed': 'donki_speed',
                                 '2nd Order Speed Final': '2nd_order_speed_final',
                                 '2nd Order Speed at 20 Solar Radii': '2nd_order_speed_20R',
                                 'CMEs over 1000 km/s Past 9 Hrs': 'CMEs_over_1000_past_9_hrs',
                                 'Max Speed Past Day': 'Max_speed_past_day',
                                 'CMEs in the Past Month': 'CMEs_past_month',
                                 'Longitude': 'longitude',
                                 'MPA': 'MPA',
                                 'Daily Sunspot Count': 'sunspots',
                                 'Half Width': 'donki_ha',
                                 'Latitude': 'latitude',
                                 'Acceleration': 'Accel',
                                 'CPA': 'Central_PA',
                                 'CMEs in the Past 9 Hours': 'CMEs_past_9_hours'}.items():
        preprocessed_data[f"{feature}_norm"] = min_max_norm(df[proper_name])

    # Apply min-max normalization to log-transformed features
    preprocessed_data['log_richardson_formula_1.0_c_norm'] = min_max_norm(preprocessed_data['log_richardson_formula_1.0_c'])
    preprocessed_data['log_V^V^2_replacement_norm'] = min_max_norm(preprocessed_data['log_V^V^2_replacement'])
    preprocessed_data['log_Type_2_Area_norm'] = min_max_norm(preprocessed_data['log_Type_2_Area'])

    # No transformation for 'Halo'
    preprocessed_data['HALO'] = df['HALO']
    
    # drop log_richardson_formula_1.0_c, log_V^V^2_replacement, log_Type_2_Area because they are not needed anymore
    preprocessed_data.pop('log_richardson_formula_1.0_c')
    preprocessed_data.pop('log_V^V^2_replacement')
    preprocessed_data.pop('log_Type_2_Area')

    return pd.DataFrame(preprocessed_data)


In [23]:
# Apply the updated preprocessing
updated_preprocessed_data = updated_preprocess_data(original_data)
updated_preprocessed_data.head()

Unnamed: 0,log_peak_intensity,V Log V_norm,Linear Speed_norm,2nd Order Speed Final_norm,2nd Order Speed at 20 Solar Radii_norm,CMEs over 1000 km/s Past 9 Hrs_norm,Max Speed Past Day_norm,CMEs in the Past Month_norm,Longitude_norm,MPA_norm,Daily Sunspot Count_norm,Half Width_norm,Latitude_norm,Acceleration_norm,CPA_norm,CMEs in the Past 9 Hours_norm,log_richardson_formula_1.0_c_norm,log_V^V^2_replacement_norm,log_Type_2_Area_norm,HALO
0,0.302585,0.427577,0.528689,0.4589,0.360515,0.0,0.126,0.45,0.861111,0.947222,0.281407,0.344828,0.831461,0.275717,0.947222,0.0,0.986491,0.984903,0.0,0.0
1,0.302585,0.131308,0.216803,0.234628,0.1875,0.0,0.0,0.4125,0.216667,0.144444,0.447236,0.091954,0.713483,0.271304,0.130556,0.0,0.639191,0.407046,0.0,0.0
2,0.302585,0.224746,0.331967,0.155987,0.117489,0.0,0.2192,0.7625,0.241667,0.311111,0.688442,0.195402,0.393258,0.219187,0.247222,0.166667,0.855708,0.907368,0.0,0.0
3,0.302585,0.085371,0.158197,0.099676,0.083423,0.0,0.1132,0.3875,0.130556,0.227778,0.336683,0.264368,0.460674,0.251445,0.252778,0.0,0.879719,0.360678,0.0,0.0
4,0.302585,0.089887,0.114754,0.143689,0.108369,0.0,0.26,0.425,0.247222,0.061111,0.623116,0.287356,0.882022,0.256699,0.061111,0.0,0.95772,0.315152,0.0,0.0


In [24]:
# Calculate the new min and max for each of the new columns in the preprocessed data
updated_min_max_values = updated_preprocessed_data.agg([np.min, np.max]).T
updated_min_max_values


Unnamed: 0,amin,amax
log_peak_intensity,0.302585,8.732079
V Log V_norm,0.0,1.0
Linear Speed_norm,0.0,1.0
2nd Order Speed Final_norm,0.0,1.0
2nd Order Speed at 20 Solar Radii_norm,0.0,1.0
CMEs over 1000 km/s Past 9 Hrs_norm,0.0,1.0
Max Speed Past Day_norm,0.0,1.0
CMEs in the Past Month_norm,0.0,1.0
Longitude_norm,0.0,1.0
MPA_norm,0.0,1.0


In [25]:
# Define the function to save the DataFrame to a CSV file
def save_dataframe_to_csv(df, file_path):
    """
    Save a given DataFrame to a CSV file at the specified file path.
    
    Parameters:
        df (pd.DataFrame): DataFrame to save.
        file_path (str): The file path where the DataFrame should be saved.
    """
    df.to_csv(file_path, index=False)


# Define the file path for saving the updated preprocessed data
file_path_to_save = '../cme_and_electron/cme_josias_10MeV.csv'

# Save the DataFrame to a CSV file
save_dataframe_to_csv(updated_preprocessed_data, file_path_to_save)

In [7]:
# max_peak_intensity = 6198.6
# log_peak_intensity_max = np.log(max_peak_intensity)
# log_peak_intensity_max

8.732078739083455

In [9]:
# def reverse_log_peak_intensity_norm(log_peak_intensity_norm, log_peak_intensity_max=8.732078739083455, in_log=False):
#     """
#     Reverse the normalization of log_peak_intensity_norm to obtain either peak_intensity or log_peak_intensity.
#     
#     Parameters:
#     - log_peak_intensity_norm (float or np.ndarray): The normalized log of peak intensity to be reversed.
#     - log_peak_intensity_max (float): The maximum value of the log of the original peak intensity.
#     - in_log (bool): If True, returns the log of the peak intensity. Otherwise, returns the peak intensity itself.
#     
#     Returns:
#     - float or np.ndarray: The reversed peak intensity or its log, depending on the value of in_log.
#     """
#     # Reverse normalization to get log_peak_intensity
#     log_peak_intensity = log_peak_intensity_norm * log_peak_intensity_max
# 
#     if in_log:
#         return log_peak_intensity
#     else:
#         # Exponentiate to get back to peak_intensity
#         return np.exp(log_peak_intensity)

In [10]:
# print(reverse_log_peak_intensity_norm(0.034652126))

1.3533528319798735
