# Preprocessing L1B
We will extract all relevant datasets from all HDF5 files and convert them into a readable Pandas dataframe. We will also do some preliminary data cleaning and PCA analysis. Everything will be outputted as *.parquet* files. This dramatically reduces the memory usage in the outlier detector scripts.

In [1]:
import os
import random

import h5py
import numpy as np
import pandas as pd
import geopandas as gp
# import geoviews as gv
# from geoviews import opts, tile_sources as gvts
# import holoviews as hv
# gv.extension('bokeh', 'matplotlib')
from shapely.geometry import Point
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

import matplotlib
import matplotlib.pyplot as plt
# import PyQt6

In [2]:
inDir = os.getcwd() + "/Input_files"
print(inDir)
input_file_names = [g for g in os.listdir(inDir) if g.startswith('GEDI01_B') and g.endswith('.h5')]  # List all GEDI level 2 files in inDir
input_file_names

/oscar/scratch/jzhu118/GEDI_Outlier_Detection_OSCAR/Input_files


['GEDI01_B_2022004042652_O17343_04_T10772_02_005_02_V002.h5',
 'GEDI01_B_2022207041426_O20491_04_T09293_02_005_03_V002.h5']

### Loading files with all information into a huge Pandas dataframe

#### Preprocessing: Get files and sort by beam

In [3]:
input_files = []
files_to_beams = dict()
for n in input_file_names:
    file_path = os.path.join(inDir, n)  # Select an example file
    file = h5py.File(file_path, 'r')
    input_files.append(file)
    
    print('Loading file: ' + n)
    print('The file contains the following groups: ' + str(list(file.keys())))
    
    print("The file's metadata contains the following attributes: ")
    for g in file['METADATA']['DatasetIdentification'].attrs: print(g)
    
    beamNames = [g for g in file.keys() if g.startswith('BEAM')]
    files_to_beams[file] = beamNames
    
    print("The file contains the following beams: ")
    for b in beamNames:
        print(f"{b} is a {file[b].attrs['description']}")

Loading file: GEDI01_B_2022004042652_O17343_04_T10772_02_005_02_V002.h5
The file contains the following groups: ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file's metadata contains the following attributes: 
PGEVersion
VersionID
abstract
characterSet
creationDate
credit
fileName
language
originatorOrganizationName
purpose
shortName
spatialRepresentationType
status
topicCategory
uuid
The file contains the following beams: 
BEAM0000 is a Coverage beam
BEAM0001 is a Coverage beam
BEAM0010 is a Coverage beam
BEAM0011 is a Coverage beam
BEAM0101 is a Full power beam
BEAM0110 is a Full power beam
BEAM1000 is a Full power beam
BEAM1011 is a Full power beam
Loading file: GEDI01_B_2022207041426_O20491_04_T09293_02_005_03_V002.h5
The file contains the following groups: ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file's metadata contains the following attribute

In [4]:
def list_1d_features(file_path, shot_number_key='shot_number'):
    """
    Returns a sorted list of base feature names from the HDF5 file where each dataset:
      - is 1-dimensional and
      - has length equal to the number of shots determined from the shot_number dataset.
    
    Parameters:
      file_path (str): Path to the HDF5 file.
      shot_number_key (str): Suffix used to locate the shot number dataset.
      
    Returns:
      List[str]: Sorted list of feature base names.
    """
    feature_names = set()
    
    with h5py.File(file_path, 'r') as f:
        # First, locate the shot_number dataset to determine the shot count.
        shot_num_ds = None
        
        def find_shot_ds(name, node):
            nonlocal shot_num_ds
            if isinstance(node, h5py.Dataset) and name.endswith(shot_number_key):
                shot_num_ds = node
        
        f.visititems(find_shot_ds)
        
        if shot_num_ds is None:
            print(f"Error: Could not find a dataset ending with '{shot_number_key}'.")
            return []
        
        shot_count = shot_num_ds.shape[0]
        print(f"Found shot_number dataset with {shot_count} shots.")
        
        # Now, visit every dataset and check if it is 1D with length equal to shot_count.
        def visitor_func(name, node):
            if isinstance(node, h5py.Dataset):
                shape = node.shape
                if len(shape) == 1 and shape[0] == shot_count:
                    # Extract the base name (the part after the last '/')
                    base_name = name.split('/')[-1]
                    feature_names.add(base_name)
        
        f.visititems(visitor_func)
    
    return sorted(feature_names)

# Example usage:
for f in input_file_names:
    file_path = file_path = os.path.join(inDir, f)
    features = list_1d_features(file_path)
    print("Features of shape (# shots):")
    for feat in features:
        print(f"  {feat}")

Found shot_number dataset with 168553 shots.
Features of shape (# shots):
  all_samples_sum
  altitude_instrument
  altitude_instrument_error
  beam
  bounce_time_offset_bin0
  bounce_time_offset_bin0_error
  bounce_time_offset_lastbin
  bounce_time_offset_lastbin_error
  channel
  degrade
  delta_time
  digital_elevation_model
  digital_elevation_model_srtm
  dynamic_atmosphere_correction
  elevation_bin0
  elevation_bin0_error
  elevation_lastbin
  elevation_lastbin_error
  geoid
  latitude_bin0
  latitude_bin0_error
  latitude_instrument
  latitude_instrument_error
  latitude_lastbin
  latitude_lastbin_error
  local_beam_azimuth
  local_beam_azimuth_error
  local_beam_elevation
  local_beam_elevation_error
  longitude_bin0
  longitude_bin0_error
  longitude_instrument
  longitude_instrument_error
  longitude_lastbin
  longitude_lastbin_error
  master_frac
  master_int
  mean_sea_surface
  neutat_delay_derivative_bin0
  neutat_delay_derivative_lastbin
  neutat_delay_total_bin0
  neut

In [None]:
def list_features_by_shot_count(file_path, shot_number_key='shot_number'):
    """
    Returns a list of dataset names from the given HDF5 file that have shapes matching:
      - (# shots,) or
      - (# shots, n) or (n, # shots)
    
    Parameters:
      file_path (str): Path to the HDF5 file.
      shot_number_key (str): Key name (or suffix) used to identify the shot number dataset.
    
    Returns:
      List[str]: List of dataset names that meet the criteria.
    """
    matching_features = []
    
    with h5py.File(file_path, 'r') as f:
        # First, locate the shot_number dataset to determine the number of shots.
        shot_num_ds = None
        def find_shot_ds(name, node):
            nonlocal shot_num_ds
            if isinstance(node, h5py.Dataset) and name.endswith(shot_number_key):
                shot_num_ds = node
        f.visititems(find_shot_ds)
        
        if shot_num_ds is None:
            print(f"Error: Could not find a dataset ending with '{shot_number_key}'.")
            return matching_features
        
        shot_count = shot_num_ds.shape[0]
        print(f"Found shot_number dataset with {shot_count} shots.")

        # Now, traverse all datasets and check their shapes.
        def visitor_func(name, node):
            if isinstance(node, h5py.Dataset):
                shape = node.shape
                # Check for 1D dataset with length equal to shot_count.
                if len(shape) == 1 and shape[0] == shot_count:
                    matching_features.append(name)
                # Check for 2D dataset where one of the dimensions equals shot_count.
                elif len(shape) == 2 and (shape[0] == shot_count or shape[1] == shot_count):
                    matching_features.append(name)
        
        f.visititems(visitor_func)
    
    return matching_features

# Example usage:
for f in input_file_names:
    file_path = file_path = os.path.join(inDir, f)
    features = list_features_by_shot_count(file_path)
    print("Features matching the shot count criteria:")
    for feature in features:
        print(f"  {feature}")

#### Helper functions for the main loop

In [4]:
# -----------------------------------------------------------------------------
# HELPER FUNCTIONS
# -----------------------------------------------------------------------------

def collect_all_datasets(file_path):
    """
    Recursively collects all dataset paths within the HDF5 file.
    Returns a list of dataset path strings.
    """
    dataset_paths = []
    
    def visitor_func(name, node):
        if isinstance(node, h5py.Dataset):
            dataset_paths.append(name)
    
    with h5py.File(file_path, 'r') as f:
        f.visititems(visitor_func)
    
    return dataset_paths

def get_dataset_by_name(h5_file, beam_ds, name):
    """
    Given an open HDF5 file (h5_file) and a list of dataset paths (beam_ds),
    returns the data for the first dataset whose path ends with '/{name}'.
    If not found, returns None.
    """
    candidates = [ds for ds in beam_ds if ds.endswith(f'/{name}')]
    if not candidates:
        print(f"Warning: No dataset ending with '/{name}' found.")
        return None
    dataset_path = candidates[0]
    return h5_file[dataset_path][()]  # Read dataset into memory

def load_attributes_to_dict(txt_file_path):
    """
    Reads attribute names from a text file (one per line) and returns a dictionary.
    This dictionary maps the human-friendly column name to the dataset suffix.
    """
    with open(txt_file_path, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]
    
    attr_dict = {}
    for raw_attr in lines:
        attr_dict[raw_attr] = raw_attr  # Direct mapping for simplicity
    return attr_dict

In [9]:
# -----------------------------------------------------------------------------
# LOAD L1B FEATURES
# -----------------------------------------------------------------------------

# Use the new features list for L1B
txt_path = 'L1B_features.txt'
ATTRIBUTES_TO_LOAD = load_attributes_to_dict(txt_path)
print("Loaded L1B attributes:", ATTRIBUTES_TO_LOAD)

Loaded L1B attributes: {'all_samples_sum': 'all_samples_sum', 'beam': 'beam', 'channel': 'channel', 'master_frac': 'master_frac', 'master_int': 'master_int', 'noise_mean_corrected': 'noise_mean_corrected', 'noise_stddev_corrected': 'noise_stddev_corrected', 'nsemean_even': 'nsemean_even', 'nsemean_odd': 'nsemean_odd', 'rx_energy': 'rx_energy', 'rx_offset': 'rx_offset', 'rx_open': 'rx_open', 'rx_sample_count': 'rx_sample_count', 'rx_sample_start_index': 'rx_sample_start_index', 'selection_stretchers_x': 'selection_stretchers_x', 'selection_stretchers_y': 'selection_stretchers_y', 'shot_number': 'shot_number', 'stale_return_flag': 'stale_return_flag', 'th_left_used': 'th_left_used', 'tx_egamplitude': 'tx_egamplitude', 'tx_egamplitude_error': 'tx_egamplitude_error', 'tx_egbias': 'tx_egbias', 'tx_egbias_error': 'tx_egbias_error', 'tx_egflag': 'tx_egflag', 'tx_eggamma': 'tx_eggamma', 'tx_eggamma_error': 'tx_eggamma_error', 'tx_egsigma': 'tx_egsigma', 'tx_egsigma_error': 'tx_egsigma_error', 

#### Main loop, collects all target features and puts them inside a Pandas dataframe.
WARNING: Despite my best attempts at optimization, this is *extremely* memory intensive even for one 5GB HDF5 file. Only run on a supercomputer.

In [12]:
# -----------------------------------------------------------------------------
# PROCESS L1B FILES
# -----------------------------------------------------------------------------

dataframes = []

# Process each L1B HDF5 file (assumed pre-loaded into input_files)
for f in input_files:
    print(f"\nProcessing L1B file: {f.filename}")
    
    # Collect all dataset paths in the current file
    all_ds_for_file = collect_all_datasets(f.filename)
    
    # Process each beam in the file
    for b in files_to_beams[f]:
        print(f"  Processing beam: {b}")
        
        # Filter dataset paths to only those belonging to the current beam
        beam_ds = [ds for ds in all_ds_for_file if b in ds]
        
        # 1) Retrieve the shot_number dataset (1D array; one entry per shot)
        shotNums = get_dataset_by_name(f, beam_ds, 'shot_number')
        if shotNums is None:
            print(f"    Warning: No 'shot_number' dataset found for beam {b}. Skipping beam.")
            continue
        shotNums = np.array(shotNums)
        row_count = len(shotNums)
        print(f"    Number of shots: {row_count}")
        
        # 2) Build constant columns: File Name and Beam Name
        file_name_col = [os.path.basename(f.filename)] * row_count
        beam_name_col = [b] * row_count
        
        # 3) Extract all 1D features specified in the L1B features list
        attr_data = {}
        for col_label, ds_suffix in ATTRIBUTES_TO_LOAD.items():
            data_array = get_dataset_by_name(f, beam_ds, ds_suffix)
            if data_array is None:
                print(f"    Warning: Missing dataset for {ds_suffix}. Filling with None.")
                attr_data[col_label] = np.full(row_count, None)
            else:
                data_array = np.array(data_array)
                if len(data_array) != row_count:
                    print(f"    Warning: Row count mismatch for {col_label} in beam {b}.")
                attr_data[col_label] = data_array
        
        # 4) Extract the 'surface_type' feature (2D array with shape (5, # of shots))
        surface_data = get_dataset_by_name(f, beam_ds, 'surface_type')
        if surface_data is None:
            print("    Warning: 'surface_type' dataset missing. Filling with NaNs.")
            # Create five columns filled with NaN
            surface_cols = {
                'surface_land': np.full(row_count, np.nan),
                'surface_ocean': np.full(row_count, np.nan),
                'surface_sea_ice': np.full(row_count, np.nan),
                'surface_land_ice': np.full(row_count, np.nan),
                'surface_inland_water': np.full(row_count, np.nan)
            }
        else:
            surface_data = np.array(surface_data)
            print(f'surface_type shape: {np.shape(surface_data)}')
            # Check the shape and transpose if needed so that shape becomes (row_count, 5)
            if surface_data.shape[0] == 5 and surface_data.shape[1] == row_count:
                surface_data = surface_data.T
            elif surface_data.shape[0] != row_count or surface_data.shape[1] != 5:
                print("    Warning: Unexpected shape for 'surface_type'. Expected (5, #shots) or (#shots, 5).")
            print(f'Transposed surface_type shape: {np.shape(surface_data)}')
            # Create interpretable column labels for surface types
            surface_cols = {
                'surface_land': surface_data[:, 0],
                'surface_ocean': surface_data[:, 1],
                'surface_sea_ice': surface_data[:, 2],
                'surface_land_ice': surface_data[:, 3],
                'surface_inland_water': surface_data[:, 4]
            }
            print('Successfully processed surface_type!')
        
        # 5) Construct the DataFrame for the current beam by combining all columns
        df = pd.DataFrame({
            'File Name': file_name_col,
            'Beam Name': beam_name_col,
            'Shot Number': shotNums,
            **attr_data,
            **surface_cols
        })
        
        # 6) One-hot encode categorical features, e.g., Beam Name (and Channel if present)
        if 'Beam Name' in df.columns:
            df = pd.get_dummies(df, columns=['Beam Name'], prefix='')
        if 'channel' in df.columns:
            df = pd.get_dummies(df, columns=['channel'], prefix='channel')
        
        dataframes.append(df)
        print(f"    Processed beam {b} with {row_count} shots.")


Processing L1B file: /oscar/scratch/jzhu118/GEDI_Outlier_Detection_OSCAR/Input_files/GEDI01_B_2022004042652_O17343_04_T10772_02_005_02_V002.h5
  Processing beam: BEAM0000
    Number of shots: 168553
surface_type shape: (5, 168553)
Transposed surface_type shape: (168553, 5)
Successfully processed surface_type!
    Processed beam BEAM0000 with 168553 shots.
  Processing beam: BEAM0001
    Number of shots: 168565
surface_type shape: (5, 168565)
Transposed surface_type shape: (168565, 5)
Successfully processed surface_type!
    Processed beam BEAM0001 with 168565 shots.
  Processing beam: BEAM0010
    Number of shots: 168671
surface_type shape: (5, 168671)
Transposed surface_type shape: (168671, 5)
Successfully processed surface_type!
    Processed beam BEAM0010 with 168671 shots.
  Processing beam: BEAM0011
    Number of shots: 168649
surface_type shape: (5, 168649)
Transposed surface_type shape: (168649, 5)
Successfully processed surface_type!
    Processed beam BEAM0011 with 168649 sho

In [13]:
# -----------------------------------------------------------------------------
# FINALIZE: Concatenate all partial DataFrames into one complete DataFrame
# -----------------------------------------------------------------------------

if dataframes:
    complete_df = pd.concat(dataframes, ignore_index=True)
    print(f"\nFinal DataFrame shape: {complete_df.shape}")
else:
    complete_df = pd.DataFrame()
    print("\nNo data extracted from L1B files.")

# Optionally, save the complete DataFrame for further processing:
complete_df.to_parquet("L1B_raw.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')


Final DataFrame shape: (1820650, 97)
Successfully wrote to a .parquet file!


#### Feature engineering
1. Adding Z-Scores for five most relevant RH metrics
2. Creating RH50 / RH100 ratio
3. Adding RH95 - RH50
4. Adding "Missingness" - the number of NaNs in each row

In [None]:
num_zeros = (complete_df['RH_100'] == 0).sum()
print(f"Number of zeros in RH_100: {num_zeros}")

In [None]:
'''Add Z-Scores of the five RH metrics'''
rh_nums = [25, 50, 75, 85, 95, 100]
for i in rh_nums:
    col_name = f'RH_{i}'
    complete_df[f'{col_name} Z Score'] = (complete_df[col_name] - complete_df[col_name].mean()) / complete_df[col_name].std()

complete_df

In [None]:
'''Adding the RH_50 / RH_100 feature'''
complete_df['RH_50_v_100'] = complete_df['RH_50'] / complete_df['RH_100']
complete_df

In [None]:
'''Adding the (RH95 - RH50) feature'''
rh_50 = complete_df['RH_50']
rh_95 = complete_df['RH_95']
complete_df['RH_95_minus_50'] = (rh_95 - rh_50)
complete_df

In [None]:
'''Adding the 'Missingness' feature'''
# Count the number of NaNs in each row
complete_df['Missingness'] = complete_df.isna().sum(axis=1)


# Optionally, inspect how many rows have missing data
print('Number of NaNs:')
print(complete_df['Missingness'].value_counts())
complete_df

In [8]:
'''Optional: Save to a parquet file'''
# Get a list of column names
columns = complete_df.columns

# Find duplicates
duplicate_columns = columns[columns.duplicated()]
print("Duplicate columns:", duplicate_columns)
print('There are ' + str(columns.duplicated().sum()) + ' duplicated columns.')
print('NaN value count:\n')
print(complete_df.isnull().sum())
# complete_df.describe()

# Uncomment the two lines below to write to parquet
complete_df.to_parquet("TEST.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

Duplicate columns: Index([], dtype='object')
There are 0 duplicated columns.
NaN value count:

File Name              0
Shot Number            0
RH_25                  0
RH_50                  0
RH_75                  0
                  ...   
channel_3        1885151
beam_BEAM1000    1893679
channel_4        1893679
beam_BEAM1011    1894845
channel_5        1894845
Length: 100, dtype: int64
Successfully wrote to a .parquet file!


#### This is the filtering step for PCA if there are any NaN rows in the dataframe
Note: there should not be mnany NaN values if you accounted for the different shapes of the data inside the HDF5 files.

In [3]:
'''Optional: load complete_df from .parquet file'''
complete_df = pd.read_parquet('input_raw.parquet', engine='pyarrow')
complete_df

Unnamed: 0,File Name,Beam Name,Shot Number,RH_25,RH_50,RH_75,RH_85,RH_95,RH_100,channel,...,zcross_localenergy,RH_25 Z Score,RH_50 Z Score,RH_75 Z Score,RH_85 Z Score,RH_95 Z Score,RH_100 Z Score,RH_50_v_100,RH_95_minus_50,Missingness
0,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050304,-0.78,0.00,0.78,1.19,1.79,2.36,0,...,163.479019,-0.267243,-0.224966,-0.212232,-0.213468,-0.214492,-0.240572,0.000000,1.79,0
1,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050305,-0.97,-0.07,0.78,1.23,1.87,2.54,0,...,178.583328,-0.352941,-0.243600,-0.212232,-0.206578,-0.202634,-0.216597,-0.027559,1.94,0
2,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050306,-0.86,-0.07,0.71,1.12,1.72,2.32,0,...,203.563889,-0.303326,-0.243600,-0.225654,-0.225526,-0.224867,-0.245900,-0.030172,1.79,0
3,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050307,-0.86,-0.07,0.74,1.16,1.79,2.47,0,...,221.641510,-0.303326,-0.243600,-0.219902,-0.218636,-0.214492,-0.225921,-0.028340,1.86,0
4,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050308,-0.89,-0.03,0.78,1.23,1.94,2.73,0,...,268.898804,-0.316857,-0.232952,-0.212232,-0.206578,-0.192258,-0.191289,-0.010989,1.97,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11607182,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293934,-1.23,-0.18,0.74,1.23,2.02,3.70,5,...,2616.521729,-0.470212,-0.272881,-0.219902,-0.206578,-0.180400,-0.062087,-0.048649,2.20,0
11607183,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293935,-1.23,-0.22,0.71,1.16,1.94,3.70,5,...,2879.136963,-0.470212,-0.283529,-0.225654,-0.218636,-0.192258,-0.062087,-0.059459,2.16,0
11607184,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293936,-1.27,-0.22,0.74,1.23,2.02,3.85,5,...,2619.544189,-0.488253,-0.283529,-0.219902,-0.206578,-0.180400,-0.042108,-0.057143,2.24,0
11607185,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293937,-1.16,-0.14,0.78,1.27,2.05,3.81,5,...,2381.172363,-0.438639,-0.262233,-0.212232,-0.199688,-0.175953,-0.047435,-0.036745,2.19,0


In [4]:
# from sklearn.impute import SimpleImputer

print(f'Original dataframe shape: {complete_df.shape}')

'''Filtering'''
complete_df.drop('Shot Number', axis=1, inplace=True)
print("Shot numbers dropped")
discounted_df = complete_df.dropna(axis=1)


# imputer = SimpleImputer(strategy='most_frequent')  # You can change to 'median', 'most_frequent', etc.
# discounted_df = pd.DataFrame(imputer.fit_transform(complete_df), columns=complete_df.columns)

print("Dataframe shape after dropping NaN columns:", discounted_df.shape)

# Step 1: Separate features and target (if applicable)
# Exclude non-numeric columns if present
filtered_df = discounted_df.select_dtypes(include=[np.number])
# columns_to_keep = [col for col in numeric_df.columns if 'RH' not in col]
# filtered_df = filtered_df[columns_to_keep]

print(f'Non-numeric columns removed from dataframe\nCleaned dataframe size: {filtered_df.shape}')
filtered_df

Original dataframe shape: (11607187, 97)
Shot numbers dropped
Dataframe shape after dropping NaN columns: (11607187, 87)
Non-numeric columns removed from dataframe
Cleaned dataframe size: (11607187, 85)


Unnamed: 0,RH_25,RH_50,RH_75,RH_85,RH_95,RH_100,channel,degrade_flag,delta_time,digital_elevation_model,...,zcross_amp,zcross_localenergy,RH_25 Z Score,RH_50 Z Score,RH_75 Z Score,RH_85 Z Score,RH_95 Z Score,RH_100 Z Score,RH_95_minus_50,Missingness
0,-0.78,0.00,0.78,1.19,1.79,2.36,0,0,4.146742e+07,-999999.000000,...,296.142761,163.479019,-0.267243,-0.224966,-0.212232,-0.213468,-0.214492,-0.240572,1.79,0
1,-0.97,-0.07,0.78,1.23,1.87,2.54,0,0,4.146742e+07,-999999.000000,...,302.973358,178.583328,-0.352941,-0.243600,-0.212232,-0.206578,-0.202634,-0.216597,1.94,0
2,-0.86,-0.07,0.71,1.12,1.72,2.32,0,0,4.146742e+07,-999999.000000,...,302.514313,203.563889,-0.303326,-0.243600,-0.225654,-0.225526,-0.224867,-0.245900,1.79,0
3,-0.86,-0.07,0.74,1.16,1.79,2.47,0,0,4.146742e+07,-999999.000000,...,309.622925,221.641510,-0.303326,-0.243600,-0.219902,-0.218636,-0.214492,-0.225921,1.86,0
4,-0.89,-0.03,0.78,1.23,1.94,2.73,0,0,4.146742e+07,-999999.000000,...,320.783600,268.898804,-0.316857,-0.232952,-0.212232,-0.206578,-0.192258,-0.191289,1.97,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11607182,-1.23,-0.18,0.74,1.23,2.02,3.70,5,0,1.552156e+08,254.929123,...,1057.588623,2616.521729,-0.470212,-0.272881,-0.219902,-0.206578,-0.180400,-0.062087,2.20,0
11607183,-1.23,-0.22,0.71,1.16,1.94,3.70,5,0,1.552156e+08,255.024338,...,1098.721924,2879.136963,-0.470212,-0.283529,-0.225654,-0.218636,-0.192258,-0.062087,2.16,0
11607184,-1.27,-0.22,0.74,1.23,2.02,3.85,5,0,1.552156e+08,255.024338,...,1065.043213,2619.544189,-0.488253,-0.283529,-0.219902,-0.206578,-0.180400,-0.042108,2.24,0
11607185,-1.16,-0.14,0.78,1.27,2.05,3.81,5,0,1.552156e+08,253.656616,...,965.210388,2381.172363,-0.438639,-0.262233,-0.212232,-0.199688,-0.175953,-0.047435,2.19,0


In [6]:
# Step 2, OPTION 1: Standardize the data using StandardScalar (for demo only)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(filtered_df)
# print(f'Data scaled\nScaled data size (ndarray): {scaled_data.shape}')

scaled_df = pd.DataFrame(scaled_data, columns=filtered_df.columns, index=filtered_df.index)
print("Made new scaled dataframe")
scaled_df
scaled_df.describe()

# Uncomment the two lines below to write scaled_df to parquet
scaled_df.to_parquet("input_standard_scaled.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

Made new scaled dataframe
Successfully wrote to a .parquet file!


In [13]:
# OPTION 2: Standardize the data using RobustScalar (this is better for outlier detection)
from sklearn.preprocessing import RobustScaler

# Assume `filtered_df` is your cleaned and filtered dataframe (all numeric columns)
robust_scaler = RobustScaler()
print("Instantiated RobustScalar")

# Fit the scaler on the dataframe and transform it
scaled_array = robust_scaler.fit_transform(filtered_df)
print("Fitted RobustScalar")

# (Optional) Create a new DataFrame with scaled values
scaled_df = pd.DataFrame(scaled_array, columns=filtered_df.columns, index=filtered_df.index)
print("Made new scaled dataframe")
scaled_df
scaled_df.describe()

# Uncomment the two lines below to write scaled_df to parquet
scaled_df.to_parquet("input_scaled.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

Instantiated RobustScalar
Fitted RobustScalar
Made new scaled dataframe
Successfully wrote to a .parquet file!
