# Preprocessing L4A
We will extract all relevant datasets from all HDF5 files and convert them into a readable Pandas dataframe. We will also do some preliminary data cleaning and PCA analysis. Everything will be outputted as *.parquet* files. This dramatically reduces the memory usage in the outlier detector scripts.

In [None]:
import os
import random

import h5py
import numpy as np
import pandas as pd
import geopandas as gp
# import geoviews as gv
# from geoviews import opts, tile_sources as gvts
# import holoviews as hv
# gv.extension('bokeh', 'matplotlib')
from shapely.geometry import Point
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

import matplotlib
import matplotlib.pyplot as plt
# import PyQt6

In [None]:
inDir = os.getcwd() + "/Input_files"
print(inDir)
input_file_names = [g for g in os.listdir(inDir) if g.startswith('GEDI04_A') and g.endswith('.h5')]  # List all GEDI level 2 files in inDir
input_file_names

In [9]:
def print_dataset_shapes(file_path):
    """
    Opens the given HDF5 file and prints the name and shape of each dataset.
    """
    with h5py.File(file_path, 'r') as f:
        def visitor_func(name, node):
            if isinstance(node, h5py.Dataset):
                # You can also print node.dtype if you want to see data type
                print(f"Dataset: {name:50} Shape: {node.shape}")
        f.visititems(visitor_func)

# Example usage:
for f in input_file_names:
    file_path = file_path = os.path.join(inDir, f)
    print_dataset_shapes(file_path)

Dataset: ANCILLARY/model_data                               Shape: (35,)
Dataset: ANCILLARY/pft_lut                                  Shape: (7,)
Dataset: ANCILLARY/region_lut                               Shape: (7,)
Dataset: BEAM0000/agbd                                      Shape: (167298,)
Dataset: BEAM0000/agbd_pi_lower                             Shape: (167298,)
Dataset: BEAM0000/agbd_pi_upper                             Shape: (167298,)
Dataset: BEAM0000/agbd_prediction/agbd_a1                   Shape: (167298,)
Dataset: BEAM0000/agbd_prediction/agbd_a10                  Shape: (167298,)
Dataset: BEAM0000/agbd_prediction/agbd_a2                   Shape: (167298,)
Dataset: BEAM0000/agbd_prediction/agbd_a3                   Shape: (167298,)
Dataset: BEAM0000/agbd_prediction/agbd_a4                   Shape: (167298,)
Dataset: BEAM0000/agbd_prediction/agbd_a5                   Shape: (167298,)
Dataset: BEAM0000/agbd_prediction/agbd_a6                   Shape: (167298,)
Dataset: BEAM

In [8]:
def list_1d_features(file_path, shot_number_key='shot_number'):
    """
    Returns a sorted list of base feature names from the HDF5 file where each dataset:
      - is 1-dimensional and
      - has length equal to the number of shots determined from the shot_number dataset.
    
    Parameters:
      file_path (str): Path to the HDF5 file.
      shot_number_key (str): Suffix used to locate the shot number dataset.
      
    Returns:
      List[str]: Sorted list of feature base names.
    """
    feature_names = set()
    
    with h5py.File(file_path, 'r') as f:
        # First, locate the shot_number dataset to determine the shot count.
        shot_num_ds = None
        
        def find_shot_ds(name, node):
            nonlocal shot_num_ds
            if isinstance(node, h5py.Dataset) and name.endswith(shot_number_key):
                shot_num_ds = node
        
        f.visititems(find_shot_ds)
        
        if shot_num_ds is None:
            print(f"Error: Could not find a dataset ending with '{shot_number_key}'.")
            return []
        
        shot_count = shot_num_ds.shape[0]
        print(f"Found shot_number dataset with {shot_count} shots.")
        
        # Now, visit every dataset and check if it is 1D with length equal to shot_count.
        def visitor_func(name, node):
            if isinstance(node, h5py.Dataset):
                shape = node.shape
                if len(shape) == 1 and shape[0] == shot_count:
                    # Extract the base name (the part after the last '/')
                    base_name = name.split('/')[-1]
                    feature_names.add(base_name)
        
        f.visititems(visitor_func)
    
    return sorted(feature_names)

# Example usage:
for f in input_file_names:
    file_path = file_path = os.path.join(inDir, f)
    features = list_1d_features(file_path)
    print("Features of shape (# shots):")
    for feat in features:
        print(f"  {feat}")


Found shot_number dataset with 167113 shots.
Features of shape (# shots):
  agbd
  agbd_a1
  agbd_a10
  agbd_a2
  agbd_a3
  agbd_a4
  agbd_a5
  agbd_a6
  agbd_pi_lower
  agbd_pi_lower_a1
  agbd_pi_lower_a10
  agbd_pi_lower_a2
  agbd_pi_lower_a3
  agbd_pi_lower_a4
  agbd_pi_lower_a5
  agbd_pi_lower_a6
  agbd_pi_upper
  agbd_pi_upper_a1
  agbd_pi_upper_a10
  agbd_pi_upper_a2
  agbd_pi_upper_a3
  agbd_pi_upper_a4
  agbd_pi_upper_a5
  agbd_pi_upper_a6
  agbd_se
  agbd_se_a1
  agbd_se_a10
  agbd_se_a2
  agbd_se_a3
  agbd_se_a4
  agbd_se_a5
  agbd_se_a6
  agbd_t
  agbd_t_a1
  agbd_t_a10
  agbd_t_a2
  agbd_t_a3
  agbd_t_a4
  agbd_t_a5
  agbd_t_a6
  agbd_t_pi_lower_a1
  agbd_t_pi_lower_a10
  agbd_t_pi_lower_a2
  agbd_t_pi_lower_a3
  agbd_t_pi_lower_a4
  agbd_t_pi_lower_a5
  agbd_t_pi_lower_a6
  agbd_t_pi_upper_a1
  agbd_t_pi_upper_a10
  agbd_t_pi_upper_a2
  agbd_t_pi_upper_a3
  agbd_t_pi_upper_a4
  agbd_t_pi_upper_a5
  agbd_t_pi_upper_a6
  agbd_t_se
  agbd_t_se_a1
  agbd_t_se_a10
  agbd_t_se_a

In [11]:
def list_features_by_shot_count(file_path, shot_number_key='shot_number'):
    """
    Returns a list of dataset names from the given HDF5 file that have shapes matching:
      - (# shots,) or
      - (# shots, n) or (n, # shots)
    
    Parameters:
      file_path (str): Path to the HDF5 file.
      shot_number_key (str): Key name (or suffix) used to identify the shot number dataset.
    
    Returns:
      List[str]: List of dataset names that meet the criteria.
    """
    matching_features = []
    
    with h5py.File(file_path, 'r') as f:
        # First, locate the shot_number dataset to determine the number of shots.
        shot_num_ds = None
        def find_shot_ds(name, node):
            nonlocal shot_num_ds
            if isinstance(node, h5py.Dataset) and name.endswith(shot_number_key):
                shot_num_ds = node
        f.visititems(find_shot_ds)
        
        if shot_num_ds is None:
            print(f"Error: Could not find a dataset ending with '{shot_number_key}'.")
            return matching_features
        
        shot_count = shot_num_ds.shape[0]
        print(f"Found shot_number dataset with {shot_count} shots.")

        # Now, traverse all datasets and check their shapes.
        def visitor_func(name, node):
            if isinstance(node, h5py.Dataset):
                shape = node.shape
                # # Check for 1D dataset with length equal to shot_count.
                # if len(shape) == 1 and shape[0] == shot_count:
                #     matching_features.append(name)
                # # Check for 2D dataset where one of the dimensions equals shot_count.
                # elif len(shape) == 2 and (shape[0] == shot_count or shape[1] == shot_count):
                #     matching_features.append(name)

                if len(shape) == 2 and (shape[0] == shot_count or shape[1] == shot_count):
                    matching_features.append(name)
                # # Check for 2D dataset where one of the dimensions equals shot_count.
                # elif 
                #     matching_features.append(name)
        
        f.visititems(visitor_func)
    
    return matching_features

# Example usage:
for f in input_file_names:
    file_path = file_path = os.path.join(inDir, f)
    features = list_features_by_shot_count(file_path)
    print("Features matching the shot count criteria:")
    for feature in features:
        print(f"  {feature}")

Found shot_number dataset with 167113 shots.
Features matching the shot count criteria:
  BEAM1011/agbd_prediction/xvar_a1
  BEAM1011/agbd_prediction/xvar_a10
  BEAM1011/agbd_prediction/xvar_a2
  BEAM1011/agbd_prediction/xvar_a3
  BEAM1011/agbd_prediction/xvar_a4
  BEAM1011/agbd_prediction/xvar_a5
  BEAM1011/agbd_prediction/xvar_a6
  BEAM1011/xvar
Found shot_number dataset with 124578 shots.
Features matching the shot count criteria:
  BEAM1011/agbd_prediction/xvar_a1
  BEAM1011/agbd_prediction/xvar_a10
  BEAM1011/agbd_prediction/xvar_a2
  BEAM1011/agbd_prediction/xvar_a3
  BEAM1011/agbd_prediction/xvar_a4
  BEAM1011/agbd_prediction/xvar_a5
  BEAM1011/agbd_prediction/xvar_a6
  BEAM1011/xvar


### Loading files with all information into a huge Pandas dataframe

#### Preprocessing: Get files and sort by beam

In [12]:
input_files = []
files_to_beams = dict()
for n in input_file_names:
    file_path = os.path.join(inDir, n)  # Select an example file
    file = h5py.File(file_path, 'r')
    input_files.append(file)
    
    print('Loading file: ' + n)
    print('The file contains the following groups: ' + str(list(file.keys())))
    
    print("The file's metadata contains the following attributes: ")
    for g in file['METADATA']['DatasetIdentification'].attrs: print(g)
    
    beamNames = [g for g in file.keys() if g.startswith('BEAM')]
    files_to_beams[file] = beamNames
    
    print("The file contains the following beams: ")
    for b in beamNames:
        print(f"{b} is a {file[b].attrs['description']}")

Loading file: GEDI04_A_2021009022644_O11762_03_T01637_02_002_02_V002.h5
The file contains the following groups: ['ANCILLARY', 'BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file's metadata contains the following attributes: 
PGEVersion
VersionID
abstract
characterSet
creationDate
credit
fileName
gedi_l4a_githash
language
originatorOrganizationName
purpose
shortName
spatialRepresentationType
status
topicCategory
uuid
The file contains the following beams: 
BEAM0000 is a Coverage beam
BEAM0001 is a Coverage beam
BEAM0010 is a Coverage beam
BEAM0011 is a Coverage beam
BEAM0101 is a Full power beam
BEAM0110 is a Full power beam
BEAM1000 is a Full power beam
BEAM1011 is a Full power beam
Loading file: GEDI04_A_2022106075705_O18927_04_T10647_02_003_01_V002.h5
The file contains the following groups: ['ANCILLARY', 'BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file'

#### Helper functions for the main loop

In [14]:
# -----------------------------------------------------------------------------
# HELPER FUNCTIONS
# -----------------------------------------------------------------------------

def collect_all_datasets(file_path):
    """
    Recursively collects all dataset paths within the HDF5 file.
    Returns a list of dataset path strings.
    """
    dataset_paths = []
    
    def visitor_func(name, node):
        if isinstance(node, h5py.Dataset):
            dataset_paths.append(name)
    
    with h5py.File(file_path, 'r') as f:
        f.visititems(visitor_func)
    
    return dataset_paths

def get_dataset_by_name(h5_file, beam_ds, name):
    """
    Given an open HDF5 file (h5_file) and a list of dataset paths (beam_ds),
    returns the data for the first dataset whose path ends with '/{name}'.
    If not found, returns None.
    """
    candidates = [ds for ds in beam_ds if ds.endswith(f'/{name}')]
    if not candidates:
        print(f"Warning: No dataset ending with '/{name}' found.")
        return None
    dataset_path = candidates[0]
    return h5_file[dataset_path][()]  # Read dataset into memory

def load_attributes_to_dict(txt_file_path):
    """
    Reads attribute names from a text file (one per line) and returns a dictionary.
    This dictionary maps the human-friendly column name to the dataset suffix.
    """
    with open(txt_file_path, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]
    
    attr_dict = {}
    for raw_attr in lines:
        attr_dict[raw_attr] = raw_attr  # Direct mapping for simplicity
    return attr_dict

In [16]:
# -------------------------
# PRE-EXTRACTION SETUP
# -------------------------

# Load L4A feature list from text file (all features here are 1D)
l4a_features_txt = 'L4A_features.txt'
ATTRIBUTES_TO_LOAD = load_attributes_to_dict(l4a_features_txt)
print("Loaded L4A 1D attributes:", ATTRIBUTES_TO_LOAD)

# Define the list of xvar metrics to extract (each is 2D with shape (# shots, 4))
xvar_features = ['xvar', 'xvar_a1', 'xvar_a2', 'xvar_a3', 
                 'xvar_a4', 'xvar_a5', 'xvar_a6', 'xvar_a10']

Loaded L4A 1D attributes: {'agbd': 'agbd', 'agbd_a1': 'agbd_a1', 'agbd_a10': 'agbd_a10', 'agbd_a2': 'agbd_a2', 'agbd_a3': 'agbd_a3', 'agbd_a4': 'agbd_a4', 'agbd_a5': 'agbd_a5', 'agbd_a6': 'agbd_a6', 'agbd_pi_lower': 'agbd_pi_lower', 'agbd_pi_lower_a1': 'agbd_pi_lower_a1', 'agbd_pi_lower_a10': 'agbd_pi_lower_a10', 'agbd_pi_lower_a2': 'agbd_pi_lower_a2', 'agbd_pi_lower_a3': 'agbd_pi_lower_a3', 'agbd_pi_lower_a4': 'agbd_pi_lower_a4', 'agbd_pi_lower_a5': 'agbd_pi_lower_a5', 'agbd_pi_lower_a6': 'agbd_pi_lower_a6', 'agbd_pi_upper': 'agbd_pi_upper', 'agbd_pi_upper_a1': 'agbd_pi_upper_a1', 'agbd_pi_upper_a10': 'agbd_pi_upper_a10', 'agbd_pi_upper_a2': 'agbd_pi_upper_a2', 'agbd_pi_upper_a3': 'agbd_pi_upper_a3', 'agbd_pi_upper_a4': 'agbd_pi_upper_a4', 'agbd_pi_upper_a5': 'agbd_pi_upper_a5', 'agbd_pi_upper_a6': 'agbd_pi_upper_a6', 'agbd_se': 'agbd_se', 'agbd_se_a1': 'agbd_se_a1', 'agbd_se_a10': 'agbd_se_a10', 'agbd_se_a2': 'agbd_se_a2', 'agbd_se_a3': 'agbd_se_a3', 'agbd_se_a4': 'agbd_se_a4', 'agbd

#### Main loop, collects all target features and puts them inside a Pandas dataframe.
WARNING: Despite my best attempts at optimization, this is *extremely* memory intensive even for one 5GB HDF5 file. Only run on a supercomputer.

In [17]:
# -------------------------
# MAIN EXTRACTION LOOP FOR L4A FILES
# -------------------------

# List to hold DataFrames for each beam
dataframes = []

for f in input_files:
    print(f"\nProcessing L4A file: {f.filename}")
    
    # Collect all dataset paths in the current file
    all_ds_for_file = collect_all_datasets(f.filename)
    
    # Process each beam in the file
    for b in files_to_beams[f]:
        print(f"  Processing beam: {b}")
        
        # Filter dataset paths to only those belonging to the current beam
        beam_ds = [ds for ds in all_ds_for_file if b in ds]
        
        # 1) Retrieve the shot_number dataset (1D array; one entry per shot)
        shotNums = get_dataset_by_name(f, beam_ds, 'shot_number')
        if shotNums is None:
            print(f"    Warning: No 'shot_number' dataset found for beam {b}. Skipping beam.")
            continue
        shotNums = np.array(shotNums)
        row_count = len(shotNums)
        print(f"    Number of shots: {row_count}")
        
        # 2) Build constant columns: File Name and Beam Name
        file_name_col = [os.path.basename(f.filename)] * row_count
        beam_name_col = [b] * row_count
        
        # 3) Extract all 1D features specified in the L4A feature list
        attr_data = {}
        for col_label, ds_suffix in ATTRIBUTES_TO_LOAD.items():
            data_array = get_dataset_by_name(f, beam_ds, ds_suffix)
            if data_array is None:
                print(f"    Warning: Missing dataset for {ds_suffix}. Filling with None.")
                attr_data[col_label] = np.full(row_count, None)
            else:
                data_array = np.array(data_array)
                if len(data_array) != row_count:
                    print(f"    Warning: Row count mismatch for {col_label} in beam {b}.")
                attr_data[col_label] = data_array
        
        # 4) Extract and process each xvar feature (each is 2D: (# shots, 4))
        xvar_data = {}
        for feature in xvar_features:
            data_array = get_dataset_by_name(f, beam_ds, feature)
            if data_array is None:
                print(f"    Warning: Missing xvar feature: {feature}. Filling with NaNs.")
                # Create 4 columns filled with NaN
                for i in range(1, 5):
                    xvar_data[f"{feature}_{i}"] = np.full(row_count, np.nan)
            else:
                data_array = np.array(data_array)
                # Check if the data shape needs transposition: expected shape is (row_count, 4)
                if data_array.shape[0] == 4 and data_array.shape[1] == row_count:
                    data_array = data_array.T
                elif data_array.shape[0] != row_count or data_array.shape[1] != 4:
                    print(f"    Warning: Unexpected shape for {feature}. Expected (# shots, 4). Got {data_array.shape}.")
                # Now split the 2D array into 4 separate 1D arrays
                for i in range(4):
                    col_name = f"{feature}_{i+1}"
                    xvar_data[col_name] = data_array[:, i]
        
        # 5) Construct the DataFrame for the current beam by combining all columns
        df = pd.DataFrame({
            'File Name': file_name_col,
            'Beam Name': beam_name_col,
            'Shot Number': shotNums,
            **attr_data,
            **xvar_data
        })
        
        # 6) One-hot encode categorical features, e.g., Beam Name and channel (if present)
        if 'Beam Name' in df.columns:
            df = pd.get_dummies(df, columns=['Beam Name'], prefix='Beam')
        if 'channel' in df.columns:
            df = pd.get_dummies(df, columns=['channel'], prefix='channel')
        
        dataframes.append(df)
        print(f"    Processed beam {b} with {row_count} shots.")

# Optionally, concatenate all DataFrames into one complete DataFrame:
if dataframes:
    complete_df = pd.concat(dataframes, ignore_index=True)
    print(f"\nFinal DataFrame shape: {complete_df.shape}")
else:
    complete_df = pd.DataFrame()
    print("\nNo data extracted from L4A files.")


Processing L4A file: /oscar/scratch/jzhu118/GEDI_Outlier_Detection_OSCAR/Input_files/GEDI04_A_2021009022644_O11762_03_T01637_02_002_02_V002.h5
  Processing beam: BEAM0000
    Number of shots: 167298
    Processed beam BEAM0000 with 167298 shots.
  Processing beam: BEAM0001
    Number of shots: 167288
    Processed beam BEAM0001 with 167288 shots.
  Processing beam: BEAM0010
    Number of shots: 167310
    Processed beam BEAM0010 with 167310 shots.
  Processing beam: BEAM0011
    Number of shots: 167328
    Processed beam BEAM0011 with 167328 shots.
  Processing beam: BEAM0101
    Number of shots: 167265
    Processed beam BEAM0101 with 167265 shots.
  Processing beam: BEAM0110
    Number of shots: 167030
    Processed beam BEAM0110 with 167030 shots.
  Processing beam: BEAM1000
    Number of shots: 166838
    Processed beam BEAM1000 with 166838 shots.
  Processing beam: BEAM1011
    Number of shots: 167113
    Processed beam BEAM1011 with 167113 shots.

Processing L4A file: /oscar/scr

In [20]:
# Optionally, save the complete DataFrame for further processing:
# Get a list of column names
columns = complete_df.columns

# Find duplicates
duplicate_columns = columns[columns.duplicated()]
print("Duplicate columns:", duplicate_columns)
print('There are ' + str(columns.duplicated().sum()) + ' duplicated columns.')
print('NaN value count:\n')
print(complete_df.isnull().sum())
complete_df.describe()

complete_df.to_parquet("L4A_raw.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

Duplicate columns: Index([], dtype='object')
There are 0 duplicated columns.
NaN value count:

File Name              0
Shot Number            0
agbd                   0
agbd_a1                0
agbd_a10               0
                  ...   
channel_3        2045013
Beam_BEAM1000    2044800
channel_4        2044800
Beam_BEAM1011    2045670
channel_5        2045670
Length: 218, dtype: int64
Successfully wrote to a .parquet file!


#### Feature engineering
1. Adding Z-Scores for five most relevant RH metrics
2. Creating RH50 / RH100 ratio
3. Adding RH95 - RH50
4. Adding "Missingness" - the number of NaNs in each row

In [None]:
num_zeros = (complete_df['RH_100'] == 0).sum()
print(f"Number of zeros in RH_100: {num_zeros}")

In [None]:
'''Add Z-Scores of the five RH metrics'''
rh_nums = [25, 50, 75, 85, 95, 100]
for i in rh_nums:
    col_name = f'RH_{i}'
    complete_df[f'{col_name} Z Score'] = (complete_df[col_name] - complete_df[col_name].mean()) / complete_df[col_name].std()

complete_df

In [None]:
'''Adding the RH_50 / RH_100 feature'''
complete_df['RH_50_v_100'] = complete_df['RH_50'] / complete_df['RH_100']
complete_df

In [None]:
'''Adding the (RH95 - RH50) feature'''
rh_50 = complete_df['RH_50']
rh_95 = complete_df['RH_95']
complete_df['RH_95_minus_50'] = (rh_95 - rh_50)
complete_df

In [None]:
'''Adding the 'Missingness' feature'''
# Count the number of NaNs in each row
complete_df['Missingness'] = complete_df.isna().sum(axis=1)


# Optionally, inspect how many rows have missing data
print('Number of NaNs:')
print(complete_df['Missingness'].value_counts())
complete_df

In [None]:
'''Optional: Save to a parquet file'''
# Get a list of column names
columns = complete_df.columns

# Find duplicates
duplicate_columns = columns[columns.duplicated()]
print("Duplicate columns:", duplicate_columns)
print('There are ' + str(columns.duplicated().sum()) + ' duplicated columns.')
print('NaN value count:\n')
print(complete_df.isnull().sum())
# complete_df.describe()

# Uncomment the two lines below to write to parquet
complete_df.to_parquet("TEST.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

#### This is the filtering step for PCA if there are any NaN rows in the dataframe
Note: there should not be mnany NaN values if you accounted for the different shapes of the data inside the HDF5 files.

In [None]:
'''Optional: load complete_df from .parquet file'''
complete_df = pd.read_parquet('input_raw.parquet', engine='pyarrow')
complete_df

In [None]:
# from sklearn.impute import SimpleImputer

print(f'Original dataframe shape: {complete_df.shape}')

'''Filtering'''
complete_df.drop('Shot Number', axis=1, inplace=True)
print("Shot numbers dropped")
discounted_df = complete_df.dropna(axis=1)


# imputer = SimpleImputer(strategy='most_frequent')  # You can change to 'median', 'most_frequent', etc.
# discounted_df = pd.DataFrame(imputer.fit_transform(complete_df), columns=complete_df.columns)

print("Dataframe shape after dropping NaN columns:", discounted_df.shape)

# Step 1: Separate features and target (if applicable)
# Exclude non-numeric columns if present
filtered_df = discounted_df.select_dtypes(include=[np.number])
# columns_to_keep = [col for col in numeric_df.columns if 'RH' not in col]
# filtered_df = filtered_df[columns_to_keep]

print(f'Non-numeric columns removed from dataframe\nCleaned dataframe size: {filtered_df.shape}')
filtered_df

In [None]:
# Step 2, OPTION 1: Standardize the data using StandardScalar (for demo only)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(filtered_df)
# print(f'Data scaled\nScaled data size (ndarray): {scaled_data.shape}')

scaled_df = pd.DataFrame(scaled_data, columns=filtered_df.columns, index=filtered_df.index)
print("Made new scaled dataframe")
scaled_df
scaled_df.describe()

# Uncomment the two lines below to write scaled_df to parquet
scaled_df.to_parquet("input_standard_scaled.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

In [None]:
# OPTION 2: Standardize the data using RobustScalar (this is better for outlier detection)
from sklearn.preprocessing import RobustScaler

# Assume `filtered_df` is your cleaned and filtered dataframe (all numeric columns)
robust_scaler = RobustScaler()
print("Instantiated RobustScalar")

# Fit the scaler on the dataframe and transform it
scaled_array = robust_scaler.fit_transform(filtered_df)
print("Fitted RobustScalar")

# (Optional) Create a new DataFrame with scaled values
scaled_df = pd.DataFrame(scaled_array, columns=filtered_df.columns, index=filtered_df.index)
print("Made new scaled dataframe")
scaled_df
scaled_df.describe()

# Uncomment the two lines below to write scaled_df to parquet
scaled_df.to_parquet("input_scaled.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')