# Preprocessing
We will extract all relevant datasets from all HDF5 files and convert them into a readable Pandas dataframe. We will also do some preliminary data cleaning and PCA analysis. Everything will be outputted as *.parquet* files. This dramatically reduces the memory usage in the outlier detector scripts.

## Note to self
Fix Beam, Channel, etc. showing NaN when one-hot encoded. These & RH are not loading correctly!

In [1]:
import os
import random

import h5py
import numpy as np
import pandas as pd
import geopandas as gp
# import geoviews as gv
# from geoviews import opts, tile_sources as gvts
# import holoviews as hv
# gv.extension('bokeh', 'matplotlib')
from shapely.geometry import Point
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

import matplotlib
import matplotlib.pyplot as plt
# import PyQt6

In [2]:
inDir = os.getcwd() + "/Input_files"
print(inDir)
input_file_names = [g for g in os.listdir(inDir) if g.startswith('GEDI02_A') and g.endswith('.h5')]  # List all GEDI level 2 files in inDir
# input_file_names = ['GEDI02_A_2019115222046_O02082_02_T00620_02_003_01_V002.h5', 'GEDI02_A_2020179044014_O08725_01_T01436_02_003_01_V002.h5']
input_file_names

/oscar/scratch/jzhu118/GEDI_Outlier_Detection_OSCAR/Input_files


['GEDI02_A_2021050140102_O12405_02_T10912_02_003_02_V002.h5',
 'GEDI02_A_2021086153349_O12964_03_T08275_02_003_02_V002.h5']

### Loading files with all information into a huge Pandas dataframe

#### Preprocessing: Get files and sort by beam

In [3]:
input_files = []
files_to_beams = dict()
for n in input_file_names:
    file_path = os.path.join(inDir, n)  # Select an example file
    file = h5py.File(file_path, 'r')
    input_files.append(file)
    
    print('Loading file: ' + n)
    print('The file contains the following groups: ' + str(list(file.keys())))
    
    print("The file's metadata contains the following attributes: ")
    for g in file['METADATA']['DatasetIdentification'].attrs: print(g)
    
    beamNames = [g for g in file.keys() if g.startswith('BEAM')]
    files_to_beams[file] = beamNames
    
    print("The file contains the following beams: ")
    for b in beamNames:
        print(f"{b} is a {file[b].attrs['description']}")

Loading file: GEDI02_A_2021050140102_O12405_02_T10912_02_003_02_V002.h5
The file contains the following groups: ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file's metadata contains the following attributes: 
PGEVersion
VersionID
abstract
characterSet
creationDate
credit
fileName
language
originatorOrganizationName
purpose
shortName
spatialRepresentationType
status
topicCategory
uuid
The file contains the following beams: 
BEAM0000 is a Coverage beam
BEAM0001 is a Coverage beam
BEAM0010 is a Coverage beam
BEAM0011 is a Coverage beam
BEAM0101 is a Full power beam
BEAM0110 is a Full power beam
BEAM1000 is a Full power beam
BEAM1011 is a Full power beam
Loading file: GEDI02_A_2021086153349_O12964_03_T08275_02_003_02_V002.h5
The file contains the following groups: ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file's metadata contains the following attribute

In [4]:
def list_1d_features(file_path, shot_number_key='shot_number'):
    """
    Returns a sorted list of base feature names from the HDF5 file where each dataset:
      - is 1-dimensional and
      - has length equal to the number of shots determined from the shot_number dataset.
    
    Parameters:
      file_path (str): Path to the HDF5 file.
      shot_number_key (str): Suffix used to locate the shot number dataset.
      
    Returns:
      List[str]: Sorted list of feature base names.
    """
    feature_names = set()
    
    with h5py.File(file_path, 'r') as f:
        # First, locate the shot_number dataset to determine the shot count.
        shot_num_ds = None
        
        def find_shot_ds(name, node):
            nonlocal shot_num_ds
            if isinstance(node, h5py.Dataset) and name.endswith(shot_number_key):
                shot_num_ds = node
        
        f.visititems(find_shot_ds)
        
        if shot_num_ds is None:
            print(f"Error: Could not find a dataset ending with '{shot_number_key}'.")
            return []
        
        shot_count = shot_num_ds.shape[0]
        print(f"Found shot_number dataset with {shot_count} shots.")
        
        # Now, visit every dataset and check if it is 1D with length equal to shot_count.
        def visitor_func(name, node):
            if isinstance(node, h5py.Dataset):
                shape = node.shape
                if len(shape) == 1 and shape[0] == shot_count:
                    # Extract the base name (the part after the last '/')
                    base_name = name.split('/')[-1]
                    feature_names.add(base_name)
        
        f.visititems(visitor_func)
    
    return sorted(feature_names)

# Example usage:
for f in input_file_names:
    file_path = file_path = os.path.join(inDir, f)
    features = list_1d_features(file_path)
    print("Features of shape (# shots):")
    for feat in features:
        print(f"  {feat}")

Found shot_number dataset with 101512 shots.
Features of shape (# shots):
  back_threshold
  beam
  botloc
  botloc_amp
  channel
  degrade_flag
  delta_time
  digital_elevation_model
  digital_elevation_model_srtm
  elev_highestreturn
  elev_highestreturn_a1
  elev_highestreturn_a2
  elev_highestreturn_a3
  elev_highestreturn_a4
  elev_highestreturn_a5
  elev_highestreturn_a6
  elev_lowestmode
  elev_lowestmode_a1
  elev_lowestmode_a2
  elev_lowestmode_a3
  elev_lowestmode_a4
  elev_lowestmode_a5
  elev_lowestmode_a6
  elev_lowestreturn_a1
  elev_lowestreturn_a2
  elev_lowestreturn_a3
  elev_lowestreturn_a4
  elev_lowestreturn_a5
  elev_lowestreturn_a6
  elevation_1gfit
  elevation_bias_flag
  elevation_bin0_error
  energy_lowestmode_a1
  energy_lowestmode_a2
  energy_lowestmode_a3
  energy_lowestmode_a4
  energy_lowestmode_a5
  energy_lowestmode_a6
  energy_sm
  energy_total
  front_threshold
  landsat_treecover
  landsat_water_persistence
  lastmodeenergy
  lat_highestreturn
  lat_h

In [5]:
def list_features_by_shot_count(file_path, shot_number_key='shot_number'):
    """
    Returns a list of dataset names from the given HDF5 file that have shapes matching:
      - (# shots,) or
      - (# shots, n) or (n, # shots)
    
    Parameters:
      file_path (str): Path to the HDF5 file.
      shot_number_key (str): Key name (or suffix) used to identify the shot number dataset.
    
    Returns:
      List[str]: List of dataset names that meet the criteria.
    """
    matching_features = []
    
    with h5py.File(file_path, 'r') as f:
        # First, locate the shot_number dataset to determine the number of shots.
        shot_num_ds = None
        def find_shot_ds(name, node):
            nonlocal shot_num_ds
            if isinstance(node, h5py.Dataset) and name.endswith(shot_number_key):
                shot_num_ds = node
        f.visititems(find_shot_ds)
        
        if shot_num_ds is None:
            print(f"Error: Could not find a dataset ending with '{shot_number_key}'.")
            return matching_features
        
        shot_count = shot_num_ds.shape[0]
        print(f"Found shot_number dataset with {shot_count} shots.")

        # Now, traverse all datasets and check their shapes.
        def visitor_func(name, node):
            if isinstance(node, h5py.Dataset):
                shape = node.shape
                # Check for 1D dataset with length equal to shot_count.
                if len(shape) == 1 and shape[0] == shot_count:
                    matching_features.append(name)
                # Check for 2D dataset where one of the dimensions equals shot_count.
                elif len(shape) == 2 and (shape[0] == shot_count or shape[1] == shot_count):
                    matching_features.append(name)
        
        f.visititems(visitor_func)
    
    return matching_features

# Example usage:
for f in input_file_names:
    file_path = file_path = os.path.join(inDir, f)
    features = list_features_by_shot_count(file_path)
    print("Features matching the shot count criteria:")
    for feature in features:
        print(f"  {feature}")

Found shot_number dataset with 101512 shots.
Features matching the shot count criteria:
  BEAM1011/beam
  BEAM1011/channel
  BEAM1011/degrade_flag
  BEAM1011/delta_time
  BEAM1011/digital_elevation_model
  BEAM1011/digital_elevation_model_srtm
  BEAM1011/elev_highestreturn
  BEAM1011/elev_lowestmode
  BEAM1011/elevation_bias_flag
  BEAM1011/elevation_bin0_error
  BEAM1011/energy_total
  BEAM1011/geolocation/elev_highestreturn_a1
  BEAM1011/geolocation/elev_highestreturn_a2
  BEAM1011/geolocation/elev_highestreturn_a3
  BEAM1011/geolocation/elev_highestreturn_a4
  BEAM1011/geolocation/elev_highestreturn_a5
  BEAM1011/geolocation/elev_highestreturn_a6
  BEAM1011/geolocation/elev_lowestmode_a1
  BEAM1011/geolocation/elev_lowestmode_a2
  BEAM1011/geolocation/elev_lowestmode_a3
  BEAM1011/geolocation/elev_lowestmode_a4
  BEAM1011/geolocation/elev_lowestmode_a5
  BEAM1011/geolocation/elev_lowestmode_a6
  BEAM1011/geolocation/elev_lowestreturn_a1
  BEAM1011/geolocation/elev_lowestreturn_a2
  

#### Helper functions for the main loop

In [6]:
def collect_all_datasets(file_path):
    dataset_paths = []
    
    def visitor_func(name, node):
        if isinstance(node, h5py.Dataset):
            dataset_paths.append(name)
    
    with h5py.File(file_path, 'r') as f:
        f.visititems(visitor_func)
    
    return dataset_paths

def get_dataset_by_name(h5_file, beam_ds, name_suffix):
    candidates = [g for g in beam_ds if g.endswith(f"/{name_suffix}")]
    if not candidates:
        return None
    return h5_file[candidates[0]][()]

def get_dataset_by_name(h5_file, beam_ds, name):
    """
    Given an open HDF5 file object (h5_file), a list of dataset paths (beam_ds),
    and a desired dataset suffix (e.g., 'digital_elevation_model'),
    returns the data from the first matching path that ends with '/{name}'.

    Returns None if no match is found.
    """
    candidates = [g for g in beam_ds if g.endswith(f'/{name}')]
    if not candidates:
        print(f"Warning: No dataset ending with '/{name}' found in beam_ds.")
        return None
    dataset_path = candidates[0]
    return h5_file[dataset_path][()]  # Read the data

def extract_rh_indices(rh_data, indices=[25, 50, 75, 85, 95, 100]):
    """
    Given an array of shape (N, something) from the 'rh' dataset,
    return a dictionary of { 'RH_25': [...], 'RH_50': [...], ... }.
    """
    # Defensive check
    if rh_data is None:
        # Could return an empty dict if the dataset wasn't found
        return {}
    
    result = {}
    for i in indices:
        col_name = f"RH_{i}"
        # Build a list of rh_data[row][i] for each row
        result[col_name] = [row[i] for row in rh_data]
    return result

# def humanize_attribute(attr):
#     """
#     Convert a raw attribute name like 'degrade_flag'
#     into a readable name like 'Degrade Flag'.
#     Customize as you wish.
#     """
#     # Split on underscores, capitalize each part, then rejoin with a space
#     parts = attr.split('_')
#     capitalized_parts = [p.capitalize() for p in parts]
#     return ' '.join(capitalized_parts)

def load_attributes_to_dict(txt_file_path):
    """
    Reads attribute names from a .txt file (one per line),
    and returns a dictionary of the form:
        { "Degrade Flag": "degrade_flag",
          "Delta Time": "delta_time",
          ...
        }
    Skips blank lines or lines that are just whitespace.
    """
    with open(txt_file_path, 'r') as f:
        # Read and strip lines
        lines = [line.strip() for line in f if line.strip()]

    # Build the dictionary
    attr_dict = {}
    for raw_attr in lines:
        # Convert raw attribute to a nicer display key
        # key = humanize_attribute(raw_attr)
        
        key = raw_attr # No humanization makes it easier, lol
        # Store in the dictionary
        attr_dict[key] = raw_attr

    return attr_dict


In [7]:
# This dictionary maps "human-friendly column name" -> "dataset suffix in the file"
txt_path = "L2A_features.txt"  # Name of attributes file
ATTRIBUTES_TO_LOAD = load_attributes_to_dict(txt_path)

print("Loaded attributes:", ATTRIBUTES_TO_LOAD)

Loaded attributes: {'back_threshold': 'back_threshold', 'beam': 'beam', 'botloc': 'botloc', 'botloc_amp': 'botloc_amp', 'channel': 'channel', 'degrade_flag': 'degrade_flag', 'delta_time': 'delta_time', 'digital_elevation_model': 'digital_elevation_model', 'digital_elevation_model_srtm': 'digital_elevation_model_srtm', 'elev_highestreturn': 'elev_highestreturn', 'elev_highestreturn_a1': 'elev_highestreturn_a1', 'elev_highestreturn_a2': 'elev_highestreturn_a2', 'elev_highestreturn_a3': 'elev_highestreturn_a3', 'elev_highestreturn_a4': 'elev_highestreturn_a4', 'elev_highestreturn_a5': 'elev_highestreturn_a5', 'elev_highestreturn_a6': 'elev_highestreturn_a6', 'elev_lowestmode': 'elev_lowestmode', 'elev_lowestmode_a1': 'elev_lowestmode_a1', 'elev_lowestmode_a2': 'elev_lowestmode_a2', 'elev_lowestmode_a3': 'elev_lowestmode_a3', 'elev_lowestmode_a4': 'elev_lowestmode_a4', 'elev_lowestmode_a5': 'elev_lowestmode_a5', 'elev_lowestmode_a6': 'elev_lowestmode_a6', 'elev_lowestreturn_a1': 'elev_lowe

#### Main loop, collects all target features and puts them inside a Pandas dataframe.
WARNING: Despite my best attempts at optimization, this is *extremely* memory intensive even for one 5GB HDF5 file. Only run on a supercomputer.

In [23]:
dataframes = []

for f in input_files:
    print(f'Processing file {f.filename}')

    all_ds_for_file = collect_all_datasets(f.filename)

    # For each beam in the file
    for b in files_to_beams[f]:
        print(f'Processing beam {b}')
        
        # Gather all dataset paths for this beam
        beam_ds = [ds for ds in all_ds_for_file if b in ds]
        
        # 1) Retrieve the 'rh' data if it exists
        rh_data = get_dataset_by_name(f, beam_ds, 'rh')  # e.g. shape (N, 101)
        # Extract specific RH columns
        rh_dict = extract_rh_indices(rh_data, indices=[25, 50, 75, 85, 95, 100])
        print("Extracted RH data successfully.")

        # 2) Retrieve shot number (often used to define how many rows we have)
        shotNums = get_dataset_by_name(f, beam_ds, 'shot_number')
        if shotNums is None:
            print("Warning: No 'shot_number' found. Skipping this beam.")
            continue
        shotNums = np.array(shotNums).astype(str)
        # We'll build a partial DataFrame row-by-row
        row_count = len(shotNums)
        print(f'Row count: {row_count}')
        
        # 3) Build some "constant" columns: file name, beam name
        file_name = [f.filename.split('\\')[-1]] * row_count
        beam_name = [b] * row_count
        
        # 4) Retrieve other attributes (digital_elevation_model, degrade_flag, etc.)
        # We'll store them in a dictionary from column -> values
        attr_data = {}
        for col_label, ds_suffix in ATTRIBUTES_TO_LOAD.items():
            data_array = get_dataset_by_name(f, beam_ds, ds_suffix)
            if data_array is None:
                # Handle missing dataset
                attr_data[col_label] = [None] * row_count
                print(f'Missing dataset detected!: {ds_suffix}')
            else:
                # Verify length matches row_count (if needed)
                if len(data_array) != row_count:
                    print(f"Warning: length mismatch in {col_label} for beam {b}.")
                attr_data[col_label] = data_array
                # print(f'Dataset {ds_suffix} successfully loaded.')
        
        # 5) Construct the DataFrame
        df = pd.DataFrame({
            'File Name': file_name,
            'Beam Name': beam_name,
            'Shot Number': shotNums,
            # Insert the extracted rh columns
            **rh_dict,
            # Insert the other attributes
            **attr_data
        })

        # 6) One-hot encode the Beam and Channel identifiers.
        #    We assume that 'Beam Name' and 'Channel' (case-sensitive) are present.
        #    Using pd.get_dummies is efficient given the low number of unique categories.

        df['beam'] = df['beam'].astype(str)
        df['channel'] = df['channel'].astype(str)

        print("Beam Name values:")
        print(df['beam'].value_counts(dropna=False))

        print("Channel values:")
        print(df['channel'].value_counts(dropna=False))


        print(f'Unique beam identifiers: {df['beam'].unique()}')
        print(f'Unique channel identifiers: {df['channel'].unique()}')

        print(f'Null values for beam: {df['beam'].isnull().sum()}')
        print(f'Null values for channel: {df['channel'].isnull().sum()}')
        
        dataframes.append(df)
        print('Made a Pandas dataframe!')

# Finally, concatenate all partial DataFrames
print("Concat dataframe...")
complete_df = pd.concat(dataframes, ignore_index=True)

if 'beam' in complete_df.columns:
    complete_df = pd.get_dummies(complete_df, columns=['beam'], prefix='beam')
    print('Encoded beam name')
if 'channel' in complete_df.columns:
    complete_df = pd.get_dummies(complete_df, columns=['channel'], prefix='channel')
    print('Encoded channel')

print("Done! Final DataFrame shape:", complete_df.shape)
complete_df

Processing file /oscar/scratch/jzhu118/GEDI_Outlier_Detection_OSCAR/Input_files/GEDI02_A_2021050140102_O12405_02_T10912_02_003_02_V002.h5
Processing beam BEAM0000
Extracted RH data successfully.
Row count: 102767
Beam Name values:
beam
0    102767
Name: count, dtype: int64
Channel values:
channel
0    102767
Name: count, dtype: int64
Unique beam identifiers: ['0']
Unique channel identifiers: ['0']
Null values for beam: 0
Null values for channel: 0
Made a Pandas dataframe with one-hot encoded Beam Name & Channel!
Processing beam BEAM0001
Extracted RH data successfully.
Row count: 101721
Beam Name values:
beam
1    101721
Name: count, dtype: int64
Channel values:
channel
0    101721
Name: count, dtype: int64
Unique beam identifiers: ['1']
Unique channel identifiers: ['0']
Null values for beam: 0
Null values for channel: 0
Made a Pandas dataframe with one-hot encoded Beam Name & Channel!
Processing beam BEAM0010
Extracted RH data successfully.
Row count: 102760
Beam Name values:
beam
2   

Unnamed: 0,File Name,Beam Name,Shot Number,RH_25,RH_50,RH_75,RH_85,RH_95,RH_100,back_threshold,...,beam_3,beam_5,beam_6,beam_8,channel_0,channel_1,channel_2,channel_3,channel_4,channel_5
0,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,124050000200021325,0.0,0.0,0.0,0.0,0.0,0.0,263.190308,...,False,False,False,False,True,False,False,False,False,False
1,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,124050000200021326,0.0,0.0,0.0,0.0,0.0,0.0,262.640930,...,False,False,False,False,True,False,False,False,False,False
2,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,124050000200021327,0.0,0.0,0.0,0.0,0.0,0.0,263.722839,...,False,False,False,False,True,False,False,False,False,False
3,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,124050000200021328,0.0,0.0,0.0,0.0,0.0,0.0,262.772583,...,False,False,False,False,True,False,False,False,False,False
4,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,124050000200021329,0.0,0.0,0.0,0.0,0.0,0.0,262.490845,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2163371,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,129641100300406988,0.0,0.0,0.0,0.0,0.0,0.0,241.886826,...,False,False,False,False,False,False,False,False,False,True
2163372,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,129641100300406989,0.0,0.0,0.0,0.0,0.0,0.0,241.160187,...,False,False,False,False,False,False,False,False,False,True
2163373,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,129641100300406990,0.0,0.0,0.0,0.0,0.0,0.0,240.570206,...,False,False,False,False,False,False,False,False,False,True
2163374,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,129641100300406991,0.0,0.0,0.0,0.0,0.0,0.0,241.826447,...,False,False,False,False,False,False,False,False,False,True


#### Feature engineering
1. Adding Z-Scores for five most relevant RH metrics
2. Creating RH50 / RH100 ratio
3. Adding RH95 - RH50
4. Adding "Missingness" - the number of NaNs in each row

In [None]:
num_zeros = (complete_df['RH_100'] == 0).sum()
print(f"Number of zeros in RH_100: {num_zeros}")

In [None]:
'''Add Z-Scores of the five RH metrics'''
rh_nums = [25, 50, 75, 85, 95, 100]
for i in rh_nums:
    col_name = f'RH_{i}'
    complete_df[f'{col_name} Z Score'] = (complete_df[col_name] - complete_df[col_name].mean()) / complete_df[col_name].std()

complete_df

In [None]:
'''Adding the RH_50 / RH_100 feature'''
complete_df['RH_50_v_100'] = complete_df['RH_50'] / complete_df['RH_100']
complete_df

In [None]:
'''Adding the (RH95 - RH50) feature'''
rh_50 = complete_df['RH_50']
rh_95 = complete_df['RH_95']
complete_df['RH_95_minus_50'] = (rh_95 - rh_50)
complete_df

In [None]:
'''Adding the 'Missingness' feature'''
# Count the number of NaNs in each row
complete_df['Missingness'] = complete_df.isna().sum(axis=1)


# Optionally, inspect how many rows have missing data
print('Number of NaNs:')
print(complete_df['Missingness'].value_counts())
complete_df

In [24]:
'''Optional: Save to a parquet file'''
# Get a list of column names
columns = complete_df.columns

# Find duplicates
duplicate_columns = columns[columns.duplicated()]
print("Duplicate columns:", duplicate_columns)
print('There are ' + str(columns.duplicated().sum()) + ' duplicated columns.')
print('NaN value count:\n')
print(complete_df.isnull().sum())
# complete_df.describe()

# Uncomment the two lines below to write to parquet
complete_df.to_parquet("L2A_raw.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

Duplicate columns: Index([], dtype='object')
There are 0 duplicated columns.
NaN value count:

File Name      0
Beam Name      0
Shot Number    0
RH_25          0
RH_50          0
              ..
channel_1      0
channel_2      0
channel_3      0
channel_4      0
channel_5      0
Length: 193, dtype: int64
Successfully wrote to a .parquet file!


#### This is the filtering step for PCA if there are any NaN rows in the dataframe
Note: there should not be mnany NaN values if you accounted for the different shapes of the data inside the HDF5 files.

In [3]:
'''Optional: load complete_df from .parquet file'''
complete_df = pd.read_parquet('input_raw.parquet', engine='pyarrow')
complete_df

Unnamed: 0,File Name,Beam Name,Shot Number,RH_25,RH_50,RH_75,RH_85,RH_95,RH_100,channel,...,zcross_localenergy,RH_25 Z Score,RH_50 Z Score,RH_75 Z Score,RH_85 Z Score,RH_95 Z Score,RH_100 Z Score,RH_50_v_100,RH_95_minus_50,Missingness
0,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050304,-0.78,0.00,0.78,1.19,1.79,2.36,0,...,163.479019,-0.267243,-0.224966,-0.212232,-0.213468,-0.214492,-0.240572,0.000000,1.79,0
1,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050305,-0.97,-0.07,0.78,1.23,1.87,2.54,0,...,178.583328,-0.352941,-0.243600,-0.212232,-0.206578,-0.202634,-0.216597,-0.027559,1.94,0
2,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050306,-0.86,-0.07,0.71,1.12,1.72,2.32,0,...,203.563889,-0.303326,-0.243600,-0.225654,-0.225526,-0.224867,-0.245900,-0.030172,1.79,0
3,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050307,-0.86,-0.07,0.74,1.16,1.79,2.47,0,...,221.641510,-0.303326,-0.243600,-0.219902,-0.218636,-0.214492,-0.225921,-0.028340,1.86,0
4,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050308,-0.89,-0.03,0.78,1.23,1.94,2.73,0,...,268.898804,-0.316857,-0.232952,-0.212232,-0.206578,-0.192258,-0.191289,-0.010989,1.97,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11607182,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293934,-1.23,-0.18,0.74,1.23,2.02,3.70,5,...,2616.521729,-0.470212,-0.272881,-0.219902,-0.206578,-0.180400,-0.062087,-0.048649,2.20,0
11607183,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293935,-1.23,-0.22,0.71,1.16,1.94,3.70,5,...,2879.136963,-0.470212,-0.283529,-0.225654,-0.218636,-0.192258,-0.062087,-0.059459,2.16,0
11607184,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293936,-1.27,-0.22,0.74,1.23,2.02,3.85,5,...,2619.544189,-0.488253,-0.283529,-0.219902,-0.206578,-0.180400,-0.042108,-0.057143,2.24,0
11607185,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293937,-1.16,-0.14,0.78,1.27,2.05,3.81,5,...,2381.172363,-0.438639,-0.262233,-0.212232,-0.199688,-0.175953,-0.047435,-0.036745,2.19,0


In [4]:
# from sklearn.impute import SimpleImputer

print(f'Original dataframe shape: {complete_df.shape}')

'''Filtering'''
complete_df.drop('Shot Number', axis=1, inplace=True)
print("Shot numbers dropped")
discounted_df = complete_df.dropna(axis=1)


# imputer = SimpleImputer(strategy='most_frequent')  # You can change to 'median', 'most_frequent', etc.
# discounted_df = pd.DataFrame(imputer.fit_transform(complete_df), columns=complete_df.columns)

print("Dataframe shape after dropping NaN columns:", discounted_df.shape)

# Step 1: Separate features and target (if applicable)
# Exclude non-numeric columns if present
filtered_df = discounted_df.select_dtypes(include=[np.number])
# columns_to_keep = [col for col in numeric_df.columns if 'RH' not in col]
# filtered_df = filtered_df[columns_to_keep]

print(f'Non-numeric columns removed from dataframe\nCleaned dataframe size: {filtered_df.shape}')
filtered_df

Original dataframe shape: (11607187, 97)
Shot numbers dropped
Dataframe shape after dropping NaN columns: (11607187, 87)
Non-numeric columns removed from dataframe
Cleaned dataframe size: (11607187, 85)


Unnamed: 0,RH_25,RH_50,RH_75,RH_85,RH_95,RH_100,channel,degrade_flag,delta_time,digital_elevation_model,...,zcross_amp,zcross_localenergy,RH_25 Z Score,RH_50 Z Score,RH_75 Z Score,RH_85 Z Score,RH_95 Z Score,RH_100 Z Score,RH_95_minus_50,Missingness
0,-0.78,0.00,0.78,1.19,1.79,2.36,0,0,4.146742e+07,-999999.000000,...,296.142761,163.479019,-0.267243,-0.224966,-0.212232,-0.213468,-0.214492,-0.240572,1.79,0
1,-0.97,-0.07,0.78,1.23,1.87,2.54,0,0,4.146742e+07,-999999.000000,...,302.973358,178.583328,-0.352941,-0.243600,-0.212232,-0.206578,-0.202634,-0.216597,1.94,0
2,-0.86,-0.07,0.71,1.12,1.72,2.32,0,0,4.146742e+07,-999999.000000,...,302.514313,203.563889,-0.303326,-0.243600,-0.225654,-0.225526,-0.224867,-0.245900,1.79,0
3,-0.86,-0.07,0.74,1.16,1.79,2.47,0,0,4.146742e+07,-999999.000000,...,309.622925,221.641510,-0.303326,-0.243600,-0.219902,-0.218636,-0.214492,-0.225921,1.86,0
4,-0.89,-0.03,0.78,1.23,1.94,2.73,0,0,4.146742e+07,-999999.000000,...,320.783600,268.898804,-0.316857,-0.232952,-0.212232,-0.206578,-0.192258,-0.191289,1.97,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11607182,-1.23,-0.18,0.74,1.23,2.02,3.70,5,0,1.552156e+08,254.929123,...,1057.588623,2616.521729,-0.470212,-0.272881,-0.219902,-0.206578,-0.180400,-0.062087,2.20,0
11607183,-1.23,-0.22,0.71,1.16,1.94,3.70,5,0,1.552156e+08,255.024338,...,1098.721924,2879.136963,-0.470212,-0.283529,-0.225654,-0.218636,-0.192258,-0.062087,2.16,0
11607184,-1.27,-0.22,0.74,1.23,2.02,3.85,5,0,1.552156e+08,255.024338,...,1065.043213,2619.544189,-0.488253,-0.283529,-0.219902,-0.206578,-0.180400,-0.042108,2.24,0
11607185,-1.16,-0.14,0.78,1.27,2.05,3.81,5,0,1.552156e+08,253.656616,...,965.210388,2381.172363,-0.438639,-0.262233,-0.212232,-0.199688,-0.175953,-0.047435,2.19,0


In [6]:
# Step 2, OPTION 1: Standardize the data using StandardScalar (for demo only)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(filtered_df)
# print(f'Data scaled\nScaled data size (ndarray): {scaled_data.shape}')

scaled_df = pd.DataFrame(scaled_data, columns=filtered_df.columns, index=filtered_df.index)
print("Made new scaled dataframe")
scaled_df
scaled_df.describe()

# Uncomment the two lines below to write scaled_df to parquet
scaled_df.to_parquet("input_standard_scaled.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

Made new scaled dataframe
Successfully wrote to a .parquet file!


In [13]:
# OPTION 2: Standardize the data using RobustScalar (this is better for outlier detection)
from sklearn.preprocessing import RobustScaler

# Assume `filtered_df` is your cleaned and filtered dataframe (all numeric columns)
robust_scaler = RobustScaler()
print("Instantiated RobustScalar")

# Fit the scaler on the dataframe and transform it
scaled_array = robust_scaler.fit_transform(filtered_df)
print("Fitted RobustScalar")

# (Optional) Create a new DataFrame with scaled values
scaled_df = pd.DataFrame(scaled_array, columns=filtered_df.columns, index=filtered_df.index)
print("Made new scaled dataframe")
scaled_df
scaled_df.describe()

# Uncomment the two lines below to write scaled_df to parquet
scaled_df.to_parquet("input_scaled.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

Instantiated RobustScalar
Fitted RobustScalar
Made new scaled dataframe
Successfully wrote to a .parquet file!
