# Matching shot_number between GEDI L2A & ALS
We will match shot numbers in the Level 2A GEDI data with the ALS crossover data.

In [1]:
import os
import random

import h5py
import numpy as np
import pandas as pd
import geopandas as gp
# import geoviews as gv
# from geoviews import opts, tile_sources as gvts
# import holoviews as hv
# gv.extension('bokeh', 'matplotlib')
from shapely.geometry import Point
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

import matplotlib
import matplotlib.pyplot as plt
# import PyQt6

In [2]:
inDir = os.getcwd() + "/Input_files"
print(inDir)
input_file_names = [g for g in os.listdir(inDir) if g.startswith('GEDI02_A') and g.endswith('.h5')]  # List all GEDI level 2 files in inDir
input_file_names

/oscar/scratch/jzhu118/GEDI_Outlier_Detection_OSCAR/Input_files


['GEDI02_A_2021050140102_O12405_02_T10912_02_003_02_V002.h5',
 'GEDI02_A_2021086153349_O12964_03_T08275_02_003_02_V002.h5']

### Loading files with all information into a huge Pandas dataframe

#### Preprocessing: Get files and sort by beam

In [3]:
input_files = []
files_to_beams = dict()
for n in input_file_names:
    file_path = os.path.join(inDir, n)  # Select an example file
    file = h5py.File(file_path, 'r')
    input_files.append(file)
    
    print('Loading file: ' + n)
    print('The file contains the following groups: ' + str(list(file.keys())))
    
    print("The file's metadata contains the following attributes: ")
    for g in file['METADATA']['DatasetIdentification'].attrs: print(g)
    
    beamNames = [g for g in file.keys() if g.startswith('BEAM')]
    files_to_beams[file] = beamNames
    
    print("The file contains the following beams: ")
    for b in beamNames:
        print(f"{b} is a {file[b].attrs['description']}")

Loading file: GEDI02_A_2021050140102_O12405_02_T10912_02_003_02_V002.h5
The file contains the following groups: ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file's metadata contains the following attributes: 
PGEVersion
VersionID
abstract
characterSet
creationDate
credit
fileName
language
originatorOrganizationName
purpose
shortName
spatialRepresentationType
status
topicCategory
uuid
The file contains the following beams: 
BEAM0000 is a Coverage beam
BEAM0001 is a Coverage beam
BEAM0010 is a Coverage beam
BEAM0011 is a Coverage beam
BEAM0101 is a Full power beam
BEAM0110 is a Full power beam
BEAM1000 is a Full power beam
BEAM1011 is a Full power beam
Loading file: GEDI02_A_2021086153349_O12964_03_T08275_02_003_02_V002.h5
The file contains the following groups: ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file's metadata contains the following attribute

#### Helper functions for the main loop

In [4]:
# -----------------------------------------------------------------------------
# HELPER FUNCTIONS
# -----------------------------------------------------------------------------

def collect_all_datasets(file_path):
    """
    Recursively collects all dataset paths within the HDF5 file.
    Returns a list of dataset path strings.
    """
    dataset_paths = []
    
    def visitor_func(name, node):
        if isinstance(node, h5py.Dataset):
            dataset_paths.append(name)
    
    with h5py.File(file_path, 'r') as f:
        f.visititems(visitor_func)
    
    return dataset_paths

def get_dataset_by_name(h5_file, beam_ds, name):
    """
    Given an open HDF5 file (h5_file) and a list of dataset paths (beam_ds),
    returns the data (as a NumPy array) for the first dataset whose path ends
    with '/{name}'. If not found, returns None.
    """
    candidates = [g for g in beam_ds if g.endswith(f'/{name}')]
    if not candidates:
        print(f"Warning: No dataset ending with '/{name}' found.")
        return None
    dataset_path = candidates[0]
    return h5_file[dataset_path][()]  # This reads the dataset into memory

def extract_rh_indices(rh_data, indices=[25, 50, 75, 85, 95, 100]):
    """
    Given an array (rh_data) with shape (N, X) from the 'rh' dataset,
    returns a dictionary mapping keys (e.g., 'RH_25') to arrays of values.
    If rh_data is None, returns an empty dictionary.
    """
    if rh_data is None:
        return {}
    
    rh_array = np.array(rh_data)  # Ensure we have a NumPy array
    result = {}
    for i in indices:
        col_name = f"RH_{i}"
        try:
            result[col_name] = rh_array[:, i]
        except IndexError:
            print(f"Warning: Index {i} is out of bounds for the rh dataset.")
            result[col_name] = np.full(rh_array.shape[0], np.nan)
    return result

def load_attributes_to_dict(txt_file_path):
    """
    Reads attribute names from a text file (one per line) and returns a dictionary.
    Each attribute is mapped to itself.
    For example:
      { "Channel": "Channel", "digital_elevation_model": "digital_elevation_model", ... }
    """
    with open(txt_file_path, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]
    
    attr_dict = {}
    for raw_attr in lines:
        attr_dict[raw_attr] = raw_attr  # Direct mapping for simplicity
    return attr_dict

In [5]:
# -----------------------------------------------------------------------------
# LOAD ALS CROSSOVER DATA
# -----------------------------------------------------------------------------

als_csv_path = os.path.join('Input_files', 'GEDI_ALSCROSSOVERS.csv')
als_df = pd.read_csv(als_csv_path, dtype={'shot_number': str})

# Ensure shot numbers are of a consistent type (adjust if necessary)
als_df['shot_number'] = als_df['shot_number'].astype(str)
als_shot_set = set(als_df['shot_number'])
print(f"Loaded ALS crossover data with {len(als_shot_set)} unique shot numbers.")

Loaded ALS crossover data with 76778 unique shot numbers.


### THIS CELL IS FOR TESTING ONLY!

In [6]:
# Read the parquet file containing all GEDI waveform shots
gedi_df = pd.read_parquet('L2A_raw.parquet')

# Assume the shot numbers are in a column named 'shot_number'
# Randomly sample 10% of the shot numbers for testing purposes.
# Set a random state for reproducibility.
sample_fraction = 0.1
random_state = 42

als_shot_series = gedi_df['Shot Number'].sample(frac=sample_fraction, random_state=random_state)

# Convert shot numbers to strings for consistent matching later on
als_shot_set = set(als_shot_series.astype(str))

print(f"Selected {len(als_shot_set)} shot numbers for als_shot_set.")

Selected 216338 shot numbers for als_shot_set.


In [7]:
# -----------------------------------------------------------------------------
# LOAD ATTRIBUTE DICTIONARY (including Beam and Channel, among others)
# -----------------------------------------------------------------------------

txt_path = 'L2A_features.txt'
ATTRIBUTES_TO_LOAD = load_attributes_to_dict(txt_path)
print("Loaded attributes:", ATTRIBUTES_TO_LOAD)

Loaded attributes: {'back_threshold': 'back_threshold', 'beam': 'beam', 'botloc': 'botloc', 'botloc_amp': 'botloc_amp', 'channel': 'channel', 'degrade_flag': 'degrade_flag', 'delta_time': 'delta_time', 'digital_elevation_model': 'digital_elevation_model', 'digital_elevation_model_srtm': 'digital_elevation_model_srtm', 'elev_highestreturn': 'elev_highestreturn', 'elev_highestreturn_a1': 'elev_highestreturn_a1', 'elev_highestreturn_a2': 'elev_highestreturn_a2', 'elev_highestreturn_a3': 'elev_highestreturn_a3', 'elev_highestreturn_a4': 'elev_highestreturn_a4', 'elev_highestreturn_a5': 'elev_highestreturn_a5', 'elev_highestreturn_a6': 'elev_highestreturn_a6', 'elev_lowestmode': 'elev_lowestmode', 'elev_lowestmode_a1': 'elev_lowestmode_a1', 'elev_lowestmode_a2': 'elev_lowestmode_a2', 'elev_lowestmode_a3': 'elev_lowestmode_a3', 'elev_lowestmode_a4': 'elev_lowestmode_a4', 'elev_lowestmode_a5': 'elev_lowestmode_a5', 'elev_lowestmode_a6': 'elev_lowestmode_a6', 'elev_lowestreturn_a1': 'elev_lowe

#### Main loop, collects all target features and puts them inside a Pandas dataframe.
WARNING: Despite my best attempts at optimization, this is *extremely* memory intensive even for one 5GB HDF5 file. Only run on a supercomputer.

In [8]:
# -----------------------------------------------------------------------------
# PROCESS GEDI LEVEL 2A FILES & MATCH SHOT NUMBERS
# -----------------------------------------------------------------------------

dataframes = []

for f in input_files:
    print(f"\nProcessing file: {f.filename}")
    
    # Collect all dataset paths in the current file
    all_ds_for_file = collect_all_datasets(f.filename)
    
    # Process each beam in the current file
    for b in files_to_beams[f]:
        print(f"  Processing beam: {b}")
        
        # Filter dataset paths to only those belonging to the current beam
        beam_ds = [ds for ds in all_ds_for_file if b in ds]
        
        # 1) Retrieve shot numbers for the beam and convert to string for matching
        shotNums = get_dataset_by_name(f, beam_ds, 'shot_number')
        if shotNums is None:
            print(f"    Warning: No 'shot_number' dataset found for beam {b}. Skipping beam.")
            continue
        shotNums = np.array(shotNums).astype(str)
        
        # 2) Identify indices where GEDI shot numbers match the ALS crossover shot numbers
        matching_indices = np.where(np.isin(shotNums, list(als_shot_set)))[0]
        if matching_indices.size == 0:
            print(f"    No matching shot numbers found for beam {b}.")
            continue
        row_count = matching_indices.size
        print(f"    Found {row_count} matching shot(s).")
        
        # Build constant columns: File Name and Beam Name
        file_name_col = [os.path.basename(f.filename)] * row_count
        beam_name_col = [b] * row_count
        
        # 3) Extract all other attributes as specified in the txt file
        attr_data = {}
        for col_label, ds_suffix in ATTRIBUTES_TO_LOAD.items():
            data_array = get_dataset_by_name(f, beam_ds, ds_suffix)
            if data_array is None:
                print(f"    Warning: Missing dataset for {ds_suffix}. Filling with None.")
                attr_data[col_label] = np.array([None] * len(shotNums))[matching_indices]
            else:
                data_array = np.array(data_array)
                if data_array.shape[0] != shotNums.shape[0]:
                    print(f"    Warning: Row count mismatch for {col_label} in beam {b}.")
                attr_data[col_label] = data_array[matching_indices]
        
        # 4) Extract RH indices and filter by matching indices
        rh_data = get_dataset_by_name(f, beam_ds, 'rh')
        rh_dict_full = extract_rh_indices(rh_data)
        rh_dict = {}
        for key, arr in rh_dict_full.items():
            arr = np.array(arr)
            if arr.shape[0] != shotNums.shape[0]:
                print(f"    Warning: Row count mismatch for {key} in beam {b}.")
            rh_dict[key] = arr[matching_indices]
        
        # 5) Filter the shot numbers to only the matching ones
        shotNums_filtered = shotNums[matching_indices]
        
        # 6) Construct the DataFrame for the current beam
        df = pd.DataFrame({
            'File Name': file_name_col,
            'Beam Name': beam_name_col,
            'Shot Number': shotNums_filtered,
            **rh_dict,
            **attr_data
        })
        
        dataframes.append(df)
        print(f"    Processed beam {b} with {row_count} matching shots.")

print('ALL DONE!')


Processing file: /oscar/scratch/jzhu118/GEDI_Outlier_Detection_OSCAR/Input_files/GEDI02_A_2021050140102_O12405_02_T10912_02_003_02_V002.h5
  Processing beam: BEAM0000
    Found 10220 matching shot(s).
    Processed beam BEAM0000 with 10220 matching shots.
  Processing beam: BEAM0001
    Found 10103 matching shot(s).
    Processed beam BEAM0001 with 10103 matching shots.
  Processing beam: BEAM0010
    Found 10388 matching shot(s).
    Processed beam BEAM0010 with 10388 matching shots.
  Processing beam: BEAM0011
    Found 10142 matching shot(s).
    Processed beam BEAM0011 with 10142 matching shots.
  Processing beam: BEAM0101
    Found 10134 matching shot(s).
    Processed beam BEAM0101 with 10134 matching shots.
  Processing beam: BEAM0110
    Found 11134 matching shot(s).
    Processed beam BEAM0110 with 11134 matching shots.
  Processing beam: BEAM1000
    Found 10320 matching shot(s).
    Processed beam BEAM1000 with 10320 matching shots.
  Processing beam: BEAM1011
    Found 100

In [9]:
# -----------------------------------------------------------------------------
# FINALIZE: Concatenate all partial DataFrames into one final DataFrame
# -----------------------------------------------------------------------------

if dataframes:
    complete_df = pd.concat(dataframes, ignore_index=True)

    if 'beam' in complete_df.columns:
        complete_df = pd.get_dummies(complete_df, columns=['beam'], prefix='beam')
        print('Encoded beam name')
    if 'channel' in complete_df.columns:
        complete_df = pd.get_dummies(complete_df, columns=['channel'], prefix='channel')
        print('Encoded channel')

    print(f"\nFinal DataFrame shape: {complete_df.shape}")
else:
    complete_df = pd.DataFrame()
    print("\nNo matching data found in any beams.")
    
# Optionally, save the complete DataFrame for further processing:
# complete_df.to_parquet("ALS_and_L2A.parquet", engine="pyarrow", compression="snappy")
# print('Successfully wrote to a .parquet file!')

Encoded beam name
Encoded channel

Final DataFrame shape: (216338, 193)


#### Feature engineering
1. Adding Z-Scores for five most relevant RH metrics
2. Creating RH50 / RH100 ratio
3. Adding RH95 - RH50
4. Adding "Missingness" - the number of NaNs in each row

In [None]:
num_zeros = (complete_df['RH_100'] == 0).sum()
print(f"Number of zeros in RH_100: {num_zeros}")

In [None]:
'''Add Z-Scores of the five RH metrics'''
rh_nums = [25, 50, 75, 85, 95, 100]
for i in rh_nums:
    col_name = f'RH_{i}'
    complete_df[f'{col_name} Z Score'] = (complete_df[col_name] - complete_df[col_name].mean()) / complete_df[col_name].std()

complete_df

In [None]:
'''Adding the RH_50 / RH_100 feature'''
complete_df['RH_50_v_100'] = complete_df['RH_50'] / complete_df['RH_100']
complete_df

In [None]:
'''Adding the (RH95 - RH50) feature'''
rh_50 = complete_df['RH_50']
rh_95 = complete_df['RH_95']
complete_df['RH_95_minus_50'] = (rh_95 - rh_50)
complete_df

In [None]:
'''Adding the 'Missingness' feature'''
# Count the number of NaNs in each row
complete_df['Missingness'] = complete_df.isna().sum(axis=1)


# Optionally, inspect how many rows have missing data
print('Number of NaNs:')
print(complete_df['Missingness'].value_counts())
complete_df

In [None]:
'''Optional: Save to a parquet file'''
# Get a list of column names
columns = complete_df.columns

# Find duplicates
duplicate_columns = columns[columns.duplicated()]
print("Duplicate columns:", duplicate_columns)
print('There are ' + str(columns.duplicated().sum()) + ' duplicated columns.')
print('NaN value count:\n')
print(complete_df.isnull().sum())
# complete_df.describe()

# Uncomment the two lines below to write to parquet
# complete_df.to_parquet("input_raw.parquet", engine="pyarrow", compression="snappy")
# print('Successfully wrote to a .parquet file!')

#### This is the filtering step for PCA if there are any NaN rows in the dataframe
Note: there should not be mnany NaN values if you accounted for the different shapes of the data inside the HDF5 files.

In [None]:
'''Optional: load complete_df from .parquet file'''
complete_df = pd.read_parquet('input_raw.parquet', engine='pyarrow')
complete_df

In [None]:
# from sklearn.impute import SimpleImputer

print(f'Original dataframe shape: {complete_df.shape}')

'''Filtering'''
complete_df.drop('Shot Number', axis=1, inplace=True)
print("Shot numbers dropped")
discounted_df = complete_df.dropna(axis=1)


# imputer = SimpleImputer(strategy='most_frequent')  # You can change to 'median', 'most_frequent', etc.
# discounted_df = pd.DataFrame(imputer.fit_transform(complete_df), columns=complete_df.columns)

print("Dataframe shape after dropping NaN columns:", discounted_df.shape)

# Step 1: Separate features and target (if applicable)
# Exclude non-numeric columns if present
filtered_df = discounted_df.select_dtypes(include=[np.number])
# columns_to_keep = [col for col in numeric_df.columns if 'RH' not in col]
# filtered_df = filtered_df[columns_to_keep]

print(f'Non-numeric columns removed from dataframe\nCleaned dataframe size: {filtered_df.shape}')
filtered_df

In [None]:
# Step 2, OPTION 1: Standardize the data using StandardScalar (for demo only)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(filtered_df)
# print(f'Data scaled\nScaled data size (ndarray): {scaled_data.shape}')

scaled_df = pd.DataFrame(scaled_data, columns=filtered_df.columns, index=filtered_df.index)
print("Made new scaled dataframe")
scaled_df
scaled_df.describe()

# Uncomment the two lines below to write scaled_df to parquet
scaled_df.to_parquet("input_standard_scaled.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

In [None]:
# OPTION 2: Standardize the data using RobustScalar (this is better for outlier detection)
from sklearn.preprocessing import RobustScaler

# Assume `filtered_df` is your cleaned and filtered dataframe (all numeric columns)
robust_scaler = RobustScaler()
print("Instantiated RobustScalar")

# Fit the scaler on the dataframe and transform it
scaled_array = robust_scaler.fit_transform(filtered_df)
print("Fitted RobustScalar")

# (Optional) Create a new DataFrame with scaled values
scaled_df = pd.DataFrame(scaled_array, columns=filtered_df.columns, index=filtered_df.index)
print("Made new scaled dataframe")
scaled_df
scaled_df.describe()

# Uncomment the two lines below to write scaled_df to parquet
scaled_df.to_parquet("input_scaled.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')