# Matching shot_number between GEDI L1B & ALS
We will match shot numbers in the Level 2A GEDI data with the ALS crossover data.

In [4]:
import os
import random

import h5py
import numpy as np
import pandas as pd
import geopandas as gp
# import geoviews as gv
# from geoviews import opts, tile_sources as gvts
# import holoviews as hv
# gv.extension('bokeh', 'matplotlib')
from shapely.geometry import Point
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

import matplotlib
import matplotlib.pyplot as plt
# import PyQt6

In [5]:
inDir = os.getcwd() + "/Input_files"
print(inDir)
input_file_names = [g for g in os.listdir(inDir) if g.startswith('GEDI01_B') and g.endswith('.h5')]  # List all GEDI level 2 files in inDir
input_file_names

/oscar/scratch/jzhu118/GEDI_Outlier_Detection_OSCAR/Input_files


['GEDI01_B_2022004042652_O17343_04_T10772_02_005_02_V002.h5',
 'GEDI01_B_2022207041426_O20491_04_T09293_02_005_03_V002.h5']

### Loading files with all information into a huge Pandas dataframe

#### Preprocessing: Get files and sort by beam

In [6]:
input_files = []
files_to_beams = dict()
for n in input_file_names:
    file_path = os.path.join(inDir, n)  # Select an example file
    file = h5py.File(file_path, 'r')
    input_files.append(file)
    
    print('Loading file: ' + n)
    print('The file contains the following groups: ' + str(list(file.keys())))
    
    print("The file's metadata contains the following attributes: ")
    for g in file['METADATA']['DatasetIdentification'].attrs: print(g)
    
    beamNames = [g for g in file.keys() if g.startswith('BEAM')]
    files_to_beams[file] = beamNames
    
    print("The file contains the following beams: ")
    for b in beamNames:
        print(f"{b} is a {file[b].attrs['description']}")

Loading file: GEDI01_B_2022004042652_O17343_04_T10772_02_005_02_V002.h5
The file contains the following groups: ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file's metadata contains the following attributes: 
PGEVersion
VersionID
abstract
characterSet
creationDate
credit
fileName
language
originatorOrganizationName
purpose
shortName
spatialRepresentationType
status
topicCategory
uuid
The file contains the following beams: 
BEAM0000 is a Coverage beam
BEAM0001 is a Coverage beam
BEAM0010 is a Coverage beam
BEAM0011 is a Coverage beam
BEAM0101 is a Full power beam
BEAM0110 is a Full power beam
BEAM1000 is a Full power beam
BEAM1011 is a Full power beam
Loading file: GEDI01_B_2022207041426_O20491_04_T09293_02_005_03_V002.h5
The file contains the following groups: ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file's metadata contains the following attribute

#### Helper functions for the main loop

In [7]:
# -----------------------------------------------------------------------------
# HELPER FUNCTIONS
# -----------------------------------------------------------------------------

def collect_all_datasets(file_path):
    """
    Recursively collects all dataset paths within the HDF5 file.
    Returns a list of dataset path strings.
    """
    dataset_paths = []
    
    def visitor_func(name, node):
        if isinstance(node, h5py.Dataset):
            dataset_paths.append(name)
    
    with h5py.File(file_path, 'r') as f:
        f.visititems(visitor_func)
    
    return dataset_paths

def get_dataset_by_name(h5_file, beam_ds, name):
    """
    Given an open HDF5 file (h5_file) and a list of dataset paths (beam_ds),
    returns the data for the first dataset whose path ends with '/{name}'.
    If not found, returns None.
    """
    candidates = [ds for ds in beam_ds if ds.endswith(f'/{name}')]
    if not candidates:
        print(f"Warning: No dataset ending with '/{name}' found.")
        return None
    dataset_path = candidates[0]
    return h5_file[dataset_path][()]  # Read dataset into memory

def load_attributes_to_dict(txt_file_path):
    """
    Reads attribute names from a text file (one per line) and returns a dictionary.
    This dictionary maps the human-friendly column name to the dataset suffix.
    """
    with open(txt_file_path, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]
    
    attr_dict = {}
    for raw_attr in lines:
        attr_dict[raw_attr] = raw_attr  # Direct mapping for simplicity
    return attr_dict

In [8]:
# -----------------------------------------------------------------------------
# LOAD ALS CROSSOVER DATA
# -----------------------------------------------------------------------------

als_csv_path = os.path.join('Input_files', 'GEDI_ALSCROSSOVERS.csv')
als_df = pd.read_csv(als_csv_path)

# Ensure shot numbers are of a consistent type (adjust if necessary)
als_df['shot_number'] = als_df['shot_number'].astype(str)
als_shot_set = set(als_df['shot_number'])
print(f"Loaded ALS crossover data with {len(als_shot_set)} unique shot numbers.")

Loaded ALS crossover data with 76778 unique shot numbers.


### THIS CELL IS FOR TESTING ONLY!

In [9]:
# Read the parquet file containing all GEDI waveform shots
gedi_df = pd.read_parquet('L1B_raw.parquet')

# Assume the shot numbers are in a column named 'shot_number'
# Randomly sample 10% of the shot numbers for testing purposes.
# Set a random state for reproducibility.
sample_fraction = 0.1
random_state = 42

als_shot_series = gedi_df['Shot Number'].sample(frac=sample_fraction, random_state=random_state)

# Convert shot numbers to strings for consistent matching later on
als_shot_set = set(als_shot_series.astype(str))

print(f"Selected {len(als_shot_set)} shot numbers for als_shot_set.")

Selected 182065 shot numbers for als_shot_set.


In [10]:
# -----------------------------------------------------------------------------
# LOAD ATTRIBUTE DICTIONARY (including Beam and Channel, among others)
# -----------------------------------------------------------------------------

txt_path = 'L1B_features.txt'
ATTRIBUTES_TO_LOAD = load_attributes_to_dict(txt_path)
print("Loaded attributes:", ATTRIBUTES_TO_LOAD)

Loaded attributes: {'all_samples_sum': 'all_samples_sum', 'beam': 'beam', 'channel': 'channel', 'master_frac': 'master_frac', 'master_int': 'master_int', 'noise_mean_corrected': 'noise_mean_corrected', 'noise_stddev_corrected': 'noise_stddev_corrected', 'nsemean_even': 'nsemean_even', 'nsemean_odd': 'nsemean_odd', 'rx_energy': 'rx_energy', 'rx_offset': 'rx_offset', 'rx_open': 'rx_open', 'rx_sample_count': 'rx_sample_count', 'rx_sample_start_index': 'rx_sample_start_index', 'selection_stretchers_x': 'selection_stretchers_x', 'selection_stretchers_y': 'selection_stretchers_y', 'shot_number': 'shot_number', 'stale_return_flag': 'stale_return_flag', 'th_left_used': 'th_left_used', 'tx_egamplitude': 'tx_egamplitude', 'tx_egamplitude_error': 'tx_egamplitude_error', 'tx_egbias': 'tx_egbias', 'tx_egbias_error': 'tx_egbias_error', 'tx_egflag': 'tx_egflag', 'tx_eggamma': 'tx_eggamma', 'tx_eggamma_error': 'tx_eggamma_error', 'tx_egsigma': 'tx_egsigma', 'tx_egsigma_error': 'tx_egsigma_error', 'tx_

#### Main loop, collects all target features and puts them inside a Pandas dataframe.
WARNING: Despite my best attempts at optimization, this is *extremely* memory intensive even for one 5GB HDF5 file. Only run on a supercomputer.

In [15]:
# -----------------------------------------------------------------------------
# PROCESS GEDI LEVEL 1B FILES & MATCH SHOT NUMBERS
# -----------------------------------------------------------------------------

dataframes = []

for f in input_files:
    print(f"\nProcessing file: {f.filename}")
    
    # Collect all dataset paths in the current file
    all_ds_for_file = collect_all_datasets(f.filename)
    
    # Process each beam in the current file
    for b in files_to_beams[f]:
        print(f"  Processing beam: {b}")
        
        # Filter dataset paths to only those belonging to the current beam
        beam_ds = [ds for ds in all_ds_for_file if b in ds]
        
        # 1) Retrieve shot numbers for the beam and convert to string for matching
        shotNums = get_dataset_by_name(f, beam_ds, 'shot_number')
        if shotNums is None:
            print(f"    Warning: No 'shot_number' dataset found for beam {b}. Skipping beam.")
            continue
        shotNums = np.array(shotNums).astype(str)
        total_shots = len(shotNums)
        
        # 2) Identify indices where GEDI shot numbers match the ALS crossover shot numbers
        matching_indices = np.where(np.isin(shotNums, list(als_shot_set)))[0]
        if matching_indices.size == 0:
            print(f"    No matching shot numbers found for beam {b}.")
            continue
        row_count = matching_indices.size
        print(f"    Found {row_count} matching shot(s).")
        
        # Build constant columns: File Name and Beam Name
        file_name_col = [os.path.basename(f.filename)] * row_count
        beam_name_col = [b] * row_count
        
        # 3) Extract all other attributes as specified in the txt file
        attr_data = {}
        for col_label, ds_suffix in ATTRIBUTES_TO_LOAD.items():
            data_array = get_dataset_by_name(f, beam_ds, ds_suffix)
            if data_array is None:
                print(f"    Warning: Missing dataset for {ds_suffix}. Filling with None.")
                attr_data[col_label] = np.array([None] * total_shots)[matching_indices]
            else:
                data_array = np.array(data_array)
                if data_array.shape[0] != shotNums.shape[0]:
                    print(f"    Warning: Row count mismatch for {col_label} in beam {b}.")
                attr_data[col_label] = data_array[matching_indices]
        
        # 4) Extract the 'surface_type' feature (2D array with shape (5, # of shots))
        surface_data = get_dataset_by_name(f, beam_ds, 'surface_type')
        if surface_data is None:
            # Handle missing surface_type as before
            surface_cols = {
                'surface_land': np.full(total_shots, np.nan)[matching_indices],
                'surface_ocean': np.full(total_shots, np.nan)[matching_indices],
                'surface_sea_ice': np.full(total_shots, np.nan)[matching_indices],
                'surface_land_ice': np.full(total_shots, np.nan)[matching_indices],
                'surface_inland_water': np.full(total_shots, np.nan)[matching_indices]
            }
        else:
            surface_data = np.array(surface_data)
            # Transpose if necessary to get shape (total_shots, 5)
            if surface_data.shape[0] == 5 and surface_data.shape[1] == total_shots:
                surface_data = surface_data.T
            elif surface_data.shape[0] != total_shots or surface_data.shape[1] != 5:
                print("    Warning: Unexpected shape for 'surface_type'. Expected (5, #shots) or (#shots, 5).")
            # Now filter each column using matching_indices
            surface_cols = {
                'surface_land': surface_data[:, 0][matching_indices],
                'surface_ocean': surface_data[:, 1][matching_indices],
                'surface_sea_ice': surface_data[:, 2][matching_indices],
                'surface_land_ice': surface_data[:, 3][matching_indices],
                'surface_inland_water': surface_data[:, 4][matching_indices]
            }
        
        # 5) Filter the shot numbers to only the matching ones
        shotNums_filtered = shotNums[matching_indices]
        
        # 6) Construct the DataFrame for the current beam
        df = pd.DataFrame({
            'File Name': file_name_col,
            'Beam Name': beam_name_col,
            'Shot Number': shotNums_filtered,
            **attr_data,
            **surface_cols
        })
        
        # 7) One-hot encode the Beam and Channel identifiers.
        #    We assume that both 'Beam Name' and 'Channel' columns exist.
        if 'Beam Name' in df.columns:
            df = pd.get_dummies(df, columns=['Beam Name'], prefix='')
        if 'channel' in df.columns:
            df = pd.get_dummies(df, columns=['channel'], prefix='Channel')
        
        dataframes.append(df)
        print(f"    Processed beam {b} with {row_count} matching shots.")

print('ALL DONE!')


Processing file: /oscar/scratch/jzhu118/GEDI_Outlier_Detection_OSCAR/Input_files/GEDI01_B_2022004042652_O17343_04_T10772_02_005_02_V002.h5
  Processing beam: BEAM0000
    Found 16820 matching shot(s).
    Processed beam BEAM0000 with 16820 matching shots.
  Processing beam: BEAM0001
    Found 16647 matching shot(s).
    Processed beam BEAM0001 with 16647 matching shots.
  Processing beam: BEAM0010
    Found 16738 matching shot(s).
    Processed beam BEAM0010 with 16738 matching shots.
  Processing beam: BEAM0011
    Found 17068 matching shot(s).
    Processed beam BEAM0011 with 17068 matching shots.
  Processing beam: BEAM0101
    Found 16810 matching shot(s).
    Processed beam BEAM0101 with 16810 matching shots.
  Processing beam: BEAM0110
    Found 16803 matching shot(s).
    Processed beam BEAM0110 with 16803 matching shots.
  Processing beam: BEAM1000
    Found 16898 matching shot(s).
    Processed beam BEAM1000 with 16898 matching shots.
  Processing beam: BEAM1011
    Found 169

In [16]:
# -----------------------------------------------------------------------------
# FINALIZE: Concatenate all partial DataFrames into one final DataFrame
# -----------------------------------------------------------------------------

if dataframes:
    complete_df = pd.concat(dataframes, ignore_index=True)
    print(f"\nFinal DataFrame shape: {complete_df.shape}")
else:
    complete_df = pd.DataFrame()
    print("\nNo matching data found in any beams.")
    
# Optionally, save the complete DataFrame for further processing:
# complete_df.to_parquet("ALS_and_L1B.parquet", engine="pyarrow", compression="snappy")
# print('Successfully wrote to a .parquet file!')


Final DataFrame shape: (182065, 97)


#### Feature engineering
1. Adding Z-Scores for five most relevant RH metrics
2. Creating RH50 / RH100 ratio
3. Adding RH95 - RH50
4. Adding "Missingness" - the number of NaNs in each row

In [None]:
num_zeros = (complete_df['RH_100'] == 0).sum()
print(f"Number of zeros in RH_100: {num_zeros}")

In [None]:
'''Add Z-Scores of the five RH metrics'''
rh_nums = [25, 50, 75, 85, 95, 100]
for i in rh_nums:
    col_name = f'RH_{i}'
    complete_df[f'{col_name} Z Score'] = (complete_df[col_name] - complete_df[col_name].mean()) / complete_df[col_name].std()

complete_df

In [None]:
'''Adding the RH_50 / RH_100 feature'''
complete_df['RH_50_v_100'] = complete_df['RH_50'] / complete_df['RH_100']
complete_df

In [None]:
'''Adding the (RH95 - RH50) feature'''
rh_50 = complete_df['RH_50']
rh_95 = complete_df['RH_95']
complete_df['RH_95_minus_50'] = (rh_95 - rh_50)
complete_df

In [None]:
'''Adding the 'Missingness' feature'''
# Count the number of NaNs in each row
complete_df['Missingness'] = complete_df.isna().sum(axis=1)


# Optionally, inspect how many rows have missing data
print('Number of NaNs:')
print(complete_df['Missingness'].value_counts())
complete_df

In [None]:
'''Optional: Save to a parquet file'''
# Get a list of column names
columns = complete_df.columns

# Find duplicates
duplicate_columns = columns[columns.duplicated()]
print("Duplicate columns:", duplicate_columns)
print('There are ' + str(columns.duplicated().sum()) + ' duplicated columns.')
print('NaN value count:\n')
print(complete_df.isnull().sum())
# complete_df.describe()

# Uncomment the two lines below to write to parquet
# complete_df.to_parquet("input_raw.parquet", engine="pyarrow", compression="snappy")
# print('Successfully wrote to a .parquet file!')

#### This is the filtering step for PCA if there are any NaN rows in the dataframe
Note: there should not be mnany NaN values if you accounted for the different shapes of the data inside the HDF5 files.

In [None]:
'''Optional: load complete_df from .parquet file'''
complete_df = pd.read_parquet('input_raw.parquet', engine='pyarrow')
complete_df

In [None]:
# from sklearn.impute import SimpleImputer

print(f'Original dataframe shape: {complete_df.shape}')

'''Filtering'''
complete_df.drop('Shot Number', axis=1, inplace=True)
print("Shot numbers dropped")
discounted_df = complete_df.dropna(axis=1)


# imputer = SimpleImputer(strategy='most_frequent')  # You can change to 'median', 'most_frequent', etc.
# discounted_df = pd.DataFrame(imputer.fit_transform(complete_df), columns=complete_df.columns)

print("Dataframe shape after dropping NaN columns:", discounted_df.shape)

# Step 1: Separate features and target (if applicable)
# Exclude non-numeric columns if present
filtered_df = discounted_df.select_dtypes(include=[np.number])
# columns_to_keep = [col for col in numeric_df.columns if 'RH' not in col]
# filtered_df = filtered_df[columns_to_keep]

print(f'Non-numeric columns removed from dataframe\nCleaned dataframe size: {filtered_df.shape}')
filtered_df

In [None]:
# Step 2, OPTION 1: Standardize the data using StandardScalar (for demo only)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(filtered_df)
# print(f'Data scaled\nScaled data size (ndarray): {scaled_data.shape}')

scaled_df = pd.DataFrame(scaled_data, columns=filtered_df.columns, index=filtered_df.index)
print("Made new scaled dataframe")
scaled_df
scaled_df.describe()

# Uncomment the two lines below to write scaled_df to parquet
scaled_df.to_parquet("input_standard_scaled.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')

In [None]:
# OPTION 2: Standardize the data using RobustScalar (this is better for outlier detection)
from sklearn.preprocessing import RobustScaler

# Assume `filtered_df` is your cleaned and filtered dataframe (all numeric columns)
robust_scaler = RobustScaler()
print("Instantiated RobustScalar")

# Fit the scaler on the dataframe and transform it
scaled_array = robust_scaler.fit_transform(filtered_df)
print("Fitted RobustScalar")

# (Optional) Create a new DataFrame with scaled values
scaled_df = pd.DataFrame(scaled_array, columns=filtered_df.columns, index=filtered_df.index)
print("Made new scaled dataframe")
scaled_df
scaled_df.describe()

# Uncomment the two lines below to write scaled_df to parquet
scaled_df.to_parquet("input_scaled.parquet", engine="pyarrow", compression="snappy")
print('Successfully wrote to a .parquet file!')