# Outlier Detector Version Two

- Features to add: Time of day, laser strength, long, lat, solar elev., azimuth ...
- Assemble a massive file with all the data; each row is a shot
- Use more files from GDrive, get 1000 files
- Do a PCA on the massive file (exploratory data analysis)
- Probably 1 or 2 RH metrics are enough for RH
- Instrumentation data may work too
- For clustering to work, we need to know where each cluster is -- we're really modeling the covariance matrices
- "Given this set of data, can this point be drawn from this cov matrix?"
- Look at cov within consecutive shots
- Learn from different passes at different times (how?)
- Idea: being aware of context

## Questions for Jim:
- What does a good waveform need to satisfy?
- Does it make sense?

In [7]:
import os
import random

import h5py
import numpy as np
import pandas as pd
import geopandas as gp
import geoviews as gv
from geoviews import opts, tile_sources as gvts
import holoviews as hv
gv.extension('bokeh', 'matplotlib')
from shapely.geometry import Point
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

import matplotlib
import matplotlib.pyplot as plt
import PyQt6

In [8]:
inDir = os.getcwd() + "\\GEDI_sample_files"
print(inDir)
# input_file_names = [g for g in os.listdir(inDir) if g.startswith('GEDI02_A') and g.endswith('.h5')]  # List all GEDI level 2 files in inDir
input_file_names = ['GEDI02_A_2021086153349_O12964_03_T08275_02_003_02_V002.h5']
input_file_names

C:\Users\jingb\OneDrive\Documents\GEDI Outlier Detection\GEDI_sample_files


['GEDI02_A_2021086153349_O12964_03_T08275_02_003_02_V002.h5']

### Loading files with RH information into a huge Pandas dataframe

In [9]:
input_files = []
files_to_beams = dict()
for n in input_file_names:
    file_path = os.path.join(inDir, n)  # Select an example file
    file = h5py.File(file_path, 'r')
    input_files.append(file)
    
    print('Loading file: ' + n)
    print('The file contains the following groups: ' + str(list(file.keys())))
    
    print("The file's metadata contains the following attributes: ")
    for g in file['METADATA']['DatasetIdentification'].attrs: print(g)
    
    beamNames = [g for g in file.keys() if g.startswith('BEAM')]
    files_to_beams[file] = beamNames
    
    print("The file contains the following beams: ")
    for b in beamNames: 
        print(f"{b} is a {file[b].attrs['description']}")

Loading file: GEDI02_A_2021086153349_O12964_03_T08275_02_003_02_V002.h5
The file contains the following groups: ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']
The file's metadata contains the following attributes: 
PGEVersion
VersionID
abstract
characterSet
creationDate
credit
fileName
language
originatorOrganizationName
purpose
shortName
spatialRepresentationType
status
topicCategory
uuid
The file contains the following beams: 
BEAM0000 is a Coverage beam
BEAM0001 is a Coverage beam
BEAM0010 is a Coverage beam
BEAM0011 is a Coverage beam
BEAM0101 is a Full power beam
BEAM0110 is a Full power beam
BEAM1000 is a Full power beam
BEAM1011 is a Full power beam


In [10]:
all_objs = []
all_datasets = []
file_to_rh = dict()
file_to_quality = dict()

for f in input_files:
    f.visit(all_objs.append) # Retrieve list of datasets
    local_datasets = [o for o in all_objs if isinstance(f[o], h5py.Dataset)]
    local_rh_datasets = [d for d in local_datasets if d.endswith('/rh')]
    
    local_quality_ds = [q for q in local_datasets if q.endswith('/quality_flag') and "rx" not in q]
    print(f'All quality_flag datasets: {local_quality_ds}')
    
    print(f'All RH datasets in file {f}: ' + str(local_rh_datasets))
    all_datasets.extend(local_datasets)  # Search for relevant SDS inside data file
    
    file_to_rh[f] = local_rh_datasets
    file_to_quality[f] = local_quality_ds

print('10 randomly selected objects: ' + str(random.sample(all_objs, 10)))
print('10 randomly selected datasets: ' + str(random.sample(all_datasets, 10)))

All quality_flag datasets: ['BEAM0000/quality_flag', 'BEAM0001/quality_flag', 'BEAM0010/quality_flag', 'BEAM0011/quality_flag', 'BEAM0101/quality_flag', 'BEAM0110/quality_flag', 'BEAM1000/quality_flag', 'BEAM1011/quality_flag']
All RH datasets in file <HDF5 file "GEDI02_A_2021086153349_O12964_03_T08275_02_003_02_V002.h5" (mode r)>: ['BEAM0000/rh', 'BEAM0001/rh', 'BEAM0010/rh', 'BEAM0011/rh', 'BEAM0101/rh', 'BEAM0110/rh', 'BEAM1000/rh', 'BEAM1011/rh']
10 randomly selected objects: ['BEAM0001/rx_processing_a3/ancillary/rx_front_threshold', 'BEAM0010/rx_assess/ancillary', 'BEAM0101/rx_processing_a4/rx_modelocalenergy', 'BEAM0101/rx_processing_a1/min_detection_energy', 'BEAM1000/geolocation/lon_highestreturn_a5', 'BEAM0001/rx_processing_a4/rx_modeenergytobotloc', 'BEAM0000/rx_processing_a5/rx_iwaveamps', 'BEAM0101/beam', 'BEAM0011/rx_processing_a5/rx_modelocalslope', 'BEAM1000/degrade_flag']
10 randomly selected datasets: ['BEAM0000/rx_processing_a2/ancillary/ampval_limit3', 'BEAM1011/rx_p

In [11]:
for f in input_files:
    print(f"File: {f.filename}")
    for b in files_to_beams[f]:
        print(f"Beam: {b}")
        print([ds for ds in all_datasets if b in ds][:])

File: C:\Users\jingb\OneDrive\Documents\GEDI Outlier Detection\GEDI_sample_files\GEDI02_A_2021086153349_O12964_03_T08275_02_003_02_V002.h5
Beam: BEAM0000
['BEAM0000/ancillary/l2a_alg_count', 'BEAM0000/beam', 'BEAM0000/channel', 'BEAM0000/degrade_flag', 'BEAM0000/delta_time', 'BEAM0000/digital_elevation_model', 'BEAM0000/digital_elevation_model_srtm', 'BEAM0000/elev_highestreturn', 'BEAM0000/elev_lowestmode', 'BEAM0000/elevation_bias_flag', 'BEAM0000/elevation_bin0_error', 'BEAM0000/energy_total', 'BEAM0000/geolocation/elev_highestreturn_a1', 'BEAM0000/geolocation/elev_highestreturn_a2', 'BEAM0000/geolocation/elev_highestreturn_a3', 'BEAM0000/geolocation/elev_highestreturn_a4', 'BEAM0000/geolocation/elev_highestreturn_a5', 'BEAM0000/geolocation/elev_highestreturn_a6', 'BEAM0000/geolocation/elev_lowestmode_a1', 'BEAM0000/geolocation/elev_lowestmode_a2', 'BEAM0000/geolocation/elev_lowestmode_a3', 'BEAM0000/geolocation/elev_lowestmode_a4', 'BEAM0000/geolocation/elev_lowestmode_a5', 'BEAM00

# To dos:
- Try reproducing issue on OSCAR terminal
- Request machine on OSCAR that gives interactive access, NOT schedule a job
- Start interactive session, use correct Py ver, load libs, run code
- File mapping
- Do 1 concat and see how much memory it uses

- Beam & channel are relevant
- Exclude RH's in PCA
- Want meta-data, not data per se
- SHAP after ML output
- Google Earth Engine

In [20]:
# """We will now create a massive dataframe with all the data from each beam from each file."""
# dataframes = []
# for ds in all_datasets:
#   print(f'Loading dataset {ds}')
#   # Load the dataset into a Pandas dataframe
#   df = pd.DataFrame(data=f[ds][()])
#   dataframes.append(df)
#   print(f"Appended {df}. Dataset shape: {df.shape}")
# 
# complete_df = pd.concat(dataframes)
# # Display or save the final dataframe
# print("Final dataframe shape:", complete_df.shape)
# complete_df

output_file = "complete_df_new.csv"  # Output file for incremental saving
save_mode = "w"  # Write mode for the first chunk, then switch to append
header = True  # Include header only for the first chunk

for ds in all_datasets:
    print(f'Loading dataset {ds}')
    # Load the dataset into a Pandas dataframe
    df = pd.DataFrame(data=f[ds][()])
    print(f"Dataset shape: {df.shape}")
    
    # Save the dataframe to a CSV file incrementally
    df.to_csv(output_file, mode=save_mode, header=header, index=False)
    save_mode = "a"  # Switch to append mode for subsequent chunks
    header = False  # Exclude header for subsequent chunks
    
    if not header:
        # Append columns to the existing file
        # Read only the number of rows in the current chunk to match
        existing_df = pd.read_csv(output_file, nrows=df.shape[0])

        # Combine the existing data with the new chunk column-wise
        combined_df = pd.concat([existing_df, df], axis=1)

        # Overwrite the file with the updated content
        combined_df.to_csv(output_file, mode='w', header=True, index=False)

print(f"Data has been saved incrementally to {output_file}")

Loading dataset BEAM0000/ancillary/l2a_alg_count
Dataset shape: (1, 1)
Loading dataset BEAM0000/beam
Dataset shape: (167280, 1)
Loading dataset BEAM0000/channel
Dataset shape: (167280, 1)
Loading dataset BEAM0000/degrade_flag
Dataset shape: (167280, 1)
Loading dataset BEAM0000/delta_time
Dataset shape: (167280, 1)
Loading dataset BEAM0000/digital_elevation_model
Dataset shape: (167280, 1)
Loading dataset BEAM0000/digital_elevation_model_srtm
Dataset shape: (167280, 1)
Loading dataset BEAM0000/elev_highestreturn
Dataset shape: (167280, 1)
Loading dataset BEAM0000/elev_lowestmode
Dataset shape: (167280, 1)
Loading dataset BEAM0000/elevation_bias_flag
Dataset shape: (167280, 1)
Loading dataset BEAM0000/elevation_bin0_error
Dataset shape: (167280, 1)
Loading dataset BEAM0000/energy_total
Dataset shape: (167280, 1)
Loading dataset BEAM0000/geolocation/elev_highestreturn_a1
Dataset shape: (167280, 1)
Loading dataset BEAM0000/geolocation/elev_highestreturn_a2
Dataset shape: (167280, 1)
Loadin

KeyboardInterrupt: 

### Incremental PCA

In [16]:
df = pd.read_csv("complete_df.csv", nrows=5)
print(df)

   0
0  6
1  0
2  0
3  0
4  0


In [15]:
from sklearn.decomposition import IncrementalPCA


# Define file path and parameters
input_file = "complete_df.csv"
chunk_size = 100000  # Number of rows per chunk
n_components = 3  # Number of principal components

# Initialize IncrementalPCA
ipca = IncrementalPCA(n_components=n_components)

# Process the CSV in chunks
chunks = pd.read_csv(input_file, chunksize=chunk_size)
print("Processing the data in chunks...")
for chunk in chunks:
    # Convert to NumPy array
    chunk_data = chunk.to_numpy()
    print("Chunk shape:", chunk.shape)
    # Incrementally fit the PCA model on the chunk
    ipca.partial_fit(chunk_data)

# Transform the data in chunks and save the results
output_file = "reduced_data.csv"
chunks = pd.read_csv(input_file, chunksize=chunk_size)
for chunk in chunks:
    chunk_data = chunk.to_numpy()
    reduced_data = ipca.transform(chunk_data)
    
    # Save reduced data incrementally
    pd.DataFrame(reduced_data).to_csv(output_file, mode='a', header=False, index=False)

print("PCA completed and reduced data saved to", output_file)

Processing the data in chunks...
Chunk shape: (100000, 1)


ValueError: n_components=3 must be less or equal to the batch number of samples 1.

### Adding z-scores of each RH metric to the dataframe

In [7]:
rh_nums = [25, 50, 75, 85, 95]
for i in rh_nums:
    col_name = f'*RH {i}'
    complete_df[f'{col_name} Z Score'] = (complete_df[col_name] - complete_df[col_name].mean()) / complete_df[col_name].std()

complete_df

Unnamed: 0,File Name,Beam Name,Shot Number,Latitude,Longitude,Tandem-X DEM,SRTM DEM,*Elevation (m),*Canopy Elevation (m),*RH 25,...,*RH 95,Quality Flag,Degrade Flag,Sensitivity,Selected Algorithm,*RH 25 Z Score,*RH 50 Z Score,*RH 75 Z Score,*RH 85 Z Score,*RH 95 Z Score
928,GEDI02_A_2021050140102_O12405_02_T10912_02_003...,BEAM0000,1.240500e+17,26.949514,-140.210474,-999999.0,-999999.0,-25.178839,-22.930853,-0.63,...,1.76,1.0,0.0,0.525841,1.0,0.079091,-0.223630,-0.352021,-0.404941,-0.482947
930,GEDI02_A_2021050140102_O12405_02_T10912_02_003...,BEAM0000,1.240500e+17,26.949901,-140.210092,-999999.0,-999999.0,-25.532187,-23.583931,-0.86,...,1.42,1.0,0.0,0.626149,1.0,-0.035554,-0.283883,-0.416731,-0.465439,-0.537166
931,GEDI02_A_2021050140102_O12405_02_T10912_02_003...,BEAM0000,1.240500e+17,26.950094,-140.209901,-999999.0,-999999.0,-25.935125,-23.649672,-0.82,...,1.72,1.0,0.0,0.648748,1.0,-0.015616,-0.257104,-0.371194,-0.427124,-0.489325
932,GEDI02_A_2021050140102_O12405_02_T10912_02_003...,BEAM0000,1.240500e+17,26.950287,-140.209711,-999999.0,-999999.0,-25.398439,-23.300318,-0.71,...,1.61,1.0,0.0,0.545459,1.0,0.039214,-0.247061,-0.380781,-0.435190,-0.506867
933,GEDI02_A_2021050140102_O12405_02_T10912_02_003...,BEAM0000,1.240500e+17,26.950481,-140.209520,-999999.0,-999999.0,-25.742947,-23.457495,-0.63,...,1.76,1.0,0.0,0.636309,1.0,0.079091,-0.223630,-0.352021,-0.413008,-0.482947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154849,GEDI02_A_2021086153349_O12964_03_T08275_02_003...,BEAM1011,1.296411e+17,5.315743,125.769220,-999999.0,-999999.0,65.394615,68.387398,-0.97,...,1.90,1.0,0.0,0.867145,1.0,-0.090384,-0.270493,-0.371194,-0.413008,-0.460621
154850,GEDI02_A_2021086153349_O12964_03_T08275_02_003...,BEAM1011,1.296411e+17,5.315323,125.769519,-999999.0,-999999.0,65.368126,68.248680,-1.04,...,1.83,1.0,0.0,0.840408,1.0,-0.125276,-0.293925,-0.387971,-0.429141,-0.471784
154851,GEDI02_A_2021086153349_O12964_03_T08275_02_003...,BEAM1011,1.296411e+17,5.314903,125.769818,-999999.0,-999999.0,65.539238,68.681664,-0.93,...,2.09,1.0,0.0,0.787433,1.0,-0.070446,-0.247061,-0.344831,-0.382759,-0.430322
154852,GEDI02_A_2021086153349_O12964_03_T08275_02_003...,BEAM1011,1.296411e+17,5.314483,125.770117,-999999.0,-999999.0,65.537354,68.193443,-0.82,...,1.94,1.0,0.0,0.631403,1.0,-0.015616,-0.247061,-0.352021,-0.398892,-0.454243
