# Elliptic Envelope (Robust Covariance) Outlier Detection

In [1]:
import os
import random

import h5py
import numpy as np
import pandas as pd
import geopandas as gp

import matplotlib
import matplotlib.pyplot as plt

### Covariance modeling

In [2]:
'''Inporting original input'''
raw_df = pd.read_parquet('input_raw.parquet', engine='pyarrow')
raw_df

Unnamed: 0,File Name,Beam Name,Shot Number,RH_25,RH_50,RH_75,RH_85,RH_95,RH_100,channel,...,zcross_localenergy,RH_25 Z Score,RH_50 Z Score,RH_75 Z Score,RH_85 Z Score,RH_95 Z Score,RH_100 Z Score,RH_50_v_100,RH_95_minus_50,Missingness
0,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050304,-0.78,0.00,0.78,1.19,1.79,2.36,0,...,163.479019,-0.267243,-0.224966,-0.212232,-0.213468,-0.214492,-0.240572,0.000000,1.79,0
1,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050305,-0.97,-0.07,0.78,1.23,1.87,2.54,0,...,178.583328,-0.352941,-0.243600,-0.212232,-0.206578,-0.202634,-0.216597,-0.027559,1.94,0
2,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050306,-0.86,-0.07,0.71,1.12,1.72,2.32,0,...,203.563889,-0.303326,-0.243600,-0.225654,-0.225526,-0.224867,-0.245900,-0.030172,1.79,0
3,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050307,-0.86,-0.07,0.74,1.16,1.79,2.47,0,...,221.641510,-0.303326,-0.243600,-0.219902,-0.218636,-0.214492,-0.225921,-0.028340,1.86,0
4,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM0000,20820000200050308,-0.89,-0.03,0.78,1.23,1.94,2.73,0,...,268.898804,-0.316857,-0.232952,-0.212232,-0.206578,-0.192258,-0.191289,-0.010989,1.97,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11607182,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293934,-1.23,-0.18,0.74,1.23,2.02,3.70,5,...,2616.521729,-0.470212,-0.272881,-0.219902,-0.206578,-0.180400,-0.062087,-0.048649,2.20,0
11607183,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293935,-1.23,-0.22,0.71,1.16,1.94,3.70,5,...,2879.136963,-0.470212,-0.283529,-0.225654,-0.218636,-0.192258,-0.062087,-0.059459,2.16,0
11607184,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293936,-1.27,-0.22,0.74,1.23,2.02,3.85,5,...,2619.544189,-0.488253,-0.283529,-0.219902,-0.206578,-0.180400,-0.042108,-0.057143,2.24,0
11607185,/oscar/scratch/jzhu118/GEDI_Outlier_Detection_...,BEAM1011,224961100200293937,-1.16,-0.14,0.78,1.27,2.05,3.81,5,...,2381.172363,-0.438639,-0.262233,-0.212232,-0.199688,-0.175953,-0.047435,-0.036745,2.19,0


In [3]:
'''Inporting scaled input'''
scaled_df = pd.read_parquet('input_standard_scaled.parquet', engine='pyarrow')
scaled_df

Unnamed: 0,RH_25,RH_50,RH_75,RH_85,RH_95,RH_100,channel,degrade_flag,delta_time,digital_elevation_model,...,zcross_amp,zcross_localenergy,RH_25 Z Score,RH_50 Z Score,RH_75 Z Score,RH_85 Z Score,RH_95 Z Score,RH_100 Z Score,RH_95_minus_50,Missingness
0,-0.261052,-0.219977,-0.205986,-0.204688,-0.209075,-0.235969,-1.160313,-0.225547,-1.733653,-1.193618,...,0.185156,-0.211920,-0.261052,-0.219977,-0.205986,-0.204688,-0.209075,-0.235969,-0.157817,-0.842991
1,-0.344765,-0.238197,-0.205986,-0.198081,-0.197517,-0.212452,-1.160313,-0.225547,-1.733653,-1.193618,...,0.208783,-0.188663,-0.344765,-0.238197,-0.205986,-0.198081,-0.197517,-0.212452,-0.118490,-0.842991
2,-0.296300,-0.238197,-0.219014,-0.216249,-0.219189,-0.241195,-1.160313,-0.225547,-1.733653,-1.193618,...,0.207195,-0.150200,-0.296300,-0.238197,-0.219014,-0.216249,-0.219189,-0.241195,-0.157817,-0.842991
3,-0.296300,-0.238197,-0.213431,-0.209643,-0.209075,-0.221597,-1.160313,-0.225547,-1.733653,-1.193618,...,0.231784,-0.122365,-0.296300,-0.238197,-0.213431,-0.209643,-0.209075,-0.221597,-0.139465,-0.842991
4,-0.309517,-0.227785,-0.205986,-0.198081,-0.187403,-0.187628,-1.160313,-0.225547,-1.733653,-1.193618,...,0.270389,-0.049602,-0.309517,-0.227785,-0.205986,-0.198081,-0.187403,-0.187628,-0.110624,-0.842991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11607182,-0.459320,-0.266829,-0.213431,-0.198081,-0.175844,-0.060899,1.733180,-0.225547,1.428969,0.836476,...,2.818989,3.565103,-0.459320,-0.266829,-0.213431,-0.198081,-0.175844,-0.060899,-0.050322,-0.842991
11607183,-0.459320,-0.277240,-0.219014,-0.209643,-0.187403,-0.060899,1.733180,-0.225547,1.428969,0.836477,...,2.961269,3.969460,-0.459320,-0.277240,-0.219014,-0.209643,-0.187403,-0.060899,-0.060809,-0.842991
11607184,-0.476943,-0.277240,-0.213431,-0.198081,-0.175844,-0.041302,1.733180,-0.225547,1.428969,0.836477,...,2.844775,3.569757,-0.476943,-0.277240,-0.213431,-0.198081,-0.175844,-0.041302,-0.039835,-0.842991
11607185,-0.428478,-0.256417,-0.205986,-0.191474,-0.171510,-0.046528,1.733180,-0.225547,1.428969,0.836474,...,2.499454,3.202729,-0.428478,-0.256417,-0.205986,-0.191474,-0.171510,-0.046528,-0.052944,-0.842991


In [6]:
'''Inporting pca input'''
pca_df = pd.read_parquet('input_pca_robust.parquet', engine='pyarrow')
pca_df

Unnamed: 0,PC1,PC2,PC3
0,-2.804235e+08,-3.041450e+07,-2.293928e+07
1,-2.804235e+08,-3.041450e+07,-2.293928e+07
2,-2.804235e+08,-3.041450e+07,-2.293928e+07
3,-2.804235e+08,-3.041450e+07,-2.293928e+07
4,-2.804235e+08,-3.041450e+07,-2.293928e+07
...,...,...,...
11607182,-2.804235e+08,-3.041450e+07,-2.293928e+07
11607183,-2.804235e+08,-3.041450e+07,-2.293928e+07
11607184,-2.804235e+08,-3.041450e+07,-2.293928e+07
11607185,-2.804235e+08,-3.041450e+07,-2.293928e+07


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.covariance import EllipticEnvelope

# If you have ground truth labels for evaluation, you might import:
# from sklearn.metrics import classification_report

# Choose which dataset (scaled or PCA)
df = pca_df

# Step 1: Split the data into training and testing sets.
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)
print("Split data")

# Step 2: Build the Robust Covariance Model.
# Here, EllipticEnvelope uses the Minimum Covariance Determinant (MCD) estimator internally.
# Hyperparameters to consider tuning:
# - contamination: Expected fraction of outliers (e.g., 0.1 if you expect 10% of the data to be outliers)
# - support_fraction: Fraction of points to use for the robust covariance estimation (default is None)
# - assume_centered: Whether to assume the data is already centered
model = EllipticEnvelope(contamination=0.01, 
                         support_fraction=None, 
                         random_state=42)

# Fit the model using the training set.
model.fit(X_train)
print("EllipticEnvelope fitted successfully")

Split data


KeyboardInterrupt: 

In [None]:
# Step 3: Predict on the test set.
# The model's predict() method returns:
#   +1 for inliers and -1 for outliers.
y_pred_test = model.predict(X_test)
print("Generated predictions on testing data")

# Print out some basic results.
n_inliers = (y_pred_test == 1).sum()
n_outliers = (y_pred_test == -1).sum()
print("Test set inliers:", n_inliers)

# Optionally, if you have ground truth labels for outlier status (e.g., y_test_true),
# you could evaluate the performance using classification metrics:
# print(classification_report(y_test_true, y_pred_test))

# We assume that pca_df (and hence pca_test) shares the same indices as raw_df.
# We'll add a new column 'Outlier' to raw_df. Rows not in the test set are set to NaN.
raw_df['Outlier'] = np.nan  # initialize column with NaN
raw_df.loc[X_test.index, 'Outlier'] = y_pred_test
print("Outliers added to raw_df")


In [None]:
# ----------------------------
# Step 5: Graph the key features with the outlier labels.
# ----------------------------
plt.figure(figsize=(10, 8))

# Optionally, plot the rows that were not part of the test split in grey.
plt.scatter(raw_df.loc[raw_df['Outlier'].isna(), 'RH_50'],
            raw_df.loc[raw_df['Outlier'].isna(), 'RH_95'],
            s = 1,
            c='grey', alpha=0.5, label='Not Tested')
print("Done graphing untested")

# Plot test set inliers (label +1) in blue.
plt.scatter(raw_df.loc[raw_df['Outlier'] == 1, 'RH_50'],
            raw_df.loc[raw_df['Outlier'] == 1, 'RH_95'],
            s = 1,
            c='blue', label='Test Inliers', alpha=0.7)
print("Done graphing inliers")

# Plot test set outliers (label -1) in red.
plt.scatter(raw_df.loc[raw_df['Outlier'] == -1, 'RH_50'],
            raw_df.loc[raw_df['Outlier'] == -1, 'RH_95'],
            s = 1,
            c='red', label='Test Outliers', alpha=0.7)
print("Done graphing outliers")

plt.xlabel('RH_50')
plt.ylabel('RH_95')
plt.title('Outlier Detection Mapped from PCA Space')
plt.legend()
plt.show()

In [None]:
'''Full predictions'''
y_pred = model.predict(df)
print("Generated predictions on ALL data")

# Print out some basic results.
n_inliers = (y_pred == 1).sum()
n_outliers = (y_pred == -1).sum()
print("Inliers:", n_inliers)
print("Outliers:", n_outliers)

In [None]:
'''Graphing'''
# Choose which dataset to graph
X = raw_df

plt.figure(figsize=(8, 6))

plt.scatter(X.loc[~outliers, 'RH_50'],
            X.loc[~outliers, 'RH_95'],
            s=10, label='Inliers', alpha=0.5, c='blue')

plt.scatter(X.loc[outliers, 'RH_50'],
            X.loc[outliers, 'RH_95'],
            s=10, label='Outliers', alpha=0.7, c='red')

plt.xlabel('RH 50')
plt.ylabel('RH 95')
plt.title('Empirical Covariance (Trained on PCA-3)\nRH 50 vs. RH 95')
plt.legend()
plt.show()