# 📊 Data Exploration & Cleaning for SWARM-A Satellite

**Author**: Naziha Aslam  
**Date**: July 2025  
**Objective**: Process raw GNSS measurements for SWARM-A from **15–31 May 2024**

## 🚀 Pipeline Overview:
1. **Load & Pivot** raw data from parquet format
2. **Convert Units** (km→m, dm/s→m/s)
3. **3-σ Outlier Filter** for data quality
4. **Cache Cleaned Data** for downstream processing
5. **Save Column Configuration** for reproducibility

---

In [None]:
# 🔧 Environment Setup
import json, hashlib, random, os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import logging
import sys
import seaborn as sns

# Professional presentation mode - suppress cosmetic warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Glyph.*missing.*")
warnings.filterwarnings("ignore", message=".*font.*")

# Set style for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['font.size'] = 11

# Reproducibility seeds
random.seed(42)
np.random.seed(42)

print("🎯 Environment configured successfully!")
print(f"📦 NumPy version: {np.__version__}")
print(f"📦 Pandas version: {pd.__version__}")

In [None]:
# 🛰️ Import Satellite AUKF Package (01 NOTEBOOK - DATA EXPLORATION)
import os
from pathlib import Path
import matplotlib.pyplot as plt

# Create directories
DATA_DIR = Path("../data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Set paths
RAW_GPS_PATH = DATA_DIR / "GPS_measurements.parquet"
CLEAN_GPS_PATH = DATA_DIR / "GPS_clean.parquet"

print(f"📂 RAW DATA   : {RAW_GPS_PATH}")
print(f"📂 CLEAN DATA : {CLEAN_GPS_PATH}")

# 🔧 utils.py saves the figures via a config:
import satellite_aukf.utils as utils

# Create a config module for utils
class Config:
    FIGURES_DIR = Path("../figures/01_Data_Exploration")
    
# Ensure directory exists
Config.FIGURES_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR = Config.FIGURES_DIR

import sys
sys.modules['satellite_aukf.config'] = Config
utils.FIGURES_DIR = Config.FIGURES_DIR

# Ensure directory exists
Config.FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print(f"✅ Utils save_figure configured")
print(f"📁 Figures will be saved to: {Config.FIGURES_DIR}")
print(f"✅ 01 Notebook setup complete")

# Configuration
COLUMN_CONFIG_PATH = Path(os.getenv("SAT_AUKF_COLUMN_JSON", "meas_cols.json"))

In [None]:
# 📥 STEP 1: Load & Pivot Raw Data
print("🔄 Loading raw GNSS measurements...")

try:
    df_long = pd.read_parquet(RAW_GPS_PATH)
    print(f"✅ Loaded {len(df_long):,} raw measurements")
    
    # Display sample of raw data structure
    print("\n📋 Raw Data Sample:")
    print(df_long.head(3))
    print(f"\n📊 Raw Data Shape: {df_long.shape}")
    print(f"🗓️ Time Range: {df_long['datetime'].min()} → {df_long['datetime'].max()}")
    
except FileNotFoundError:
    print(f"❌ Could not find raw data file: {RAW_GPS_PATH}")
    print("💡 Please ensure the data file exists in the expected location")
    raise

In [None]:
# 🔄 STEP 2: Reshape from Long to Wide Format
print("🔄 Reshaping data from long to wide format...")

# Pivot the data to get position_x, position_y, position_z, velocity_x, velocity_y, velocity_z
df_wide = (
    df_long
    .pivot_table(index="datetime", columns="ECEF", values=["position", "velocity"])
    .reset_index()
)

# Flatten column names
df_wide.columns = [
    "_".join(col).strip() if isinstance(col, tuple) and col[1] != '' else str(col[0])
    for col in df_wide.columns
]

# Clean up the column names
df = df_wide.rename(columns={"datetime": "datetime"})

print(f"✅ Reshaped to {len(df):,} rows × {len(df.columns)} columns")
print("\n📋 Wide Format Columns:")
for col in df.columns:
    print(f"  • {col}")
    
# Display sample of reshaped data
print("\n📋 Reshaped Data Sample:")
print(df.head(3))

In [None]:
# ⚖️ STEP 3: Unit Conversion & Data Cleaning
print("⚖️ Converting units and preparing data...")

# Identify position and velocity columns
pos_cols = sorted([col for col in df.columns if col.startswith("position_")])
vel_cols = sorted([col for col in df.columns if col.startswith("velocity_")])

print(f"📍 Position columns: {pos_cols}")
print(f"🚀 Velocity columns: {vel_cols}")

# Ensure datetime is properly formatted
df["datetime"] = pd.to_datetime(df["datetime"])

# Unit conversions
print("\n🔄 Converting units...")
if pos_cols:
    df[pos_cols] *= 1000.0    # km → m
    print("  ✅ Position: km → m")
    
if vel_cols:
    df[vel_cols] /= 10.0      # dm/s → m/s
    print("  ✅ Velocity: dm/s → m/s")

# Calculate orbital radius for outlier detection
if len(pos_cols) >= 3:
    r = np.linalg.norm(df[pos_cols[:3]], axis=1)
    
    print(f"\n📏 Orbital Statistics:")
    print(f"  • Mean radius: {r.mean()/1000:.1f} km")
    print(f"  • Std deviation: {r.std()/1000:.1f} km")
    print(f"  • Range: {r.min()/1000:.1f} - {r.max()/1000:.1f} km")
else:
    print("⚠️ Insufficient position columns for radius calculation")
    r = pd.Series([np.nan] * len(df))

In [None]:
# 🎯 STEP 4: Outlier Detection & Removal
print("🎯 Performing 3-σ outlier detection...")

# 3-sigma outlier detection on orbital radius
if not np.isnan(r).all():  # Fixed: use np.isnan() instead of r.isna()
    r_mean = r.mean()
    r_std = r.std()
    threshold = 3 * r_std
    
    is_outlier = np.abs(r - r_mean) > threshold
    n_outliers = is_outlier.sum()
    outlier_pct = (n_outliers / len(df)) * 100
    
    print(f"📊 Outlier Detection Results:")
    print(f"  • Threshold: ±{threshold/1000:.1f} km from mean")
    print(f"  • Outliers found: {n_outliers:,} ({outlier_pct:.2f}%)")
    
    # Remove outliers
    df_clean = df.loc[~is_outlier].reset_index(drop=True)
    
    if n_outliers > 0:
        print(f"  • Cleaned data: {len(df_clean):,} measurements")
        print(f"  • Data reduction: {(len(df) - len(df_clean))/len(df)*100:.2f}%")
    else:
        print("  ✅ No outliers detected - data is clean!")
else:
    print("⚠️ Cannot perform outlier detection - using original data")
    df_clean = df.copy()
    n_outliers = 0
    outlier_pct = 0.0

print(f"\n📊 Final Dataset: {len(df_clean):,} rows × {len(df_clean.columns)} columns")

In [None]:
# 📈 STEP 5: Data Visualization - SIMPLIFIED VERSION
print("📈 Creating orbital radius visualization...")

# Calculate orbital radius for cleaned data
if len(pos_cols) >= 3:
    r_clean = np.linalg.norm(df_clean[pos_cols[:3]], axis=1)
    r_km = r_clean / 1000.0  # Convert to km for plotting
    
    # 1. Time series with sampling
    print("📊 Creating orbital radius time series...")
    plt.style.use('seaborn-v0_8-whitegrid')
    plt.figure(figsize=(12, 6))
    plt.plot(df_clean["datetime"][::600], r_km[::600], lw=1.5, marker='.', markersize=3, 
             alpha=0.8, color='#2E86AB', label='Orbital Radius (10-min sampling)')
    plt.title("🛰️ SWARM-A Orbital Radius (15–31 May 2024)", fontsize=14, fontweight='bold')
    plt.ylabel("Radius [km]", fontweight='bold')
    plt.xlabel("Time", fontweight='bold')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    
    utils.save_figure('SWARM_A_Orbital_Radius_TimeSeries.png')
    plt.show()
    
    # 2. Analysis plot (time series + distribution)
    print("📊 Creating detailed orbital analysis...")
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    fig.suptitle('🛰️ SWARM-A Orbital Radius Analysis', fontsize=16, fontweight='bold')
    
    # Time series
    ax1.plot(range(len(r_km)), r_km, color='#2E86AB', alpha=0.7, linewidth=1.5, 
             label='Orbital Radius')
    ax1.fill_between(range(len(r_km)), r_km, alpha=0.3, color='#2E86AB')
    ax1.axhline(y=r_km.mean(), color='#A23B72', linestyle='--', alpha=0.8, 
                label=f'Mean: {r_km.mean():.1f} km')
    ax1.set_xlabel('Measurement Index', fontweight='bold')
    ax1.set_ylabel('Orbital Radius (km)', fontweight='bold')
    ax1.set_title('📈 Time Series', fontsize=14, fontweight='bold')
    ax1.grid(True, alpha=0.3)
    ax1.legend()
    
    # Distribution
    ax2.hist(r_km, bins=50, density=True, alpha=0.7, color='#F18F01', edgecolor='black')
    ax2.axvline(r_km.mean(), color='#A23B72', linestyle='--', linewidth=2, 
                label=f'Mean: {r_km.mean():.1f} km')
    ax2.axvline(np.median(r_km), color='#C73E1D', linestyle=':', linewidth=2,
                label=f'Median: {np.median(r_km):.1f} km')
    ax2.set_xlabel('Orbital Radius (km)', fontweight='bold')
    ax2.set_ylabel('Probability Density', fontweight='bold')
    ax2.set_title('📊 Distribution', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    ax2.legend()
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.93)
    
    utils.save_figure('SWARM_A_Orbital_Radius_Analysis.png')
    plt.show()
    
    # 3. Scatter plot
    print("📊 Creating scatter plot...")
    plt.figure(figsize=(12, 6))
    scatter = plt.scatter(df_clean.index, r_km, s=2, alpha=0.4, c=r_km, cmap='viridis')
    plt.xlabel("Measurement Index", fontweight='bold')
    plt.ylabel("Orbital Radius (km)", fontweight='bold')
    plt.title("🛰️ SWARM-A Radius Scatter — All Measurements", fontsize=14, fontweight='bold')
    plt.grid(True, linestyle=":", linewidth=0.5, alpha=0.3)
    plt.colorbar(scatter, label='Orbital Radius (km)')
    plt.tight_layout()
    
    utils.save_figure('SWARM_A_Orbital_Radius_Scatter.png')
    plt.show()
    
    # Statistics
    print(f"\n✅ Orbital Radius Visualization Complete!")
    print(f"📊 Statistics:")
    print(f"  • Mean radius: {r_km.mean():.1f} ± {r_km.std():.1f} km")
    print(f"  • Altitude: ~{r_km.mean() - 6371:.0f} km (LEO)")
    print(f"  • Range: {r_km.min():.1f} - {r_km.max():.1f} km")
    print(f"📁 Saved 3 figures successfully!")
    
else:
    print("⚠️ Cannot create orbital plot - insufficient position data")

In [None]:
# 💾 STEP 6: Save Cleaned Data & Configuration
print("💾 Saving cleaned data and configuration...")

# Ensure output directory exists
CLEAN_GPS_PATH.parent.mkdir(parents=True, exist_ok=True)

# Save cleaned data
df_clean.to_parquet(CLEAN_GPS_PATH, index=False)
print(f"✅ Cleaned data saved → {CLEAN_GPS_PATH}")

# Save column configuration for downstream processing
column_config = {
    "position_columns": pos_cols,
    "velocity_columns": vel_cols,
    "all_measurement_columns": pos_cols + vel_cols,
    "total_measurements": len(df_clean),
    "outliers_removed": int(n_outliers),
    "data_quality": "high" if outlier_pct < 1.0 else "medium" if outlier_pct < 5.0 else "low"
}

with open(COLUMN_CONFIG_PATH, "w") as f:
    json.dump(column_config, f, indent=2)
print(f"✅ Column configuration saved → {COLUMN_CONFIG_PATH}")

# Generate data integrity hash
data_hash = hashlib.sha1(
    pd.util.hash_pandas_object(df_clean, index=True).values
).hexdigest()

print(f"\n🔐 Data Integrity:")
print(f"  • Hash: {data_hash[:16]}...")
print(f"  • Quality: {column_config['data_quality'].upper()}")
print(f"  • Ready for AUKF processing: ✅")

In [None]:
# 📋 STEP 7: Final Summary Report
print("\n" + "="*80)
print("🎯 DATA EXPLORATION & CLEANING SUMMARY REPORT")
print("="*80)

print(f"\n📊 DATASET OVERVIEW:")
print(f"  • Mission: SWARM-A Satellite Tracking")
print(f"  • Data Period: {df_clean['datetime'].min().strftime('%Y-%m-%d %H:%M')} → {df_clean['datetime'].max().strftime('%Y-%m-%d %H:%M')}")
print(f"  • Duration: {(df_clean['datetime'].max() - df_clean['datetime'].min()).days} days")
print(f"  • Total Measurements: {len(df_clean):,}")
print(f"  • Sampling Rate: ~{len(df_clean) / ((df_clean['datetime'].max() - df_clean['datetime'].min()).total_seconds() / 3600):.1f} measurements/hour")

print(f"\n🔧 DATA PROCESSING:")
print(f"  • Raw measurements loaded: {len(df):,}")
print(f"  • Outliers detected & removed: {n_outliers:,} ({outlier_pct:.2f}%)")
print(f"  • Final clean dataset: {len(df_clean):,}")
print(f"  • Data retention rate: {(len(df_clean)/len(df)*100):.1f}%")

print(f"\n📏 MEASUREMENT TYPES:")
print(f"  • Position measurements: {len(pos_cols)} components (ECEF)")
print(f"  • Velocity measurements: {len(vel_cols)} components (ECEF)")
print(f"  • Total state dimensions: {len(pos_cols) + len(vel_cols)}")

if len(pos_cols) >= 3:
    print(f"\n🛰️ ORBITAL CHARACTERISTICS:")
    print(f"  • Mean orbital radius: {r_km.mean():.1f} ± {r_km.std():.1f} km")
    print(f"  • Estimated altitude: ~{r_km.mean() - 6371:.0f} km")
    print(f"  • Orbit type: Low Earth Orbit (LEO)")
    print(f"  • Orbital stability: {('Excellent' if r_km.std() < 10 else 'Good' if r_km.std() < 50 else 'Moderate')}")

print(f"\n📁 OUTPUT FILES:")
print(f"  • Clean data: {CLEAN_GPS_PATH}")
print(f"  • Configuration: {COLUMN_CONFIG_PATH}")
print(f"  • Visualization: {FIGURES_DIR / 'Orbit_Radius_Analysis.png'}")

print(f"\n🎯 NEXT STEPS:")
print(f"  1. ✅ Data exploration and cleaning completed")
print(f"  2. 🔄 Ready for coordinate transformation (ECEF → ECI)")
print(f"  3. 🚀 Ready for Adaptive UKF implementation")
print(f"  4. 📈 Ready for performance analysis and visualization")

print("\n" + "="*80)
print("🎉 DATA EXPLORATION COMPLETE - READY FOR AUKF PROCESSING!")
print("="*80)