# Digital Twin KPI Sanity Check

**Purpose**: Validate digital twin KPI predictions against ground truth data

**Author**: Analytics Team  
**Last Updated**: 2025-01-XX  
**Version**: 1.0

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

%matplotlib inline
sns.set_style('whitegrid')

## Load Data

In [None]:
# Load predicted KPIs from digital twin
predicted = pd.read_csv('../../../10-METRICS/kpi_predictions.csv')

# Load actual KPIs from flight data
actual = pd.read_csv('../../../10-METRICS/kpi_actuals.csv')

# Merge on timestamp and flight ID
df = pd.merge(predicted, actual, on=['timestamp', 'flight_id'], suffixes=('_pred', '_actual'))

print(f"Loaded {len(df)} data points")
df.head()

## Fuel Consumption Analysis

In [None]:
# Calculate metrics for fuel consumption
fuel_pred = df['h2_fuel_kg_pred']
fuel_actual = df['h2_fuel_kg_actual']

rmse = np.sqrt(mean_squared_error(fuel_actual, fuel_pred))
mae = mean_absolute_error(fuel_actual, fuel_pred)
r2 = r2_score(fuel_actual, fuel_pred)
mape = np.mean(np.abs((fuel_actual - fuel_pred) / fuel_actual)) * 100

print(f"Fuel Consumption Prediction Metrics:")
print(f"  RMSE: {rmse:.2f} kg")
print(f"  MAE: {mae:.2f} kg")
print(f"  R²: {r2:.4f}")
print(f"  MAPE: {mape:.2f}%")

# Requirement: >95% accuracy (MAPE <5%)
if mape < 5.0:
    print("\n✓ PASSED: Fuel prediction meets accuracy requirement")
else:
    print(f"\n✗ FAILED: Fuel prediction MAPE {mape:.2f}% exceeds 5% threshold")

In [None]:
# Scatter plot: Predicted vs Actual
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(fuel_actual, fuel_pred, alpha=0.5, s=10)
ax.plot([fuel_actual.min(), fuel_actual.max()], 
        [fuel_actual.min(), fuel_actual.max()], 
        'r--', lw=2, label='Perfect prediction')
ax.set_xlabel('Actual H₂ Fuel Consumption (kg)')
ax.set_ylabel('Predicted H₂ Fuel Consumption (kg)')
ax.set_title('Digital Twin Fuel Prediction Accuracy')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Residual Analysis

In [None]:
# Calculate residuals
residuals = fuel_pred - fuel_actual

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residual histogram
axes[0].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Prediction Error (kg)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Residual Distribution')
axes[0].grid(True, alpha=0.3)

# Residual vs predicted
axes[1].scatter(fuel_pred, residuals, alpha=0.5, s=10)
axes[1].axhline(0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted H₂ Fuel (kg)')
axes[1].set_ylabel('Residual (kg)')
axes[1].set_title('Residual vs Predicted Value')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Time Series Analysis

In [None]:
# Analyze prediction drift over time
df['date'] = pd.to_datetime(df['timestamp']).dt.date
daily_metrics = df.groupby('date').apply(
    lambda x: pd.Series({
        'mape': np.mean(np.abs((x['h2_fuel_kg_actual'] - x['h2_fuel_kg_pred']) / x['h2_fuel_kg_actual'])) * 100,
        'count': len(x)
    })
)

fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(daily_metrics.index, daily_metrics['mape'], marker='o', linewidth=2)
ax.axhline(5.0, color='r', linestyle='--', lw=2, label='Requirement threshold (5%)')
ax.set_xlabel('Date')
ax.set_ylabel('MAPE (%)')
ax.set_title('Daily Prediction Error Trend')
ax.legend()
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Summary Report

In [None]:
summary = {
    'Total Samples': len(df),
    'RMSE (kg)': rmse,
    'MAE (kg)': mae,
    'R²': r2,
    'MAPE (%)': mape,
    'Requirement': '<5% MAPE',
    'Status': 'PASS' if mape < 5.0 else 'FAIL'
}

summary_df = pd.DataFrame([summary])
print("\n=== KPI Sanity Check Summary ===")
print(summary_df.to_string(index=False))

# Save summary to file
summary_df.to_csv('kpi_sanity_summary.csv', index=False)
print("\nSummary saved to kpi_sanity_summary.csv")