# XAI: quantitative SHAP analysis and Gemini qualitative explanation

This notebook: loads the trained stacking model artifacts, prepares an input sample, computes prediction and SHAP-based quantitative analysis, then sends a structured prompt to a text-generation API (Gemini) to produce a technical, qualitative explanation.

Requirements: install `pandas`, `numpy`, `scikit-learn`, `shap`, `requests`, and optionally `python-dotenv`. Ensure model artifacts exist in `models/` and set `GEMINI_API_KEY` (or use the `scripts/set_gemini_key.ps1` script).

In [2]:
# Standard imports
import os
import pickle
import json
import pandas as pd
import numpy as np
print('imports ok')


imports ok


In [3]:
# Load .env if available and try to read user env (Windows) so setx changes are visible
try:
    from dotenv import load_dotenv
    load_dotenv()
    print('.env loaded (if present)')
except Exception:
    print('python-dotenv not installed; skipping .env load')

# Try to ensure GEMINI_API_KEY present by reading HKCU\Environment (Windows) if needed
if not os.environ.get('GEMINI_API_KEY'):
    try:
        import winreg
        with winreg.OpenKey(winreg.HKEY_CURRENT_USER, 'Environment') as reg:
            try:
                val, _ = winreg.QueryValueEx(reg, 'GEMINI_API_KEY')
                if val:
                    os.environ['GEMINI_API_KEY'] = val
                    print('GEMINI_API_KEY loaded from HKCU\\Environment')
            except FileNotFoundError:
                pass
    except Exception as e:
        print('Could not read Windows registry for GEMINI_API_KEY:', e)

print('GEMINI_API_KEY present:', 'yes' if os.environ.get('GEMINI_API_KEY') else 'no')


.env loaded (if present)
GEMINI_API_KEY present: yes


In [4]:
import pandas as pd


sample_data = {
    "orbital_period_days": [5.72],
    "transit_epoch_bjd": [2457000.12345],
    "transit_duration_hours": [2.1],
    "transit_depth_ppm": [1300.0],
    "planet_radius_re": [1.12],
    "equilibrium_temp_k": [1100.0],
    "insolation_flux": [800.0],
    "impact_parameter": [0.45],
    "stellar_teff_k": [5700.0],
    "stellar_radius_rsun": [0.98],
    "stellar_radius_normal": [1.00],
    "stellar_mass_msun": [1.02],
    "mass_rad_ratio": [1.04],
    "stellar_logg": [4.38],
    "acc_grav_stellar_surface": [2.4e4],
    "ra": [299.123],
    "dec": [45.789],
    "radius_ratio_est": [0.011]
}

# Convert to DataFrame
sample_row = pd.DataFrame(sample_data)

In [5]:
# Determine models directory
base = os.path.abspath(os.getcwd())
models_dir = os.path.join(base, 'models')
# Check current directory's 'models' first
if not os.path.exists(models_dir):
    # Check parent directory's 'models' if not found
    potential_models_dir = os.path.join(base, '..', 'models')
    if os.path.exists(potential_models_dir):
        models_dir = potential_models_dir
    else:
        # If neither path exists, default back to the first one and rely on load to fail
        pass 
        
print('models_dir ->', models_dir)

models_dir -> d:\College Files\Hackathons\NasaSpaceApps\spaceapps_exoplanet_detection\model\models


In [7]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import joblib # <--- Import joblib
import os
# ...

# Load trained model
# Note: joblib.load can typically handle the path directly, 
# so you often don't need to manually open the file with 'with open...'
model = joblib.load('../models/stacking_classifier_FIXED.pkl')
scaler = joblib.load('../models/scaler_FIXED.pkl')
selector = joblib.load('../models/selector_FIXED.pkl')

print("Successfully loaded models with joblib!")

Successfully loaded models with joblib!


In [9]:
# Load selected features if available
for name in ['selected_features.pkl', 'selected_features']:
    path = os.path.join(models_dir, name)
    if os.path.exists(path):
        try:
            selected_features = joblib.load(path)
            break
        except Exception:
            # Silently ignore if file exists but load fails, as original code did
            pass 

In [10]:
# Assuming your scaling object is named 'scaler'
try:
    fitted_features = scaler.feature_names_in_
    print("Features used to fit the scaler (Name and Order):")
    for i, name in enumerate(fitted_features):
        print(f"  {i+1}. {name}")
except AttributeError:
    print("The scaler does not have the 'feature_names_in_' attribute (it may have been fitted on a NumPy array).")

Features used to fit the scaler (Name and Order):
  1. orbital_period_days
  2. transit_epoch_bjd
  3. transit_duration_hours
  4. transit_depth_ppm
  5. planet_radius_re
  6. equilibrium_temp_k
  7. insolation_flux
  8. impact_parameter
  9. stellar_teff_k
  10. stellar_radius_rsun
  11. stellar_radius_normal
  12. stellar_mass_msun
  13. mass_rad_ratio
  14. stellar_logg
  15. acc_grav_stellar_surface
  16. ra
  17. dec
  18. radius_ratio_est


In [11]:
import pandas as pd
import numpy as np
import joblib 
import os

# --- 1. The Definitive 18-Feature List ---
FINAL_SCALER_FEATURES = [
    "orbital_period_days", "transit_epoch_bjd", "transit_duration_hours",
    "transit_depth_ppm", "planet_radius_re", "equilibrium_temp_k", 
    "insolation_flux", "impact_parameter", "stellar_teff_k", 
    "stellar_radius_rsun", "stellar_radius_normal", "stellar_mass_msun", 
    "mass_rad_ratio", "stellar_logg", "acc_grav_stellar_surface", 
    "ra", "dec", "radius_ratio_est"
]

# --- 2. Load Trained Model Objects (Assuming success) ---
try:
    model = joblib.load('../models/stacking_classifier_FIXED.pkl')
    scaler = joblib.load('../models/scaler_FIXED.pkl')
    selector = joblib.load('../models/selector_FIXED.pkl')
except Exception as e:
    print("Error loading models. Please ensure '../models/' path is correct.")
    # Re-raise the error so you see the specific problem
    raise


# --- 3. The Final, Corrected Prediction Function (MODIFIED) ---

def predict_exoplanet_robust(data: pd.DataFrame):
    """
    Returns X_scaled, X_selected, and the prediction probability.
    """
    X_processed = data.copy()
    
    # Feature Engineering
    X_processed.loc[:, 'acc_grav_stellar_surface'] = 10**X_processed['stellar_logg']

    # Select the FINAL 18 Features in the CORRECT ORDER
    X_final = X_processed[FINAL_SCALER_FEATURES].fillna(0) 
    
    # Apply Pipeline Steps
    X_scaled = scaler.transform(X_final)
    X_selected = selector.transform(X_scaled)
    
    # Predict
    pred_proba = float(model.predict_proba(X_selected)[:, 1][0])
    
    # MODIFIED: Return the intermediate arrays and the prediction
    return X_scaled, X_selected, pred_proba


# --- 4. Prepare and Run Test Data ---
sample_data = {
    "orbital_period_days": [5.72], "transit_epoch_bjd": [2457000.12345], 
    "transit_duration_hours": [2.1], "transit_depth_ppm": [1300.0], 
    "planet_radius_re": [1.12], "equilibrium_temp_k": [1100.0], 
    "insolation_flux": [800.0], "impact_parameter": [0.45], 
    "stellar_teff_k": [5700.0], "stellar_radius_rsun": [0.98], 
    "stellar_radius_normal": [1.0], "stellar_mass_msun": [1.02], 
    "mass_rad_ratio": [1.04], "stellar_logg": [4.38], 
    "ra": [299.123], "dec": [45.789], "radius_ratio_est": [0.011]
}

test_sample = pd.DataFrame(sample_data)

# Call the modified prediction function
try:
    X_scaled, X_selected, pred_proba = predict_exoplanet_robust(test_sample)
    pred_label = int(pred_proba >= 0.5)

    print(f'\n--- Intermediate Data Shapes ---')
    print(f'X_scaled shape (18 features): {X_scaled.shape}')
    print(f'X_selected shape (Selected features): {X_selected.shape}')

    print(f'\n--- Prediction Results ---')
    print(f'prediction: label={pred_label}, prob={pred_proba:.4f}')

except Exception as e:
    print(f"\n--- EXECUTION FAILED ---")
    print(f"Error: {e}")


--- Intermediate Data Shapes ---
X_scaled shape (18 features): (1, 18)
X_selected shape (Selected features): (1, 12)

--- Prediction Results ---
prediction: label=0, prob=0.0315


In [14]:
# 1. ADD THESE TWO LINES HERE:
from dotenv import load_dotenv
load_dotenv() 

True

In [15]:
# Define the feature list needed for mapping
FINAL_SCALER_FEATURES = [
    "orbital_period_days", "transit_epoch_bjd", "transit_duration_hours",
    "transit_depth_ppm", "planet_radius_re", "equilibrium_temp_k", 
    "insolation_flux", "impact_parameter", "stellar_teff_k", 
    "stellar_radius_rsun", "stellar_radius_normal", "stellar_mass_msun", 
    "mass_rad_ratio", "stellar_logg", "acc_grav_stellar_surface", 
    "ra", "dec", "radius_ratio_est"
]
selected_features = FINAL_SCALER_FEATURES 

# Compute SHAP explanations (FIXED)
import shap
import pandas as pd
import numpy as np

try:
    # --- Original TreeExplainer attempt ---
    explainer = shap.TreeExplainer(model)
    shap_vals = explainer.shap_values(X_selected)
    if isinstance(shap_vals, list):
        shap_vals = shap_vals[1]
    
    # FIX APPLIED in previous step: Flatten the array
    shap_for_sample = np.array(shap_vals[0]).flatten() 
    
except Exception as e:
    # --- Robust KernelExplainer Fallback ---
    print('TreeExplainer failed, falling back to KernelExplainer:', e)
    
    # 1. Prepare Background Dataset from Training Data
    print('Preparing robust background dataset from training data...')
    
    # CRITICAL: Ensure 'unified_exoplanets_final_imputed.csv' is in the correct directory.
    # Note: If running locally, you must ensure the path is correct for your system.
    # The path was changed to a simple filename in the previous step, use the one that works for you.
    try:
        df_train = pd.read_csv('unified_exoplanets_final_imputed.csv')
    except FileNotFoundError:
        # Fallback to the relative path if the simple filename doesn't work locally
        df_train = pd.read_csv('../data/unified_exoplanets_final_imputed.csv')


    df_train.loc[:, 'acc_grav_stellar_surface'] = 10**df_train['stellar_logg']
    X_train_final = df_train[FINAL_SCALER_FEATURES].fillna(0)
    
    # 2. Apply Scaling and Selection to Training Data
    X_train_scaled = scaler.transform(X_train_final)
    X_train_selected = selector.transform(X_train_scaled)
    
    background = shap.sample(X_train_selected, min(100, X_train_selected.shape[0]))
    
    # 3. Run KernelExplainer
    explainer = shap.KernelExplainer(model.predict_proba, background)
    shap_vals = explainer.shap_values(X_selected, nsamples=100)
    
    if isinstance(shap_vals, list):
        shap_vals = shap_vals[1]
    
    # FIX APPLIED in previous step: Flatten the array
    shap_for_sample = np.array(shap_vals[0]).flatten() 

# Map back to feature names
try:
    support = selector.get_support(indices=True)
    feature_names_selected = [selected_features[i] for i in support]
except Exception:
    # If selector fails, we rely on the length of the SHAP output, which is the source of the error.
    # We must assume the selector worked and report a warning if this part is reached.
    print("WARNING: Feature selection failed. Using an estimated length for slicing.")
    feature_names_selected = selected_features[:len(shap_for_sample) // 2] 


# --- FINAL FIX FOR LENGTH MISMATCH ---
n_expected_features = len(feature_names_selected)
if len(shap_for_sample) != n_expected_features:
    print(f"WARNING: SHAP array length ({len(shap_for_sample)}) does not match feature count ({n_expected_features}). Slicing to the correct length.")
    # Slice the SHAP array to the correct length (assuming the correct values are in the first half)
    shap_for_sample = shap_for_sample[:n_expected_features]


shap_series = pd.Series(shap_for_sample, index=feature_names_selected).sort_values(ascending=False)
print('SHAP computed; top contributors:')
print(shap_series.head(10))

  from .autonotebook import tqdm as notebook_tqdm


TreeExplainer failed, falling back to KernelExplainer: Model type not yet supported by TreeExplainer: <class 'sklearn.ensemble._stacking.StackingClassifier'>
Preparing robust background dataset from training data...


100%|██████████| 1/1 [00:06<00:00,  6.13s/it]

SHAP computed; top contributors:
transit_epoch_bjd           0.100368
stellar_logg                0.021352
radius_ratio_est            0.006399
acc_grav_stellar_surface    0.002989
stellar_teff_k              0.000000
equilibrium_temp_k          0.000000
transit_depth_ppm           0.000000
impact_parameter            0.000000
ra                         -0.002989
dec                        -0.006399
dtype: float64





In [17]:
%pip install google-genai

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
# Quantitative summary and Gemini qualitative explanation
import requests
import json
import os
import pandas as pd
import numpy as np

# NOTE: Ensure shap_series, pred_label, pred_proba, and test_sample (or sample_row) are available from previous steps.

TOP_K = 8
df_shap = pd.DataFrame({'feature': shap_series.index, 'shap_value': shap_series.values})
df_shap['abs_shap'] = df_shap['shap_value'].abs()
df_shap = df_shap.sort_values('abs_shap', ascending=False).reset_index(drop=True)
total_abs = df_shap['abs_shap'].sum() if df_shap['abs_shap'].sum() != 0 else 1.0
df_shap['pct_contrib'] = df_shap['abs_shap'] / total_abs * 100.0

def safe_value(f):
    try:
        # Assuming your prediction sample is named 'test_sample' from the earlier code
        return float(test_sample[f].values[0])
    except Exception:
        return None

df_shap['value'] = df_shap['feature'].apply(safe_value)

# Top contributors
print('Top quantitative contributors:')
print(df_shap.head(TOP_K)[['feature','value','shap_value','abs_shap','pct_contrib']].to_string(index=False))

summary_stats = {
    'prediction': {'label': int(pred_label), 'probability': float(pred_proba)},
    'num_reported_features': int(min(TOP_K, len(df_shap))),
    'total_abs_shap': float(total_abs),
    'positive_shap_sum': float(df_shap[df_shap['shap_value']>0]['shap_value'].sum()),
    'negative_shap_sum': float(df_shap[df_shap['shap_value']<0]['shap_value'].sum())
}

structured = {
    'dataset': 'unified_exoplanets_final_imputed.csv',
    'model': 'stacking ensemble (RF+GB+SVM+LR) with feature selection + scaling',
    'prediction': summary_stats['prediction'],
    'quantitative_shap_top': df_shap.head(TOP_K)[['feature','value','shap_value','abs_shap','pct_contrib']].to_dict(orient='records'),
    'summary_stats': summary_stats
}

human_instructions = (
    'You are a senior ML engineer. Given the quantitative SHAP analysis (JSON) and the sample feature values, '
    'produce a concise technical explanation (5-8 bullet points) describing why the model produced the prediction, '
    'include likely causes, model confidence caveats, and 3 concrete suggestions to validate or improve model reliability. '
    'Reference the top contributing features and their directional effects. Keep the explanation technical and targeted to a data-science audience.'
)

# --- Replacement for the original 'prompt =' line ---
data_preamble = "Analyze the following machine learning prediction data and SHAP values and return the explanation for the prediction:"
data_json = json.dumps(structured, indent=2)

prompt = (
    f"{data_preamble}\n\n{data_json}\n\n"
    f"{human_instructions}"
)
# --- End of Replacement ---

print('prompt preview:\n', prompt[:800]) # Keep your preview line

Top quantitative contributors:
                 feature        value  shap_value  abs_shap  pct_contrib
  transit_duration_hours 2.100000e+00   -0.100368  0.100368    38.276851
       transit_epoch_bjd 2.457000e+06    0.100368  0.100368    38.276851
            stellar_logg 4.380000e+00    0.021352  0.021352     8.142995
       stellar_mass_msun 1.020000e+00   -0.021352  0.021352     8.142995
                     dec 4.578900e+01   -0.006399  0.006399     2.440400
        radius_ratio_est 1.100000e-02    0.006399  0.006399     2.440400
                      ra 2.991230e+02   -0.002989  0.002989     1.139754
acc_grav_stellar_surface          NaN    0.002989  0.002989     1.139754
prompt preview:
 Analyze the following machine learning prediction data and SHAP values and return the explanation for the prediction:

{
  "dataset": "unified_exoplanets_final_imputed.csv",
  "model": "stacking ensemble (RF+GB+SVM+LR) with feature selection + scaling",
  "prediction": {
    "label": 0,
    "pr

In [21]:
# --- ADD THESE IMPORTS AT THE VERY TOP OF YOUR SCRIPT ---
from google import genai
from google.genai.errors import APIError
# --------------------------------------------------------

# ... (rest of your code, calculating structured and prompt) ...

# --------------------------------------------------------
# CORRECTED API CALL BLOCK
# --------------------------------------------------------

# Check if API Key is available
api_key = os.environ.get('GEMINI_API_KEY')

if not api_key:
    print('GEMINI_API_KEY not configured; saving prompt for manual submission')
    os.makedirs('models', exist_ok=True)
    with open('output/xai_prompt.json','w',encoding='utf-8') as pf:
        json.dump({'prompt': prompt, 'structured': structured}, pf, indent=2)
    print('Saved output/xai_prompt.json')

else:
    try:
        # 1. Initialize the client (it automatically detects and uses the API_KEY)
        client = genai.Client() 
        model_name = 'gemini-2.0-flash' # Define model name here
        
        print(f"[*] Sending data to {model_name} for analysis (using SDK)...")

        # 2. Use client.models.generate_content and pass the model_name directly
        response = client.models.generate_content(
            model=model_name, # <-- CORRECTED SYNTAX: Pass model name here
            contents=prompt,
            config={'max_output_tokens': 768, 'temperature': 0.15}
        )
        
        # Access the text directly
        explanation_text = getattr(response, "text", None)

        if not explanation_text:
            print("Error: Gemini did not return any text in the response. Check safety settings.")
            explanation_text = "ERROR: No text received from API."
        
        print('\n--- Gemini Explanation Successfully Generated (via SDK) ---')
        print(explanation_text)
        
        os.makedirs('output', exist_ok=True) 
        with open('output/xai_explanation.txt','w',encoding='utf-8') as of:
            of.write(explanation_text)
        print('\nSaved final explanation to output/xai_explanation.txt')

        os.makedirs('output', exist_ok=True) 
        with open('output/xai_prompt.json','w',encoding='utf-8') as pf:
            json.dump({'prompt': prompt, 'structured': structured}, pf, indent=2)
        print('Saved output/xai_prompt.json for manual submission')

    except APIError as e:
        print(f'\nAPI call failed (SDK Error): {e}')
        print('The API key is likely invalid or restricted. You must fix the 401 error.')
        
        # Fallback to saving prompt for manual submission
        os.makedirs('output', exist_ok=True) 
        with open('output/xai_prompt.json','w',encoding='utf-8') as pf:
            json.dump({'prompt': prompt, 'structured': structured}, pf, indent=2)
        print('Saved output/xai_prompt.json for manual submission')

    except Exception as e:
        print(f'\nAn unexpected error occurred: {e}')

[*] Sending data to gemini-2.0-flash for analysis (using SDK)...

--- Gemini Explanation Successfully Generated (via SDK) ---
Here's a technical explanation of the model's prediction based on the provided SHAP analysis:

*   **Prediction and Confidence:** The stacking ensemble model predicts a label of 0 (likely indicating "not an exoplanet") with a low probability of 0.031. This suggests low confidence in the negative classification.

*   **Dominant Feature Influence:** The top two features, `transit_duration_hours` and `transit_epoch_bjd`, contribute most significantly (38.3% each) to the prediction. `transit_duration_hours` with a value of 2.1 hours pushes the prediction towards label 0 (negative SHAP value), while `transit_epoch_bjd` with a value of 2457000.12345 pushes the prediction towards label 1 (positive SHAP value). The model is highly sensitive to these features.

*   **Secondary Feature Effects:** `stellar_logg` (4.38) increases the probability of label 1, while `stellar_m