In [None]:
# shap_drift_analysis.py

import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
import joblib
import os

# -------------------------------
# STEP 1: Load and Prepare Data
# -------------------------------
df_t1 = pd.read_csv('data/data_time1.csv')
df_t2 = pd.read_csv('data/data_time2.csv')

X_t1 = df_t1.drop(columns=['target'])
y_t1 = df_t1['target']

X_t2 = df_t2.drop(columns=['target'])
y_t2 = df_t2['target']

# -------------------------------
# STEP 2: Train Model on First Time Window
# -------------------------------
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_t1, y_t1)

# Save model
os.makedirs('models', exist_ok=True)
joblib.dump(model, 'models/model_time1.pkl')

# -------------------------------
# STEP 3: Compute SHAP Values
# -------------------------------
explainer = shap.TreeExplainer(model)
shap_values_t1 = explainer.shap_values(X_t1)
shap_values_t2 = explainer.shap_values(X_t2)

# -------------------------------
# STEP 4: Aggregate SHAP Values
# -------------------------------
mean_shap_t1 = np.abs(shap_values_t1).mean(axis=0)
mean_shap_t2 = np.abs(shap_values_t2).mean(axis=0)

df_compare = pd.DataFrame({
    'feature': X_t1.columns,
    'SHAP_t1': mean_shap_t1,
    'SHAP_t2': mean_shap_t2,
    'drift': mean_shap_t2 - mean_shap_t1
})

# Save to CSV
os.makedirs('results', exist_ok=True)
df_compare.to_csv('results/shap_drift_values.csv', index=False)

# -------------------------------
# STEP 5: Visualize Feature Drift
# -------------------------------
df_plot = df_compare.reindex(df_compare.drift.abs().sort_values(ascending=False).index).head(10)

df_plot.plot(x='feature', y=['SHAP_t1', 'SHAP_t2'], kind='bar')
plt.title("Top 10 Feature Drift (Mean |SHAP| Values)")
plt.ylabel("Mean |SHAP Value|")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('results/shap_drift_plot.png')
plt.show()