### SHAP Analysis Setup

In [1]:

import shap
import joblib
import matplotlib.pyplot as plt
from lime import lime_tabular

### Load best model and data

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

# Load the original data
fraud_data = pd.read_csv('../data/fraud_data.csv')

# Preprocess the data
# 1. Convert timestamps
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time']).astype(np.int64) // 10**9
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time']).astype(np.int64) // 10**9

# 2. Convert categorical variables
categorical_features = ['source', 'browser', 'sex']
fraud_data = pd.get_dummies(fraud_data, columns=categorical_features)

# 3. Handle ip_address
fraud_data['ip_address'] = pd.util.hash_array(fraud_data['ip_address'].values)

# 4. Create feature matrix X and target vector y
X = fraud_data.drop(['class', 'user_id', 'device_id'], axis=1)
y = fraud_data['class']

# 5. Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 6. Save the preprocessed data
joblib.dump((X_train, X_test, y_train, y_test), 'preprocessed_data.pkl')
print("Successfully saved preprocessed data to: preprocessed_data.pkl")

# Optional: Print shapes to verify
print("\nData shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

Successfully saved preprocessed data to: preprocessed_data.pkl

Data shapes:
X_train: (120889, 15)
X_test: (30223, 15)
y_train: (120889,)
y_test: (30223,)


### SHAP Global Explanations

In [14]:
import shap
import matplotlib.pyplot as plt

# Use the loaded model instead of 'model'
explainer = shap.TreeExplainer(loaded_model)
shap_values = explainer.shap_values(X_test)

plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
plt.title("Global Feature Importance")
plt.tight_layout()
plt.savefig('shap_global.png', bbox_inches='tight')
plt.close()

LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


### Time-based features

In [15]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load the original data
fraud_data = pd.read_csv('../data/fraud_data.csv')

# Convert timestamp columns to datetime
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# 1. Time difference between signup and purchase (in hours)
fraud_data['time_since_signup'] = (fraud_data['purchase_time'] - fraud_data['signup_time']).dt.total_seconds() / 3600

# 2. Hour of purchase (to capture daily patterns)
fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour

# 3. Day of week (to capture weekly patterns)
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.dayofweek

# 4. Is weekend
fraud_data['is_weekend'] = fraud_data['purchase_day'].isin([5, 6]).astype(int)

# 5. Is night time (e.g., between 11PM and 5AM)
fraud_data['is_night'] = fraud_data['purchase_hour'].isin(range(23, 24)).astype(int) | \
                        fraud_data['purchase_hour'].isin(range(0, 6)).astype(int)

# 6. Purchase speed category (how quickly after signup)
fraud_data['purchase_speed'] = pd.cut(
    fraud_data['time_since_signup'],
    bins=[-np.inf, 1, 24, 168, np.inf],  # 1 hour, 24 hours, 1 week, more
    labels=['very_fast', 'fast', 'normal', 'slow']
)

# 7. Convert purchase speed to dummy variables
purchase_speed_dummies = pd.get_dummies(fraud_data['purchase_speed'], prefix='purchase_speed')
fraud_data = pd.concat([fraud_data, purchase_speed_dummies], axis=1)

# Now proceed with the rest of your preprocessing
# Convert categorical variables
categorical_features = ['source', 'browser', 'sex']
fraud_data = pd.get_dummies(fraud_data, columns=categorical_features)

# Handle ip_address
fraud_data['ip_address'] = pd.util.hash_array(fraud_data['ip_address'].values)

# Create feature matrix X and target vector y
# Include new time-based features but exclude original datetime columns and purchase_speed
features_to_drop = ['class', 'user_id', 'device_id', 'signup_time', 'purchase_time', 'purchase_speed']
X = fraud_data.drop(features_to_drop, axis=1)
y = fraud_data['class']

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Save preprocessed data with new features
joblib.dump((X_train, X_test, y_train, y_test), 'preprocessed_data_with_time_features.pkl')
print("Successfully saved enhanced preprocessed data")

# Print new features summary
print("\nNew Time-Based Features Added:")
print("1. time_since_signup: Hours between signup and purchase")
print("2. purchase_hour: Hour of the day (0-23)")
print("3. purchase_day: Day of week (0-6, 0=Monday)")
print("4. is_weekend: Binary indicator for weekend")
print("5. is_night: Binary indicator for night time purchases")
print("6. purchase_speed: Categorized speed of purchase after signup")

print("\nData shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")

Successfully saved enhanced preprocessed data

New Time-Based Features Added:
1. time_since_signup: Hours between signup and purchase
2. purchase_hour: Hour of the day (0-23)
3. purchase_day: Day of week (0-6, 0=Monday)
4. is_weekend: Binary indicator for weekend
5. is_night: Binary indicator for night time purchases
6. purchase_speed: Categorized speed of purchase after signup

Data shapes:
X_train: (120889, 22)
X_test: (30223, 22)


### Retrain your LightGBM model

In [16]:
# Load the enhanced data
X_train, X_test, y_train, y_test = joblib.load('preprocessed_data_with_time_features.pkl')

# Create and train the model
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    num_leaves=32,
    random_state=42
)

# Train the model
lgb_model.fit(X_train, y_train)

# Save the new model
joblib.dump(lgb_model, 'best_model_LightGBM_with_time_features.joblib')

# Check performance
from sklearn.metrics import classification_report, roc_auc_score
y_pred = lgb_model.predict(X_test)
y_pred_proba = lgb_model.predict_proba(X_test)[:, 1]

print("\nModel Performance with Time Features:")
print(classification_report(y_test, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

[LightGBM] [Info] Number of positive: 11321, number of negative: 109568
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 742
[LightGBM] [Info] Number of data points in the train set: 120889, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.093648 -> initscore=-2.269886
[LightGBM] [Info] Start training from score -2.269886

Model Performance with Time Features:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27393
           1       1.00      0.53      0.69      2830

    accuracy                           0.96     30223
   macro avg       0.98      0.76      0.83     30223
weighted avg       0.96      0.96      0.95     30223

AUC-ROC Score: 0.7694


###  SHAP Local Explanation

In [26]:
import shap
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier

# 1. Define current features
current_features = [
    'purchase_value', 'age', 'ip_address',
    'source_Ads', 'source_Direct', 'source_SEO',
    'browser_Chrome', 'browser_FireFox', 'browser_IE',
    'browser_Opera', 'browser_Safari',
    'sex_F', 'sex_M'
]

# 2. Train new model
new_model = LGBMClassifier(n_estimators=1000, random_state=42)
new_model.fit(X_train[current_features], y_train)

# 3. Get a fraud case
fraud_mask = y_test == 1
fraud_case = X_test[fraud_mask].iloc[[0]][current_features]

# 4. Create SHAP explanation
explainer = shap.TreeExplainer(new_model)
shap_values = explainer.shap_values(fraud_case)

# 5. Create waterfall plot - using shap_values[1] for the fraud class
plt.figure(figsize=(12, 6))
shap.plots.waterfall(
    shap.Explanation(
        values=shap_values[1][0],  # Select fraud class (class 1) values
        base_values=explainer.expected_value[1],  # Base value for fraud class
        data=fraud_case.iloc[0],
        feature_names=current_features
    ),
    show=False
)
plt.title("Individual Fraud Case Explanation")
plt.savefig('shap_local.png', bbox_inches='tight')
plt.close()

# 6. Print prediction
print(f"Fraud probability: {new_model.predict_proba(fraud_case)[:, 1][0]:.4f}")


[LightGBM] [Info] Number of positive: 11321, number of negative: 109568
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004369 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 444
[LightGBM] [Info] Number of data points in the train set: 120889, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.093648 -> initscore=-2.269886
[LightGBM] [Info] Start training from score -2.269886


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


Fraud probability: 0.0392


###  LIME Setup

In [31]:
import lime
from lime import lime_tabular

# Define numerical and categorical features
numerical_features = ['purchase_value', 'age', 'ip_address']
categorical_features = [
    'source_Ads', 'source_Direct', 'source_SEO',
    'browser_Chrome', 'browser_FireFox', 'browser_IE', 'browser_Opera', 'browser_Safari',
    'sex_F', 'sex_M'
]

# Create LIME explainer
lime_explainer = lime_tabular.LimeTabularExplainer(
    training_data=X_train[numerical_features + categorical_features].values,
    feature_names=numerical_features + categorical_features,
    class_names=['Legit', 'Fraud'],
    categorical_features=list(range(len(numerical_features), len(numerical_features + categorical_features))),
    mode='classification'
)

print("LIME explainer setup completed successfully")
print("\nFeatures used:")
print("Numerical:", numerical_features)
print("Categorical:", categorical_features)

LIME explainer setup completed successfully

Features used:
Numerical: ['purchase_value', 'age', 'ip_address']
Categorical: ['source_Ads', 'source_Direct', 'source_SEO', 'browser_Chrome', 'browser_FireFox', 'browser_IE', 'browser_Opera', 'browser_Safari', 'sex_F', 'sex_M']


### LIME Explanation

In [None]:
import pandas as pd
import numpy as np
import joblib

# Load the model
model = joblib.load('best_model_LightGBM_with_time_features.joblib')

# Let's print the feature names the model was trained on
print("Model's expected features:")
print(model.feature_names_)

### Save Explainers

In [17]:
import joblib
import shap

# First let's load the preprocessed data that was already saved
X_train, X_test, y_train, y_test = joblib.load('preprocessed_data_with_time_features.pkl')

# Load the model and create SHAP explainer
model = joblib.load('best_model_LightGBM_with_time_features.joblib')
shap_explainer = shap.TreeExplainer(model)

# Save only the SHAP explainer
joblib.dump(shap_explainer, 'shap_explainer.pkl')

print("Task 3 Complete: SHAP explainer saved successfully")

Task 3 Complete: SHAP explainer saved successfully
