In [None]:
# Install required packages
!pip install xgboost shap --quiet

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from xgboost import XGBClassifier
import shap

# Load dataset
from google.colab import files
uploaded = files.upload()  # Upload Fraud.csv file
df = pd.read_csv('Fraud.csv')

# Data inspection
print(df.info())
print(df.describe())
print(df.isnull().sum())

# Encode transaction type
df['type'] = LabelEncoder().fit_transform(df['type'])

# Drop identifier columns
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

# Feature engineering: balance inconsistencies
df['errorOrig'] = df['oldbalanceOrg'] - df['newbalanceOrig'] - df['amount']
df['errorDest'] = df['newbalanceDest'] - df['oldbalanceDest'] - df['amount']

# Downsample for class balance
fraud_df = df[df['isFraud'] == 1]
nonfraud_df = df[df['isFraud'] == 0].sample(n=20000, random_state=42)
df_balanced = pd.concat([fraud_df, nonfraud_df])

# Define features and target
X = df_balanced.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = df_balanced['isFraud']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Train model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=10)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label='XGBoost')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# SHAP values for explainability
explainer = shap.Explainer(model)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)
