In [None]:
import pandas as pd
import numpy as np
import joblib
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, precision_recall_curve

# Load Data
df = pd.read_csv("your_data.csv")

# Exploratory Data Analysis
display(df.head())
display(df.describe())
print("Data Shape:", df.shape)
print("Missing Values:", df.isnull().sum().sum())

# Split Features & Target
X = df.drop(columns=['Class'])  # Adjust column name accordingly
y = df['Class']

# Train-Test Split with Stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train XGBoost Model
best_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
best_xgb.fit(X_train, y_train)

# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Train Logistic Regression Model
log_model = LogisticRegression(solver='liblinear', random_state=42)
log_model.fit(X_train, y_train)

# Define Evaluation Function
def evaluate_model(model, X_test, y_test, threshold=0.5):
    y_probs = model.predict_proba(X_test)[:, 1]
    y_pred = (y_probs >= threshold).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_pr = average_precision_score(y_test, y_probs)
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')
    print(f'AUC-PR: {auc_pr:.4f}')

# Evaluate Models
print("\n📌 XGBoost Performance:")
evaluate_model(best_xgb, X_test, y_test, threshold=0.93)

print("\n📌 Random Forest Performance:")
evaluate_model(rf_model, X_test, y_test, threshold=0.93)

print("\n📌 Logistic Regression Performance:")
evaluate_model(log_model, X_test, y_test, threshold=0.93)

# Save Models & Preprocessing Steps
joblib.dump(best_xgb, "final_xgb_model.pkl")
joblib.dump(rf_model, "final_rf_model.pkl")
joblib.dump(log_model, "final_log_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(0.93, "final_threshold.pkl")

# SHAP Analysis
explainer = shap.Explainer(best_xgb)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)


In [None]:
# Step-by-Step Guide: Credit Card Fraud Detection Project

## 1️⃣ Load Necessary Libraries
First, we import the essential Python libraries for data handling, preprocessing, model building, evaluation, and visualization.

```python
import pandas as pd
import numpy as np
import joblib
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, precision_recall_curve
```

## 2️⃣ Load and Explore Data
We load the dataset and perform basic exploratory data analysis.

```python
df = pd.read_csv("your_data.csv")
display(df.head())
display(df.describe())
print("Data Shape:", df.shape)
print("Missing Values:", df.isnull().sum().sum())
```

## 3️⃣ Split Features & Target
We separate the features from the target variable and split the data into training and testing sets.

```python
X = df.drop(columns=['Class'])  # Adjust column name accordingly
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
```

## 4️⃣ Feature Scaling
Since our features might have different scales, we standardize them.

```python
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
```

## 5️⃣ Train Individual Models
We train XGBoost, Random Forest, and Logistic Regression models.

```python
best_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
best_xgb.fit(X_train, y_train)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

log_model = LogisticRegression(solver='liblinear', random_state=42)
log_model.fit(X_train, y_train)
```

## 6️⃣ Hyperparameter Tuning
We optimize the hyperparameters of the XGBoost and Random Forest models using GridSearchCV.

```python
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), param_grid_xgb, cv=3, scoring='f1', n_jobs=-1)
grid_xgb.fit(X_train, y_train)
best_xgb = grid_xgb.best_estimator_
```

```python
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train, y_train)
rf_model = grid_rf.best_estimator_
```

## 7️⃣ Precision-Recall Curve & Optimal Threshold Selection
We compute precision-recall scores at different thresholds and select the best one automatically.

```python
y_probs = best_xgb.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
best_threshold = thresholds[np.argmax(f1_scores)]

plt.figure(figsize=(8, 6))
plt.plot(thresholds, precisions[:-1], 'b--', label='Precision')
plt.plot(thresholds, recalls[:-1], 'r-', label='Recall')
plt.axvline(x=best_threshold, color='g', linestyle='--', label=f'Optimal Threshold: {best_threshold:.2f}')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.legend()
plt.title('Precision-Recall vs. Decision Threshold')
plt.show()
print(f'Optimal Threshold: {best_threshold:.2f}')
```

## 8️⃣ Model Evaluation with Custom Threshold

```python
def evaluate_model(model, X_test, y_test, threshold):
    y_probs = model.predict_proba(X_test)[:, 1]
    y_pred = (y_probs >= threshold).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_pr = average_precision_score(y_test, y_probs)
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')
    print(f'AUC-PR: {auc_pr:.4f}')
```

# ```python
evaluate_model(best_xgb, X_test, y_test, best_threshold)
evaluate_model(rf_model, X_test, y_test, best_threshold)
evaluate_model(log_model, X_test, y_test, best_threshold)
```

## 9️⃣ Building an Ensemble Model
We combine the three models into a VotingClassifier.

```python
ensemble = VotingClassifier(estimators=[('xgb', best_xgb), ('rf', rf_model), ('log', log_model)], voting='soft')
ensemble.fit(X_train, y_train)
evaluate_model(ensemble, X_test, y_test, best_threshold)
```

## 🔟 SHAP Analysis for Feature Importance
We analyze the impact of each feature on the model's decision-making process.

```python
explainer = shap.Explainer(best_xgb)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)
```

## 🔹 Save Final Models & Threshold
```python
joblib.dump(best_xgb, "final_xgb_model.pkl")
joblib.dump(rf_model, "final_rf_model.pkl")
joblib.dump(log_model, "final_log_model.pkl")
joblib.dump(ensemble, "final_ensemble_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(best_threshold, "final_threshold.pkl")
```

### ✅ Project Completed!
This end-to-end guide covers everything from data loading to final model deployment. Let me know if you need further refinements!

