In [2]:
pip install lime joblib mlflow


Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mlflow
  Downloading mlflow-3.1.4-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.4 (from mlflow)
  Downloading mlflow_skinny-3.1.4-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading databricks_sdk-0.59.0-py3-none-any.whl.metadata (39 kB)
Collect

In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Load Processed Data

In [7]:
import pandas as pd
import numpy as np
import os
import joblib
import logging
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from scipy import sparse

# Setup logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Create directories
os.makedirs('models/fraud_data', exist_ok=True)
os.makedirs('models/credit_card_data', exist_ok=True)
logger.info("Model directories created.")

# Load data
fraud_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/processed/cleaned_fraud_data_by_country.csv')
credit_card_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/processed/cleaned_credit_card_data.csv')


# Datetime features for fraud data
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['signup_day'] = fraud_data['signup_time'].dt.day
fraud_data['signup_month'] = fraud_data['signup_time'].dt.month
fraud_data['signup_year'] = fraud_data['signup_time'].dt.year

fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.day
fraud_data['purchase_month'] = fraud_data['purchase_time'].dt.month
fraud_data['purchase_year'] = fraud_data['purchase_time'].dt.year

fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])

# Frequency encoding for categorical columns
cat_cols = fraud_data.select_dtypes(include='object').columns
for col in cat_cols:
    freq_map = fraud_data[col].value_counts().to_dict()
    fraud_data[col] = fraud_data[col].map(freq_map)

# Downcast numeric types
for col in fraud_data.select_dtypes(include='int64'):
    fraud_data[col] = fraud_data[col].astype('int32')
for col in fraud_data.select_dtypes(include='float64'):
    fraud_data[col] = fraud_data[col].astype('float32')

# Separate features and targets
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

X_credit = credit_card_data.drop(columns=['Class'])
y_credit = credit_card_data['Class']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_fraud = imputer.fit_transform(X_fraud)
X_credit = imputer.fit_transform(X_credit)

y_fraud = y_fraud.fillna(y_fraud.mode()[0])
y_credit = y_credit.fillna(y_credit.mode()[0])

# Convert to sparse for fraud (optional)
X_fraud_sparse = sparse.csr_matrix(X_fraud)

# Train/test split
from sklearn.model_selection import train_test_split

X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud_sparse, y_fraud, test_size=0.2, random_state=42)
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)

logger.info("Data loading and preprocessing complete.")


# I Have Selected 3 Model for Comparing Performance
1) Decison Tree
2) logistic Regression
3) Random Forest !!

### Model Training and Evaluation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn

# Setup MLflow
mlflow.set_tracking_uri("file:///mlruns")
mlflow.set_experiment("Fraud and Credit Card Detection")

# Define only the required models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
}

# Define the train and eval function
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, dataset_name, model_name):
    with mlflow.start_run(run_name=f'{model_name} on {dataset_name}'):
        logger.info(f"Training {model_name} on {dataset_name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        mlflow.log_params({
            'model': model_name,
            'dataset': dataset_name,
            'test_size': 0.2,
            'random_state': 42
        })
        mlflow.log_metric('accuracy', accuracy)
        mlflow.sklearn.log_model(model, f'{model_name}_mlflow')

        # Save locally
        save_path = f"models/{dataset_name.lower().replace(' ', '_')}/{model_name.lower().replace(' ', '_')}.joblib"
        joblib.dump(model, save_path)
        logger.info(f"Saved {model_name} to {save_path}")

        # Output results
        print(f"\n📊 Model: {model_name} | Dataset: {dataset_name}")
        print(f"✅ Accuracy: {accuracy:.4f}")
        print(report)
        print("=" * 60)

# Run on Fraud Data
for name, model in models.items():
    train_and_evaluate_model(model, X_train_fraud, y_train_fraud, X_test_fraud, y_test_fraud, 'Fraud Data', name)

# Run on Credit Card Data
for name, model in models.items():
    train_and_evaluate_model(model, X_train_credit, y_train_credit, X_test_credit, y_test_credit, 'Credit Card Data', name)


2025-07-21 21:12:35,928 - DEBUG - Popen(['git', 'version'], cwd=c:\Users\HP\10 Acadamy PRojects\New folder (8 & 9)\Ehanced_Fraud_Detection_Ecommerce_Banking\notebooks, stdin=None, shell=False, universal_newlines=False)
2025-07-21 21:12:36,262 - DEBUG - Popen(['git', 'version'], cwd=c:\Users\HP\10 Acadamy PRojects\New folder (8 & 9)\Ehanced_Fraud_Detection_Ecommerce_Banking\notebooks, stdin=None, shell=False, universal_newlines=False)
2025-07-21 21:12:36,480 - DEBUG - Popen(['git', 'check-ignore', 'c:\\Users\\HP\\10 Acadamy PRojects\\New folder (8 & 9)\\Ehanced_Fraud_Detection_Ecommerce_Banking\\.venv\\Lib\\site-packages'], cwd=c:\Users\HP\10 Acadamy PRojects\New folder (8 & 9)\Ehanced_Fraud_Detection_Ecommerce_Banking, stdin=None, shell=False, universal_newlines=False)
2025-07-21 21:12:37,107 - INFO - Training Logistic Regression on Fraud Data
2025-07-21 21:13:15,899 - INFO - Saved Logistic Regression to models/fraud_data/logistic_regression.joblib
2025-07-21 21:13:16,102 - INFO - Trai


📊 Model: Logistic Regression | Dataset: Fraud Data
✅ Accuracy: 0.6478
              precision    recall  f1-score   support

           0       0.95      0.64      0.77     27373
           1       0.17      0.70      0.27      2850

    accuracy                           0.65     30223
   macro avg       0.56      0.67      0.52     30223
weighted avg       0.88      0.65      0.72     30223



2025-07-21 21:14:42,803 - INFO - Saved Decision Tree to models/fraud_data/decision_tree.joblib



📊 Model: Decision Tree | Dataset: Fraud Data
✅ Accuracy: 0.9211
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     27373
           1       0.58      0.59      0.59      2850

    accuracy                           0.92     30223
   macro avg       0.77      0.77      0.77     30223
weighted avg       0.92      0.92      0.92     30223



2025-07-21 21:14:43,088 - INFO - Training Random Forest on Fraud Data
2025-07-21 21:29:09,453 - INFO - Saved Random Forest to models/fraud_data/random_forest.joblib



📊 Model: Random Forest | Dataset: Fraud Data
✅ Accuracy: 0.9565
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223



2025-07-21 21:29:09,743 - INFO - Training Logistic Regression on Credit Card Data
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2025-07-21 21:31:02,311 - INFO - Saved Logistic Regression to models/credit_card_data/logistic_regression.joblib



📊 Model: Logistic Regression | Dataset: Credit Card Data
✅ Accuracy: 0.9767
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     56656
           1       0.06      0.89      0.11        90

    accuracy                           0.98     56746
   macro avg       0.53      0.93      0.55     56746
weighted avg       1.00      0.98      0.99     56746



2025-07-21 21:31:02,886 - INFO - Training Decision Tree on Credit Card Data
2025-07-21 21:32:13,159 - INFO - Saved Decision Tree to models/credit_card_data/decision_tree.joblib



📊 Model: Decision Tree | Dataset: Credit Card Data
✅ Accuracy: 0.9989
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.66      0.63      0.64        90

    accuracy                           1.00     56746
   macro avg       0.83      0.82      0.82     56746
weighted avg       1.00      1.00      1.00     56746



2025-07-21 21:32:13,451 - INFO - Training Random Forest on Credit Card Data
2025-07-21 21:39:02,219 - INFO - Saved Random Forest to models/credit_card_data/random_forest.joblib



📊 Model: Random Forest | Dataset: Credit Card Data
✅ Accuracy: 0.9995
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.97      0.71      0.82        90

    accuracy                           1.00     56746
   macro avg       0.98      0.86      0.91     56746
weighted avg       1.00      1.00      1.00     56746



## Model Explainability

In [8]:
import shap
import lime
import joblib
import matplotlib.pyplot as plt
import lime.lime_tabular
import pandas as pd

In [9]:
print(fraud_data.columns)


Index(['user_id', 'purchase_value', 'device_id', 'age', 'ip_address', 'class',
       'transaction_count', 'time_since_signup', 'transaction_velocity',
       'transaction_time_diff', 'hour_of_day', 'day_of_week', 'source_Direct',
       'source_SEO', 'browser_FireFox', 'browser_IE', 'browser_Opera',
       'browser_Safari', 'sex_M', 'signup_hour', 'signup_day', 'signup_month',
       'signup_year', 'purchase_hour', 'purchase_day', 'purchase_month',
       'purchase_year'],
      dtype='object')


In [10]:
# Reconstruct datetime from parts in fraud_data
fraud_data['signup_datetime'] = pd.to_datetime(dict(
    year=fraud_data['signup_year'],
    month=fraud_data['signup_month'],
    day=fraud_data['signup_day'],
    hour=fraud_data['signup_hour']
))

fraud_data['purchase_datetime'] = pd.to_datetime(dict(
    year=fraud_data['purchase_year'],
    month=fraud_data['purchase_month'],
    day=fraud_data['purchase_day'],
    hour=fraud_data['purchase_hour']
))

# Optional: convert these to UNIX timestamps
fraud_data['signup_timestamp'] = fraud_data['signup_datetime'].astype('int64') / 10**9
fraud_data['purchase_timestamp'] = fraud_data['purchase_datetime'].astype('int64') / 10**9


In [11]:
 # Prepare features and target
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

# Keep only numeric columns (exclude things like user_id, device_id, etc.)
X_fraud = X_fraud.select_dtypes(include='number')

# Repeat for credit card dataset
X_credit = credit_card_data.drop(columns=['Class'])
y_credit = credit_card_data['Class']
X_credit = X_credit.select_dtypes(include='number')

# Split into training and test sets
from sklearn.model_selection import train_test_split

X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42
)

X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(
    X_credit, y_credit, test_size=0.2, random_state=42
)


In [12]:
import joblib

fraud_model = joblib.load('/content/drive/MyDrive/Colab Notebooks/models/fraud_data/random_forest.joblib')
credit_model = joblib.load('/content/drive/MyDrive/Colab Notebooks/models/credit_card_data/random_forest.joblib')


# SHAP for Fraud Detection Model

In [1]:
import shap
import matplotlib.pyplot as plt
import pandas as pd

# Enable JS visualizations (optional for notebooks)
shap.initjs()

# Sample 100 rows for efficiency
X_sample = X_test_fraud.sample(n=100, random_state=42)

# Use TreeExplainer on your best-performing model (RandomForest, XGBoost, etc.)
explainer = shap.TreeExplainer(fraud_model)

# Compute SHAP values (for binary classification → use index [1])
shap_values = explainer.shap_values(X_sample)

# Global Feature Importance (Summary Plot)
plt.figure()
shap.summary_plot(shap_values[1], X_sample)
plt.show()

# Local Explanation for a Single Prediction (first row in sample)
plt.figure()
shap.force_plot(
    explainer.expected_value[1],
    shap_values[1][0],
    X_sample.iloc[0],
    matplotlib=True
)
plt.show()

# Dependence Plot (replace with a real feature name)
# Find the most important feature from the summary plot and use it here
top_feature = X_sample.columns[0]  # Or replace manually with 'signup_timestamp' or similar
plt.figure()
shap.dependence_plot(top_feature, shap_values[1], X_sample)
plt.show()


# SHAP for Credit Card Model

In [None]:
explainer_credit = shap.TreeExplainer(credit_model)
shap_values_credit = explainer_credit.shap_values(X_test_credit)

shap.summary_plot(shap_values_credit[1], X_test_credit)
plt.show()

shap.force_plot(
    explainer_credit.expected_value[1],
    shap_values_credit[1][0],
    X_test_credit.iloc[0],
    matplotlib=True
)
plt.show()

shap.dependence_plot('Amount', shap_values_credit[1], X_test_credit)  # or another numeric feature
plt.show()


# LIME for Fraud Model

In [None]:
import lime
import lime.lime_tabular

explainer_lime_fraud = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train_fraud.values,
    feature_names=X_train_fraud.columns.tolist(),
    class_names=['Not Fraud', 'Fraud'],
    mode='classification',
    discretize_continuous=True
)

explanation_fraud = explainer_lime_fraud.explain_instance(
    X_test_fraud.iloc[0].values,
    fraud_model.predict_proba,
    num_features=10
)

explanation_fraud.show_in_notebook(show_all=False)
explanation_fraud.as_pyplot_figure()
plt.show()


# LIME for Credit Card Model

In [None]:
explainer_lime_credit = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train_credit.values,
    feature_names=X_train_credit.columns.tolist(),
    class_names=['Class 0', 'Class 1'],
    mode='classification',
    discretize_continuous=True
)

explanation_credit = explainer_lime_credit.explain_instance(
    X_test_credit.iloc[0].values,
    credit_model.predict_proba,
    num_features=10
)

explanation_credit.show_in_notebook(show_all=False)
explanation_credit.as_pyplot_figure()
plt.show()
