In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, SimpleRNN, LSTM
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import warnings
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score

warnings.filterwarnings('ignore')

# Set plot style for better visuals
sns.set(style="whitegrid")

In [2]:
# Load the datasets
fraud_data = pd.read_csv('../data/fraud_cleaned_data.csv')
credit_data = pd.read_csv('../data/creditcard.csv')

In [3]:
fraud_data.shape, credit_data.shape

((151112, 20), (284807, 31))

In [4]:
fraud_data.columns, credit_data.columns

(Index(['Unnamed: 0', 'purchase_value', 'age', 'ip_address', 'class',
        'frequency', 'velocity', 'hour_of_day', 'day_of_week', 'time_diff',
        'signup_hour', 'signup_day_of_week', 'purchase_day_of_week',
        'source_Direct', 'source_SEO', 'browser_FireFox', 'browser_IE',
        'browser_Opera', 'browser_Safari', 'sex_M'],
       dtype='object'),
 Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
        'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
        'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
        'Class'],
       dtype='object'))

In [5]:
# For Credit Card Data
X_creditcard = credit_data.drop(columns=['Class'])  # independant Features
y_creditcard = credit_data['Class']                   # Target variable

# For Fraud Data
X_fraud = fraud_data.drop(columns=['class'])  # independant Features
y_fraud = fraud_data['class']      # Target variable

In [6]:
# Train-test split for Credit Card Data
X_train_creditcard, X_test_creditcard, y_train_creditcard, y_test_creditcard = train_test_split(
    X_creditcard, y_creditcard, test_size=0.2, random_state=42, stratify=y_creditcard
)

# Train-test split for Fraud Data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

In [7]:
print(np.unique(y_train_fraud, return_counts=True))
print(np.unique(y_train_creditcard, return_counts=True))

(array([0, 1], dtype=int64), array([109568,  11321], dtype=int64))
(array([0, 1], dtype=int64), array([227451,    394], dtype=int64))


In [8]:
# Apply SMOTE-Tomek Link for Credit Card Data
from imblearn.combine import SMOTETomek
smote_tomek_creditcard = SMOTETomek(random_state=42)
X_train_creditcard_resampled, y_train_creditcard_resampled = smote_tomek_creditcard.fit_resample(X_train_creditcard, y_train_creditcard)

# Apply SMOTE-Tomik for Fraud Data
smote_tomek_fraud = SMOTETomek(random_state=42)
X_train_fraud_resampled, y_train_fraud_resampled = smote_tomek_fraud.fit_resample(X_train_fraud, y_train_fraud)


In [9]:
print(np.unique(y_train_fraud_resampled, return_counts=True))
print(np.unique(y_train_creditcard_resampled, return_counts=True))

(array([0, 1], dtype=int64), array([100250, 100250], dtype=int64))
(array([0, 1], dtype=int64), array([226818, 226818], dtype=int64))


In [10]:
# Enable autologging
mlflow.sklearn.autolog()

## MLflow for Machine Learning Models

In [11]:
models = [
    ("Logistic Regression", LogisticRegression(C=1, solver='liblinear')),
    ("Random Forest", RandomForestClassifier(n_estimators=100, max_depth=5)),
    ("XGBClassifier", XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    ("Decision Tree", DecisionTreeClassifier(max_depth=5))
]


In [None]:
def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test, experiment_name):
    mlflow.set_experiment(experiment_name)
    mlflow.set_tracking_uri("http://localhost:5000")
    
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob) if y_prob is not None else float('nan')
        
        # Log parameters, metrics, and model
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("auc_roc", auc)

        mlflow.sklearn.log_model(model, "model")
        print(f"Logged {model_name} model with experiment '{experiment_name}'")


In [13]:
# Experiment for Credit Card Data
for name, model in models:
    train_and_log_model(model, name, X_train_creditcard_resampled, y_train_creditcard_resampled, X_test_creditcard, y_test_creditcard, "Fraud Detection Models using Credit Data")

# Experiment for Fraud Data
for name, model in models:
    train_and_log_model(model, name, X_train_fraud_resampled, y_train_fraud_resampled, X_test_fraud, y_test_fraud, "Fraud Detection Models using Fraud")


2024/10/25 16:18:02 INFO mlflow.tracking.fluent: Experiment with name 'Fraud Detection Models using Credit Data' does not exist. Creating a new experiment.
2024/10/25 16:19:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run loud-seal-782 at: http://localhost:5000/#/experiments/354232857904242509/runs/cfcee49824f64488891b5fdcbaa524cb.
2024/10/25 16:19:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/354232857904242509.


Logged Logistic Regression model with experiment 'Fraud Detection Models using Credit Data'


2024/10/25 16:22:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run sedate-sheep-946 at: http://localhost:5000/#/experiments/354232857904242509/runs/bee02cb4aab94239a324ba9a6d638c25.
2024/10/25 16:22:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/354232857904242509.


Logged Random Forest model with experiment 'Fraud Detection Models using Credit Data'


2024/10/25 16:22:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run awesome-ant-919 at: http://localhost:5000/#/experiments/354232857904242509/runs/2f1e8fafc0934a94be7d7b2b57b02825.
2024/10/25 16:22:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/354232857904242509.


Logged XGBClassifier model with experiment 'Fraud Detection Models using Credit Data'


2024/10/25 16:23:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run capable-crane-968 at: http://localhost:5000/#/experiments/354232857904242509/runs/ea437323d0cd4a3da0aaf4819581f059.
2024/10/25 16:23:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/354232857904242509.
2024/10/25 16:23:50 INFO mlflow.tracking.fluent: Experiment with name 'Fraud Detection Models using Fraud' does not exist. Creating a new experiment.


Logged Decision Tree model with experiment 'Fraud Detection Models using Credit Data'


2024/10/25 16:24:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run languid-wasp-985 at: http://localhost:5000/#/experiments/807921376342360566/runs/dca228df72f4444eb8e6f61680c4d04b.
2024/10/25 16:24:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/807921376342360566.


Logged Logistic Regression model with experiment 'Fraud Detection Models using Fraud'


2024/10/25 16:24:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run handsome-tern-753 at: http://localhost:5000/#/experiments/807921376342360566/runs/e741112adf0e433895ff9d88d7591a66.
2024/10/25 16:24:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/807921376342360566.


Logged Random Forest model with experiment 'Fraud Detection Models using Fraud'


2024/10/25 16:24:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run wise-tern-306 at: http://localhost:5000/#/experiments/807921376342360566/runs/97c4c1c5fe89441081320f45e0462f2e.
2024/10/25 16:24:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/807921376342360566.


Logged XGBClassifier model with experiment 'Fraud Detection Models using Fraud'


2024/10/25 16:25:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run nimble-carp-922 at: http://localhost:5000/#/experiments/807921376342360566/runs/ba36fe53581e45f8a12fb1e8a2d1d5bf.
2024/10/25 16:25:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/807921376342360566.


Logged Decision Tree model with experiment 'Fraud Detection Models using Fraud'


## MLflow for Deep Learning Models

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, SimpleRNN
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler


In [16]:
scaler_credit = StandardScaler()
X_train_credit_resampled = scaler_credit.fit_transform(X_train_creditcard_resampled)
X_test_creditcard = scaler_credit.transform(X_test_creditcard)

scaler_fraud = StandardScaler()
X_train_fraud_resampled = scaler_fraud.fit_transform(X_train_fraud_resampled)
X_test_fraud = scaler_fraud.transform(X_test_fraud)


In [17]:
X_train_credit_resampled = X_train_credit_resampled.reshape((X_train_creditcard_resampled.shape[0], X_train_credit_resampled.shape[1], 1))
X_test_creditcard = X_test_creditcard.reshape((X_test_creditcard.shape[0], X_test_creditcard.shape[1], 1))

X_train_fraud_resampled = X_train_fraud_resampled.reshape((X_train_fraud_resampled.shape[0], X_train_fraud_resampled.shape[1], 1))
X_test_fraud = X_test_fraud.reshape((X_test_fraud.shape[0], X_test_fraud.shape[1], 1))


In [18]:
def create_cnn_model(input_shape):
    model = Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_lstm_model(input_shape):
    model = Sequential([
        LSTM(32, input_shape=input_shape),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_rnn_model(input_shape):
    model = Sequential([
        SimpleRNN(32, input_shape=input_shape),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [19]:
def train_and_log_deep_model(model, model_name, X_train, y_train, X_test, y_test, experiment_name):
    mlflow.set_experiment(experiment_name)
    mlflow.set_tracking_uri("http://localhost:5000")

    with mlflow.start_run(run_name=model_name):
        early_stopping = EarlyStopping(monitor='val_loss', patience=3)

        # Train the model
        history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, 
                            callbacks=[early_stopping], verbose=0)

        # Evaluate the model
        loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
        y_pred = (model.predict(X_test) > 0.5).astype("int32")
        y_prob = model.predict(X_test)

        # Calculate metrics
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)

        # Log metrics and model
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("auc_roc", auc)

        # Log the model
        mlflow.keras.log_model(model, "model")
        print(f"Logged {model_name} model with experiment '{experiment_name}'")


In [None]:
# Define input shape for models
input_shape_credit = (X_train_creditcard_resampled.shape[1], 1)
input_shape_fraud = (X_train_fraud_resampled.shape[1], 1)

# List of deep learning models
deep_models = [
    ("CNN", create_cnn_model(input_shape_credit)),
    ("LSTM", create_lstm_model(input_shape_credit)),
    ("RNN", create_rnn_model(input_shape_credit))
]

# Run experiments for Credit Card Data
for name, model in deep_models:
    train_and_log_deep_model(model, name, X_train_creditcard_resampled, y_train_creditcard_resampled, X_test_creditcard, y_test_creditcard, "Fraud Detection Models using Credit Data")

# Update models for Fraud Data
deep_models = [
    ("CNN", create_cnn_model(input_shape_fraud)),
    ("LSTM", create_lstm_model(input_shape_fraud)),
    ("RNN", create_rnn_model(input_shape_fraud))
]

# Run experiments for Fraud Data
for name, model in deep_models:
    train_and_log_deep_model(model, name, X_train_fraud_resampled, y_train_fraud_resampled, X_test_fraud, y_test_fraud, "Fraud Detection Models using Fraud")


