# Data Preparation

### Feature and Target Separation

In [2]:
import pandas as pd

# Load datasets
creditcard_df = pd.read_csv('../data/creditcard.csv')
fraud_data_df = pd.read_csv('../data/Fraud_Data.csv')

# Separate features and target for creditcard dataset
X_creditcard = creditcard_df.drop(columns=['Class'])
y_creditcard = creditcard_df['Class']

# Separate features and target for fraud-data dataset
X_fraud_data = fraud_data_df.drop(columns=['class'])
y_fraud_data = fraud_data_df['class']

### Train-Test Split

In [3]:
from sklearn.model_selection import train_test_split

# Train-test split for creditcard dataset
X_train_creditcard, X_test_creditcard, y_train_creditcard, y_test_creditcard = train_test_split(
    X_creditcard, y_creditcard, test_size=0.2, random_state=42, stratify=y_creditcard
)

# Train-test split for fraud-data dataset
X_train_fraud_data, X_test_fraud_data, y_train_fraud_data, y_test_fraud_data = train_test_split(
    X_fraud_data, y_fraud_data, test_size=0.2, random_state=42, stratify=y_fraud_data
)

# Model Selection

### We’ll use the following models for comparison:

Logistic Regression

Decision Tree

Random Forest

Gradient Boosting

Multi-Layer Perceptron (MLP)

Convolutional Neural Network (CNN)

Recurrent Neural Network (RNN)

Long Short-Term Memory (LSTM)

#  Model Training and Evaluation

### Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Train Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_creditcard, y_train_creditcard)

# Evaluate
y_pred_log_reg = log_reg.predict(X_test_creditcard)
print("Logistic Regression Classification Report:")
print(classification_report(y_test_creditcard, y_pred_log_reg))
print("ROC AUC Score:", roc_auc_score(y_test_creditcard, y_pred_log_reg))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.80      0.69      0.74        98

    accuracy                           1.00     56962
   macro avg       0.90      0.85      0.87     56962
weighted avg       1.00      1.00      1.00     56962

ROC AUC Score: 0.8467892960504404


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree


In [5]:
from sklearn.tree import DecisionTreeClassifier

# Train Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_creditcard, y_train_creditcard)

# Evaluate
y_pred_dt = dt.predict(X_test_creditcard)
print("Decision Tree Classification Report:")
print(classification_report(y_test_creditcard, y_pred_dt))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.75      0.74      0.75        98

    accuracy                           1.00     56962
   macro avg       0.88      0.87      0.87     56962
weighted avg       1.00      1.00      1.00     56962



### Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_creditcard, y_train_creditcard)

# Evaluate
y_pred_rf = rf.predict(X_test_creditcard)
print("Random Forest Classification Report:")
print(classification_report(y_test_creditcard, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.94      0.82      0.87        98

    accuracy                           1.00     56962
   macro avg       0.97      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962



### Gradient Boosting

In [7]:
from sklearn.ensemble import GradientBoostingClassifier

# Train Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train_creditcard, y_train_creditcard)

# Evaluate
y_pred_gb = gb.predict(X_test_creditcard)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test_creditcard, y_pred_gb))

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.53      0.18      0.27        98

    accuracy                           1.00     56962
   macro avg       0.76      0.59      0.64     56962
weighted avg       1.00      1.00      1.00     56962



###  Multi-Layer Perceptron (MLP)

In [8]:
from sklearn.neural_network import MLPClassifier

# Train MLP
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
mlp.fit(X_train_creditcard, y_train_creditcard)

# Evaluate
y_pred_mlp = mlp.predict(X_test_creditcard)
print("MLP Classification Report:")
print(classification_report(y_test_creditcard, y_pred_mlp))

MLP Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.66      0.60      0.63        98

    accuracy                           1.00     56962
   macro avg       0.83      0.80      0.81     56962
weighted avg       1.00      1.00      1.00     56962



### Convolutional Neural Network (CNN)

In [None]:
!pip install tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

# Reshape data for CNN
X_train_creditcard_cnn = X_train_creditcard.values.reshape(X_train_creditcard.shape[0], X_train_creditcard.shape[1], 1)
X_test_creditcard_cnn = X_test_creditcard.values.reshape(X_test_creditcard.shape[0], X_test_creditcard.shape[1], 1)

# Build CNN model
model_cnn = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_creditcard_cnn.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(50, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_cnn.fit(X_train_creditcard_cnn, y_train_creditcard, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate
y_pred_cnn = model_cnn.predict(X_test_creditcard_cnn)
y_pred_cnn = (y_pred_cnn > 0.5).astype(int)
print("CNN Classification Report:")
print(classification_report(y_test_creditcard, y_pred_cnn))

### Recurrent Neural Network (RNN) and LSTM

In [None]:
from tensorflow.keras.layers import LSTM, SimpleRNN

# Reshape data for RNN/LSTM
X_train_creditcard_rnn = X_train_creditcard.values.reshape(X_train_creditcard.shape[0], X_train_creditcard.shape[1], 1)
X_test_creditcard_rnn = X_test_creditcard.values.reshape(X_test_creditcard.shape[0], X_test_creditcard.shape[1], 1)

# Build LSTM model
model_lstm = Sequential([
    LSTM(50, input_shape=(X_train_creditcard_rnn.shape[1], 1)),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm.fit(X_train_creditcard_rnn, y_train_creditcard, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate
y_pred_lstm = model_lstm.predict(X_test_creditcard_rnn)
y_pred_lstm = (y_pred_lstm > 0.5).astype(int)
print("LSTM Classification Report:")
print(classification_report(y_test_creditcard, y_pred_lstm))

# MLOps Steps

### Versioning and Experiment Tracking with MLflow

### Log experiments:

In [4]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression

# Define and train the model
log_reg = LogisticRegression()

# Example: Logging Logistic Regression
with mlflow.start_run():
    mlflow.log_param("model", "Logistic Regression")
    
    log_reg.fit(X_train_creditcard, y_train_creditcard)  # Ensure X_train_creditcard and y_train_creditcard are defined
    y_pred = log_reg.predict(X_test_creditcard)
    accuracy = log_reg.score(X_test_creditcard, y_test_creditcard)
    
    mlflow.log_metric("accuracy", accuracy)
    mlflow.sklearn.log_model(log_reg, "logistic_regression_model")

NameError: name 'X_train_creditcard' is not defined

In [None]:
# Train Logistic Regression on Fraud_Data
log_reg_fraud = LogisticRegression(max_iter=1000)
log_reg_fraud.fit(X_train_fraud_data, y_train_fraud_data)

# Evaluate
y_pred_log_reg_fraud = log_reg_fraud.predict(X_test_fraud_data)
print("Logistic Regression (Fraud_Data) Classification Report:")
print(classification_report(y_test_fraud_data, y_pred_log_reg_fraud))
print("ROC AUC Score:", roc_auc_score(y_test_fraud_data, y_pred_log_reg_fraud))

### Compare Model Performance

### Evaluate All Models for creditcard Dataset

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define the evaluate_model function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_pred)
    }

# Evaluate all models for creditcard dataset
performance_summary_creditcard = {
    "Logistic Regression": evaluate_model(log_reg, X_test_creditcard, y_test_creditcard),
    "Decision Tree": evaluate_model(dt, X_test_creditcard, y_test_creditcard),
    "Random Forest": evaluate_model(rf, X_test_creditcard, y_test_creditcard),
    "Gradient Boosting": evaluate_model(gb, X_test_creditcard, y_test_creditcard),
    "MLP": evaluate_model(mlp, X_test_creditcard, y_test_creditcard),
    "CNN": evaluate_model(model_cnn, X_test_creditcard_cnn, y_test_creditcard),
    "LSTM": evaluate_model(model_lstm, X_test_creditcard_rnn, y_test_creditcard)
}

# Convert to DataFrame
performance_df_creditcard = pd.DataFrame(performance_summary_creditcard).T
print("Performance Summary (creditcard):")
print(performance_df_creditcard)

### Evaluate All Models for Fraud_Data Dataset

In [None]:
# Evaluate all models for Fraud_Data dataset
performance_summary_fraud_data = {
    "Logistic Regression": evaluate_model(log_reg_fraud, X_test_fraud_data, y_test_fraud_data),
    "Decision Tree": evaluate_model(dt_fraud, X_test_fraud_data, y_test_fraud_data),
    "Random Forest": evaluate_model(rf_fraud, X_test_fraud_data, y_test_fraud_data),
    "Gradient Boosting": evaluate_model(gb_fraud, X_test_fraud_data, y_test_fraud_data),
    "MLP": evaluate_model(mlp_fraud, X_test_fraud_data, y_test_fraud_data),
    "CNN": evaluate_model(model_cnn_fraud, X_test_fraud_data_cnn, y_test_fraud_data),
    "LSTM": evaluate_model(model_lstm_fraud, X_test_fraud_data_rnn, y_test_fraud_data)
}

# Convert to DataFrame
performance_df_fraud_data = pd.DataFrame(performance_summary_fraud_data).T
print("Performance Summary (Fraud_Data):")
print(performance_df_fraud_data)

###  Combine Results for Both Datasets

In [None]:
# Add a column to indicate the dataset
performance_df_creditcard['Dataset'] = 'creditcard'
performance_df_fraud_data['Dataset'] = 'Fraud_Data'

# Combine the results
combined_performance_df = pd.concat([performance_df_creditcard, performance_df_fraud_data])
print("Combined Performance Summary:")
print(combined_performance_df)

### Visualize the Results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the plot
plt.figure(figsize=(12, 6))
sns.barplot(x=combined_performance_df.index, y=combined_performance_df['ROC-AUC'], hue=combined_performance_df['Dataset'])
plt.title("ROC-AUC Scores for All Models (Creditcard vs Fraud_Data)")
plt.xlabel("Model")
plt.ylabel("ROC-AUC")
plt.xticks(rotation=45)
plt.legend(title='Dataset')
plt.show()

# Use MLflow to Track Experiments

In [None]:
import mlflow
import mlflow.sklearn

# Log Logistic Regression experiment for creditcard dataset
with mlflow.start_run():
    mlflow.log_param("dataset", "creditcard")
    mlflow.log_param("model", "Logistic Regression")
    log_reg.fit(X_train_creditcard, y_train_creditcard)
    metrics = evaluate_model(log_reg, X_test_creditcard, y_test_creditcard)
    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value)
    mlflow.sklearn.log_model(log_reg, "logistic_regression_model")

###  Define a Function to Log Experiments

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def log_experiment(model, X_train, y_train, X_test, y_test, dataset_name, model_name):
    """
    Logs an experiment using MLflow.
    """
    with mlflow.start_run():
        # Log parameters
        mlflow.log_param("dataset", dataset_name)
        mlflow.log_param("model", model_name)
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Evaluate the model
        y_pred = model.predict(X_test)
        metrics = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1-Score": f1_score(y_test, y_pred),
            "ROC-AUC": roc_auc_score(y_test, y_pred)
        }
        
        # Log metrics
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(metric_name, metric_value)
        
        # Log the model
        mlflow.sklearn.log_model(model, model_name)
        
        print(f"Logged {model_name} for {dataset_name} dataset.")

### Define Models and Datasets

In [None]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
}

# Define datasets
datasets = {
    "creditcard": (X_train_creditcard, y_train_creditcard, X_test_creditcard, y_test_creditcard),
    "Fraud_Data": (X_train_fraud_data, y_train_fraud_data, X_test_fraud_data, y_test_fraud_data)
}

### Log Experiments for All Models and Datasets

In [None]:
# Log experiments for all models and datasets
for dataset_name, (X_train, y_train, X_test, y_test) in datasets.items():
    for model_name, model in models.items():
        log_experiment(model, X_train, y_train, X_test, y_test, dataset_name, model_name)

### Log Experiments for Deep Learning Models (CNN, RNN, LSTM)

In [None]:
import mlflow.tensorflow

def log_dl_experiment(model, X_train, y_train, X_test, y_test, dataset_name, model_name):
    """
    Logs a deep learning experiment using MLflow.
    """
    with mlflow.start_run():
        # Log parameters
        mlflow.log_param("dataset", dataset_name)
        mlflow.log_param("model", model_name)
        
        # Train the model
        model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)
        
        # Evaluate the model
        y_pred = (model.predict(X_test) > 0.5).astype(int)
        metrics = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1-Score": f1_score(y_test, y_pred),
            "ROC-AUC": roc_auc_score(y_test, y_pred)
        }
        
        # Log metrics
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(metric_name, metric_value)
        
        # Log the model
        mlflow.tensorflow.log_model(model, model_name)
        
        print(f"Logged {model_name} for {dataset_name} dataset.")

# Log CNN, RNN, and LSTM experiments
dl_models = {
    "CNN": model_cnn,
    "RNN": model_rnn,
    "LSTM": model_lstm
}

for dataset_name, (X_train, y_train, X_test, y_test) in datasets.items():
    for model_name, model in dl_models.items():
        log_dl_experiment(model, X_train, y_train, X_test, y_test, dataset_name, model_name)

### View Logged Experiments

bash: mlflow ui

# Create a Performance Summary Table

### Define a Function to Evaluate Models

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, X_test, y_test):
    """
    Evaluates a model and returns a dictionary of metrics.
    """
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_pred)
    }

### Evaluate All Models for Both Datasets

In [None]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
}

# Define datasets
datasets = {
    "creditcard": (X_train_creditcard, y_train_creditcard, X_test_creditcard, y_test_creditcard),
    "Fraud_Data": (X_train_fraud_data, y_train_fraud_data, X_test_fraud_data, y_test_fraud_data)
}

# Initialize a dictionary to store performance results
performance_summary = {}

# Evaluate all models for both datasets
for dataset_name, (X_train, y_train, X_test, y_test) in datasets.items():
    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        # Evaluate the model
        metrics = evaluate_model(model, X_test, y_test)
        
        # Store results in the performance_summary dictionary
        performance_summary[f"{dataset_name} - {model_name}"] = metrics

### Convert the Results into a DataFrame

In [None]:
import pandas as pd

# Convert the performance summary dictionary to a DataFrame
performance_df = pd.DataFrame(performance_summary).T

# Add a column for the dataset and model names
performance_df.reset_index(inplace=True)
performance_df.rename(columns={"index": "Dataset - Model"}, inplace=True)

# Display the performance summary table
print("Performance Summary Table:")
print(performance_df)

### Add Deep Learning Models (CNN, RNN, LSTM)

In [None]:
# Define deep learning models
dl_models = {
    "CNN": model_cnn,
    "RNN": model_rnn,
    "LSTM": model_lstm
}

# Evaluate deep learning models for both datasets
for dataset_name, (X_train, y_train, X_test, y_test) in datasets.items():
    for model_name, model in dl_models.items():
        # Evaluate the model
        y_pred = (model.predict(X_test) > 0.5).astype(int)
        metrics = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1-Score": f1_score(y_test, y_pred),
            "ROC-AUC": roc_auc_score(y_test, y_pred)
        }
        
        # Store results in the performance_summary dictionary
        performance_summary[f"{dataset_name} - {model_name}"] = metrics

# Convert the updated performance summary dictionary to a DataFrame
performance_df = pd.DataFrame(performance_summary).T
performance_df.reset_index(inplace=True)
performance_df.rename(columns={"index": "Dataset - Model"}, inplace=True)

# Display the updated performance summary table
print("Updated Performance Summary Table (Including Deep Learning Models):")
print(performance_df)

### Save the Performance Summary Table

In [None]:
# Save the performance summary table to a CSV file
performance_df.to_csv("performance_summary.csv", index=False)
print("Performance summary saved to 'performance_summary.csv'.")

### Visualize the Results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the plot
plt.figure(figsize=(12, 6))
sns.barplot(x=performance_df["Dataset - Model"], y=performance_df["ROC-AUC"])
plt.title("ROC-AUC Scores for All Models (Creditcard vs Fraud_Data)")
plt.xlabel("Model")
plt.ylabel("ROC-AUC")
plt.xticks(rotation=45)
plt.show()