In [15]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, SimpleRNN, LSTM
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import warnings

warnings.filterwarnings('ignore')

# Set plot style for better visuals
sns.set(style="whitegrid")

In [16]:
# Load the datasets
fraud_data = pd.read_csv('../data/fraud_cleaned_data.csv')
credit_data = pd.read_csv('../data/creditcard.csv')

In [17]:
fraud_data.shape, credit_data.shape

((151112, 19), (284807, 31))

In [18]:
fraud_data.columns, credit_data.columns

(Index(['purchase_value', 'age', 'ip_address', 'class', 'frequency', 'velocity',
        'hour_of_day', 'day_of_week', 'time_diff', 'signup_hour',
        'signup_day_of_week', 'purchase_day_of_week', 'source_Direct',
        'source_SEO', 'browser_FireFox', 'browser_IE', 'browser_Opera',
        'browser_Safari', 'sex_M'],
       dtype='object'),
 Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
        'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
        'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
        'Class'],
       dtype='object'))

In [19]:
# For Credit Card Data
X_creditcard = credit_data.drop(columns=['Class'])  # independant Features
y_creditcard = credit_data['Class']                   # Target variable

# For Fraud Data
X_fraud = fraud_data.drop(columns=['class'])  # independant Features
y_fraud = fraud_data['class']      # Target variable

In [20]:
# Train-test split for Credit Card Data
X_train_creditcard, X_test_creditcard, y_train_creditcard, y_test_creditcard = train_test_split(
    X_creditcard, y_creditcard, test_size=0.2, random_state=42, stratify=y_creditcard
)

# Train-test split for Fraud Data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

In [21]:
print(np.unique(y_train_fraud, return_counts=True))
print(np.unique(y_train_creditcard, return_counts=True))

(array([0, 1], dtype=int64), array([109568,  11321], dtype=int64))
(array([0, 1], dtype=int64), array([227451,    394], dtype=int64))


In [22]:
# Apply SMOTE for Credit Card Data
from imblearn.over_sampling import SMOTE
smote_creditcard = SMOTE(random_state=42)
X_train_creditcard_resampled, y_train_creditcard_resampled = smote_creditcard.fit_resample(X_train_creditcard, y_train_creditcard)

# Apply SMOTE for Fraud Data
smote_fraud = SMOTE(random_state=42)
X_train_fraud_resampled, y_train_fraud_resampled = smote_fraud.fit_resample(X_train_fraud, y_train_fraud)


2024/10/23 22:15:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '691f0349eb2843af941b0240a3f5b956', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/10/23 22:15:33 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '029c6dbf25d84676be5afe08561afdaa', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [23]:
print(np.unique(y_train_fraud_resampled, return_counts=True))
print(np.unique(y_train_creditcard_resampled, return_counts=True))

(array([0, 1], dtype=int64), array([109568, 109568], dtype=int64))
(array([0, 1], dtype=int64), array([227451, 227451], dtype=int64))


In [24]:
# Enable autologging
mlflow.sklearn.autolog()

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix, classification_report
with mlflow.start_run():
    # Model training
    model = LogisticRegression(max_iter=100)
    model.fit(X_train_fraud_resampled, y_train_fraud_resampled)

    # Predict on the test set
    y_pred_fraud = model.predict(X_test_fraud)
    
    # Evaluate the model
    accuracy_fraud = accuracy_score(y_test_fraud, y_pred_fraud)
    precision_fraud = precision_score(y_test_fraud, y_pred_fraud, average='weighted')
    recall_fraud = recall_score(y_test_fraud, y_pred_fraud, average='weighted')
    f1_fraud = f1_score(y_test_fraud, y_pred_fraud, average='weighted')

   # Display Metrics for Fraud Data
    print("\nFraud Data Metrics:")
    print(f"Accuracy: {accuracy_fraud:.4f}")
    print(f"Precision: {precision_fraud:.4f}")
    print(f"Recall: {recall_fraud:.4f}")
    print(f"F1 Score: {f1_fraud:.4f}")

    # Log parameters and metrics manually (optional)
    mlflow.log_metric("Accuracy", accuracy_fraud)
    mlflow.log_metric("Precision", precision_fraud)
    mlflow.log_metric("Recall", recall_fraud)
    mlflow.log_metric("F1 Score", f1_fraud)

    # Log the model to MLflow
    mlflow.sklearn.log_model(model, "logistic_regression_model")
    # Log parameters
    mlflow.log_param("max_iter", 100)
    mlflow.log_param("solver", "lbfgs")


    # Optionally save artifacts (like plots)
    # Example: save confusion matrix, ROC curve, etc.
    
    # Print the run ID for reference
    run_id = mlflow.active_run().info.run_id
    print(f"Run ID: {run_id}")


Fraud Data Metrics:
Accuracy: 0.6794
Precision: 0.8814
Recall: 0.6794
F1 Score: 0.7458




Run ID: 1d35b9af9f4e4d73bfa790ac9b25f6cc


In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix, classification_report
with mlflow.start_run():
    # Model training
    model = LogisticRegression(max_iter=100)
    model.fit(X_train_creditcard_resampled, y_train_creditcard_resampled)

    # Predict on the test set
    y_pred_creditcard = model.predict(X_test_creditcard)
    
    # Evaluate the model
    accuracy_creditcard = accuracy_score(y_test_creditcard, y_pred_creditcard)
    precision_creditcard = precision_score(y_test_creditcard, y_pred_creditcard, average='weighted')
    recall_creditcard = recall_score(y_test_creditcard, y_pred_creditcard, average='weighted')
    f1_creditcard = f1_score(y_test_creditcard, y_pred_creditcard, average='weighted')

   # Display Metrics for Fraud Data
    print("\nFraud Data Metrics:")
    print(f"Accuracy: {accuracy_creditcard:.4f}")
    print(f"Precision: {precision_creditcard:.4f}")
    print(f"Recall: {recall_creditcard:.4f}")
    print(f"F1 Score: {f1_creditcard:.4f}")

    # Log parameters and metrics manually (optional)
    mlflow.log_metric("Accuracy", accuracy_creditcard)
    mlflow.log_metric("Precision", precision_creditcard)
    mlflow.log_metric("Recall", recall_creditcard)
    mlflow.log_metric("F1 Score", f1_creditcard)

    # Log the model to MLflow
    mlflow.sklearn.log_model(model, "logistic_regression_model")
    # Log parameters
    mlflow.log_param("max_iter", 100)
    mlflow.log_param("solver", "lbfgs")


    # Optionally save artifacts (like plots)
    # Example: save confusion matrix, ROC curve, etc.
    
    # Print the run ID for reference
    run_id = mlflow.active_run().info.run_id
    print(f"Run ID: {run_id}")


Fraud Data Metrics:
Accuracy: 0.9775
Precision: 0.9982
Recall: 0.9775
F1 Score: 0.9871




Run ID: 6f93090bd2674e26aa60e57fea39081c
