In [2]:
# Import Statements
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
import tensorflow as tf
import mlflow
import mlflow.sklearn
import mlflow.tensorflow
import os
import logging


In [3]:
# Setup Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [4]:
# Define Dataset Paths
credit_card_file_path = r'C:\Users\hayyu.ragea\AppData\Local\Programs\Python\Python312\ecommerce_fraud_detection_system\Data\processed\creditcard_cleaned.csv'
fraud_data_file_path = r'C:\Users\hayyu.ragea\AppData\Local\Programs\Python\Python312\ecommerce_fraud_detection_system\Data\processed\merged_data.csv'


In [5]:
# Load Datasets with Error Handling
def load_datasets():
    try:
        fraud_data = pd.read_csv(fraud_data_file_path)
        creditcard_data = pd.read_csv(credit_card_file_path)
        logging.info("Datasets loaded successfully.")
        return fraud_data, creditcard_data
    except FileNotFoundError as e:
        logging.error(f"Error loading dataset: {e}")
        raise

fraud_data, creditcard_data = load_datasets()


2024-10-26 22:13:40,685 - INFO - Datasets loaded successfully.


In [6]:
# Data Preparation - Feature and Target Separation
def prepare_data(fraud_data, creditcard_data):
    X_fraud = fraud_data.drop('class', axis=1)
    y_fraud = fraud_data['class']
    X_credit = creditcard_data.drop('Class', axis=1)
    y_credit = creditcard_data['Class']
    return X_fraud, y_fraud, X_credit, y_credit

X_fraud, y_fraud, X_credit, y_credit = prepare_data(fraud_data, creditcard_data)


In [7]:
# Preprocessing Function
def preprocess_data(X):
    for col in X.select_dtypes(include=['object']):
        try:
            X[col] = pd.to_datetime(X[col], format='%Y-%m-%d', errors='coerce').astype(np.int64) // 10**9
        except ValueError:
            X.drop(col, axis=1, inplace=True)
    X.dropna(axis=1, how='all', inplace=True)
    X = X.loc[:, X.isnull().mean() < 0.5]
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)
    logging.info("Data preprocessing completed.")
    return pd.DataFrame(X_imputed, columns=X.columns)

X_fraud = preprocess_data(X_fraud)
X_credit = preprocess_data(X_credit)


2024-10-26 22:13:42,055 - INFO - Data preprocessing completed.
2024-10-26 22:13:42,411 - INFO - Data preprocessing completed.


In [8]:
# Train-Test Split
def split_data(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=42)

X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = split_data(X_fraud, y_fraud)
X_train_credit, X_test_credit, y_train_credit, y_test_credit = split_data(X_credit, y_credit)


In [9]:
# Standard Scaling
def scale_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

X_train_fraud_scaled, X_test_fraud_scaled = scale_data(X_train_fraud, X_test_fraud)
X_train_credit_scaled, X_test_credit_scaled = scale_data(X_train_credit, X_test_credit)


In [10]:
# Model Selection - Initialize Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP": MLPClassifier(max_iter=500),
}
mlflow.set_experiment("Fraud Detection Experiment")


2024/10/26 22:13:42 INFO mlflow.tracking.fluent: Experiment with name 'Fraud Detection Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/ecommerce_fraud_detection_system/notebooks/mlruns/515474713450816704', creation_time=1729970022958, experiment_id='515474713450816704', last_update_time=1729970022958, lifecycle_stage='active', name='Fraud Detection Experiment', tags={}>

In [11]:
#Model Training and Evaluation Function
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test, X_test_original):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
        
        report = classification_report(y_test, y_pred)
        auc = roc_auc_score(y_test, y_proba)
        
        mlflow.log_param("model", model_name)
        mlflow.log_metric("roc_auc", auc)
        mlflow.log_text(report, "classification_report.txt")
        
        if not X_test_original.empty:
            try:
                input_example = X_test_original.iloc[:1]
                mlflow.sklearn.log_model(model, model_name, input_example=input_example)
            except Exception as e:
                logging.error(f"Error logging model: {e}")
        
        logging.info(f"Model: {model_name}\n{report}\nAUC: {auc}")

# Training models on Fraud Data
for model_name, model in models.items():
    train_and_evaluate_model(model_name, model, X_train_fraud_scaled, y_train_fraud, X_test_fraud_scaled, y_test_fraud, X_test_fraud)

# Training models on Credit Card Data
for model_name, model in models.items():
    train_and_evaluate_model(model_name, model, X_train_credit_scaled, y_train_credit, X_test_credit_scaled, y_test_credit, X_test_credit)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2024-10-26 22:13:55,378 - INFO - Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27373
           1       0.00      0.00      0.00      2850

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223

AUC: 0.7610801397971236
2024-10-26 22:14:02,428 - INFO - Model: Decision Tree
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     27373
           1       0.50      0.56      0.53      2850

    accuracy                           0.91     30223
   macro avg       0.73      0.75      0.74     30223
weighted avg       0.91      0.91      0.91 

In [12]:
#Build and Train Deep Learning Models
def build_and_train_nn_model(model_type, X_train, y_train, X_test, y_test, input_shape):
    input_layer = tf.keras.layers.Input(shape=input_shape)
    if model_type == "CNN":
        x = tf.keras.layers.Conv1D(64, 3, activation="relu")(input_layer)
        x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
        x = tf.keras.layers.Flatten()(x)
    elif model_type == "RNN":
        x = tf.keras.layers.SimpleRNN(64)(input_layer)
    elif model_type == "LSTM":
        x = tf.keras.layers.LSTM(64)(input_layer)
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    
    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    with mlflow.start_run(run_name=f"{model_type} Model"):
        history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)
        
        if len(X_test) > 0:
            try:
                input_example = X_test[:1]
                signature = mlflow.models.infer_signature(X_train, model.predict(X_train[:1]))
                mlflow.keras.log_model(model, model_type, input_example=input_example, signature=signature)
                mlflow.log_artifacts("classification_report.txt")
            except Exception as e:
                logging.error(f"Error logging model: {e}")

        for epoch, acc in enumerate(history.history['accuracy']):
            mlflow.log_metric("train_accuracy", acc, step=epoch)
        for epoch, val_acc in enumerate(history.history['val_accuracy']):
            mlflow.log_metric("val_accuracy", val_acc, step=epoch)

    return model

# Reshape data for deep learning models
X_train_fraud_reshaped = X_train_fraud_scaled.reshape(-1, X_train_fraud_scaled.shape[1], 1)
X_test_fraud_reshaped = X_test_fraud_scaled.reshape(-1, X_test_fraud_scaled.shape[1], 1)

X_train_credit_reshaped = X_train_credit_scaled.reshape(-1, X_train_credit_scaled.shape[1], 1)
X_test_credit_reshaped = X_test_credit_scaled.reshape(-1, X_test_credit_scaled.shape[1], 1)

# Build and Train CNN, RNN, LSTM Models
cnn_model = build_and_train_nn_model("CNN", X_train_fraud_reshaped, y_train_fraud, X_test_fraud_reshaped, y_test_fraud, (X_train_fraud_reshaped.shape[1], 1))
rnn_model = build_and_train_nn_model("RNN", X_train_fraud_reshaped, y_train_fraud, X_test_fraud_reshaped, y_test_fraud, (X_train_fraud_reshaped.shape[1], 1))
lstm_model = build_and_train_nn_model("LSTM", X_train_fraud_reshaped, y_train_fraud, X_test_fraud_reshaped, y_test_fraud, (X_train_fraud_reshaped.shape[1], 1))


Epoch 1/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - accuracy: 0.9038 - loss: 0.3306 - val_accuracy: 0.9057 - val_loss: 0.3125
Epoch 2/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - accuracy: 0.9056 - loss: 0.3133 - val_accuracy: 0.9057 - val_loss: 0.3131
Epoch 3/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.9058 - loss: 0.3122 - val_accuracy: 0.9057 - val_loss: 0.3156
Epoch 4/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.9080 - loss: 0.3075 - val_accuracy: 0.9057 - val_loss: 0.3126
Epoch 5/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9078 - loss: 0.3077 - val_accuracy: 0.9057 - val_loss: 0.3123
Epoch 6/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.9058 - loss: 0.3126 - val_accuracy: 0.9057 - val_loss: 0.3126
Epoch 7/10

2024-10-26 22:32:55,967 - ERROR - Error logging model: [Errno 2] No such file or directory: 'C:\\Users\\HAYYU~1.RAG\\AppData\\Local\\Temp\\tmpp2jjxcr2\\model\\input_example.json'


Epoch 1/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5ms/step - accuracy: 0.9000 - loss: 0.2811 - val_accuracy: 0.9057 - val_loss: 0.2494
Epoch 2/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - accuracy: 0.9203 - loss: 0.2384 - val_accuracy: 0.9470 - val_loss: 0.2320
Epoch 3/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9338 - loss: 0.2296 - val_accuracy: 0.9477 - val_loss: 0.2253
Epoch 4/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.9363 - loss: 0.2263 - val_accuracy: 0.9464 - val_loss: 0.2224
Epoch 5/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.9389 - loss: 0.2239 - val_accuracy: 0.9479 - val_loss: 0.2209
Epoch 6/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.9410 - loss: 0.2197 - val_accuracy: 0.9435 - val_loss: 0.2162
Epoch 7/10

2024-10-26 22:35:24,759 - ERROR - Error logging model: [Errno 2] No such file or directory: 'C:\\Users\\HAYYU~1.RAG\\AppData\\Local\\Temp\\tmpbp5d_4bx\\model\\input_example.json'


Epoch 1/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 8ms/step - accuracy: 0.9023 - loss: 0.2865 - val_accuracy: 0.9057 - val_loss: 0.2386
Epoch 2/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 7ms/step - accuracy: 0.9268 - loss: 0.2369 - val_accuracy: 0.9307 - val_loss: 0.2320
Epoch 3/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 6ms/step - accuracy: 0.9391 - loss: 0.2281 - val_accuracy: 0.9402 - val_loss: 0.2219
Epoch 4/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 7ms/step - accuracy: 0.9420 - loss: 0.2209 - val_accuracy: 0.9435 - val_loss: 0.2182
Epoch 5/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 7ms/step - accuracy: 0.9421 - loss: 0.2176 - val_accuracy: 0.9355 - val_loss: 0.2188
Epoch 6/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 7ms/step - accuracy: 0.9440 - loss: 0.2149 - val_accuracy: 0.9494 - val_loss: 0.2140
Epoch 7/10

2024-10-26 22:39:54,776 - ERROR - Error logging model: [Errno 2] No such file or directory: 'C:\\Users\\HAYYU~1.RAG\\AppData\\Local\\Temp\\tmp39g7noae\\model\\input_example.json'
