In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
import mlflow.tensorflow
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from typing import Tuple, Dict, Any


In [2]:
# Load your data
def load_data(file_paths: Tuple[str, str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    fraud_data = pd.read_csv(file_paths[0])
    credit_data = pd.read_csv(file_paths[1])
    return fraud_data, credit_data


In [3]:
# Preprocess data by converting non-numeric types and handling missing values
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    # Convert date columns to numeric (timestamp)
    for col in data.select_dtypes(include=['object']):
        try:
            data[col] = pd.to_datetime(data[col]).astype(int) // 10**9  # Convert to UNIX timestamp
        except (ValueError, TypeError):
            # If it can't be converted, you may decide to drop it or encode it
            data.drop(columns=[col], inplace=True)

    # Handle missing values (if any)
    data.fillna(0, inplace=True)  # Or use other imputation methods

    return data


In [4]:
# Prepare data for modeling
def prepare_data(data: pd.DataFrame, target_column: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    data = preprocess_data(data)  # Preprocess the data here
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Evaluate a model
def evaluate_model(model: Any, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, model_name: str) -> None:
    with mlflow.start_run():
        # Train the model
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        # For models that may output probabilities, binarize the output
        if predictions.ndim == 2 and predictions.shape[1] == 1:  # If it's a 2D array with one column
            predictions = (predictions > 0.5).astype(int)  # Convert probabilities to binary (0 or 1)
        elif predictions.ndim == 1 and not np.issubdtype(predictions.dtype, np.integer):
            predictions = (predictions > 0.5).astype(int)  # Same conversion for 1D continuous outputs
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, predictions)
        report = classification_report(y_test, predictions)

        # Log parameters, metrics, and model
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("num_features", X_train.shape[1])  # Number of features
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_text(report, "classification_report.txt")
        
        # Log models depending on type
        if isinstance(model, keras.Model):
            mlflow.tensorflow.log_model(model, model_name)
        else:
            mlflow.sklearn.log_model(model, model_name)

        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy}")
        print(report)


In [6]:
# Build CNN model
def build_cnn_model(input_shape: Tuple[int, int, int]) -> keras.Model:
    model = keras.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build RNN model
def build_rnn_model(input_shape: Tuple[int, int]) -> keras.Model:
    model = keras.Sequential([
        layers.SimpleRNN(50, input_shape=input_shape, return_sequences=True),
        layers.SimpleRNN(50),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build LSTM model
def build_lstm_model(input_shape: Tuple[int, int]) -> keras.Model:
    model = keras.Sequential([
        layers.LSTM(50, input_shape=input_shape, return_sequences=True),
        layers.LSTM(50),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [7]:
# Automatically adjust reshaping
def auto_reshape(X: pd.DataFrame, model_type: str) -> Tuple:
    num_samples, num_features = X.shape
    
    if model_type == 'cnn':
        side_length = int(num_features ** 0.5)
        if side_length ** 2 != num_features:
            raise ValueError(f"Cannot reshape array of size {X.size} into a square for CNN.")
        return X.values.reshape(num_samples, side_length, side_length, 1)
    
    elif model_type in ['rnn', 'lstm']:
        return X.values.reshape(num_samples, num_features, 1)
    
    else:
        raise ValueError(f"Unsupported model type for reshaping: {model_type}")


In [8]:
# Evaluate models
def evaluate_models(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> None:
    models: Dict[str, Any] = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "MLP Classifier": MLPClassifier(max_iter=1000),
    }

    # Evaluate classical machine learning models
    for model_name, model in models.items():
        evaluate_model(model, X_train, X_test, y_train, y_test, model_name)

    # Reshape data for CNN, RNN, LSTM
    try:
        X_train_cnn = auto_reshape(X_train, 'cnn')
        X_test_cnn = auto_reshape(X_test, 'cnn')
        evaluate_model(build_cnn_model(X_train_cnn.shape[1:]), X_train_cnn, X_test_cnn, y_train, y_test, "CNN Model")
    except ValueError as e:
        print(f"Skipping CNN evaluation: {e}")

    # Reshape for RNN and LSTM
    try:
        X_train_rnn = auto_reshape(X_train, 'rnn')
        X_test_rnn = auto_reshape(X_test, 'rnn')
        
        evaluate_model(build_rnn_model(X_train_rnn.shape[1:]), X_train_rnn, X_test_rnn, y_train, y_test, "RNN Model")
        evaluate_model(build_lstm_model(X_train_rnn.shape[1:]), X_train_rnn, X_test_rnn, y_train, y_test, "LSTM Model")
    except ValueError as e:
        print(f"Skipping RNN/LSTM evaluation: {e}")


In [9]:
# Set experiment name and create it if it doesn't exist
experiment_name = "fraud_detection_experiment"

# Set the tracking URI for MLflow
mlflow.set_tracking_uri("file:./mlruns")  # Ensure this path is writable

# Create the experiment if it does not exist
try:
    mlflow.create_experiment(experiment_name)
except mlflow.exceptions.MlflowException as e:
    if "already exists" in str(e):
        print(f"Experiment '{experiment_name}' already exists. Using the existing experiment.")

# Set the current experiment
mlflow.set_experiment(experiment_name)

# Load data
file_paths = (
    r'C:\Users\hayyu.ragea\AppData\Local\Programs\Python\Python312\ecommerce_fraud_detection_system\Data\processed\fraud_cleaned.csv',
    r'C:\Users\hayyu.ragea\AppData\Local\Programs\Python\Python312\ecommerce_fraud_detection_system\Data\processed\creditcard_cleaned.csv'
)
fraud_data, credit_data = load_data(file_paths)

# Prepare and evaluate models for credit card data
X_train_cc, X_test_cc, y_train_cc, y_test_cc = prepare_data(credit_data, 'Class')
print("Evaluating models for Credit Card Data:")
evaluate_models(X_train_cc, X_test_cc, y_train_cc, y_test_cc)

# Prepare and evaluate models for fraud data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = prepare_data(fraud_data, 'class')  # Ensure 'class' is the correct target column
print("Evaluating models for Fraud Data:")
evaluate_models(X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud)


Experiment 'fraud_detection_experiment' already exists. Using the existing experiment.
Evaluating models for Credit Card Data:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Logistic Regression
Accuracy: 0.9991188806259472
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.84      0.54      0.66        90

    accuracy                           1.00     56746
   macro avg       0.92      0.77      0.83     56746
weighted avg       1.00      1.00      1.00     56746





Model: Decision Tree
Accuracy: 0.9990131463010609
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.68      0.72      0.70        90

    accuracy                           1.00     56746
   macro avg       0.84      0.86      0.85     56746
weighted avg       1.00      1.00      1.00     56746





Model: Random Forest
Accuracy: 0.9995594403129736
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.99      0.73      0.84        90

    accuracy                           1.00     56746
   macro avg       0.99      0.87      0.92     56746
weighted avg       1.00      1.00      1.00     56746





Model: Gradient Boosting
Accuracy: 0.9992951045007578
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.89      0.63      0.74        90

    accuracy                           1.00     56746
   macro avg       0.95      0.82      0.87     56746
weighted avg       1.00      1.00      1.00     56746





Model: MLP Classifier
Accuracy: 0.9982377612518945
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.47      0.78      0.58        90

    accuracy                           1.00     56746
   macro avg       0.73      0.89      0.79     56746
weighted avg       1.00      1.00      1.00     56746

Skipping CNN evaluation: Cannot reshape array of size 6809400 into a square for CNN.


  super().__init__(**kwargs)


[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 12ms/step - accuracy: 0.9985 - loss: 0.0088
[1m1774/1774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step


  super().__init__(**kwargs)


Model: RNN Model
Accuracy: 0.9985725866140345
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.91      0.11      0.20        90

    accuracy                           1.00     56746
   macro avg       0.95      0.56      0.60     56746
weighted avg       1.00      1.00      1.00     56746

[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 25ms/step - accuracy: 0.9989 - loss: 0.0138
[1m1774/1774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step




Model: LSTM Model
Accuracy: 0.9992422373383146
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.76      0.77      0.76        90

    accuracy                           1.00     56746
   macro avg       0.88      0.88      0.88     56746
weighted avg       1.00      1.00      1.00     56746



  data[col] = pd.to_datetime(data[col]).astype(int) // 10**9  # Convert to UNIX timestamp
  data[col] = pd.to_datetime(data[col]).astype(int) // 10**9  # Convert to UNIX timestamp
  data[col] = pd.to_datetime(data[col]).astype(int) // 10**9  # Convert to UNIX timestamp
  data[col] = pd.to_datetime(data[col]).astype(int) // 10**9  # Convert to UNIX timestamp


Evaluating models for Fraud Data:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Logistic Regression
Accuracy: 0.9057009562253913
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27373
           1       0.00      0.00      0.00      2850

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223





Model: Decision Tree
Accuracy: 0.906329616517222
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     27373
           1       0.50      0.56      0.53      2850

    accuracy                           0.91     30223
   macro avg       0.73      0.75      0.74     30223
weighted avg       0.91      0.91      0.91     30223





Model: Random Forest
Accuracy: 0.9564239155609966
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223





Model: Gradient Boosting
Accuracy: 0.956390828177216
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223



  super().__init__(**kwargs)


Model: MLP Classifier
Accuracy: 0.617939979485822
              precision    recall  f1-score   support

           0       0.95      0.61      0.74     27373
           1       0.16      0.71      0.26      2850

    accuracy                           0.62     30223
   macro avg       0.56      0.66      0.50     30223
weighted avg       0.88      0.62      0.70     30223

Skipping CNN evaluation: Cannot reshape array of size 604445 into a square for CNN.
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 5ms/step - accuracy: 0.9567 - loss: 0.1770
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step


  super().__init__(**kwargs)


Model: RNN Model
Accuracy: 0.9564239155609966
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223

[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 6ms/step - accuracy: 0.9496 - loss: 0.1942
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step




Model: LSTM Model
Accuracy: 0.9564239155609966
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223

