In [1]:
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models, layers, regularizers
import numpy as np
import tensorflow as tf
import pickle 

In [2]:
X_train = pd.read_csv('../Data/X_train.csv')
y_train = pd.read_csv('../Data/y_train.csv')
X_test = pd.read_csv('../Data/X_test.csv')
y_test = pd.read_csv('../Data/y_test.csv')
X_oot = pd.read_csv('../Data/X_oot.csv')
y_oot = pd.read_csv('../Data/y_oot.csv')
y_train = y_train['Fraud']
y_test = y_test['Fraud']
y_oot = y_oot['Fraud']
X_train = X_train.drop(['Unnamed: 0'], axis=1)
X_test = X_test.drop(['Unnamed: 0'], axis=1)
X_oot = X_oot.drop(['Unnamed: 0'], axis=1)

In [3]:
def create_CNN():
    cnn_model = models.Sequential([
        layers.Reshape((82, 1), input_shape=(82,)),  # Reshape for CNN input
        layers.Conv1D(64, 3, activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.3),
        
        layers.Conv1D(128, 3, activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.3),
    
        layers.Flatten(),
        layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
    cnn_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return cnn_model

In [4]:
def create_RNN():
    rnn_model = models.Sequential([
        layers.Reshape((82, 1), input_shape=(82,)),  # Reshape for RNN input
        layers.SimpleRNN(128, activation='relu', return_sequences=True),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.SimpleRNN(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
    rnn_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return rnn_model

In [5]:
def create_LSTM():
    lstm_model = models.Sequential([
        layers.Reshape((82, 1), input_shape=(82,)),  # Reshape for LSTM input
        layers.LSTM(128, activation='relu', return_sequences=True),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.LSTM(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
    lstm_model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return lstm_model


In [6]:
models = [
    ('CNN_Model', create_CNN()),
    ('RNN_Model', create_RNN()),
    ('LSTM_Model', create_LSTM())
]


# List to store results
results = []
results_oot = []

In [7]:
# Loop through models
for model_name, model in models:
    # Define early stopping
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,  # Number of epochs with no improvement after which training will be stopped
        restore_best_weights=True
    )
    
    # Train the model with early stopping
    history = model.fit(X_train, y_train,
                        epochs=10,
                        batch_size=64,
                        validation_data=(X_oot, y_oot),
                        callbacks=[early_stopping])
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred = np.where(y_pred < 0.2, 0, 1)
    y_pred = y_pred.reshape(y_test.shape)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score_val = report['macro avg']['f1-score']
    
    # Calculate confusion matrix values
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Add predictions and true labels to the DataFrame
    data = pd.DataFrame(X_test, columns=['Amount'])  # Include other columns if needed
    data['predicted'] = y_pred
    data['actual'] = y_test
    
    # Filter out true positive cases
    true_positives = data[(data['predicted'] == 1) & (data['actual'] == 1)]
    
    # Get the sum of amounts for true positives
    sum_amounts_true_positives = true_positives['Amount'].sum()
    
    # Calculate FDR for regular data
    top_percentage = 0.03  # Top 3% of cases
    top_records = int(len(data) * top_percentage)
    data = data.sort_values(by='predicted', ascending=False)
    top_records_data = data.head(top_records)
    fraud_cases = sum(top_records_data['actual'])
    FDR = fraud_cases / sum(data['actual'])
    
    # Create a dictionary with results
    model_results = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_score_val,
        'TP': tp,
        'FP': fp,
        'TN': tn,
        'FN': fn,
        'Amount_Saved': sum_amounts_true_positives,
        'FDR': FDR
    }
    
    # Append results to list
    results_oot.append(model_results)
    
    print(model_name + ' done')

# Convert results list to DataFrame
results_df = pd.DataFrame(results)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CNN_Model done
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
RNN_Model done
Epoch 1/10
Epoch 2/10
Epoch 3/10
LSTM_Model done


In [13]:
results_df = pd.DataFrame(results_oot)

In [14]:
best_model_indice = results_df['F1 Score'].idxmax()

In [18]:
with open('DL_Test_Model.pkl', 'wb') as outfile:
    pickle.dump(models[best_model_indice][1], outfile)

In [19]:
# Loop through models
for model_name, model in models:
    # Define early stopping
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,  # Number of epochs with no improvement after which training will be stopped
        restore_best_weights=True
    )
    
    # Train the model with early stopping
    history = model.fit(X_train, y_train,
                        epochs=10,
                        batch_size=64,
                        validation_data=(X_test, y_test),
                        callbacks=[early_stopping])
    
    # Make predictions
    y_pred = model.predict(X_oot)
    y_pred = np.where(y_pred < 0.2, 0, 1)
    y_pred = y_pred.reshape(y_oot.shape)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_oot, y_pred)
    report = classification_report(y_oot, y_pred, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score_val = report['macro avg']['f1-score']
    
    # Calculate confusion matrix values
    cm = confusion_matrix(y_oot, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Add predictions and true labels to the DataFrame
    data = pd.DataFrame(X_oot, columns=['Amount'])  # Include other columns if needed
    data['predicted'] = y_pred
    data['actual'] = y_oot
    
    # Filter out true positive cases
    true_positives = data[(data['predicted'] == 1) & (data['actual'] == 1)]
    
    # Get the sum of amounts for true positives
    sum_amounts_true_positives = true_positives['Amount'].sum()
    
    # Calculate FDR for regular data
    top_percentage = 0.03  # Top 3% of cases
    top_records = int(len(data) * top_percentage)
    data = data.sort_values(by='predicted', ascending=False)
    top_records_data = data.head(top_records)
    fraud_cases = sum(top_records_data['actual'])
    FDR = fraud_cases / sum(data['actual'])
    
    # Create a dictionary with results
    model_results = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_score_val,
        'TP': tp,
        'FP': fp,
        'TN': tn,
        'FN': fn,
        'Amount_Saved': sum_amounts_true_positives,
        'FDR': FDR
    }
    
    # Append results to list
    results.append(model_results)
    
    print(model_name + ' done')

# Convert results list to DataFrame
results_df_oot = pd.DataFrame(results)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
CNN_Model done
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
RNN_Model done
Epoch 1/10
Epoch 2/10
Epoch 3/10
LSTM_Model done


In [20]:
results_df_oot

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,TP,FP,TN,FN,Amount_Saved,FDR
0,CNN_Model,0.982505,0.639838,0.570433,0.592956,26,63,12048,152,103082.17,0.174157
1,RNN_Model,0.996338,0.945632,0.923414,0.93423,151,18,12093,27,120446.33,0.848315
2,LSTM_Model,0.014484,0.007242,0.5,0.014278,178,12111,0,0,168579.0,0.061798


In [21]:
best_model_indice = results_df_oot['F1 Score'].idxmax()

In [22]:
with open('DL_OOT_Model.pkl', 'wb') as outfile:
    pickle.dump(models[best_model_indice][1], outfile)