In [1]:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import tensorflow as tf

In [2]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')
X_oot = pd.read_csv('X_oot.csv')
y_oot = pd.read_csv('y_oot.csv')
y_train = y_train['Fraud']
y_test = y_test['Fraud']
y_oot = y_oot['Fraud']
X_train = X_train.drop(['Unnamed: 0'], axis=1)
X_test = X_test.drop(['Unnamed: 0'], axis=1)
X_oot = X_oot.drop(['Unnamed: 0'], axis=1)

In [None]:
def create_CNN():
    model = models.Sequential([
        layers.Reshape((82, 1), input_shape=(82,)),  # Reshape for CNN input
        layers.Conv1D(64, 3, activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.3),
        
        layers.Conv1D(128, 3, activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.3),
    
        layers.Flatten(),
        layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [None]:
def create_RNN():
    model = models.Sequential([
        layers.Reshape((82, 1), input_shape=(82,)),  # Reshape for RNN input
        layers.SimpleRNN(128, activation='relu', return_sequences=True),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.SimpleRNN(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

In [None]:
def create_LSTM():
    model = models.Sequential([
        layers.Reshape((82, 1), input_shape=(82,)),  # Reshape for LSTM input
        layers.LSTM(128, activation='relu', return_sequences=True),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.LSTM(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


In [None]:
models = [
    ('CNN_Model', create_CNN()),
    ('RNN_Model', create_RNN()),
    ('LSTM_Model', create_LSTM())
]

models_oot = [
    ('CNN_Model', create_model_1()),
    ('RNN_Model', create_model_2()),
    ('Dense_Model', create_model_3())
]
# List to store results
results = []
results_oot = []

In [5]:
# Loop through models
for model_name, model in models:
    # Define early stopping
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,  # Number of epochs with no improvement after which training will be stopped
        restore_best_weights=True
    )
    
    # Train the model with early stopping
    history = model.fit(X_train, y_train,
                        epochs=10,
                        batch_size=64,
                        validation_data=(X_oot, y_oot),
                        callbacks=[early_stopping])
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred = np.where(y_pred < 0.2, 0, 1)
    y_pred = y_pred.reshape(y_test.shape)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score_val = report['macro avg']['f1-score']
    
    # Calculate confusion matrix values
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Add predictions and true labels to the DataFrame
    data = pd.DataFrame(X_test, columns=['Amount'])  # Include other columns if needed
    data['predicted'] = y_pred
    data['actual'] = y_test
    
    # Filter out true positive cases
    true_positives = data[(data['predicted'] == 1) & (data['actual'] == 1)]
    
    # Get the sum of amounts for true positives
    sum_amounts_true_positives = true_positives['Amount'].sum()
    
    # Calculate FDR for regular data
    top_percentage = 0.03  # Top 3% of cases
    top_records = int(len(data) * top_percentage)
    data = data.sort_values(by='predicted', ascending=False)
    top_records_data = data.head(top_records)
    fraud_cases = sum(top_records_data['actual'])
    FDR = fraud_cases / sum(data['actual'])
    
    # Create a dictionary with results
    model_results = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_score_val,
        'TP': tp,
        'FP': fp,
        'TN': tn,
        'FN': fn,
        'Amount_Saved': sum_amounts_true_positives,
        'FDR': FDR
    }
    
    # Append results to list
    results_oot.append(model_results)
    
    print(model_name + ' done')

# Convert results list to DataFrame
results_df = pd.DataFrame(results)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CNN_Model done
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
RNN_Model done
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Dense_Model done


In [6]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,TP,FP,TN,FN,Amount_Saved,FDR
0,CNN_Model,0.981405,0.631491,0.715165,0.662667,78,212,16283,98,345662.03,0.448864
1,RNN_Model,0.989323,0.661446,0.505561,0.508305,2,4,16491,174,36854.26,0.039773
2,Dense_Model,0.989323,0.736925,0.578636,0.61697,28,30,16465,148,149166.78,0.181818


In [None]:
best_model_indice = results_df['F1 Score'].idxmax()

In [None]:
plt.figure(figsize=(20, 20))
sns.barplot(x=X_train.columns, y=models[best_model_indice][1].feature_importances_)
plt.xticks(rotation=45)
plt.show()

In [11]:
# Loop through models
for model_name, model in models_oot:
    # Define early stopping
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,  # Number of epochs with no improvement after which training will be stopped
        restore_best_weights=True
    )
    
    # Train the model with early stopping
    history = model.fit(X_train, y_train,
                        epochs=10,
                        batch_size=64,
                        validation_data=(X_test, y_test),
                        callbacks=[early_stopping])
    
    # Make predictions
    y_pred = model.predict(X_oot)
    y_pred = np.where(y_pred < 0.2, 0, 1)
    y_pred = y_pred.reshape(y_oot.shape)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_oot, y_pred)
    report = classification_report(y_oot, y_pred, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score_val = report['macro avg']['f1-score']
    
    # Calculate confusion matrix values
    cm = confusion_matrix(y_oot, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Add predictions and true labels to the DataFrame
    data = pd.DataFrame(X_oot, columns=['Amount'])  # Include other columns if needed
    data['predicted'] = y_pred
    data['actual'] = y_oot
    
    # Filter out true positive cases
    true_positives = data[(data['predicted'] == 1) & (data['actual'] == 1)]
    
    # Get the sum of amounts for true positives
    sum_amounts_true_positives = true_positives['Amount'].sum()
    
    # Calculate FDR for regular data
    top_percentage = 0.03  # Top 3% of cases
    top_records = int(len(data) * top_percentage)
    data = data.sort_values(by='predicted', ascending=False)
    top_records_data = data.head(top_records)
    fraud_cases = sum(top_records_data['actual'])
    FDR = fraud_cases / sum(data['actual'])
    
    # Create a dictionary with results
    model_results = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_score_val,
        'TP': tp,
        'FP': fp,
        'TN': tn,
        'FN': fn,
        'Amount_Saved': sum_amounts_true_positives,
        'FDR': FDR
    }
    
    # Append results to list
    results.append(model_results)
    
    print(model_name + ' done')

# Convert results list to DataFrame
results_df = pd.DataFrame(results)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CNN_Model done
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
RNN_Model done
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Dense_Model done


In [12]:
results_df_oot

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,TP,FP,TN,FN,Amount_Saved,FDR
0,CNN_Model,0.981405,0.631491,0.715165,0.662667,78,212,16283,98,345662.03,0.448864
1,RNN_Model,0.989323,0.661446,0.505561,0.508305,2,4,16491,174,36854.26,0.039773
2,Dense_Model,0.989323,0.736925,0.578636,0.61697,28,30,16465,148,149166.78,0.181818
3,CNN_Model,0.983156,0.686656,0.631653,0.654152,48,77,12034,130,112696.18,0.297753
4,RNN_Model,0.996826,0.941246,0.948571,0.944877,160,21,12090,18,154188.3,0.898876
5,Dense_Model,0.983074,0.667563,0.592863,0.618909,34,64,12047,144,105392.25,0.213483


In [None]:
best_model_indice = results_df_oot['F1 Score'].idxmax()

In [None]:
plt.figure(figsize=(20, 20))
sns.barplot(x=X_train.columns, y=models_oot[best_model_indice][1].feature_importances_)
plt.xticks(rotation=45)
plt.show()