In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as snsi
import numpy as np
from pylab import rcParams
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
from tensorflow import keras
from keras.models import Model, load_model
from keras.layers import Input, Dense, Bidirectional
from tensorflow.keras import layers
from tensorflow.python.client import device_lib
from keras.layers import LSTM 
from keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras import activations
from keras import regularizers
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import recall_score, classification_report, auc, roc_curve
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import keras_tuner    
import pickle
from bayes_opt import BayesianOptimization
from sklearn.impute import SimpleImputer
from numpy.random import seed
tf.random.set_seed(42)
pd.set_option("display.max_columns", None)
import os
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
#import pycuda.autoinit
import gc
SEED = 42 #used to help randomly select the data points
DATA_SPLIT_PCT = 0.2
rcParams['figure.figsize'] = 8, 6
LABELS = ["Normal","Break"]
tf.random.set_seed(42)
pd.options.display.max_rows = 400


Num GPUs Available:  1


In [2]:
gpus = tf.config.list_physical_devices('GPU')

In [3]:
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_virtual_device_configuration(gpu,[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10000000)])

In [4]:
def Data_Split(current_data, number_of_statements):
    unique_customer_ids = current_data.customer_ID.unique()
    grouped = current_data.groupby('customer_ID').size().reset_index(name="count")
    grouped = grouped.query("count == @number_of_statements")
    customer_ids_train, customer_ids_test = train_test_split(unique_customer_ids, test_size=0.3, random_state=42)
    x_train = current_data[(current_data['customer_ID'].isin(customer_ids_train)) & (current_data['customer_ID'].isin(grouped['customer_ID'])) ]
    x_test = current_data[(current_data['customer_ID'].isin(customer_ids_test)) & (current_data['customer_ID'].isin(grouped['customer_ID'])) ]
    y_train = x_train['target']
    y_test = x_test['target']
    
    unique_customer_ids_x_train = x_train.customer_ID.nunique()   
    unique_customer_ids_x_test = x_test.customer_ID.nunique()

    x_train = x_train.drop(columns=['customer_ID', 'S_2', 'RANK','target', 'count'])
    x_test = x_test.drop(columns=['customer_ID', 'S_2', 'RANK','target', 'count'])
    del current_data
    gc.collect()
    return x_train, y_train, x_test, y_test, unique_customer_ids_x_train, unique_customer_ids_x_test

In [5]:
def setScaler(x_data, x_values_to_scale):
    scaler = StandardScaler()
    scaler.fit(x_data[x_values_to_scale])
    return scaler


def setOneHotEncoder(x_data, categorical_cols):
    transformer = make_column_transformer((OneHotEncoder( handle_unknown='ignore'), 
                                       categorical_cols), remainder='passthrough')
    one_hot_transformer = transformer.fit(x_data)

    with open("C:\\Users\\Grant\\Desktop\\Data_Science\\AMEX\\ENCODERS\\" + str(number_of_statements) + '_encoder', "wb") as f: 
        pickle.dump(one_hot_transformer, f)
    
    return one_hot_transformer

In [6]:
def TransformData(x_data, y_data, numeric_scaler, numeric_cols, categorical_cols, column_means, one_hot_transformer, unique_customer_ids):

    for col in column_means.columns:
        x_data[col] = x_data[col].fillna(value = column_means[col][0])
    
    x_data[numeric_cols] = numeric_scaler.transform(x_data[numeric_cols])
    x_data[numeric_cols] = x_data[numeric_cols].fillna(0)
    x_data[categorical_cols] = x_data[categorical_cols].fillna('missing')
    x_data[categorical_cols] = x_data[categorical_cols].astype('string')
    x_data[categorical_cols] = x_data[categorical_cols].astype('category')
    

    #print(x_data.isnull().sum())
    
    x_data_only_cats = one_hot_transformer.transform(x_data)
    data_hot_encoded = pd.DataFrame(x_data_only_cats, index=x_data.index)
    #Extract only the columns that didnt need to be encoded
    x_data = x_data.drop(columns=categorical_cols)
    #Concatenate the two dataframes : 
    x_data = pd.concat([data_hot_encoded, x_data], axis=1)
    

    
    #Reshape
    number_of_features = x_data.shape[1]
    x_data = [x_data[c].values.reshape(unique_customer_ids, number_of_statements) for c in x_data.columns] #Change Here
    #full_reshaped_list = tuple(full_reshaped_list)
    x_data = np.hstack(
        (x_data)
    ).reshape(unique_customer_ids, number_of_features, number_of_statements).transpose(0, 2, 1) #Change Here
    
    y_data = y_data[::number_of_statements] #Change Here
    y_data = y_data.to_numpy()
    return x_data, y_data

In [7]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


def amex_metric_mod(y_true: tf.Tensor, y_pred: tf.Tensor) -> float:

    y_true = tf.cast(y_true, dtype=tf.float64)
    y_pred = tf.cast(y_pred, dtype=tf.float64)
    
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)



def amex_metric_tensorflow(y_true: tf.Tensor, y_pred: tf.Tensor) -> float:

    # convert dtypes to float64
    y_true = tf.cast(y_true, dtype=tf.float64)
    y_pred = tf.cast(y_pred, dtype=tf.float64)

    # count of positives and negatives
    n_pos = tf.math.reduce_sum(y_true)
    n_neg = tf.cast(tf.shape(y_true)[0], dtype=tf.float64) - n_pos

    # sorting by descring prediction values
    indices = tf.argsort(y_pred, axis=0, direction='DESCENDING')
    preds, target = tf.gather(y_pred, indices), tf.gather(y_true, indices)

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = tf.cumsum(weight / tf.reduce_sum(weight))
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = tf.reduce_sum(target[four_pct_filter]) / n_pos

    # weighted gini coefficient
    lorentz = tf.cumsum(target / n_pos)
    gini = tf.reduce_sum((lorentz - cum_norm_weight) * weight)

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

In [8]:
def Build_Model(x_train, y_train, x_test,y_test,number_of_statements):
    number_of_features = x_train.shape[2]
    num_units = x_train.shape[0]
    if num_units > 4000:
        num_units = 4000
    
    num_neurons = number_of_features
    model = keras.Sequential()
    model.add(LSTM(units=num_units, activation='relu',return_sequences = True,input_shape=(number_of_statements, number_of_features)))
    model.add(LSTM(units=num_units, return_sequences = True,activation='relu',input_shape=(number_of_statements, number_of_features)))
    model.add(LSTM(units=num_units, activation='relu',input_shape=(number_of_statements, number_of_features)))
    #model.add(tf.keras.layers.Dropout(recurrent_dropout))
    model.add(layers.Dense(num_neurons, activation = 'relu'))
    model.add(layers.Dense(num_neurons/4, activation = 'relu'))
    model.add(layers.Dense(1, activation = 'sigmoid'))
        
    epochs = 30
    batch_size = 32

    callbacks = [
        keras.callbacks.ModelCheckpoint(
            "D:\\AMEX\\best_model" + str(number_of_statements) + "_Statements.h5", save_best_only=True, monitor="val_amex_metric_tensorflow"
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor="val_loss", factor=0.5, patience=20, min_lr=0.0001
        ),
        keras.callbacks.EarlyStopping(monitor='val_accuracy',  
                   patience=5)
    ]
    optimizer = keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(
          optimizer=optimizer,
            loss='binary_crossentropy',
                  metrics=['accuracy',f1_m,precision_m, recall_m, amex_metric_tensorflow],
        )


    default_model = model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_data = (x_test,y_test),
        callbacks=callbacks,
        verbose=1,
    )
    
    del default_model
    del model
    gc.collect()


# Run PreProcessing

In [9]:
#Column Definitions
column_means = pd.read_csv("C:\\Users\\Grant\\Desktop\\Data_Science\\AMEX\\AMEX_COLUMN_MEANS.csv")
category_column = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'] 


In [10]:
for number_of_statements in range(1,13):
    print("Processing: File With", number_of_statements, "Statements")
    current_data = pd.read_csv("C:\\Users\\Grant\\Desktop\\Data_Science\\AMEX\\Balanced_Datasets\\balanced_train_"+str(number_of_statements) + "_statement.csv")
    x_train, y_train, x_test, y_test, unique_customer_ids_x_train, unique_customer_ids_x_test = Data_Split(current_data, number_of_statements)

    #Numeric columns
    numeric_columns = x_train.select_dtypes(include=['number','float64']).columns
    numeric_cols = x_train.columns
    x_values_to_scale= [x for x in numeric_cols if x not in category_column]

    numeric_scaler = setScaler(x_train, x_values_to_scale)
    one_hot_encoder = setOneHotEncoder(x_train, category_column)

    x_train, y_train = TransformData(x_train, y_train, numeric_scaler, x_values_to_scale, category_column, column_means, one_hot_encoder, unique_customer_ids_x_train)
    x_test, y_test = TransformData(x_test, y_test, numeric_scaler, x_values_to_scale, category_column, column_means, one_hot_encoder, unique_customer_ids_x_test)
    
    Build_Model(x_train,y_train,x_test ,y_test,number_of_statements)

Processing: File With 1 Statements


  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Processing: File With 2 Statements


  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Processing: File With 3 Statements
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Processing: File With 4 Statements


  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Processing: File With 5 Statements
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Processing: File With 6 Statements
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Processing: File With 7 Statements


  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Processing: File With 8 Statements
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Processing: File With 9 Statements
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Processing: File With 10 Statements
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Processing: File With 11 Statements
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Processing: File With 12 Statements
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30


In [85]:
x_train.shape

(3599, 1, 405)

In [268]:
unique, counts = np.unique(y_train, return_counts=True)

print (np.asarray((unique, counts)).T)

[[   0 2413]
 [   1 1186]]


In [99]:
model = load_model("best_model_" + str(number_of_statements) + "_Statments.h5", custom_objects={"f1_m": f1_m, "precision_m": precision_m, "recall_m": recall_m, "amex_metric_tensorflow":amex_metric_tensorflow })
# make predictions




In [62]:
x_test.shape

(1500, 13, 179)

In [269]:
x_train.shape

(3599, 1, 229)

In [270]:
y_train.shape

(3599,)

In [80]:
for index in range(100,200):
    sample = np.reshape(x_test[index], (1,x_test.shape[1], x_test.shape[2]))

    yhat = model.predict(sample, verbose=0)
    print(yhat, y_test[index])

[[3.2894786e-05]] 0
[[2.7454513e-05]] 0
[[2.744323e-05]] 0
[[0.9999998]] 1
[[7.422409e-05]] 0
[[2.5798183e-05]] 0
[[0.9999996]] 1
[[7.302886e-05]] 0
[[4.0247924e-05]] 0
[[2.4125631e-05]] 0
[[9.2915274e-05]] 0
[[8.172402e-05]] 0
[[3.1855732e-05]] 0
[[0.9999999]] 1
[[2.9318388e-05]] 0
[[2.8906037e-05]] 0
[[2.6766727e-05]] 0
[[1.]] 1
[[2.9638126e-05]] 0
[[3.607076e-05]] 0
[[2.9245617e-05]] 0
[[2.6168345e-05]] 0
[[1.]] 1
[[3.9660707e-05]] 0
[[2.9159033e-05]] 0
[[3.1829284e-05]] 0
[[3.746931e-05]] 0
[[2.726956e-05]] 0
[[0.00036664]] 0
[[4.69789e-05]] 0
[[2.8863971e-05]] 0
[[3.640556e-05]] 0
[[0.9999998]] 1
[[3.5203106e-05]] 0
[[2.7980534e-05]] 0
[[3.1185817e-05]] 0
[[2.8169074e-05]] 0
[[0.99999964]] 1
[[0.9999996]] 1
[[0.99999994]] 1
[[0.9999999]] 1
[[0.999998]] 1
[[0.9999997]] 1
[[0.99999994]] 1
[[0.99999994]] 1
[[0.9999999]] 1
[[0.9999998]] 1
[[1.]] 1
[[0.9999999]] 1
[[0.99999994]] 1
[[0.99999994]] 1
[[0.9999999]] 1
[[0.9999999]] 1
[[1.]] 1
[[1.]] 1
[[1.]] 1
[[0.99999994]] 1
[[0.9999999]]

In [69]:
sample.shape

(1, 13, 179)

In [14]:
#x_values_to_scale = x_train.columns

#x_values_to_scale= [x for x in x_values_to_scale if x not in category_column]
#scaler = StandardScaler()
#scaler.fit(x_train[x_values_to_scale])
#x_train[x_values_to_scale] = scaler.transform(x_train[x_values_to_scale])
#x_train = x_train.to_numpy()
