# Setup

In [None]:
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Lambda
from keras.layers import Embedding
from keras.layers import Convolution1D,MaxPooling1D, Flatten
from keras.datasets import imdb
from keras import backend as K
from sklearn.model_selection import train_test_split
import pandas as pd
from keras.utils.np_utils import to_categorical

from sklearn.preprocessing import Normalizer
from keras.models import Sequential
from keras.layers import Convolution1D, Dense, Dropout, Flatten, MaxPooling1D
from keras.utils import np_utils
import numpy as np
import h5py
from keras import callbacks
from keras.layers import LSTM, GRU, SimpleRNN
from keras.callbacks import CSVLogger
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger


from sklearn.model_selection import train_test_split

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
import math
import os
import pickle as pkl
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline

# Functions

In [None]:
def calculate_precision_recall(model, X, y):
    """
    calulate precision, recall based on different thresholds
    :param X: pandas dataframe
    :param y: array of labels
    :return: dataframe of model_name, precision, recall, threshold
    """
    thresh_list = []
    precision_list = []
    recall_list = []

    predd = model.predict_proba(X)
    for my_threshold in range(500, 1000, 10):
        thres = my_threshold / 1000
        y_test_pred = (predd >= thres).argmax(axis=1)

        recall1 = round(recall_score(y, y_test_pred), 2)
        prec1 = round(precision_score(y, y_test_pred), 2)
        precision_list.append(prec1)
        recall_list.append(recall1)
        thresh_list.append(thres)

    df_threshold = pd.DataFrame(
        {'precision_fraud': precision_list, 'recall_fraud': recall_list,
         'threshold': thresh_list})
    return df_threshold

def report_optimal_threshold(model, X, y):
    """
    calulate the optimal threshold for balanced precision and recall
    :param X: pandas dataframe
    :param y: array of labels
    :return: float of optimal threshold
    """
    df_threshold = calculate_precision_recall(model, X, y)
    df_threshold['diff'] = np.abs(df_threshold['precision_fraud'] - df_threshold['recall_fraud'])
    optimal_threshold = df_threshold[df_threshold['diff'] == df_threshold['diff'].min()]['threshold'].values[0]
    return optimal_threshold

def calculate_precision_threshold(model, X, y):
    df_threshold = calculate_precision_recall(model, X, y)

    predd = model.predict_proba(X)
    list_of_met = []
    for i in [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.98]:
        df_threshold['prec_th'] = i
        list_of_met.append(df_threshold[df_threshold['precision_fraud'] >= i][:1])

    df_met = pd.concat(list_of_met)
    conf_matrix_all = [list(report_confusion_matrix(y, predd, threshold=prec_th)) for prec_th in
                       df_met['threshold']]
    df_res = pd.DataFrame(conf_matrix_all, \
                          columns=["NonFraud_ActualNonFraud", "Fraud_ActualNonFraud", "NonFraud_ActualFraud",
                                   "Fraud_ActualFraud"])

    df_met['Fraud_ActualFraud'] = df_res['Fraud_ActualFraud'].values
    df_met['Fraud_ActualNonFraud'] = df_res['Fraud_ActualNonFraud'].values
    df_met['NonFraud_ActualFraud'] = df_res['NonFraud_ActualFraud'].values
    df_met['NonFraud_ActualNonFraud'] = df_res['NonFraud_ActualNonFraud'].values

    df_met = df_met.reset_index(drop=True)
    return df_met


def report_confusion_matrix(y, y_pred_proba, threshold=0.5):
    """
    :param y: true labels
    :param y_pred_proba: predicted probability
    :param threshold: threshold of fraud
    :return: 1d numpy array of confusion matrix
    """
    y_pred = (y_pred_proba >= threshold).argmax(axis=1)
    confusion_mat = confusion_matrix(y, y_pred).ravel()
    return confusion_mat

def report_metrics(model, X, y, optimal_threshold=None, data_name=None):
    """
    calculate confusion matrix, roc_auc, precision and recall
    :param X: dataframe ready to predcit
    :param y: true labels
    :param data_name: string name of data (train, test)
    :return:
    """
    y_pred_proba = model.predict_proba(X)
    roc_score = roc_auc_score(y, y_pred_proba[:, 1], average="weighted")

    curr_threshold = 0.96
    conf_matrix_ = list(report_confusion_matrix(y, y_pred_proba, threshold=curr_threshold))
    conf_matrix_.append(curr_threshold)
    # threshold IsOptimal
    conf_matrix_.append(False)

    if not optimal_threshold:
        optimal_threshold = report_optimal_threshold(model, X, y)
    conf_matrix_optimal_th = list(report_confusion_matrix(y, y_pred_proba, threshold=optimal_threshold))
    conf_matrix_optimal_th.append(optimal_threshold)
    # threshold IsOptimal
    conf_matrix_optimal_th.append(True)

    df_res = pd.DataFrame([conf_matrix_, conf_matrix_optimal_th], \
                          columns=["NonFraud_ActualNonFraud", "Fraud_ActualNonFraud", \
                                   "NonFraud_ActualFraud", "Fraud_ActualFraud", \
                                   "threshold", "optimal_threshold"])
    df_res['roc_auc'] = roc_score
    df_res['data'] = data_name
    df_res['user_count'] = len(y)

    df_res['precision'] = df_res['Fraud_ActualFraud'] / (
            df_res['Fraud_ActualFraud'] + df_res['Fraud_ActualNonFraud'])
    df_res['recall'] = df_res['Fraud_ActualFraud'] / (df_res['Fraud_ActualFraud'] + df_res['NonFraud_ActualFraud'])
    df_res['f1_score'] = (2 * df_res['precision'] * df_res['recall']) / (df_res['precision'] + df_res['recall'])

    df_res = round(df_res, 3)
    df_res = df_res[['data', 'user_count', 'threshold', 'optimal_threshold', \
                     'roc_auc', 'precision', 'recall', 'f1_score', \
                     'Fraud_ActualFraud', 'Fraud_ActualNonFraud', 'NonFraud_ActualFraud',
                     'NonFraud_ActualNonFraud']]
    return df_res


def report_metric(model, x_train, y_train, x_test, y_test):
    train_report = report_metrics(model, x_train, y_train, data_name='train')
    opt_t=train_report[train_report['optimal_threshold'] ==True]['threshold'].values[0]
    test_report= report_metrics(model, x_test, y_test, optimal_threshold=opt_t, data_name='test')

    report = pd.concat([train_report, test_report])
    return report

In [None]:
def filter_correlated_features(df_merged_filtered):
    """
    filter correlated features for multicolinearity problem
    :param df_merged_filtered:
    :return: dataframe wxluded the correlated features
    """
    columns_keep = []
    for col in df_merged_filtered.columns:
        if df_merged_filtered[col].nunique() > 1:
            columns_keep.append(col)

    df_merged_filtered = df_merged_filtered[columns_keep]
    not_dummy = ['impression_col', 'session_click', 'session_dev', 'session_ip', 'session_post_back', \
                 'device_plat_unique_vals', 'profile_change_fieldid', 'question_diff_source', \
                 'survey_status_', 'ss1_provider_', 'ss1_provider_', 'ss2_provider_', 'ss3_provider_', \
                 'act_conv_rate_all_', 'success_rate_provider_', 'status_change_reasonid_', \
                 'session_length_minutes_', '_browser_unique_vals', 'avg_acloi_provider_', 'sum_acloi_provider_']

    pre_dummy_variables_to_add = [col for col in df_merged_filtered.columns if (
            (df_merged_filtered[col].nunique() == 2) & (~(("CLICK" in col) | ("BACK" in col))))]
    dummy_variables_to_add = [x for x in pre_dummy_variables_to_add if all(elem not in x for elem in not_dummy)]

    variables_to_filter = list(set(df_merged_filtered.columns).difference(set(dummy_variables_to_add)))

    df_temp = df_merged_filtered[variables_to_filter]
    # Create correlation matrix
    corr_matrix = df_temp.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find index of feature columns with correlation greater than 0.85
    to_drop = [column for column in upper.columns if any((upper[column] > 0.90))]
    print('dropping #features:', len(to_drop))
    df_temp = df_temp.drop(to_drop, axis=1)
    df_merged_filtered = pd.concat([df_temp, df_merged_filtered[dummy_variables_to_add]], axis=1)
    print('filtering correlated features done.')
    return df_merged_filtered


def adjust_features(_features, X):
    """
        adjust the features to the trained set (sequence and existence)
        :param X: pandas dataframe
        :return: adjusted dataframe
        """
    cols_to_add = set(_features).difference(set(X.columns))
    df_cols_to_add = pd.DataFrame(columns=cols_to_add)
    df_to_predict_ready = pd.concat([X, df_cols_to_add], axis=1).fillna(0)
    x_adjusted = df_to_predict_ready[_features]
    return x_adjusted

def standardize_data(df, scaler=None):
    """
    standardize data based on Z-score
    :param df:
    :return:
    """
    if scaler is None:
        print("scaler is none")
        scaler = StandardScaler()
        scaler.fit(df)
    df_scaled = pd.DataFrame(scaler.transform(df), columns=df.columns)
    return df_scaled, scaler


# Data

In [None]:
df_sj = pd.read_csv('../data/processed/processed_data.csv')

----

In [None]:
X = df_sj.loc[:, ~df_sj.columns.isin(["user_id", "fraud"])]
y = df_sj['fraud'].values
X = X.replace(math.inf, 0)
x_train_raw, x_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=41)

In [None]:

x_train_filt = filter_correlated_features(x_train_raw)
x_train, scaler = standardize_data(x_train_filt)

x_test_filt = adjust_features(x_train.columns, x_test_raw)
x_test, _ = standardize_data(x_test_filt, scaler)

In [None]:
def fit_logistic_regression(x_train, y_train, penalty_v = "l1", C_value = 0.01, \
                            GridSearch = False, max_iter = 1000):
    
    pars = {"random_state": 42, "class_weight": "balanced", "n_jobs": -1, "solver": "saga", "max_iter": 10000,
               "C": C_value, "verbose": 0, 'penalty': penalty_v}
    lr_model = LogisticRegression(**pars)
    
    if GridSearch:
        parameters = {"C": [0.001, 0.01, 0.1, 1], 'penalty':['l1', 'l2']}
        model_logit = GridSearchCV(lr_model, parameters, scoring="roc_auc", cv=None, verbose=5,
                                   pre_dispatch=20, return_train_score=True, n_jobs=-1)

        model_logit.fit(x_train,y_train)
        print("GridSearch best parameters: {}".format(model_logit.best_params_))
        
        C_value = model_logit.best_params_['C']
        penalty_v = model_logit.best_params_['penalty']
    
        pars = {"random_state": 42, "class_weight": "balanced", "n_jobs": -1, "solver": "saga", "max_iter": 10000,
                   "C": C_value, "verbose": 0, 'penalty': penalty_v}
        lr_model = LogisticRegression(**pars)
    
    lr_model.fit(x_train,y_train)
    return lr_model

In [None]:
def fit_decision_tree(x_train, y_train, criterion='entropy', max_depth=5, min_samples_leaf=20, GridSearch=False):
    
    dt_model = DecisionTreeClassifier(random_state=42, max_depth=max_depth,
                                        min_samples_leaf=min_samples_leaf, class_weight="balanced")
    
    if GridSearch:
        parameters = dict(max_depth=list(range(3,7,1)), min_samples_leaf = list(range(5,41,5)),
                         criterion  = ['gini', 'entropy'])
        model_tree = GridSearchCV(dt_model, parameters, scoring='roc_auc', cv=3,verbose=5)

        model_tree.fit(x_train,y_train)
        print(model_tree.best_params_)
        max_depth=model_tree.best_params_["max_depth"]
        min_samples_leaf=model_tree.best_params_["min_samples_leaf"]
        criterion = model_tree.best_params_["criterion"]
        
        dt_model = DecisionTreeClassifier(random_state=42, max_depth=max_depth, criterion =criterion,
                                        min_samples_leaf=min_samples_leaf, class_weight="balanced")
    
    dt_model.fit(x_train,y_train)
    return dt_model

In [None]:
def fit_random_forest(x_train, y_train, max_depth=10, min_samples_leaf=20, n_estimators = 10,
                      GridSearch=False):
    
    rf_model = RandomForestClassifier(random_state=42, max_depth=max_depth,n_jobs = -1, n_estimators=n_estimators,
                                        min_samples_leaf=min_samples_leaf, class_weight="balanced")

    if GridSearch:
        parameters = dict(max_depth=list(range(2,7,1)), min_samples_leaf = list(range(1,40,3)),
                         n_estimators=list(range(10,100,20)))
        model_rf = GridSearchCV(rf_model, parameters, scoring='roc_auc', verbose=5)

        model_rf.fit(x_train,y_train)
        print(model_rf.best_params_)
        max_depth=model_rf.best_params_["max_depth"]
        min_samples_leaf=model_rf.best_params_["min_samples_leaf"]
        n_estimators=model_rf.best_params_["n_estimators"]
        
        rf_model = RandomForestClassifier(random_state=42, max_depth=max_depth,n_estimators=n_estimators,
                                        min_samples_leaf=min_samples_leaf, class_weight="balanced")
    
    rf_model.fit(x_train,y_train)
    return rf_model

In [None]:
def fit_naive_bayes(x_train, y_train):
    nb_model = GaussianNB()
    nb_model.fit(x_train, y_train)
    return nb_model

----

In [None]:
logistic_regression = fit_logistic_regression(x_train, y_train,  penalty_v = "l1", C_value = 0.01)
report_metric(logistic_regression, x_train, y_train, x_test, y_test)

In [None]:
decision_tree = fit_decision_tree(x_train, y_train,  criterion='entropy', max_depth=5, min_samples_leaf= 20, 
                                  GridSearch=True)

report_metric(decision_tree, x_train, y_train, x_test, y_test)

In [None]:
random_forest = fit_random_forest(x_train, y_train, GridSearch=False, max_depth= 6, min_samples_leaf= 1, n_estimators=90)
report_metric(random_forest, x_train, y_train, x_test, y_test)

In [None]:
naive_bayes = fit_naive_bayes(x_train, y_train)
report_metric(naive_bayes, x_train, y_train, x_test, y_test)

# CNN

In [None]:
cnn = Sequential()
cnn.add(Convolution1D(64, 3, border_mode="same",activation="relu",input_shape=(x_train.shape[1], 1)))
cnn.add(MaxPooling1D(pool_length=(2)))
cnn.add(Flatten())
cnn.add(Dense(128, activation="relu"))
cnn.add(Dropout(0.5))
cnn.add(Dense(1, activation="sigmoid"))
print(cnn.summary())

# define optimizer and objective, compile cnn

cnn.compile(loss="binary_crossentropy", optimizer="adam",metrics=['accuracy'])
# reshape input to be [samples, time steps, features]
trainX = np.reshape(x_train.values, (x_train.shape[0],x_train.shape[1],1))
testX = np.reshape(x_test.values, (x_test.shape[0],x_test.shape[1],1))

cnn.fit(trainX, y_train, nb_epoch=20,validation_data=(testX, y_test))


In [None]:
class cnn_wrapper:
    def __init__(self, model):
        self._model= model
        
    def predict_proba(self, X):
        predictions = self._model.predict_proba(X)
        return np.array(list(map(list,list(zip(1- predictions, predictions)))))


In [None]:
cnn_w = cnn_wrapper(cnn)
report_metric(cnn_w, trainX, y_train, testX, y_test)

# Autoencoder

In [None]:
# autoencoder
X = df_sj.loc[:, ~df_sj.columns.isin(["user_id", "fraud"])]
y = df_sj['fraud'].values
X = X.replace(math.inf, 0)

x_train_raw, x_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=41)
x_train_filt = filter_correlated_features(x_train_raw)

y_train_orig = y_train.copy()
x_train_orig = x_train_filt.copy()

x_train_interim = x_train_filt[y_train==0]
y_train = y_train[y_train==0]

x_train, scaler = standardize_data(x_train_interim)
x_train_orig, _= standardize_data(adjust_features(x_train.columns, x_train_orig), scaler)
x_test_filt = adjust_features(x_train.columns, x_test_raw)
x_test, _ = standardize_data(x_test_filt, scaler)

In [None]:
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers

In [None]:
input_dim = x_train.shape[1]
encoding_dim = 14
input_layer = Input(shape=(input_dim, ))

encoder = Dense(encoding_dim, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)

decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)

In [None]:
nb_epoch = 100
batch_size = 32

autoencoder.compile(optimizer='adam', 
                    loss='mean_squared_error', 
                    metrics=['accuracy'])

checkpointer = ModelCheckpoint(filepath="model.h5",
                               verbose=0,
                               save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)

history = autoencoder.fit(x_train, x_train,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(x_test, x_test),
                    verbose=1,
                    callbacks=[checkpointer, tensorboard]).history

In [None]:
predictions = autoencoder.predict(x_train_orig)
mse = np.mean(np.power(x_train_orig - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_train_orig})

fpr, tpr, thresholds = roc_curve(error_df.true_class, error_df.reconstruction_error)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();

In [None]:
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support)

precision, recall, th = precision_recall_curve(error_df.true_class, error_df.reconstruction_error)
plt.plot(recall, precision, 'b', label='Precision-Recall curve')
plt.title('Recall vs Precision')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
plt.plot(th, precision[1:], 'b', label='Threshold-Precision curve')
plt.plot(th, recall[1:], 'r', label='Threshold-Precision curve')
plt.title('Precision for different threshold values')
plt.xlabel('Threshold')
plt.ylabel('Precision')
plt.show()

In [None]:
class model_wrapper:
    def __init__(self, model):
        self._model= model
        
    def predict_proba(self, X):
        predictions = self._model.predict(X)
#         mse = np.mean(np.power(X - predictions, 2), axis=1)
        mse = np.mean(np.absolute(X - predictions), axis=1)
        return np.array(list(map(list,list(zip(1- mse.values, mse.values)))))

In [None]:
autoencoder_wrapper = model_wrapper(autoencoder)
autoencoder_wrapper.predict_proba(x_train)[:, 1]

In [None]:
report_metric(autoencoder_wrapper, x_train_orig, y_train_orig, x_test, y_test)