In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("../input/fraudstat/data.csv")
data.head()

In [None]:

features = ["TX_AMOUNT", 
             "n_tx_1", "n_tx_7", "n_tx_30", 
             "avg_tx_1", "avg_tx_7" , "avg_tx_30" , 
             "n_tx_terminal_1", "n_tx_terminal_7" , "n_tx_terminal_30", 
             "tx_terminal_risk_1" , "tx_terminal_risk_7" , "tx_terminal_risk_30" , 
             "tx_weekend" , "tx_night"]
train = data.loc[
    (data["TX_DATETIME"] >= "2018-06-10") & 
    (data["TX_DATETIME"] < "2018-06-17") & 
    (data["TX_FRAUD"] == 0), features ]

test = data.loc[
    (data["TX_DATETIME"] >= "2018-06-24") & 
    (data["TX_DATETIME"] < "2018-07-01") & 
    (data["TX_FRAUD"] == 0), features ]

fraud = data.loc[
    (data["TX_DATETIME"] >= "2018-06-24") & 
    (data["TX_DATETIME"] < "2018-07-01") & 
    (data["TX_FRAUD"] == 1), features ]
train.shape, test.shape, fraud.shape

In [None]:
test_index = test.index
fraud_index= fraud.index
test_index.shape, fraud_index.shape

In [None]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

In [None]:
train = ss.fit_transform(train)
test = ss.transform(test)
fraud = ss.transform(fraud)

In [None]:
min_val = tf.reduce_min(train)
max_val = tf.reduce_max(train)

train_data = (train - min_val) / (max_val - min_val)
train_data = tf.cast(train_data, tf.float32)

test_data = (test - min_val) / (max_val - min_val)
test_data = tf.cast(test_data, tf.float32)

fraud_data = (fraud - min_val) / (max_val - min_val)
fraud_data = tf.cast(fraud_data, tf.float32)

In [None]:
train_data.shape, test_data.shape, fraud_data.shape # number of fraud

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras import layers, losses


class AnomalyDetector(Model):
    def __init__(self):
        super(AnomalyDetector, self).__init__()
        self.encoder = tf.keras.Sequential([
          layers.Dense(64, activation="relu"),
          layers.Dense(32, activation="relu"),
          layers.Dense(8, activation="relu")]) # the latent representation has only 8 dimensions

        self.decoder = tf.keras.Sequential([
          layers.Dense(16, activation="relu"),
          layers.Dense(32, activation="relu"),
          layers.Dense(15, activation="sigmoid")])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    def latent_space(self, x):
        latent = self.encoder(x)
        return latent
        
        

autoencoder = AnomalyDetector()
autoencoder.compile(optimizer='adam', loss='mse')

In [None]:
history = autoencoder.fit(train_data[:-1000], train_data[:-1000], 
          epochs=150, 
          batch_size=1024,
          validation_data=(train_data[-1000:-1], train_data[-1000:-1]),
          shuffle=True)

In [None]:
reconstructions = autoencoder.predict(train_data)
train_loss = tf.keras.losses.mse(reconstructions, train_data)

plt.hist(train_loss[None,:], bins=50)
plt.xlabel("Train loss")
plt.ylabel("No of examples")
plt.show()

In [None]:
threshold = np.quantile(train_loss, 0.99)
print("Threshold: ", threshold)

In [None]:
from sklearn.metrics import confusion_matrix, average_precision_score
merge_data = np.vstack([test_data, fraud_data])
y_true_test = np.zeros(merge_data.shape[0])
y_true_test[test_data.shape[0]:] = 1

reconstructions = autoencoder.predict(merge_data)
test_loss = tf.keras.losses.mse(reconstructions, merge_data)

y_hat_test = test_loss >= threshold
confusion_matrix(y_true_test, y_hat_test) # true label is on vertical axids in sklearn

In [None]:
average_precision_score(y_true_test, tf.sigmoid(test_loss))

In [None]:
merge_index = list(test_index) + list(fraud_index)
merge_df = data.iloc[merge_index,  :]
merge_df["predictions"] = test_loss

def p_at_k(df, k):
    group = df. \
        sort_values(["predictions"], ascending=False). \
        groupby(pd.to_datetime(df["TX_DATETIME"]).dt.date). \
        head(100)
    p = group.groupby(pd.to_datetime(df["TX_DATETIME"]).dt.date)["TX_FRAUD"].sum()/100
    return p.mean()

p_at_k(merge_df, 100)