# Dataset
Transactions de cartes de crédits

https://www.kaggle.com/pavansanagapati/anomaly-detection-credit-card-fraud-analysis



In [0]:
! wget http://debauche.xyz/Github/creditcard.csv.zip
! unzip creditcard.csv.zip
! rm creditcard.csv.zip

--2018-11-18 13:34:15--  http://debauche.xyz/Github/creditcard.csv.zip
Resolving debauche.xyz (debauche.xyz)... 46.17.175.28
Connecting to debauche.xyz (debauche.xyz)|46.17.175.28|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69155632 (66M) [application/zip]
Saving to: ‘creditcard.csv.zip’


2018-11-18 13:34:24 (8.19 MB/s) - ‘creditcard.csv.zip’ saved [69155632/69155632]

Archive:  creditcard.csv.zip
  inflating: creditcard.csv          


In [0]:
import pandas as pd
import numpy as np

data = pd.read_csv('./creditcard.csv')
# Shuffle row
data = data.reindex(np.random.permutation(data.index))

print(data.head())

data = data.as_matrix()[:, 1:]

rows, _ = data.shape

split = int(0.8 * rows)

train_data = data[:split, :]

test_data = data[split:, :]

print(train_data.shape)
print(test_data.shape)

            Time        V1        V2        V3        V4        V5        V6  \
242842  151679.0 -1.350227  0.092878  1.340837  0.407938 -0.707785 -0.295252   
110205   71723.0 -0.919461 -0.197297  2.409016 -0.504443 -0.005879  1.585147   
277573  167722.0  2.026725  0.412645 -2.516285  0.603314  0.596640 -1.343659   
253258  156165.0 -2.014510  0.882556  0.180390 -1.223229  1.505471 -0.955376   
129798   79216.0 -0.471761  0.327037  1.680430 -1.631820 -1.009025  0.177521   

              V7        V8        V9  ...         V21       V22       V23  \
242842 -0.224706  0.575816 -1.330639  ...   -0.242593 -0.515863 -0.090548   
110205 -0.078686  0.492575  0.895720  ...   -0.053474  0.323766  0.206236   
277573  0.156316 -0.151752  0.516404  ...    0.109467  0.405650 -0.079986   
253258  0.419505 -0.492242 -0.685570  ...    0.435715 -1.415058 -0.473890   
129798 -1.421718 -2.382304  1.296625  ...   -1.303011  0.374509 -0.202424   

             V24       V25       V26       V27       V28

# Autoencoders

Unsupervised algorithm

We want to predict the input... 

Input: V1 -> V28 & Amount

In [0]:
import keras
from keras import Model
from keras.models import Sequential
from keras.layers import *

def create_model():
    model = Sequential()
    
    model.add(Dense(50, activation='relu', input_dim=29))
            
    model.add(Dense(15))
    
    model.add(Dense(29))
    
    return model


model = create_model()
model.compile('adam', 'mse')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_79 (Dense)             (None, 50)                1500      
_________________________________________________________________
dense_80 (Dense)             (None, 15)                765       
_________________________________________________________________
dense_81 (Dense)             (None, 29)                464       
Total params: 2,729
Trainable params: 2,729
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.fit(x=train_data[:, :-1], y=train_data[:, :-1], epochs=2, validation_split=0)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc31ba0fa90>

In [0]:
predicted_vector = model.predict(test_data[:, :-1])

print('Mean')
print('====')
print('Prediction,     ', predicted_vector.mean())
print('Initial data,   ', test_data[:, :-1].mean())

predicted_vector = np.concatenate((predicted_vector, np.zeros(predicted_vector.shape[0]).reshape(-1, 1)), axis=1)

error = np.square(test_data - predicted_vector)

anormal_error = error[error[:, -1] == 1][:, :-1]
regular_error = error[error[:, -1] == 0][:, :-1]

print('Mean error')
print('==========')
print('On anomaly,     ', anormal_error.mean())
print('On normal data, ', regular_error.mean())

Mean
====
Prediction,      3.1193116
Initial data,    3.095192032112978
Mean error
On anomaly,      7.884394841430703
On normal data,  0.48915868304962173


# False positive & False negative

We can adapte threshold, if we want less "false positive" or less "false negative"

In [0]:
threshold = 0.7

anormal_error_mean = anormal_error.mean(axis=1)

anormal_detected = anormal_error_mean[anormal_error_mean >= threshold].size
anormal_undetected = anormal_error_mean[anormal_error_mean < threshold].size

print('Anormal detected, ', anormal_detected)
print('Anormal undetected', anormal_undetected)
print('Accuraccy, ', anormal_detected / (anormal_detected + anormal_undetected))

print()

regular_error_mean = regular_error.mean(axis=1)

regular_detected = regular_error_mean[regular_error_mean >= threshold].size
regular_undetected = regular_error_mean[regular_error_mean < threshold].size

print('Regular detected, ', regular_detected)
print('Regular undetected', regular_undetected)
print('Accuraccy, ', regular_undetected / (regular_detected + regular_undetected))

Anormal detected,  84
Anormal undetected 12
Accuraccy,  0.875

Regular detected,  5975
Regular undetected 50891
Accuraccy,  0.8949284282347976


# Forest

In [0]:
n_models = 5

def create_model():
    model = Sequential()
    
    model.add(Dense(50, activation='relu', input_dim=29))
                
    model.add(Dense(15))
        
    model.add(Dense(29))
    
    return model

models = []

for _ in range(n_models):
    model = create_model()
    model.compile('adam', 'mse')
    
    models.append(model)

In [0]:
for model in models:
    model.fit(x=train_data[:, :-1], y=train_data[:, :-1], epochs=2, validation_split=0)

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


In [0]:
threshold = 0.75

predictions = np.array([test_data[:, -1]]).T

print(predictions.shape)

for model in models:
    predicted_vector = model.predict(test_data[:, :-1])
    
    # mse
    error = np.square(test_data[:, :-1] - predicted_vector).mean(axis=1)
    
    predictions = np.concatenate((predictions, error.reshape(-1, 1)), axis=1)

    
    
print(predictions.shape)

anormal_errors = predictions[predictions[:, 0] == 1][:, 1:]
regular_errors = predictions[predictions[:, 0] == 0][:, 1:]

anormal_errors = (anormal_errors > threshold).astype(float).mean(axis=1)
regular_errors = (regular_errors > threshold).astype(float).mean(axis=1)


print('Mean error')
print('==========')
print('On anomaly,     ', anormal_errors.mean())
print('On normal data, ', regular_errors.mean())

anormal_detected = anormal_errors[anormal_errors >= 0.5].size
anormal_undetected = anormal_errors[anormal_errors < 0.5].size

print('On anomaly')
print('==========')
print('Anormal detected, ', anormal_detected)
print('Anormal undetected', anormal_undetected)
print('Accuraccy, ', anormal_detected / (anormal_detected + anormal_undetected))

regular_detected = regular_errors[regular_errors >= 0.5].size
regular_undetected = regular_errors[regular_errors < 0.5].size

print('On regular')
print('==========')
print('Regular detected, ', regular_detected)
print('Regular undetected', regular_undetected)
print('Accuraccy, ', regular_undetected / (regular_detected + regular_undetected))

(56962, 1)
(56962, 6)
Mean error
On anomaly,      0.8874999999999998
On normal data,  0.09872683149861078
On anomaly
Anormal detected,  88
Anormal undetected 8
Accuraccy,  0.9166666666666666
On regular
Regular detected,  4803
Regular undetected 52063
Accuraccy,  0.9155382829810431
