[View in Colaboratory](https://colab.research.google.com/github/MarcinWylot/CreditCardFraudDetectionKaggleDS/blob/master/CreditCardFraudDetectionKaggleDS.ipynb)

In [0]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
#from google.colab import files
#uploaded = files.upload()

!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# PyDrive reference:
# https://googledrive.github.io/PyDrive/docs/build/html/index.html

# 2. Create & upload a file text file.
#uploaded = drive.CreateFile({'title': 'Sample upload.txt'})
#uploaded.SetContentString('Sample upload file content')
#uploaded.Upload()
#print('Uploaded file with ID {}'.format(uploaded.get('id')))

# 3. Load a file by ID and print its contents.
downloaded = drive.CreateFile({'id': '1QQara53mzPKgRO4poTIWtQ2Ijm3GcqkV'})
#print('Downloaded content "{}"'.format(downloaded.GetContentString()))
downloaded.GetContentFile('./creditcard.csv')
!ls


In [106]:
data = pd.read_csv('./creditcard.csv')
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
#data.describe()
X_train_full, X_test = train_test_split(data, test_size=0.2, random_state=0)

#X_train = data[data.Class == 0]
X_train = X_train_full
y_train = X_train['Class']
X_train = X_train.drop(['Class'], axis=1)
#X_train = X_train.values

y_test = X_test['Class']
X_test = X_test.drop(['Class'], axis=1)
#X_test = X_test.values


fraud = X_train_full[X_train_full['Class'] == 1]
valid = X_train_full[X_train_full['Class'] == 0]

fraud_fraction = len(fraud) / float(len(valid))
print(fraud_fraction)

0.0017190289025473282


In [107]:
from sklearn.ensemble import IsolationForest
clf_IsolationForest = IsolationForest(max_samples = len(X_test), contamination = fraud_fraction, random_state=1)

clf_IsolationForest.fit(X_train)
y_IsolationForest = clf_IsolationForest.predict(X_test)

y_IsolationForest[y_IsolationForest == 1] = 0
y_IsolationForest[y_IsolationForest == -1] = 1

print(accuracy_score(y_test,y_IsolationForest))
print(classification_report(y_test,y_IsolationForest))

0.9976475545100242
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     56861
          1       0.33      0.33      0.33       101

avg / total       1.00      1.00      1.00     56962



In [108]:
#it's not the best method to test unseen data 
from sklearn.neighbors import LocalOutlierFactor
clf_LocalOutlierFactor = LocalOutlierFactor(n_neighbors=20, contamination = fraud_fraction)

clf_LocalOutlierFactor.fit(X_train)
y_LocalOutlierFactor = clf_LocalOutlierFactor._decision_function(X_test)

threshold = np.histogram(y_LocalOutlierFactor,bins=2)[1][1]

y_pred = np.zeros(X_test.shape[0], dtype=int)
y_pred[y_LocalOutlierFactor < threshold] = 1

      
print(accuracy_score(y_test,y_pred)) 
print(classification_report(y_test,y_pred))

0.9981039991573329
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     56861
          1       0.00      0.00      0.00       101

avg / total       1.00      1.00      1.00     56962



In [109]:
#https://medium.com/@curiousily/credit-card-fraud-detection-using-autoencoders-in-keras-tensorflow-for-hackers-part-vii-20e0c85301bd
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers

X_train = X_train_full[X_train_full.Class == 0] #we tain the model on normal transaction only 
y_train = X_train['Class']
X_train = X_train.drop(['Class'], axis=1)
X_train = X_train.values

input_dim = X_train.shape[1]
encoding_dim = 14

input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)


nb_epoch = 20
batch_size = 32
autoencoder.compile(optimizer='adam', 
                    loss='mean_squared_error', 
                    metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath="model.h5",
                               verbose=0,
                               save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)
history = autoencoder.fit(X_train, X_train,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(X_test, X_test),
                    verbose=0,
                    callbacks=[checkpointer, tensorboard]).history


predictions = autoencoder.predict(X_test)

mse = np.mean(np.power(X_test - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_test})

print(np.histogram(error_df.reconstruction_error.values,bins=4))
#threshold = np.histogram(y_LocalOutlierFactor,bins=2)[1][1]
threshold = 3
y_pred = np.zeros(X_test.shape[0], dtype=int)
y_pred[y_LocalOutlierFactor > threshold] = 1

print(accuracy_score(y_test,y_pred)) 
print(classification_report(error_df.true_class,y_test))

Using TensorFlow backend.


(array([56961,     1]), array([8.06259820e-02, 8.12961644e+02, 1.62584266e+03]))
0.9783188792528352
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     56861
          1       1.00      1.00      1.00       101

avg / total       1.00      1.00      1.00     56962



             precision    recall  f1-score   support

          0       1.00      0.98      0.99    284315
          1       0.06      0.82      0.12       492

avg / total       1.00      0.98      0.99    284807

