In [2]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from pylab import rcParams
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

In [3]:
data = pd.read_csv('Anomaly_detect.csv',sep=',')
data.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V4,V5,V6,V7,V8,V11,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,-0.282401,0.409239,-0.412125,0.689071,-1.672189,3.54938,3.181706,0.578688,0.660969,0.183458,...,0.111821,-0.332041,-0.754061,-0.018708,0.606893,-0.671096,0.107782,0.11646,0.027057,0
1,-0.25152,-0.403482,-1.557997,1.445214,-0.70368,-0.619161,-0.351902,-0.171144,1.115511,1.013396,...,-0.142424,-0.05355,-0.202123,0.085877,0.234648,-0.265061,0.303707,0.168222,0.101471,0
2,0.948788,-0.714987,-1.153532,1.096862,-0.317905,0.210462,-1.043646,0.96925,0.145975,0.029226,...,0.110291,-0.395159,-1.221043,0.212406,-0.071539,-0.133931,-0.045221,0.095197,0.079633,0
3,0.230839,0.377507,-0.063657,0.716875,-1.146666,1.619539,-1.418952,1.907872,-0.679603,-1.44579,...,0.137095,0.18474,0.808247,-0.255063,-0.606518,-0.222594,0.099614,0.21177,0.045021,0
4,-0.279466,0.649127,-1.034736,0.58111,1.178578,-0.194626,0.390951,-0.161889,0.245898,-1.489924,...,-0.105769,-0.133762,-0.178392,-0.344198,-0.041099,0.328147,-0.51518,-0.113233,0.118757,0


In [4]:

#Create independent and Dependent Features
columns = data.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["Class"]]
# Store the variable we are predicting 
target = "Class"
# Define a random state 
state = np.random.RandomState(42)
X = data[columns]
Y = data[target]
#X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

(284807, 23)
(284807,)


In [5]:
normal= data['Class'].value_counts()[0]
fraud= data['Class'].value_counts()[1]

In [6]:
outlier_fraction = fraud/normal
outlier_fraction

0.0017304750013189597

Contamination is the assumption about the fraction of anomalies in the dataset. This is used when fitting to define the threshold on the scores of the samples. As we already know the data points which are fraud we can determine outlier fraction which is used for contamination. 

In [7]:
##Define the outlier detection methods
state = np.random.RandomState(42)
classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples='auto', 
                                       contamination=outlier_fraction,random_state=state, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
                                              leaf_size=30, metric='minkowski',
                                              p=2, metric_params=None, contamination=outlier_fraction)
   
}

In [8]:
#n_outliers = len(Fraud)
#looping through a dictionary
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    else:    
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_pred = clf.predict(X)
    #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    # Run Classification Metrics
    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(Y,y_pred))
    print("Classification Report :")
    print(classification_report(Y,y_pred))

Isolation Forest: 857
Accuracy Score :
0.9969909447450379
Classification Report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    284315
           1       0.13      0.13      0.13       492

    accuracy                           1.00    284807
   macro avg       0.56      0.56      0.56    284807
weighted avg       1.00      1.00      1.00    284807

Local Outlier Factor: 983
Accuracy Score :
0.9965485398884156
Classification Report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    284315
           1       0.00      0.00      0.00       492

    accuracy                           1.00    284807
   macro avg       0.50      0.50      0.50    284807
weighted avg       1.00      1.00      1.00    284807



As Isolation Forest has high accuracy compared to Local outlier I am adding anomaly and scores to the dataset using this model

In [9]:
for i, (clf_name,clf) in enumerate(classifiers.items()):
    if clf_name == "Isolation Forest":
        data['scores'] = clf.decision_function(X)
        data['anomaly']=clf.predict(X)
data.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V4,V5,V6,V7,V8,V11,...,V22,V23,V24,V25,V26,V27,V28,Class,scores,anomaly
0,-0.282401,0.409239,-0.412125,0.689071,-1.672189,3.54938,3.181706,0.578688,0.660969,0.183458,...,-0.754061,-0.018708,0.606893,-0.671096,0.107782,0.11646,0.027057,0,0.272523,1
1,-0.25152,-0.403482,-1.557997,1.445214,-0.70368,-0.619161,-0.351902,-0.171144,1.115511,1.013396,...,-0.202123,0.085877,0.234648,-0.265061,0.303707,0.168222,0.101471,0,0.305407,1
2,0.948788,-0.714987,-1.153532,1.096862,-0.317905,0.210462,-1.043646,0.96925,0.145975,0.029226,...,-1.221043,0.212406,-0.071539,-0.133931,-0.045221,0.095197,0.079633,0,0.282618,1
3,0.230839,0.377507,-0.063657,0.716875,-1.146666,1.619539,-1.418952,1.907872,-0.679603,-1.44579,...,0.808247,-0.255063,-0.606518,-0.222594,0.099614,0.21177,0.045021,0,0.260266,1
4,-0.279466,0.649127,-1.034736,0.58111,1.178578,-0.194626,0.390951,-0.161889,0.245898,-1.489924,...,-0.178392,-0.344198,-0.041099,0.328147,-0.51518,-0.113233,0.118757,0,0.302035,1


In [10]:
data['anomaly'].value_counts()[-1]

493

In [11]:
data[data['anomaly']== -1].head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V4,V5,V6,V7,V8,V11,...,V22,V23,V24,V25,V26,V27,V28,Class,scores,anomaly
560,0.721582,0.796861,-22.859099,-29.336007,7.178809,28.516513,-17.28214,-21.116854,1.455076,1.198064,...,-0.30519,-3.729122,-0.776047,-0.691296,0.257049,3.992898,-0.737841,0,-0.022649,-1
1366,18.569133,0.94093,-18.479164,-15.726967,9.739257,-7.635013,3.553671,4.550106,0.008685,-0.16796,...,-1.706726,-10.117548,1.250139,-2.141659,0.173716,2.670597,-3.095313,0,-0.039953,-1
1372,2.726752,0.2395,-15.417559,-13.275716,4.317739,5.798948,-0.926116,0.314395,-2.408577,2.707393,...,2.269292,8.719703,-1.242309,2.4983,3.155327,5.018986,-2.404684,0,-0.025446,-1
2760,27.639209,-0.562789,-5.902597,2.780776,-2.280813,-7.857015,6.380748,1.368344,-13.334497,1.906948,...,-3.333528,-6.164796,0.469673,-0.794146,-0.578735,3.222753,-0.689999,0,-0.010527,-1
2766,58.24076,0.594368,-6.035851,-13.22483,4.446741,-3.201954,1.307555,5.995386,-1.219453,0.138702,...,-1.947865,-3.729543,0.396558,-1.02225,-0.957093,-0.82128,0.656766,0,-0.012247,-1


All the records with -1 are the anomalies in the data.

In [12]:
data.to_csv('Add_anomaly.csv',index=False)