<div style="direction:rtl;text-align:center"><img src="https://dl.mohammadkh.ir/logo.png" alt="Mohammadkh.ir" style="width: 250px;"/></div>
<h1><div style="direction:rtl;text-align:center">Clustering</div></h1>

In [1]:
import random
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from pyod.models import abod

In [2]:
n = 284807 - 1 #number of records in file (excludes header)
s = 10000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list

data = pd.read_csv('../__data/creditcard.csv', skiprows=skip)
X = data.drop('Class', axis=1)
print(str(X.shape) + '\n')
print(np.unique(data['Class'], return_counts=True))
print('\nInbalance ratio (class0/class1) : ' + str(492/284315)+'\n')

data.head(2)

(10001, 30)

(array([0, 1], dtype=int64), array([9985,   16], dtype=int64))

Inbalance ratio (class0/class1) : 0.0017304750013189597



Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,10,0.384978,0.616109,-0.8743,-0.094019,2.924584,3.317027,0.470455,0.538247,-0.558895,...,0.049924,0.238422,0.00913,0.99671,-0.767315,-0.492208,0.042472,-0.054337,9.99,0
1,23,1.322707,-0.174041,0.434555,0.576038,-0.836758,-0.831083,-0.264905,-0.220982,-1.071425,...,-0.284376,-0.323357,-0.03771,0.347151,0.559639,-0.280158,0.042335,0.028822,16.0,0


# LocalOutlierFactor

In [3]:
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.002, n_jobs=-1)
y_pred = lof.fit_predict(X)

# convert label
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

print(np.unique(y_pred, return_counts=True))

print(metrics.classification_report(y_true=data['Class'], y_pred=y_pred))

(array([0, 1]), array([9981,   20], dtype=int64))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9986
           1       0.00      0.00      0.00        15

    accuracy                           1.00     10001
   macro avg       0.50      0.50      0.50     10001
weighted avg       1.00      1.00      1.00     10001



# IsolationForest

In [4]:
IF = IsolationForest(contamination=0.005)
y_pred = IF.fit_predict(X)

# convert label
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

print(np.unique(y_pred, return_counts=True))

print(metrics.classification_report(y_true=data['Class'], y_pred=y_pred))



(array([0, 1]), array([9951,   50], dtype=int64))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9986
           1       0.12      0.40      0.18        15

    accuracy                           0.99     10001
   macro avg       0.56      0.70      0.59     10001
weighted avg       1.00      0.99      1.00     10001



# ABOD

In [5]:
abod_od = abod.ABOD(n_neighbors=20, contamination=0.002)
y_pred = abod_od.fit_predict(X)

print(np.unique(y_pred, return_counts=True))

print(metrics.classification_report(y_true=data['Class'], y_pred=y_pred))



(array([0, 1]), array([9981,   20], dtype=int64))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9986
           1       0.05      0.07      0.06        15

    accuracy                           1.00     10001
   macro avg       0.52      0.53      0.53     10001
weighted avg       1.00      1.00      1.00     10001



<div class="alert alert-block alert-info">
<div style="direction:rtl;text-align:left"><strong>Clustering</strong><br>MohammadReza <strong>Khajedaloi</strong><br><br>
</div>
<div style="direction:rtl;text-align:right">
<a href="http://mohammadkh.ir/">WebSite</a> - <a href="https://github.com/khajedaloi/">GitHub</a> - <a href="https://www.linkedin.com/in/mohammad-kh/">Linkedin</a>
</div>
</div>