# Anomaly Detection
- David Atkins

### Data Load / Prep

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

from sklearn import set_config
set_config(transform_output='pandas')
pd.set_option('display.max_columns', 100)

In [2]:
df = pd.read_csv('data/credit_card.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      10000 non-null  float64
 1   V2      10000 non-null  float64
 2   V3      10000 non-null  float64
 3   V4      10000 non-null  float64
 4   V5      10000 non-null  float64
 5   V6      10000 non-null  float64
 6   V7      10000 non-null  float64
 7   V8      10000 non-null  float64
 8   V9      10000 non-null  float64
 9   V10     10000 non-null  float64
 10  V11     10000 non-null  float64
 11  V12     10000 non-null  float64
 12  V13     10000 non-null  float64
 13  V14     10000 non-null  float64
 14  V15     10000 non-null  float64
 15  V16     10000 non-null  float64
 16  V17     10000 non-null  float64
 17  V18     10000 non-null  float64
 18  V19     10000 non-null  float64
 19  V20     10000 non-null  float64
 20  V21     10000 non-null  float64
 21  V22     10000 non-null  float64
 22 

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [3]:
df.isna().sum().sum()

0

In [4]:
df.duplicated().sum()

0

### KMeans

In [5]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)
kmeans = KMeans(n_clusters=3, n_init= 10, random_state=42)
kmeans.fit(scaled_df)

In [6]:
# Making a dataframe copy of X for saving clusters 
df = scaled_df.copy()
df['cluster'] = kmeans.predict(scaled_df)
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,cluster
0,-0.726092,-0.270865,1.38998,0.762227,-0.236899,0.264052,0.285346,0.131083,-0.384443,0.28612,-1.133169,0.446834,-1.535047,-0.821518,1.645946,-0.517249,-0.13325,0.057508,0.58559,0.370795,0.043066,0.689788,-0.163478,0.083851,0.099198,-0.532722,0.312929,-0.090009,0.459472,0
1,0.94981,-0.01302,-0.647336,0.11071,0.102482,-0.155239,-0.009963,0.120253,-0.915428,0.051569,0.722785,1.551943,-0.315973,-0.686699,0.798096,0.541269,-0.466837,-0.205886,-0.093585,-0.167706,-0.18551,-0.760673,0.268858,-0.603364,0.188884,0.033761,-0.039088,0.0435,-0.333835,0
2,-0.725138,-1.235035,0.733929,0.06281,-0.377373,1.293991,0.797185,0.249752,-1.995239,0.392444,-0.124638,0.895884,-0.128068,-0.704557,2.539689,-3.258583,0.798958,-0.127804,-2.707713,0.830447,0.336466,1.471338,1.918734,-1.193707,-0.959876,-0.442774,-0.153601,-0.234422,1.69611,0
3,-0.467623,-0.356408,0.750937,-0.80803,0.042565,0.868121,0.283501,0.353112,-1.885794,0.153501,-0.854378,0.96952,-0.300606,-0.802796,-0.492057,-1.184825,-1.055192,2.500622,-1.43623,-0.401179,-0.056083,0.258431,-0.326493,-2.015264,1.303733,-0.591732,0.137995,0.217896,0.318445,0
4,-0.593701,0.452248,0.54094,0.079101,-0.295578,-0.018016,0.613062,-0.163028,0.004829,0.888749,-1.365765,1.20588,0.389505,-1.472671,0.329236,-0.495778,-0.593159,-0.023071,1.079111,0.634808,0.052846,1.513434,-0.218569,0.209441,-0.677493,0.71064,0.524976,0.791442,0.029532,0


In [7]:
df['cluster'].value_counts()

1    5648
0    3681
2     671
Name: cluster, dtype: int64

In [8]:
distances = cdist(scaled_df, kmeans.cluster_centers_, 'euclidean')
distances.shape

(10000, 3)

In [9]:
scaled_df.shape

(10000, 29)

In [10]:
cluter_cols = [f"Distance (Cluster {c})" for  c in range(len(kmeans.cluster_centers_))]
distance_df = pd.DataFrame(distances, columns = cluter_cols)
distance_df.head(3)

Unnamed: 0,Distance (Cluster 0),Distance (Cluster 1),Distance (Cluster 2)
0,2.804682,4.393264,5.857215
1,2.42643,3.686087,5.075081
2,6.542622,7.337378,8.656002


In [11]:
# Get the minimum distance to any cluster for each point
min_distances = np.min(distances, axis=1)
# Display first 5 values
min_distances[:5]

array([2.80468161, 2.42642963, 6.5426218 , 4.74252086, 3.24973107])

In [12]:
threshold = np.percentile(min_distances, 99.6)
threshold

20.909887868001256

In [13]:
filter_anomalies = min_distances > threshold
filter_anomalies.sum()

40

In [14]:
df['anomaly-kmeans'] = filter_anomalies
df.head(3)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,cluster,anomaly-kmeans
0,-0.726092,-0.270865,1.38998,0.762227,-0.236899,0.264052,0.285346,0.131083,-0.384443,0.28612,-1.133169,0.446834,-1.535047,-0.821518,1.645946,-0.517249,-0.13325,0.057508,0.58559,0.370795,0.043066,0.689788,-0.163478,0.083851,0.099198,-0.532722,0.312929,-0.090009,0.459472,0,False
1,0.94981,-0.01302,-0.647336,0.11071,0.102482,-0.155239,-0.009963,0.120253,-0.915428,0.051569,0.722785,1.551943,-0.315973,-0.686699,0.798096,0.541269,-0.466837,-0.205886,-0.093585,-0.167706,-0.18551,-0.760673,0.268858,-0.603364,0.188884,0.033761,-0.039088,0.0435,-0.333835,0,False
2,-0.725138,-1.235035,0.733929,0.06281,-0.377373,1.293991,0.797185,0.249752,-1.995239,0.392444,-0.124638,0.895884,-0.128068,-0.704557,2.539689,-3.258583,0.798958,-0.127804,-2.707713,0.830447,0.336466,1.471338,1.918734,-1.193707,-0.959876,-0.442774,-0.153601,-0.234422,1.69611,0,False


In [15]:
idx_anomalies = scaled_df[filter_anomalies].index
idx_anomalies

Int64Index([ 159, 1376, 1619, 2156, 2212, 2439, 2594, 2654, 2756, 2911, 2914,
            2917, 2923, 3443, 4779, 5303, 5412, 5413, 5529, 5674, 5704, 5764,
            6489, 6643, 6672, 7322, 7338, 7470, 7596, 7597, 8124, 8163, 8437,
            8442, 8856, 8939, 8999, 9071, 9304, 9326],
           dtype='int64')

In [16]:
scaled_df.iloc[idx_anomalies]

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
159,-3.83496,-9.431468,-5.686284,2.104654,-1.153193,3.584223,4.52282,-0.129306,-1.147362,-1.366035,-0.971285,1.040386,-0.31701,0.443842,0.823397,0.531465,-0.862498,1.022771,-1.122225,12.960326,2.518824,-3.332963,-7.192555,2.023902,-1.141962,-1.367622,-1.773317,2.499616,20.320071
1376,-2.418627,-3.007869,-2.040269,1.407855,-8.255103,4.877629,10.065097,-1.668099,-0.900585,-0.710429,-0.933227,-0.448552,-1.440443,-1.453373,1.303724,1.877587,-1.230779,-0.183323,-0.915106,-3.75428,-1.015485,1.76492,5.0476,0.394163,0.49937,1.889565,3.994384,-6.126389,12.89069
1619,-7.15007,-7.528445,-11.441444,4.009433,-27.290908,16.374319,31.878466,-5.938228,-2.347745,-2.19564,2.51453,-0.35769,1.279856,-1.902074,3.852449,4.646881,-2.058505,-1.093783,1.015847,-19.791952,-5.125945,2.41208,-5.911458,1.395896,1.53332,1.248504,9.496001,15.504724,41.292747
2156,-3.674689,-9.158175,-3.656015,2.370064,-3.748184,0.415459,4.262012,-0.732546,-0.492485,-1.483717,-1.212926,0.933604,-0.708696,0.116644,1.129175,0.712448,-0.406408,0.674243,-1.411297,12.238043,2.443958,-2.902722,-6.553469,1.000686,-2.487275,-0.03134,-1.778584,2.425582,19.461379
2212,-3.215208,3.026652,-1.687193,-2.215652,-0.775564,1.476574,-4.085387,-11.810305,0.773002,0.285954,0.584726,2.037797,-2.106236,0.48264,0.478011,-0.855557,0.894265,0.26658,0.336464,-5.487786,16.278911,-6.764131,4.300451,-0.071619,-0.10071,-0.275131,1.512629,1.061451,-0.326762
2439,-7.824911,-12.184466,-1.114101,2.453688,9.132442,-5.838093,-6.342982,1.306016,-0.64652,-0.180378,0.226978,2.269757,-0.401795,0.707069,-0.007378,-0.430741,0.179731,-0.494607,-1.26991,7.457408,1.892358,-1.190891,0.083439,-0.193509,-0.96516,0.213631,0.481461,-8.850109,0.299547
2594,-1.524177,-2.246208,-3.43837,1.228373,-9.67575,5.977763,11.396173,-1.403798,-1.962029,-1.585241,0.016495,0.746917,0.873895,-0.755322,1.880003,1.633457,-1.608488,1.988047,2.116557,1.340186,-0.064103,0.668031,4.593911,-1.331986,0.138431,-0.625159,2.89828,-2.338725,15.363209
2654,-4.789278,2.105598,-1.241513,0.500971,0.085842,2.026108,-5.749739,-15.222733,0.203976,0.678419,-0.916013,1.914899,-1.816517,-0.010251,-0.301824,-0.501598,1.207208,-1.025966,-0.66256,-6.412602,10.683389,-4.582514,5.469079,0.69604,0.498896,-0.683721,3.195176,-2.40501,-0.186922
2756,-6.913533,-7.777855,0.673923,1.964941,6.806898,-3.19586,-8.2336,-4.009462,1.273875,-0.009641,-1.048503,2.254194,-0.511892,-1.097861,-0.253083,0.208405,0.393092,-2.235378,-0.330877,-8.166138,-4.784251,2.834458,-1.390803,1.062015,-1.474357,1.074814,8.311982,-4.891511,0.110575
2911,-2.686228,3.031804,-2.148883,-0.01232,-1.157837,1.912165,-10.291944,-18.772044,-2.282244,-4.544346,-0.981103,3.220752,-1.636125,2.643913,0.917435,-0.571699,1.943065,-0.10103,0.962689,9.779945,-12.357685,6.045469,-0.347143,1.526074,1.239905,-0.878097,-0.345087,2.003279,-0.34296


### Isolation Forest

In [17]:
iso_004 = IsolationForest(contamination=0.004, random_state = 42)
# fit the model using .values to avoid a warning
iso_004.fit(scaled_df.values)

In [18]:
predictions = iso_004.predict(scaled_df.values)
predictions[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [19]:
predictions[predictions ==1] = 0
predictions[predictions ==-1] = 1
predictions[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [20]:
df['anomaly-iso_004'] = predictions
df.head(3)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,cluster,anomaly-kmeans,anomaly-iso_004
0,-0.726092,-0.270865,1.38998,0.762227,-0.236899,0.264052,0.285346,0.131083,-0.384443,0.28612,-1.133169,0.446834,-1.535047,-0.821518,1.645946,-0.517249,-0.13325,0.057508,0.58559,0.370795,0.043066,0.689788,-0.163478,0.083851,0.099198,-0.532722,0.312929,-0.090009,0.459472,0,False,0
1,0.94981,-0.01302,-0.647336,0.11071,0.102482,-0.155239,-0.009963,0.120253,-0.915428,0.051569,0.722785,1.551943,-0.315973,-0.686699,0.798096,0.541269,-0.466837,-0.205886,-0.093585,-0.167706,-0.18551,-0.760673,0.268858,-0.603364,0.188884,0.033761,-0.039088,0.0435,-0.333835,0,False,0
2,-0.725138,-1.235035,0.733929,0.06281,-0.377373,1.293991,0.797185,0.249752,-1.995239,0.392444,-0.124638,0.895884,-0.128068,-0.704557,2.539689,-3.258583,0.798958,-0.127804,-2.707713,0.830447,0.336466,1.471338,1.918734,-1.193707,-0.959876,-0.442774,-0.153601,-0.234422,1.69611,0,False,0


### Compare

In [21]:
kmeans_anomalies = df['anomaly-kmeans'].sum()
print(f'Kmeans (threshold = 0.996) identified {kmeans_anomalies} anomalies.')
isoforest_anomalies = df['anomaly-iso_004'].sum()
print(f'Isoforest (contamination = 0.004) identified {isoforest_anomalies} anomalies.')

Kmeans (threshold = 0.996) identified 40 anomalies.
Isoforest (contamination = 0.004) identified 40 anomalies.


In [22]:
both = [a for a in isoforest_anomalies if a in kmeans_anomalies]

TypeError: 'numpy.int64' object is not iterable