In [1]:
import numpy as np 
import pandas as pd 
import os
import numpy as np
from sklearn import metrics

from sklearn.ensemble import IsolationForest
import eif as iso

In [2]:
df=pd.read_csv("/Users/pzha0021/Downloads/datasets/Mammography.csv")
df.head()

Unnamed: 0,attr1,attr2,attr3,attr4,attr5,attr6,class,Unnamed: 7
0,0.23002,5.072578,-0.276061,0.832444,-0.377866,0.480322,0,
1,0.155491,-0.16939,0.670652,-0.859553,-0.377866,-0.945723,0,
2,-0.784415,-0.443654,5.674705,-0.859553,-0.377866,-0.945723,0,
3,0.546088,0.131415,-0.456387,-0.859553,-0.377866,-0.945723,0,
4,-0.102987,-0.394994,-0.140816,0.979703,-0.377866,1.013566,0,


In [3]:
df.drop("Unnamed: 7",1,inplace=True)

In [4]:
df.shape

(11183, 7)

In [5]:
df.head()

Unnamed: 0,attr1,attr2,attr3,attr4,attr5,attr6,class
0,0.23002,5.072578,-0.276061,0.832444,-0.377866,0.480322,0
1,0.155491,-0.16939,0.670652,-0.859553,-0.377866,-0.945723,0
2,-0.784415,-0.443654,5.674705,-0.859553,-0.377866,-0.945723,0
3,0.546088,0.131415,-0.456387,-0.859553,-0.377866,-0.945723,0
4,-0.102987,-0.394994,-0.140816,0.979703,-0.377866,1.013566,0


In [6]:
df['class'].value_counts()

0    10923
1      260
Name: class, dtype: int64

In [7]:
260/11183

0.023249575248144506

There are 11183 data points, 6 features, and about 2.32% anomalies in the Mammography dataset, which are exactly the same as what is presented in the ExtendedIsoForest paper.

### Build an IsolationForest with 100 trees

In [63]:
# parameters

rng = np.random.RandomState(53)
num_trees = 100
sample = 256

In [64]:
clf = IsolationForest(n_estimators = num_trees, max_samples = sample, random_state = rng, contamination = 'auto')
clf.fit(df)

IsolationForest(max_samples=256,
                random_state=RandomState(MT19937) at 0x7F9949250040)

In [65]:
predict_probabilities = clf.decision_function(df)
predict_probabilities

array([-0.05091604,  0.08065439, -0.07795416, ..., -0.18141008,
       -0.07920944, -0.16526467])

In [70]:
predict_label = np.where(predict_probabilities >= 0.5, 1, 0)
predict_label

array([0, 0, 0, ..., 0, 0, 0])

In [71]:
pd.DataFrame(predict_label).value_counts()

0    11183
dtype: int64

The isolation Forest algorithm uses 0 as the score threshold.

In [68]:
y_pred_train = clf.predict(df)
y_pred_train
# classified -1 are anomalous

array([-1,  1, -1, ..., -1, -1, -1])

In [69]:
pd.DataFrame(y_pred_train).value_counts()

 1    9867
-1    1316
dtype: int64

In [60]:
# replace 1 with 0, and -1 with 1
y_pred_train = np.where(y_pred_train== 1, 0, y_pred_train)
y_pred_train = np.where(y_pred_train== -1, 1, y_pred_train)

In [61]:
pd.DataFrame(y_pred_train).value_counts()

0    9867
1    1316
dtype: int64

In [62]:
fpr, tpr, thresholds = metrics.roc_curve(df['class'], y_pred_train, pos_label=1)
metrics.auc(fpr, tpr)

0.9516616314199396

### Built a standard IsolationForest in the EIF package

In [15]:
F0 = iso.iForest(df.to_numpy(), ntrees=num_trees, sample_size=256, ExtensionLevel=0, seed = 53) 

# Extension level 0 is the same as the standard Isolation Forest. 

I can't find the contamination papameter in the iso.iForest()

In [16]:
S0 = F0.compute_paths(X_in=df.to_numpy())

In [17]:
S0

array([0.54466477, 0.37763599, 0.50763809, ..., 0.6823383 , 0.58570142,
       0.66702171])

In [50]:
S0_label = np.where(S0 > 0.5, 1, 0)

In [51]:
pd.DataFrame(S0_label).value_counts()

0    10255
1      928
dtype: int64

In [52]:
fpr, tpr, thresholds = metrics.roc_curve(df['class'], S0_label, pos_label=1)
metrics.auc(fpr, tpr)

0.9694223198754921

AUC is about 0.97.

### Build an Extended Isolation Forest in the EIF package.

Extended Level is N-1

In [22]:
F1 = iso.iForest(df.to_numpy(), ntrees = num_trees, sample_size = 256, ExtensionLevel = df.shape[1]-1,  seed = 53)
S1 = F1.compute_paths(X_in=df.to_numpy())

In [39]:
# use 0.5 as the threshold

S1_label = np.where(S1 > 0.5, 1, 0)

In [40]:
pd.DataFrame(S1_label).value_counts()

0    10162
1     1021
dtype: int64

In [41]:
fpr, tpr, thresholds = metrics.roc_curve(df['class'], S1_label, pos_label=1)
metrics.auc(fpr, tpr)

0.7741866139902394

In [None]:
# calculate precision-recall curve
precision, recall, thresholds = metrics.precision_recall_curve(df['class'], S1_label,pos_label=1)
metrics.precision_recall_curve