In [30]:
import numpy as np
from scipy.io import arff
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import precision_score, recall_score, f1_score

In [122]:
dataset = pd.read_csv('dateset/dataset.csv')
dataset['Outlier_Ratio'] = dataset["Outliers"] / dataset["Instances"]
dataset.sort_values('Instances', inplace = True)
dataset

Unnamed: 0,Name,Instances,Outliers,Attributes,Outlier_Ratio
4,Lymphography,148,6,19,0.040541
10,WPBC,198,47,33,0.237374
1,Glass,214,9,7,0.042056
2,Ionosphere,351,126,32,0.358974
9,WDBC,367,10,30,0.027248
8,WBC,454,10,9,0.022026
6,Shuttle,1013,13,9,0.012833
7,Waveform,3443,100,21,0.029044
5,PenDigits,9868,20,16,0.002027
0,ALOI,50000,1508,27,0.03016


## Glass

In [31]:
data_Glass = arff.loadarff('dateset/Glass/Glass_withoutdupl_norm.arff')
Glass = pd.DataFrame(data_Glass[0])
Glass.drop(columns="id", inplace=True)
Glass["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)
Glass.head()

Unnamed: 0,var_0000,var_0001,var_0002,var_0003,var_0004,var_0005,var_0006,outlier
0,0.467651,0.321584,0.76888,0.24663,0.838799,0.099737,0.29834,1
1,0.496412,0.220491,0.776032,0.316598,0.919973,0.089145,0.279479,1
2,0.519133,0.404464,0.768012,0.334978,0.801622,0.092369,0.271238,1
3,0.19965,0.547373,0.374284,0.362223,0.817017,0.0,0.177913,-1
4,0.847261,0.286361,0.0,0.217792,0.0,0.019135,1.0,1


In [46]:
iforest_Glass = IsolationForest(random_state=0,contamination=0.042)
iforest_Glass.fit(Glass)
pred_Glass_iforest = iforest_Glass.predict(Glass)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [47]:
LOF_Glass = LocalOutlierFactor(contamination=0.042)
pred_Glass_LOF = LOF_Glass.fit_predict(Glass)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [48]:
result_Glass = pd.DataFrame({'iforest':[precision_score(Glass['outlier'],pred_Glass_iforest),
                                       recall_score( Glass['outlier'],pred_Glass_iforest),
                                       f1_score(Glass['outlier'],pred_Glass_iforest)],
                            'LOF':[precision_score(Glass['outlier'],pred_Glass_LOF),
                                   recall_score( Glass['outlier'],pred_Glass_LOF),
                                   f1_score(Glass['outlier'],pred_Glass_LOF)]},index=['precision','recall','f1_score'])
result_Glass

Unnamed: 0,iforest,LOF
precision,0.970732,0.995122
recall,0.970732,0.995122
f1_score,0.970732,0.995122


## ALOI

In [50]:
data_ALOI = arff.loadarff("dateset/ALOI/ALOI_withoutdupl_norm.arff")
ALOI = pd.DataFrame(data_ALOI[0])
ALOI.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att20,att21,att22,att23,att24,att25,att26,att27,outlier,id
0,0.784999,1.8e-05,0.0,9.3e-05,0.0,0.0,0.0,0.0,0.0,0.139811,...,0.001189,0.0,0.115728,0.023449,0.0002,0.00022,0.034952,0.046914,"b""'yes'""",1.0
1,0.958088,0.0,0.0,0.001671,0.0,0.0,0.0,0.0,0.0,0.019556,...,0.0,0.0,0.022035,0.007516,0.0,0.0,0.001033,0.0,"b""'yes'""",2.0
2,0.938768,0.0,0.0,0.005146,0.0,0.0,0.0,0.0,0.0,0.018451,...,0.0,0.0,0.035542,0.011982,0.0,0.001595,0.01952,0.0,"b""'yes'""",3.0
3,0.954775,0.0,0.0,0.001427,0.0,0.0,0.0,0.0,0.0,0.024944,...,0.0,0.0,0.019941,0.000805,0.0,0.0,3.5e-05,0.0,"b""'yes'""",4.0
4,0.933601,0.0,0.0,0.001682,0.0,0.0,0.0,0.0,0.0,0.037002,...,0.0,0.0,0.046759,0.002663,0.0,0.000339,0.001359,0.0,"b""'yes'""",5.0


In [51]:
ALOI.drop(columns="id", inplace=True)
ALOI["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)

In [52]:
iforest_ALOI = IsolationForest(random_state=0,contamination=0.03)
iforest_ALOI.fit(ALOI)
pred_ALOI_iforest = iforest_ALOI.predict(ALOI)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [54]:
LOF_ALOI = LocalOutlierFactor(contamination=0.03)
pred_ALOI_LOF = LOF_ALOI.fit_predict(ALOI)

In [55]:
result_ALOI = pd.DataFrame({'iforest':[precision_score(ALOI['outlier'],pred_ALOI_iforest),
                                       recall_score( ALOI['outlier'],pred_ALOI_iforest),
                                       f1_score(ALOI['outlier'],pred_ALOI_iforest)],
                            'LOF':[precision_score(ALOI['outlier'],pred_ALOI_LOF),
                                   recall_score( ALOI['outlier'],pred_ALOI_LOF),
                                   f1_score(ALOI['outlier'],pred_ALOI_LOF)]},index=['precision','recall','f1_score'])
result_ALOI

Unnamed: 0,iforest,LOF
precision,0.970405,0.970509
recall,0.970849,0.970953
f1_score,0.970627,0.970731


## Ionosphere

In [62]:
data_Ionosphere = arff.loadarff('dateset/Ionosphere/Ionosphere_withoutdupl_norm.arff')
Ionosphere = pd.DataFrame(data_Ionosphere[0])
Ionosphere.drop(columns="id", inplace=True)
Ionosphere["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)
Ionosphere.head()

Unnamed: 0,var_0000,var_0001,var_0002,var_0003,var_0004,var_0005,var_0006,var_0007,var_0008,var_0009,...,var_0023,var_0024,var_0025,var_0026,var_0027,var_0028,var_0029,var_0030,var_0031,outlier
0,0.997695,0.470555,0.926215,0.51153,0.91699,0.31146,1.0,0.5188,0.926215,0.411225,...,0.244145,0.70539,0.26916,0.60633,0.32955,0.711335,0.227565,0.593205,0.2735,1
1,1.0,0.405855,0.965175,0.31922,0.44566,0.032015,1.0,0.477255,0.75437,0.161285,...,0.367155,0.39766,0.407995,0.4048,0.442035,0.41687,0.46856,0.43131,0.487765,-1
2,1.0,0.483175,1.0,0.502425,1.0,0.43969,0.944825,0.50599,0.86541,0.52673,...,0.2989,0.79492,0.389275,0.7155,0.413175,0.80218,0.3791,0.780225,0.30881,1
3,1.0,0.274195,1.0,1.0,0.85608,0.0,0.5,0.5,0.5,0.5,...,0.953475,0.758065,1.0,1.0,0.399505,0.62841,1.0,0.33809,1.0,-1
4,1.0,0.487995,0.9707,0.532655,0.96053,0.383725,0.88576,0.418005,0.76399,0.398625,...,0.17421,0.56645,0.23397,0.512155,0.189015,0.471465,0.202135,0.47696,0.171515,1


In [72]:
iforest_Ionosphere = IsolationForest(random_state=0,contamination=0.35)
iforest_Ionosphere.fit(Ionosphere)
pred_Ionosphere_iforest = iforest_Ionosphere.predict(Ionosphere)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [73]:
LOF_Ionosphere = LocalOutlierFactor(contamination=0.35)
pred_Ionosphere_LOF = LOF_Ionosphere.fit_predict(Ionosphere)

In [74]:
result_Ionosphere = pd.DataFrame({'iforest':[precision_score(Ionosphere['outlier'],pred_Ionosphere_iforest),
                                       recall_score( Ionosphere['outlier'],pred_Ionosphere_iforest),
                                       f1_score(Ionosphere['outlier'],pred_Ionosphere_iforest)],
                            'LOF':[precision_score(Ionosphere['outlier'],pred_Ionosphere_LOF),
                                   recall_score( Ionosphere['outlier'],pred_Ionosphere_LOF),
                                   f1_score(Ionosphere['outlier'],pred_Ionosphere_LOF)]},index=['precision','recall','f1_score'])
result_Ionosphere

Unnamed: 0,iforest,LOF
precision,0.820175,0.72807
recall,0.831111,0.737778
f1_score,0.825607,0.732892


## KDDCup99

In [75]:
data_KDDCup99 = arff.loadarff('dateset/KDDCup99/KDDCup99_withoutdupl_norm_idf.arff')
KDDCup99 = pd.DataFrame(data_KDDCup99[0])
KDDCup99.drop(columns="id", inplace=True)
KDDCup99["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)
KDDCup99.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att32,att33,att34,att35,att36,att37,att38,att39,att40,outlier
0,0.0,0.484582,0.335534,0.0,1.7e-05,2.8e-05,0.0,0.0,0.0,0.0,...,0.996078,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.484582,0.255578,0.0,5e-06,0.0,0.0,0.0,0.0,0.0,...,0.011765,0.3,0.3,0.3,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.484582,0.335534,0.0,1.7e-05,2.8e-05,0.0,0.0,0.0,0.0,...,0.992157,0.99,0.01,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,3.5e-05,3.6e-05,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,3.7e-05,5e-05,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.33,0.07,0.33,0.0,0.0,0.0,1


In [78]:
iforest_KDDCup99 = IsolationForest(random_state=0,contamination=0.004)
iforest_KDDCup99.fit(KDDCup99)
pred_KDDCup99_iforest = iforest_KDDCup99.predict(KDDCup99)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [79]:
LOF_KDDCup99 = LocalOutlierFactor(contamination=0.004)
pred_KDDCup99_LOF = LOF_KDDCup99.fit_predict(KDDCup99)

In [80]:
result_KDDCup99 = pd.DataFrame({'iforest':[precision_score(KDDCup99['outlier'],pred_KDDCup99_iforest),
                                       recall_score( KDDCup99['outlier'],pred_KDDCup99_iforest),
                                       f1_score(KDDCup99['outlier'],pred_KDDCup99_iforest)],
                            'LOF':[precision_score(KDDCup99['outlier'],pred_KDDCup99_LOF),
                                   recall_score( KDDCup99['outlier'],pred_KDDCup99_LOF),
                                   f1_score(KDDCup99['outlier'],pred_KDDCup99_LOF)]},index=['precision','recall','f1_score'])
result_KDDCup99

Unnamed: 0,iforest,LOF
precision,0.997913,0.995826
recall,0.998059,0.995972
f1_score,0.997986,0.995899


## Lymphography

In [81]:
data_Lymphography =arff .loadarff('dateset/Lymphography/Lymphography_withoutdupl_norm_idf.arff')
Lymphography = pd.DataFrame(data_Lymphography[0])
Lymphography.drop(columns="id", inplace=True)
Lymphography["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)
Lymphography.head()

Unnamed: 0,att2,att3,att4,att5,att6,att7,att8,att9,att10,att11,att12,att13,att14,att15,att16,att17,att18,att19,outlier
0,0.10709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.007181,0.434968,0.0,0.0,1.0,0.0,0.285714,1
1,0.201673,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.333333,0.0,0.007181,0.434968,0.0,1.0,1.0,0.0,0.142857,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.066385,0.0,0.434968,0.0,0.0,0.0,0.0,0.857143,1
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.333333,0.0,0.007181,0.229565,0.276929,1.0,1.0,1.0,0.0,1
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.333333,0.0,0.007181,1.0,0.597637,1.0,0.0,0.0,0.142857,1


In [82]:
iforest_Lymphography = IsolationForest(random_state=0,contamination=0.04)
iforest_Lymphography.fit(Lymphography)
pred_Lymphography_iforest = iforest_Lymphography.predict(Lymphography)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [83]:
LOF_Lymphography = LocalOutlierFactor(contamination=0.04)
pred_Lymphography_LOF = LOF_Lymphography.fit_predict(Lymphography)

In [85]:
result_Lymphography = pd.DataFrame({'iforest':[precision_score(Lymphography['outlier'],pred_Lymphography_iforest),
                                       recall_score( Lymphography['outlier'],pred_Lymphography_iforest),
                                       f1_score(Lymphography['outlier'],pred_Lymphography_iforest)],
                            'LOF':[precision_score(Lymphography['outlier'],pred_Lymphography_LOF),
                                   recall_score( Lymphography['outlier'],pred_Lymphography_LOF),
                                   f1_score(Lymphography['outlier'],pred_Lymphography_LOF)]},index=['precision','recall','f1_score'])
result_Lymphography

Unnamed: 0,iforest,LOF
precision,1.0,1.0
recall,1.0,1.0
f1_score,1.0,1.0


## PenDigits

In [86]:
data_PenDigits =arff .loadarff('dateset/PenDigits/PenDigits_withoutdupl_norm_v10.arff')
PenDigits = pd.DataFrame(data_PenDigits[0])
PenDigits.drop(columns="id", inplace=True)
PenDigits["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)
PenDigits.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,att11,att12,att13,att14,att15,att16,outlier
0,45.0,100.0,30.0,79.0,0.0,47.0,8.0,41.0,68.0,38.0,100.0,52.0,96.0,35.0,91.0,0.0,-1
1,42.0,100.0,12.0,74.0,0.0,45.0,65.0,51.0,100.0,76.0,85.0,55.0,62.0,27.0,37.0,0.0,-1
2,26.0,100.0,0.0,83.0,23.0,51.0,83.0,42.0,100.0,55.0,79.0,84.0,56.0,42.0,37.0,0.0,-1
3,9.0,100.0,0.0,72.0,27.0,51.0,100.0,55.0,97.0,86.0,72.0,61.0,53.0,30.0,48.0,0.0,-1
4,47.0,100.0,38.0,94.0,12.0,68.0,0.0,42.0,48.0,39.0,100.0,52.0,80.0,26.0,83.0,0.0,-1


In [90]:
iforest_PenDigits = IsolationForest(random_state=0,contamination=0.002)
iforest_PenDigits.fit(PenDigits)
pred_PenDigits_iforest = iforest_PenDigits.predict(PenDigits)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [91]:
LOF_PenDigits = LocalOutlierFactor(contamination=0.002)
pred_PenDigits_LOF = LOF_PenDigits.fit_predict(PenDigits)

In [92]:
result_PenDigits = pd.DataFrame({'iforest':[precision_score(PenDigits['outlier'],pred_PenDigits_iforest),
                                       recall_score( PenDigits['outlier'],pred_PenDigits_iforest),
                                       f1_score(PenDigits['outlier'],pred_PenDigits_iforest)],
                            'LOF':[precision_score(PenDigits['outlier'],pred_PenDigits_LOF),
                                   recall_score( PenDigits['outlier'],pred_PenDigits_LOF),
                                   f1_score(PenDigits['outlier'],pred_PenDigits_LOF)]},index=['precision','recall','f1_score'])
result_PenDigits

Unnamed: 0,iforest,LOF
precision,0.997969,0.997969
recall,0.997969,0.997969
f1_score,0.997969,0.997969


## Shuttle

In [93]:
data_Shuttle =arff .loadarff('dateset/Shuttle/Shuttle_withoutdupl_norm_v10.arff')
Shuttle = pd.DataFrame(data_Shuttle[0])
Shuttle.drop(columns="id", inplace=True)
Shuttle["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)
Shuttle.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,outlier
0,0.177215,0.296576,0.076923,0.24,0.614035,0.180233,0.433735,0.242718,0.180328,-1
1,0.063291,0.296576,0.076923,0.24,0.614035,0.180233,0.53012,0.242718,0.114754,-1
2,0.164557,0.296576,0.076923,0.24,0.175439,0.180233,0.445783,0.757282,0.606557,-1
3,0.063291,0.296418,0.076923,0.24,0.614035,0.180233,0.542169,0.242718,0.114754,-1
4,0.151899,0.297832,0.076923,0.24,0.175439,0.180233,0.457831,0.757282,0.590164,-1


In [94]:
iforest_Shuttle = IsolationForest(random_state=0,contamination=0.013)
iforest_Shuttle.fit(Shuttle)


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


IsolationForest(bootstrap=False, contamination=0.013, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=0,
        verbose=0)

In [95]:
pred_Shuttle_iforest = iforest_Shuttle.predict(Shuttle)
LOF_Shuttle = LocalOutlierFactor(contamination=0.013)
pred_Shuttle_LOF = LOF_Shuttle.fit_predict(Shuttle)


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [96]:
result_Shuttle = pd.DataFrame({'iforest':[precision_score(Shuttle['outlier'],pred_Shuttle_iforest),
                                       recall_score( Shuttle['outlier'],pred_Shuttle_iforest),
                                       f1_score(Shuttle['outlier'],pred_Shuttle_iforest)],
                            'LOF':[precision_score(Shuttle['outlier'],pred_Shuttle_LOF),
                                   recall_score( Shuttle['outlier'],pred_Shuttle_LOF),
                                   f1_score(Shuttle['outlier'],pred_Shuttle_LOF)]},index=['precision','recall','f1_score'])
result_Shuttle

Unnamed: 0,iforest,LOF
precision,0.988989,0.998999
recall,0.988,0.998
f1_score,0.988494,0.998499


## Waveform

In [97]:
data_Waveform =arff .loadarff('dateset/Waveform/Waveform_withoutdupl_norm_v10.arff')
Waveform = pd.DataFrame(data_Waveform[0])
Waveform.drop(columns="id", inplace=True)
Waveform["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)
Waveform.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att13,att14,att15,att16,att17,att18,att19,att20,att21,outlier
0,0.298984,0.336158,0.490058,0.508863,0.450902,0.259184,0.534768,0.408451,0.328901,0.395745,...,0.439169,0.566698,0.664286,0.662239,0.753651,0.790456,0.583144,0.543796,0.599732,-1
1,0.525399,0.635593,0.753216,0.495308,0.705411,0.820408,0.840232,0.872359,0.656028,0.548936,...,0.445104,0.338694,0.1875,0.245731,0.326193,0.397303,0.277904,0.278589,0.376171,-1
2,0.285922,0.478814,0.831579,0.479666,0.628257,0.873469,0.678808,0.611796,0.633865,0.618085,...,0.272997,0.243141,0.385714,0.478178,0.298929,0.313278,0.501139,0.564477,0.345382,-1
3,0.288824,0.477401,0.506433,0.327424,0.638277,0.410204,0.562914,0.693662,0.35195,0.440426,...,0.530168,0.512772,0.548214,0.574004,0.483934,0.536307,0.541002,0.399027,0.4083,-1
4,0.390421,0.435028,0.553216,0.427529,0.442886,0.515306,0.405629,0.376761,0.578901,0.301064,...,0.40455,0.721854,0.555357,0.347249,0.608569,0.516598,0.454442,0.521898,0.345382,-1


In [105]:
iforest_Waveform = IsolationForest(random_state=0,contamination=0.03)
iforest_Waveform.fit(Waveform)
pred_Waveform_iforest = iforest_Waveform.predict(Waveform)


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [106]:
LOF_Waveform = LocalOutlierFactor(contamination=0.03)
pred_Waveform_LOF = LOF_Waveform.fit_predict(Waveform)


In [107]:
result_Waveform = pd.DataFrame({'iforest':[precision_score(Waveform['outlier'],pred_Waveform_iforest),
                                       recall_score( Waveform['outlier'],pred_Waveform_iforest),
                                       f1_score(Waveform['outlier'],pred_Waveform_iforest)],
                            'LOF':[precision_score(Waveform['outlier'],pred_Waveform_LOF),
                                   recall_score( Waveform['outlier'],pred_Waveform_LOF),
                                   f1_score(Waveform['outlier'],pred_Waveform_LOF)]},index=['precision','recall','f1_score'])
result_Waveform

Unnamed: 0,iforest,LOF
precision,0.990416,0.970051
recall,0.989231,0.96889
f1_score,0.989823,0.96947


## WBC

In [101]:
data_WBC =arff .loadarff('dateset/WBC/WBC_withoutdupl_norm_v10.arff')
WBC = pd.DataFrame(data_WBC[0])
WBC.drop(columns="id", inplace=True)
WBC["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)
WBC.head()

Unnamed: 0,att2,att3,att4,att5,att6,att7,att8,att9,att10,outlier
0,1.0,0.666667,0.666667,0.333333,0.444444,1.0,0.444444,0.666667,0.142857,-1
1,0.444444,1.0,1.0,0.222222,0.777778,0.0,0.444444,1.0,0.285714,-1
2,0.777778,1.0,1.0,0.777778,0.444444,1.0,0.666667,0.777778,0.0,-1
3,0.444444,0.444444,0.444444,0.111111,0.444444,1.0,0.333333,0.222222,0.0,-1
4,1.0,1.0,1.0,1.0,0.444444,1.0,1.0,1.0,0.857143,-1


In [108]:
iforest_WBC = IsolationForest(random_state=0,contamination=0.022)
iforest_WBC.fit(WBC)
pred_WBC_iforest = iforest_WBC.predict(WBC)


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [109]:
LOF_WBC = LocalOutlierFactor(contamination=0.022)
pred_WBC_LOF = LOF_WBC.fit_predict(WBC)


In [110]:
result_WBC = pd.DataFrame({'iforest':[precision_score(WBC['outlier'],pred_WBC_iforest),
                                       recall_score( WBC['outlier'],pred_WBC_iforest),
                                       f1_score(WBC['outlier'],pred_WBC_iforest)],
                            'LOF':[precision_score(WBC['outlier'],pred_WBC_LOF),
                                   recall_score( WBC['outlier'],pred_WBC_LOF),
                                   f1_score(WBC['outlier'],pred_WBC_LOF)]},index=['precision','recall','f1_score'])
result_WBC

Unnamed: 0,iforest,LOF
precision,0.977064,0.954128
recall,1.0,0.976526
f1_score,0.988399,0.965197


## WDBC

In [111]:
data_WDBC =arff .loadarff('dateset/WDBC/WDBC_withoutdupl_norm_v10.arff')
WDBC = pd.DataFrame(data_WDBC[0])
WDBC.drop(columns="id", inplace=True)
WDBC["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)
WDBC.head()

Unnamed: 0,att3,att4,att5,att6,att7,att8,att9,att10,att11,att12,...,att24,att25,att26,att27,att28,att29,att30,att31,att32,outlier
0,0.570615,0.645643,0.596782,0.440958,0.50167,0.734989,0.409688,0.581848,0.514557,0.308656,...,0.829637,0.819372,0.588446,0.717994,0.924016,0.50599,0.753818,0.616483,0.466324,-1
1,0.541121,0.288797,0.558497,0.403594,0.573892,0.71152,0.411392,0.593285,0.651812,0.360592,...,0.515793,0.63543,0.493973,0.901105,0.665986,0.375719,0.754935,0.505762,0.549202,-1
2,0.690651,0.388797,0.708787,0.562711,0.554934,0.673871,0.464946,0.724734,0.636364,0.259681,...,0.43246,0.688458,0.54501,0.765897,0.579554,0.400799,0.947114,0.386523,0.379056,-1
3,0.910831,0.552697,0.90225,0.839012,0.344407,0.547233,0.356621,0.577327,0.614973,0.105695,...,0.657594,0.875715,0.849604,0.369543,0.517025,0.327316,0.585847,0.531814,0.304851,-1
4,0.642637,0.563485,0.663985,0.521827,0.404351,0.637199,0.440847,0.583311,0.662507,0.235308,...,0.638777,0.653658,0.53198,0.543383,0.695272,0.566054,0.837244,0.816132,0.43827,-1


In [112]:
iforest_WDBC = IsolationForest(random_state=0,contamination=0.027)
iforest_WDBC.fit(WDBC)
pred_WDBC_iforest = iforest_WDBC.predict(WDBC)


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [113]:
LOF_WDBC = LocalOutlierFactor(contamination=0.027)
pred_WDBC_LOF = LOF_WDBC.fit_predict(WDBC)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [114]:
result_WDBC = pd.DataFrame({'iforest':[precision_score(WDBC['outlier'],pred_WDBC_iforest),
                                       recall_score( WDBC['outlier'],pred_WDBC_iforest),
                                       f1_score(WDBC['outlier'],pred_WDBC_iforest)],
                            'LOF':[precision_score(WDBC['outlier'],pred_WDBC_LOF),
                                   recall_score( WDBC['outlier'],pred_WDBC_LOF),
                                   f1_score(WDBC['outlier'],pred_WDBC_LOF)]},index=['precision','recall','f1_score'])
result_WDBC

Unnamed: 0,iforest,LOF
precision,0.991597,1.0
recall,0.991597,1.0
f1_score,0.991597,1.0


## WPBC

In [115]:
data_WPBC =arff .loadarff('dateset/WPBC/WPBC_withoutdupl_norm.arff')
WPBC = pd.DataFrame(data_WPBC[0])
WPBC.drop(columns="id", inplace=True)
WPBC["outlier"].replace({b"'no'":1,b"'yes'":-1},inplace=True)
WPBC.head()

Unnamed: 0,var_0000,var_0001,var_0002,var_0003,var_0004,var_0005,var_0006,var_0007,var_0008,var_0009,...,var_0024,var_0025,var_0026,var_0027,var_0028,var_0029,var_0030,var_0031,var_0032,outlier
0,0.241935,0.434542,0.595848,0.413793,0.344948,0.285673,0.216883,0.210069,0.277738,0.321594,...,0.273322,0.267183,0.140351,0.253067,0.336803,0.2192,0.171127,0.479167,0.185185,1
1,0.483871,0.432698,0.0,0.461887,0.338594,0.622831,0.872621,0.685467,0.700923,0.641455,...,0.44505,0.570687,0.610208,0.600269,0.904711,0.598462,0.418864,0.270833,0.074074,1
2,0.927419,0.640443,0.244291,0.595281,0.535586,0.192026,0.274543,0.252023,0.33993,0.591801,...,0.424431,0.262208,0.291639,0.276976,0.666679,0.545831,0.233701,0.21875,0.0,1
3,0.983871,0.028888,0.346021,0.051543,0.012974,0.96845,0.896363,0.539745,0.469291,0.744226,...,0.017556,0.90902,0.809574,0.578454,0.874479,1.0,0.773711,0.166667,0.0,1
4,0.209677,0.574063,0.137024,0.573503,0.49534,0.363258,0.326927,0.432004,0.464315,0.289261,...,0.314266,0.394413,0.152669,0.328109,0.510926,0.1575,0.142595,0.322917,0.0,-1


In [116]:
iforest_WPBC = IsolationForest(random_state=0,contamination=0.24)
iforest_WPBC.fit(WPBC)
pred_WPBC_iforest = iforest_WPBC.predict(WPBC)


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [117]:
LOF_WPBC = LocalOutlierFactor(contamination=0.24)
pred_WPBC_LOF = LOF_WPBC.fit_predict(WPBC)


In [118]:
result_WPBC = pd.DataFrame({'iforest':[precision_score(WPBC['outlier'],pred_WPBC_iforest),
                                       recall_score( WPBC['outlier'],pred_WPBC_iforest),
                                       f1_score(WPBC['outlier'],pred_WPBC_iforest)],
                            'LOF':[precision_score(WPBC['outlier'],pred_WPBC_LOF),
                                   recall_score( WPBC['outlier'],pred_WPBC_LOF),
                                   f1_score(WPBC['outlier'],pred_WPBC_LOF)]},index=['precision','recall','f1_score'])
result_WPBC

Unnamed: 0,iforest,LOF
precision,0.746667,0.713333
recall,0.741722,0.708609
f1_score,0.744186,0.710963
