## Example of preprocessing datasets for evaluating outlier detection algorithms

In [97]:
import time
import scipy
import numpy as np
import pandas as pd
import openml as oml

In [91]:
data = oml.datasets.get_dataset(772)
X, y = data.get_data(target=data.default_target_attribute)

In [92]:
print(X.shape)
X

(2178, 3)


array([[ 33.  , -52.26,  28.3 ],
       [ 36.  ,  45.53, 150.93],
       [ 57.  ,  41.85, 142.78],
       ...,
       [527.  ,  27.87, 139.51],
       [ 51.  ,  36.43, 140.98],
       [ 41.  ,  24.02, 122.23]], dtype=float32)

In [93]:
y.shape[0]
y

array([1, 0, 0, ..., 0, 1, 0])

In [94]:
y = y.reshape((2178,1))
y.shape

(2178, 1)

In [95]:
Xy = np.concatenate((X,y), axis=1)
Xy.shape
Xy

array([[ 33.        , -52.25999832,  28.29999924,   1.        ],
       [ 36.        ,  45.52999878, 150.92999268,   0.        ],
       [ 57.        ,  41.84999847, 142.77999878,   0.        ],
       ...,
       [527.        ,  27.87000084, 139.50999451,   0.        ],
       [ 51.        ,  36.43000031, 140.97999573,   1.        ],
       [ 41.        ,  24.02000046, 122.23000336,   0.        ]])

In [96]:
df_ori = pd.DataFrame(Xy)
df_ori.head()

Unnamed: 0,0,1,2,3
0,33.0,-52.259998,28.299999,1.0
1,36.0,45.529999,150.929993,0.0
2,57.0,41.849998,142.779999,0.0
3,67.0,29.190001,141.149994,1.0
4,30.0,-21.66,169.809998,1.0


In [90]:
df_ori.to_csv('yeast.csv', sep=',', index=False, header=False)

In [75]:
data_anomaly = np.genfromtxt('yeast' + '.csv', delimiter=',')
df_anomaly = pd.DataFrame(data_anomaly)
df_anomaly.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,2.0
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,2.0
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,2.0
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,1.0
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,2.0


In [77]:
pd.value_counts(df_anomaly[8])

0.0    463
1.0    429
2.0    244
3.0    163
4.0     51
5.0     44
6.0     35
7.0     30
8.0     20
9.0      5
Name: 8, dtype: int64

In [78]:
# df_anomaly[8] = df_anomaly[8].map({0:1, 1:1, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0,
#                                   12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0, 20:0, 21:0, 22:1, 23:1,
#                                   24:1, 25:1, 26:1, 27:1, 28:1})
df_anomaly[8] = df_anomaly[8].map({0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:1})
pd.value_counts(df_anomaly[8])

0    1479
1       5
Name: 8, dtype: int64

In [79]:
df_anomaly

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,0
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,0
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,0
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,0
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,0
5,0.51,0.40,0.56,0.17,0.5,0.5,0.49,0.22,0
6,0.50,0.54,0.48,0.65,0.5,0.0,0.53,0.22,0
7,0.48,0.45,0.59,0.20,0.5,0.0,0.58,0.34,0
8,0.55,0.50,0.66,0.36,0.5,0.0,0.49,0.22,0
9,0.40,0.39,0.60,0.15,0.5,0.0,0.58,0.30,0


In [81]:
df_anomaly.to_csv('yeast_anomaly.csv', sep=',', index=False, header=False)