# Introduction

In [59]:
from scipy.io import arff
import pandas as pd

data, meta = arff.loadarff("synth_multidim_100_000.arff")
data = pd.DataFrame(data)

In [60]:
labels = data['class'].astype(int)
labels[labels != 0] = 1
del data['class']

In [61]:
data['labels'] = labels

In [62]:
data['labels'] = 0
data.loc[data.index.isin([50, 121, 350, 572, 669]),'labels'] = 1

In [63]:
# some matplotlib styling
from matplotlib import rc
import matplotlib.pyplot as plt

rc('text', usetex=True)
plt.style.use('seaborn-notebook')
rc('font',**{'family':'serif','serif':['Cambria']})
plt.rcParams['image.cmap'] = 'Reds'
plt.rcParams['patch.linewidth'] = '0.5'
plt.rcParams['patch.edgecolor'] = 'black'
plt.rcParams['axes.titlesize'] = '18'
plt.rcParams['axes.labelsize'] = '18'
plt.rcParams['legend.fontsize'] = '18'
plt.rcParams['xtick.labelsize'] = '12'
plt.rcParams['ytick.labelsize'] = '12'

In [69]:
%matplotlib notebook
from mpl_toolkits.mplot3d import Axes3D

threedee = plt.figure(figsize=(10,7)).gca(projection='3d')
threedee.scatter(data['var_0030'], data['var_0031'], zs=data['var_0032'], 
                 c=data["labels"], edgecolor='black')
threedee.set_xlabel('x', labelpad=10)
threedee.set_ylabel('y', labelpad=10)
threedee.set_zlabel('z', labelpad=10)
plt.savefig("3d-plot-outlier.svg", format="svg")

<IPython.core.display.Javascript object>

In [78]:
import matplotlib.gridspec as gridspec

# Create 1x3 sub plots
gs = gridspec.GridSpec(1, 3, wspace=0.4)

plt.figure(figsize=(10,2.5))

ax = plt.subplot(gs[0, 0]) # row 0, col 0
plt.scatter(data['var_0030'], data['var_0031'], c=data["labels"], 
            edgecolor='black', s=15)
plt.xlabel('x')
plt.ylabel('y')

ax = plt.subplot(gs[0, 1]) # row 0, col 1
plt.scatter(data['var_0031'], data['var_0032'], c=data["labels"],
           edgecolor='black', s=15)
plt.xlabel('y')
plt.ylabel('z')

ax = plt.subplot(gs[0, 2]) # row 0, col 3
plt.scatter(data['var_0030'], data['var_0032'], c=data["labels"],
           edgecolor='black', s=15)
plt.xlabel('x')
plt.ylabel('z')

#plt.savefig("2d-plot-outlier.svg", format="svg", bbox_inches='tight')

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7f7393a32f60>

In [79]:
from mpl_toolkits.mplot3d import Axes3D

threedee = plt.figure(figsize=(10,7)).gca(projection='3d')
threedee.scatter(data['var_0070'], data['var_0080'], zs=data['var_0015'], 
                 c=data["labels"], edgecolor='black')
threedee.set_xlabel('r', labelpad=10)
threedee.set_ylabel('s', labelpad=10)
threedee.set_zlabel('t', labelpad=10)

plt.savefig("3d-plot-non-outlier.svg", format="svg")

<IPython.core.display.Javascript object>

# Autoencoder 

In [16]:
from scipy.io import arff
import pandas as pd

data, meta = arff.loadarff("synth_multidim_100_000.arff")
data = pd.DataFrame(data)

In [17]:
labels = data['class'].astype(int)
labels[labels != 0] = 1
del data['class']

In [18]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(data)
data_n = pd.DataFrame(np_scaled)
data_n = data_n.astype('float32')

In [19]:
from keras.layers import Input, Dense
from keras.models import Model

# this is the size of our encoded representations
encoding_dim = 80  # 80 floats -> compression of factor 0.8, assuming the input is 100 floats

# this is our input placeholder
input = Input(shape=(100,))
# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input)
# "decoded" is the lossy reconstruction of the input
decoded = Dense(100, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(input=input, output=decoded)

Using TensorFlow backend.


In [20]:
# this model maps an input to its encoded representation
encoder = Model(input=input, output=encoded)

In [21]:
# create a placeholder for an encoded (32-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(input=encoded_input, output=decoder_layer(encoded_input))

In [22]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

In [23]:
# Just compute the distance before learning (show be very bad !)
import numpy as np

encoded = encoder.predict(data_n.values)
decoded = decoder.predict(encoded)
naivedist = np.zeros(len(data_n.values))
for i, x in enumerate(data_n.values):
    naivedist[i] = np.linalg.norm(x-decoded[i])

In [83]:
data['naivedist'] = naivedist

In [24]:
autoencoder.fit(data_n.values, data_n.values,
                nb_epoch=2500,
                batch_size=100,
                shuffle=True,
                verbose=0)

<keras.callbacks.History at 0x7f737c09df28>

# Evaluation

In [25]:
encoded = encoder.predict(data_n.values)
decoded = decoder.predict(encoded)

In [26]:
import numpy as np

dist = np.zeros(len(data_n.values))
for i, x in enumerate(data_n.values):
    dist[i] = np.linalg.norm(x-decoded[i]) # euclidean distance

In [27]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(labels, dist)
roc_auc = auc(fpr, tpr)

In [28]:
plt.figure(figsize=(12,7))
plt.plot(fpr, tpr, color='red', label='AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlabel('False Positive rate')
plt.ylabel('True Positive rate')
plt.title('ROC Autoencoder 100-80-100 ReLU/Sigmoid synth\_multidim\_100\_000', fontsize=20)
plt.legend(loc="lower right")
plt.show()
plt.savefig("ae-outlier-training-roc.svg", format="svg")

<IPython.core.display.Javascript object>

In [80]:
data['labels'] = labels
data['dist'] = dist

In [81]:
plt.figure(figsize=(10,7))
plt.scatter(data.index, data['dist'], c=data['labels'], edgecolor='black')
plt.xlabel('Index')
plt.ylabel('Score')
plt.xlim((0,1000))
plt.show()
plt.savefig("ae-outlier-training.svg", format="svg")

<IPython.core.display.Javascript object>

In [82]:
import matplotlib.gridspec as gridspec

# Create 1x2 sub plots
gs = gridspec.GridSpec(1, 2)

plt.figure(figsize=(12,5))
ax = plt.subplot(gs[0, 0]) # row 0, col 0
plt.scatter(data.index, data['naivedist'], c=data["labels"], edgecolor='black')
plt.xlabel("Index")
plt.ylabel('Score')
plt.xlim((0,1000))
plt.title("Before learning")

ax = plt.subplot(gs[0, 1]) # row 0, col 1
plt.scatter(data.index, data['dist'], c=data["labels"], edgecolor='black')
plt.xlabel('Index')
plt.ylabel('Score')
plt.xlim((0,1000))
plt.title("After learning")
plt.savefig("ae-outlier-training-comp.svg", format="svg")

<IPython.core.display.Javascript object>

KeyError: 'naivedist'

In [32]:
def compute_error_per_dim(point):
    p = np.array(data_n.iloc[point,:]).reshape(1,100)
    encoded = encoder.predict(p)
    decoded = decoder.predict(encoded)
    return np.array(p - decoded)[0]

plt.figure(figsize=(12,7))
plt.plot(compute_error_per_dim(350))
plt.xlabel('Index')
plt.ylabel('Reconstruction error')
plt.title("Reconstruction error in each dimension of point 350")
plt.savefig("ae-outlier-reconstruction-350.svg", format="svg")

<IPython.core.display.Javascript object>

In [33]:
plt.figure(figsize=(12,7))
plt.plot(compute_error_per_dim(50))
plt.xlabel('Index')
plt.ylabel('Reconstruction error')
plt.title("Reconstruction error in each dimension of point 50")
plt.savefig("ae-outlier-reconstruction-50.svg", format="svg")

<IPython.core.display.Javascript object>

In [34]:
# Return a list of the dimensions with the highest reconstruction error
np.argsort(compute_error_per_dim(50))[::-1]

array([69, 32, 68, 31, 30, 52, 50,  2, 51, 40, 36, 26, 95,  4, 39, 45, 90,
       35, 29, 79, 16, 53, 13, 74, 43, 24, 87, 21,  0, 22, 11, 27, 33, 82,
       12, 81, 75, 42, 23,  7, 25, 84, 44,  3, 10, 46,  8, 98, 96, 94, 34,
       86, 72, 18, 54, 41, 17, 38, 49, 14, 15,  1, 80, 28, 77, 91, 97, 78,
       65, 19, 70, 76, 37, 48, 93, 73, 20, 56, 99, 88, 89, 92, 71, 83, 55,
        5, 47, 85,  6, 63, 67, 66, 57, 61, 58,  9, 59, 64, 60, 62])

In [35]:
# Look at the position of point 50 in subspace [50,51,52]
data['labels'] = 0
data.loc[data.index.isin([50]),'labels'] = 1

from mpl_toolkits.mplot3d import Axes3D
threedee = plt.figure().gca(projection='3d')
threedee.scatter(data['var_0050'], data['var_0051'], zs=data['var_0052'], 
                 c=data["labels"], cmap='Reds')
threedee.set_xlabel('50', labelpad=10)
threedee.set_ylabel('51', labelpad=10)
threedee.set_zlabel('52', labelpad=10)
plt.savefig("3d-plot-non-outlier-50.svg", format="svg")

<IPython.core.display.Javascript object>

In [36]:
# Look at the position of point 50 in subspace [21,22,23]
data['labels'] = 0
data.loc[data.index.isin([50]),'labels'] = 1

from mpl_toolkits.mplot3d import Axes3D
threedee = plt.figure().gca(projection='3d')
threedee.scatter(data['var_0021'], data['var_0022'], zs=data['var_0023'], 
                 c=data["labels"], cmap='Reds')
threedee.set_xlabel('21', labelpad=10)
threedee.set_ylabel('22', labelpad=10)
threedee.set_zlabel('23', labelpad=10)

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7f7356a1dcf8>

In [37]:
# 50, 121, 350, 572 and 559 are outliers in subspace [30,31,32]
plt.figure(figsize=(12,7))
plt.plot(range(100), compute_error_per_dim(50), label="50")
plt.plot(range(100), compute_error_per_dim(121), label="121")
plt.plot(range(100), compute_error_per_dim(350), label="350")
plt.plot(range(100), compute_error_per_dim(572), label="572")
plt.plot(range(100), compute_error_per_dim(669), label="669")
plt.legend(loc=1)
plt.xlabel('Index')
plt.ylabel('Reconstruction error')
plt.title("Reconstruction error in each dimension of outliers in [30,31,32]")
plt.savefig("ae-outlier-reconstruction-all.svg", format="svg")

<IPython.core.display.Javascript object>