# Novelty detection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 0. Data preparation

### Data load

In [2]:
data = pd.read_csv('data/artificial_data.csv')

In [3]:
data.head(5)

Unnamed: 0,X1,X2,class
0,-0.563773,0.569194,0.0
1,-0.621337,-0.127719,0.0
2,-0.211521,0.27203,0.0
3,0.302166,0.086107,0.0
4,0.011078,-0.091119,0.0


### Data split

In [4]:
normal = data.loc[data['class'] == 0]
abnormal = data.loc[data['class'] == 1]

## 1. one-class SVM

In [None]:
from sklearn.svm import OneClassSVM

In [None]:
ocsvm = OneClassSVM(kernel='rbf', gamma='auto')

In [None]:
ocsvm.fit(X)

In [None]:
result = ocsvm.predict(X)
# print(result)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.scatter(X[result==-1, 0], X[result==-1, 1], marker='o', color='r')

In [None]:
result = ocsvm.decision_function(X)
# print(result)

In [None]:
outliers = np.where(np.abs(result) >= 40)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.scatter(X[outliers, 0], X[outliers, 1], marker='o', color='r')

## 2. Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
IsForest = IsolationForest()

In [None]:
IsForest.fit(X)

In [None]:
result = IsForest.predict(X)
# print(result)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.scatter(X[result==-1, 0], X[result==-1, 1], marker='o', color='r')

In [None]:
result = ocsvm.decision_function(X)
print(max(result))
print(min(result))

In [None]:
outliers = np.where(np.abs(result) >= 40)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.scatter(X[outliers, 0], X[outliers, 1], marker='o', color='r')

## 3. EllipticEnvelope

In [None]:
from sklearn.covariance import EllipticEnvelope

In [None]:
elliptic = EllipticEnvelope()

In [None]:
elliptic.fit(X)

In [None]:
result = elliptic.predict(X)
# print(result)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.scatter(X[result==-1, 0], X[result==-1, 1], marker='o', color='r')

## 4. Gaussian Mixture Model

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
gmm = GaussianMixture(n_components=1)

In [None]:
gmm.fit(X)

In [None]:
# Calculate the log-probabilitiy of each sample
scores = gmm.score_samples(X)

In [None]:
# Calculate the probabilitiy of each esample
probs = np.exp(scores)

In [None]:
probs

In [None]:
# If the probability is not bigger than 0.01, then corresponding points are considered as 'novelty'
outliers = np.where(probs <= 0.01)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.scatter(X[outliers, 0], X[outliers, 1], marker='o', color='r')

In [None]:
outliers = np.where(probs <= 0.02)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.scatter(X[outliers, 0], X[outliers, 1], marker='o', color='r')

### Class version

In [None]:
class GaussianMixtureNovelty(GaussianMixture):
    def __init__(self, threshold, n_components=1, covariance_type='full', tol=1e-3,
                 reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
                 weights_init=None, means_init=None, precisions_init=None,
                 random_state=None, warm_start=False,
                 verbose=0, verbose_interval=10):
        super().__init__(n_components=n_components, covariance_type=covariance_type, tol=tol,
                         reg_covar=reg_covar, max_iter=max_iter, n_init=n_init, init_params=init_params,
                         weights_init=weights_init, means_init=means_init, precisions_init=precisions_init,
                         random_state=random_state, warm_start=warm_start,
                         verbose=verbose, verbose_interval=verbose_interval)
        
        self.threshold = threshold
    
    def prob_samples(self, test_data):
        scores = self.score_samples(test_data)
        return np.exp(scores)
    
    def predict(self, test_data):
        probs = self.prob_samples(test_data)
        outliers = np.where(probs <= self.threshold)
        result = np.zeros(len(test_data))
        result[outliers] = 1
        return result

In [None]:
gmm_novelty = GaussianMixtureNovelty(n_components=1, threshold=0.02)

In [None]:
gmm_novelty.fit(X)

In [None]:
gmm_novelty.prob_samples(X)

In [None]:
predicted_class = gmm_novelty.predict(X)
print(predicted_class)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.scatter(X[predicted_class==1, 0], X[predicted_class==1, 1], marker='o', color='r')

## 5. PCA and kernel PCA

In [None]:
# Make 4-dimensional dataset
# 2 centers
centers = [[0, 0, 0, 0], [1, 1, 1, 1]]
X, _ = make_blobs(n_samples=999, centers=centers, cluster_std=0.4, random_state=1234)
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.xlabel('X1')
plt.ylabel('X2')
plt.title('Benchmark dataset with 4 dimensions')
plt.show()

In [None]:
from sklearn.decomposition import PCA, KernelPCA

In [None]:
pca = PCA(n_components=3)

In [None]:
pca.fit(X)

In [None]:
# First PC(principal component) and second PC account for 81.6% of the total variance
print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
X_transformed = pca.transform(X)

In [None]:
X_reconstructed = pca.inverse_transform(X_transformed)

In [None]:
print('Shape of original data: ', X.shape)
print('Shape of transformed data: ', X_transformed.shape)
print('Shape of reconstructed data: ', X_reconstructed.shape)

In [None]:
print("<1~5 rows of original data>")
print(X[:5,:])
print("=" * 60)
print("<1~5 rows of reconstructed data>")
print(X_reconstructed[:5,:])

In [None]:
squared_euclidean = np.sum(np.power(X - X_reconstructed, 2), axis=-1)

In [None]:
outliers = np.where(squared_euclidean >= 0.7)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.scatter(X[outliers, 0], X[outliers, 1], marker='o', color='r')

In [None]:
kpca = KernelPCA(n_components=3, kernel='rbf', gamma=0.1, fit_inverse_transform=True)

In [None]:
kpca.fit(X)

In [None]:
X_transformed = kpca.transform(X)

In [None]:
X_reconstructed = kpca.inverse_transform(X_transformed)

In [None]:
print("<1~5 rows of original data>")
print(X[:5,:])
print("=" * 60)
print("<1~5 rows of reconstructed data>")
print(X_reconstructed[:5,:])

In [None]:
squared_euclidean = np.sum(np.power(X - X_reconstructed, 2), axis=-1)

In [None]:
outliers = np.where(squared_euclidean >= 0.7)

In [None]:
plt.scatter(X[:, 0], X[:, 1], marker='x', color='b')
plt.scatter(X[outliers, 0], X[outliers, 1], marker='o', color='r')