<a href="https://colab.research.google.com/github/HyoseonKye/anomaly-detector-gearbearing/blob/master/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/data')

# library
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
import seaborn as sns
# sns.set(color_codes=True)

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')



In [None]:
#dataset = pd.read_csv('/content/data/My Drive/test/merged_dataset_BearingTest_2.txt',index_col=[0])
from google.colab import files 
uploaded=files.upload()
dataset.head()

In [None]:
dataset.info()

In [None]:
# transform the index to datetime format 
dataset.index = pd.to_datetime(dataset.index, format='%Y-%m-%d %H:%M:%S')
dataset = dataset.sort_index()

In [None]:
dataset.to_csv('merged_dataset_BearingTest_2.txt')
dataset.head()

In [None]:
dataset.plot()
plt.show()

In [None]:
# Split data
dataset_train = dataset['2004-02-12 11:02:39':'2004-02-13 23:52:39']
dataset_test = dataset['2004-02-13 23:52:39':]

dataset_train.shape, dataset_test.shape

In [None]:
dataset_train.plot(figsize = (12,6))
plt.show()

In [None]:
# The “MinMaxScaler” simply re-scales the data to be in the range [0,1]
scaler = preprocessing.MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(dataset_train), 
                              columns=dataset_train.columns, 
                              index=dataset_train.index)

In [None]:
# Random shuffle training data
X_train.sample(frac=1)

X_test = pd.DataFrame(scaler.transform(dataset_test), 
                             columns=dataset_test.columns, 
                             index=dataset_test.index)

In [None]:
X_train.plot(figsize = (12,6))
plt.show()

In [None]:

X_test.plot(figsize = (12,6))
plt.show()

In [None]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2, svd_solver= 'full')
X_train_PCA = pca.fit_transform(X_train)
X_train_PCA = pd.DataFrame(X_train_PCA)
X_train_PCA.index = X_train.index

X_test_PCA = pca.transform(X_test)
X_test_PCA = pd.DataFrame(X_test_PCA)
X_test_PCA.index = X_test.index

In [None]:
def cov_matrix(data, verbose=False):
    covariance_matrix = np.cov(data, rowvar=False)
    if is_pos_def(covariance_matrix):
        inv_covariance_matrix = np.linalg.inv(covariance_matrix)
        if is_pos_def(inv_covariance_matrix):
            return covariance_matrix, inv_covariance_matrix
        else:
            print("Error: Inverse of Covariance Matrix is not positive definite!")
    else:
        print("Error: Covariance Matrix is not positive definite!")

In [None]:
# Mahalanobis distance
def MahalanobisDist(inv_cov_matrix, mean_distr, data, verbose=False):
    inv_covariance_matrix = inv_cov_matrix
    vars_mean = mean_distr
    diff = data - vars_mean
    md = []
    for i in range(len(diff)):
        md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))
    return md

In [None]:
# Detecting outliers
def MD_detectOutliers(dist, extreme=False, verbose=False):
    k = 3. if extreme else 2.
    threshold = np.mean(dist) * k
    outliers = []
    for i in range(len(dist)):
        if dist[i] >= threshold:
            outliers.append(i)  # index of the outlier
    return np.array(outliers)

In [None]:
# Calculate threshold value for classifying datapoint as anomaly:
def MD_threshold(dist, extreme=False, verbose=False):
    k = 3. if extreme else 2.
    threshold = np.mean(dist) * k
    return threshold

In [None]:
# Check if matrix is positive definite:
def is_pos_def(A):
    if np.allclose(A, A.T):
        try:
            np.linalg.cholesky(A)
            return True
        except np.linalg.LinAlgError:
            return False
    else:
        return False

In [None]:
# Define train/test set from the two main principal components:
data_train = np.array(X_train_PCA.values)
data_test = np.array(X_test_PCA.values)

In [None]:
# Calculate the covariance matrix and its inverse, based on data in the training set:
cov_matrix, inv_cov_matrix  = cov_matrix(data_train)

In [None]:
cov_matrix

In [None]:
inv_cov_matrix

In [None]:
# calculate the mean value for the input variables in the training set
mean_distr = data_train.mean(axis=0)
mean_distr

In [None]:
# calculate the Mahalanobis distance for the training data defining “normal conditions”
dist_test = MahalanobisDist(inv_cov_matrix, mean_distr, data_test, verbose=False)
dist_train = MahalanobisDist(inv_cov_matrix, mean_distr, data_train, verbose=False)

# find the threshold value to flag datapoints as an anomaly.
threshold = MD_threshold(dist_train, extreme = True)

In [None]:
threshold

In [None]:

# visualizing the square of the Mahalanobis distance
plt.figure()
sns.distplot(np.square(dist_train),
             bins = 10, 
             kde= False);
plt.xlim([0.0,15])
plt.xlabel('Square of the Mahalanobis distance')
plt.show()

In [None]:

# visualize the Mahalanobis distance itself:
plt.figure()
sns.distplot(dist_train,
             bins = 10, 
             kde= True, 
            color = 'green');
plt.xlim([0.0,5])
plt.xlabel('Mahalanobis dist')
plt.show()

In [None]:
anomaly_train = pd.DataFrame()
anomaly_train['Mob dist']= dist_train
anomaly_train['Thresh'] = threshold

# If Mob dist above threshold: Flag as anomaly
anomaly_train['Anomaly'] = anomaly_train['Mob dist'] > anomaly_train['Thresh']
anomaly_train.index = X_train_PCA.index

In [None]:
anomaly_train.info()

In [None]:
anomaly_train.Anomaly.value_counts()

In [None]:
anomaly_train.head()

In [None]:
anomaly = pd.DataFrame()
anomaly['Mob dist']= dist_test
anomaly['Thresh'] = threshold

# If Mob dist above threshold: Flag as anomaly
anomaly['Anomaly'] = anomaly['Mob dist'] > anomaly['Thresh']
anomaly.index = X_test_PCA.index
anomaly.head()

In [None]:
anomaly.info()

In [None]:
anomaly.Anomaly.value_counts()

In [None]:
anomaly_alldata = pd.concat([anomaly_train, anomaly])
#anomaly_alldata.to_csv('Anomaly_distance.csv')

In [None]:
anomaly_alldata.head(10)
anomaly_alldata.tail(10)

In [None]:
anomaly_alldata.plot(logy=True, figsize = (10,6), ylim = [1e-1,1e3], color = ['green','red'])
plt.show()