In [None]:
# Synthetic dataset
from sklearn.datasets import make_classification
# Data processing
import pandas as pd
import numpy as np
from collections import Counter
# Visualization
import matplotlib.pyplot as plt
# Model and performance
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
# Import the necessary libraries.
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
#!pip install pyod            # normal install
#!pip install --upgrade pyod  # or update if needed
from pyod.models.pca import PCA as PCA_PYOD
from pyod.models.kpca import KPCA as KPCA_PYOD
from scipy.io import loadmat
import os
import seaborn as sns
from sklearn.datasets import load_breast_cancer


### Breast Cancer dataset

In [None]:
data = []
target = []
dimensionality = []
dataset = []

df = load_breast_cancer()
name = 'breast_cancer'

In [None]:
var = name+"_data"

var = pd.DataFrame(data=df, columns=df.feature_names)
var.columns = ['feature{}'.format(i) for i in range(0,var.shape[1])]

dataset = pd.DataFrame(df.data).assign(target=df.target)
print (dataset.shape,pd.DataFrame(df.target).nunique().tolist()[-1:])

In [None]:
dataset.head()

In [None]:
dataset_reduced = dataset.drop(dataset[dataset['target'] == 0].sample(frac=0.20).index)

In [None]:
Counter(dataset_reduced['target'])

In [None]:
sns.set(rc = {'figure.figsize':(8,6)})
g1 = sns.countplot(x="target",data= dataset_reduced, palette="hls")                  
g1.set_title("Target variable(vowel1) Distribution", fontsize=15)
g1.set_xlabel("Vowel1")
g1.set_xlabel("Count")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
box = sns.boxplot(data=dataset_reduced) 
box.set_xticklabels(box.get_xticklabels(), rotation= 45) 
fig.subplots_adjust(bottom=0.2)
plt.tight_layout()

In [None]:
outliers = Counter(dataset_reduced['target'])[0]
inliers = Counter(dataset_reduced['target'])[1]

contamination = outliers/inliers 
contamination

In [None]:
dataset_reduced[dataset_reduced.columns[:-1]]

In [None]:
n_features = dataset_reduced.shape[1]     # number of features

X = dataset_reduced[dataset_reduced.columns[:-1]]
y = dataset_reduced['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

X_train_pd = pd.DataFrame(X_train)
X_train_pd.head()

## PCA

In [None]:
#implementation PCA
pca = PCA_PYOD(n_components=2, contamination=contamination) 
pca.fit(X_train)


In [None]:
# PCA
# get the prediction labels and outlier scores of the training data
y_train_pred_pca = pca.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores_pca = pca.decision_scores_  # .decision_scores_ yields the raw outlier scores for the training data

# get the prediction labels and outlier scores of the test data
y_test_scores_pca = pca.decision_function(X_test)
y_test_pred_pca = pca.predict(X_test)


In [None]:
pca.get_params()

In [None]:
[pca.explained_variance_,
pca.explained_variance_ratio_]

In [None]:
# get the prediction on the test data
y_test_pred_pca = pca.predict(X_test)  # outlier labels (0 or 1)
y_test_scores_pca = pca.decision_function(X_test)  # outlier scores

plt.hist(y_train_scores_pca, bins='auto')  # arguments are passed to np.histogram
plt.title("Histogram with 'auto' bins")
plt.xlabel('PCA outlier score')
plt.show()

## Kernel PCA

In [None]:
# KPCA
kpca = KPCA_PYOD(n_components=2, kernel='rbf', gamma=0.15, contamination=contamination)
kpca.fit(X_train)

In [None]:
# KPCA
# get the prediction labels and outlier scores of the training data
y_train_pred_kpca = kpca.labels_
y_train_scores_kpca = kpca.decision_scores_

# get the prediction labels and outlier scores of the test data
y_test_scores_kpca = kpca.decision_function(X_test)
y_test_pred_kpca = kpca.predict(X_test)

In [None]:
kpca.get_params()

In [None]:
# get the prediction on the test data
y_test_pred_kpca = kpca.predict(X_test)  # outlier labels (0 or 1)
y_test_scores_kpca = kpca.decision_function(X_test)  # outlier scores

In [None]:
plt.hist(y_train_scores_kpca, bins='auto')  # arguments are passed to np.histogram
plt.title("Histogram with 'auto' bins")
plt.xlabel('KPCA outlier score')
plt.show()

### Test other Kernels

In [None]:
# Define a list of kernel functions and parameters to try
"""kernels = ['linear', 'poly', 'rbf', 'sigmoid']
gammas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]"""

In [None]:
"""# Perform kernelPCA
# Initialize a subplot grid to plot the results
fig, axs = plt.subplots(len(kernels), len(gammas), figsize=(15, 15), constrained_layout=True)

# Loop over the kernels and gammas and perform KPCA
for i, kernel in enumerate(kernels):
    for j, gamma in enumerate(gammas):
        # Initialize the KPCA transformer and fit the data
        kpca = KernelPCA(n_components=2, kernel=kernel, gamma=gamma)
        X_kpca = kpca.fit_transform(X_train)

        # Plot the results
        axs[i, j].scatter(X_kpca[:, 0], X_kpca[:, 1], c=y_train, cmap='viridis', alpha=0.8)
        axs[i, j].set_title(f'{kernel} kernel, gamma={gamma}')

plt.show()"""

## Summary PCA and Kernel PCA

In [None]:
threshold_pca = pca.threshold_
threshold_kpca = kpca.threshold_

print("The threshold of the PCA method for the defined comtanimation rate:" , threshold_pca)
print("The threshold of the KPCA method for the defined comtanimation rate:" , threshold_kpca)

In [None]:
def descriptive_stat_threshold(df,pred_score, threshold):
    # Let's see how many '0's and '1's.
    df = pd.DataFrame(df)
    df['Anomaly_Score'] = pred_score
    df['Group'] = np.where(df['Anomaly_Score']< threshold, 'Normal', 'Outlier')

    # Now let's show the summary statistics:
    cnt = df.groupby('Group')['Anomaly_Score'].count().reset_index().rename(columns={'Anomaly_Score':'Count'})
    cnt['Count %'] = (cnt['Count'] / cnt['Count'].sum()) * 100 # The count and count %
    stat = df.groupby('Group').mean().round(2).reset_index() # The avg.
    stat = cnt.merge(stat, left_on='Group',right_on='Group') # Put the count and the avg. together
    return (stat)

In [None]:
descriptive_stat_threshold(X_train,y_train_scores_pca, threshold_pca)

In [None]:
descriptive_stat_threshold(X_train,y_train_scores_kpca, threshold_kpca)

In [None]:
Actual_pred_pca = pd.DataFrame({'Actual': y_test, 'Anomaly_Score': y_test_scores_pca})
Actual_pred_pca['Pred'] = np.where(Actual_pred_pca['Anomaly_Score']< threshold_pca,0,1)
pd.crosstab(Actual_pred_pca['Actual'],Actual_pred_pca['Pred'])

In [None]:
Actual_pred_kpca = pd.DataFrame({'Actual': y_test, 'Anomaly_Score': y_test_scores_kpca})
Actual_pred_kpca['Pred'] = np.where(Actual_pred_kpca['Anomaly_Score']< threshold_kpca,0,1)
pd.crosstab(Actual_pred_kpca['Actual'],Actual_pred_kpca['Pred'])

In [None]:
# fit PCA on training data
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)

# create a scatter plot of the projected data
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


In [None]:
# fit KernelPCA on training data
kpca = KernelPCA(n_components=2, kernel='rbf', gamma=0.01)
X_train_kpca = kpca.fit_transform(X_train)

# create a scatter plot of the projected data
plt.scatter(X_train_kpca[:, 0], X_train_kpca[:, 1], c=y_train, cmap='viridis')
plt.xlabel('Kernel Principal Component 1')
plt.ylabel('Kernel Principal Component 2')
plt.show()

## SVM

In [None]:
X = dataset_reduced[dataset_reduced.columns[:-1]]
y = dataset_reduced['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

X_train_pd = pd.DataFrame(X_train)
X_train_pd.head()

In [None]:
# Train the one class support vector machine (SVM) model
one_class_svm = OneClassSVM(nu=0.01, kernel = 'rbf', gamma = 'auto').fit(X_train)

In [None]:
# Predict the anomalies
prediction = one_class_svm.predict(X_test)

In [None]:
# Change the anomalies' values to make it consistent with the true values
prediction = [1 if i==-1 else 0 for i in prediction]

In [None]:
# Check the model performance
print(classification_report(y_test, prediction))

In [None]:
print("The accuracy score predicted is {}".format(accuracy_score(y_test, prediction)))

In [None]:
# Get the scores for the testing dataset
score = one_class_svm.score_samples(X_test)
score

In [None]:
# Check the score for 2% of outliers
score_threshold = np.percentile(score, 2)
score_threshold

In [None]:
# Check the score for 2% of outliers
score_threshold = np.percentile(score, 2)
print(f'The customized score threshold for 2% of outliers is {score_threshold:.2f}')

In [None]:
# Check the model performance at 2% threshold
customized_prediction = [1 if i < score_threshold else 0 for i in score]
# # Check the prediction performance
print(classification_report(y_test, customized_prediction))