<a href="https://colab.research.google.com/github/LGCilento/Experimento-03-AM/blob/master/one_class_classification_algorithms_kc1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><b>One-class-classification-algorithms</b></h1>

<h2>Data-Preparation</h2>

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from scipy.io import arff
import pandas as pd
from math import floor
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt  
from sklearn.datasets import make_classification
from sklearn.metrics import ConfusionMatrixDisplay
import numpy as np


In [3]:
data = arff.loadarff("kc1.arff")
df = pd.DataFrame(data[0])
df.rename(columns = {'defects': 'problems'}, inplace = True)
df['problems'] = df['problems'].apply(lambda x: x.decode("utf-8"))
df['problems'] = df['problems'].map({"false": 0, "true": 1})
df['problems']

0       0
1       1
2       1
3       1
4       1
       ..
2104    0
2105    0
2106    0
2107    0
2108    0
Name: problems, Length: 2109, dtype: int64

In [4]:
def split_db_pos_neg(df):
  df = shuffle(df)
  negative_database = df.loc[df['problems'] <= 0]
  positive_database = df.loc[df['problems'] > 0]
  return negative_database,positive_database

train Base at 30%:


In [5]:
def train_test_30(negative_database,positive_database,train_array_size):
  train_30 = negative_database[0:train_array_sizes[0]]
  test_30 = pd.concat([negative_database[train_array_sizes[0]:],positive_database])
  train_y_30 = train_30['problems'].values
  train_x_30 = train_30.drop(columns=['problems']).values
  test_y_30 = test_30['problems'].values
  test_x_30 = test_30.drop(columns=['problems']).values
  return train_x_30,test_x_30,test_y_30

train Base at 40%:

In [6]:
def train_test_40(negative_database,positive_database,train_array_size):
  train_40 = negative_database[0:train_array_sizes[1]]
  test_40 = pd.concat([negative_database[train_array_sizes[1]:],positive_database])
  train_y_40 = train_40['problems'].values
  train_x_40 = train_40.drop(columns=['problems']).values
  test_y_40 = test_40['problems'].values
  test_x_40 = test_40.drop(columns=['problems']).values
  return train_x_40,test_x_40,test_y_40

train Base at 50%:

In [7]:
def train_test_50(negative_database,positive_database,train_array_size):
  train_50 = negative_database[0:train_array_sizes[2]]
  test_50 = pd.concat([negative_database[train_array_sizes[2]:],positive_database])
  train_y_50 = train_50['problems'].values
  train_x_50 = train_50.drop(columns=['problems']).values
  test_y_50 = test_50['problems'].values
  test_x_50 = test_50.drop(columns=['problems']).values
  return train_x_50,test_x_50,test_y_50

<h2><b>One-class-classification Algorithms</b></h2>

In [17]:
def execute_benchmark(model,negative_database,positive_database,train_array_sizes):   ## Função para benchmark dos modelos
  f1_score_list_30 = []
  f1_score_list_40 = []
  f1_score_list_50 = []
  for exec in range(0,100):
    train_x_30,test_x_30,test_y_30 = train_test_30(negative_database,positive_database,train_array_sizes)
    train_x_40,test_x_40,test_y_40 = train_test_40(negative_database,positive_database,train_array_sizes)
    train_x_50,test_x_50,test_y_50 = train_test_40(negative_database,positive_database,train_array_sizes)
    train_test_list = [(train_x_30,test_x_30,test_y_30,'30'),(train_x_40,test_x_40,test_y_40,'40'),(train_x_50,test_x_50,test_y_50,'50')]
    for i in train_test_list:
      train_x, test_x, test_y, label = i
      model.fit(train_x)
      yhat = model.predict(test_x)
      test_y[test_y == 1] = -1
      test_y[test_y == 0] = 1
      score = f1_score(test_y,yhat,pos_label=-1)
      #print('F1 Score for {}: {}'.format(label,score))
      if label == '30':
        f1_score_list_30.append(score)
      elif label == '40':
        f1_score_list_40.append(score)
      else:
        f1_score_list_50.append(score)
  print("F1-Score 30%: {} (+/- {})".format(np.mean(f1_score_list_30),np.std(f1_score_list_30)))
  print("F1-Score 40%: {} (+/- {})".format(np.mean(f1_score_list_40),np.std(f1_score_list_40)))
  print("F1-Score 50%: {} (+/- {})".format(np.mean(f1_score_list_50),np.std(f1_score_list_50)))

<h3><b>Minimum Covariance Determinant:</b></h3>

In [18]:
from sklearn.covariance import EllipticEnvelope
model_mcd = EllipticEnvelope(contamination=0.20)
negative_database,positive_database = split_db_pos_neg(df)
train_array_sizes = [floor(negative_database.shape[0]*0.3),floor(negative_database.shape[0]*0.4),floor(negative_database.shape[0]*0.5)]
execute_benchmark(model_mcd,negative_database,positive_database,train_array_sizes)

F1-Score 30%: 0.5289128837056051 (+/- 0.004068250254933467)
F1-Score 40%: 0.5560929370398565 (+/- 0.005996061545261575)
F1-Score 50%: 0.5556942054584639 (+/- 0.006131339026842997)


<h3><b>Local Outlier Factor:</b></h3>

In [10]:
from sklearn.neighbors import LocalOutlierFactor
from numpy import vstack
model_lof = LocalOutlierFactor(contamination=0.20)
negative_database,positive_database = split_db_pos_neg(df)
train_array_sizes = [floor(negative_database.shape[0]*0.3),floor(negative_database.shape[0]*0.4),floor(negative_database.shape[0]*0.5)]
#execute_benchmark(model_lof)

In [11]:
f1_score_list_30 = []  # função de benchmark adaptada para Local Outlier Factor
f1_score_list_40 = []
f1_score_list_50 = []
for exec in range(0,100):
  train_x_30,test_x_30,test_y_30 = train_test_30(negative_database,positive_database,train_array_sizes)
  train_x_40,test_x_40,test_y_40 = train_test_40(negative_database,positive_database,train_array_sizes)
  train_x_50,test_x_50,test_y_50 = train_test_40(negative_database,positive_database,train_array_sizes)
  train_test_list = [(train_x_30,test_x_30,test_y_30,'30'),(train_x_40,test_x_40,test_y_40,'40'),(train_x_50,test_x_50,test_y_50,'50')]
  for i in train_test_list:
    train_x, test_x, test_y,label = i
    composite = vstack((train_x, test_x))
    yhat = model_lof.fit_predict(composite)
    # get just the predictions on the test set
    yhat = yhat[len(train_x):]

    test_y[test_y == 1] = -1
    test_y[test_y == 0] = 1
    score = f1_score(test_y,yhat,pos_label=-1)
    if label == '30':
      f1_score_list_30.append(score)
    elif label == '40':
      f1_score_list_40.append(score)
    else:
      f1_score_list_50.append(score)
print("F1-Score 30%: {} (+/- {})".format(np.mean(f1_score_list_30),np.std(f1_score_list_30)))
print("F1-Score 40%: {} (+/- {})".format(np.mean(f1_score_list_40),np.std(f1_score_list_40)))
print("F1-Score 50%: {} (+/- {})".format(np.mean(f1_score_list_50),np.std(f1_score_list_50)))
    

F1-Score 30%: 0.25116279069767433 (+/- 1.1102230246251565e-16)
F1-Score 40%: 0.2647058823529412 (+/- 0.0)
F1-Score 50%: 0.2647058823529412 (+/- 0.0)


<h3><b>Isolation Forest:</b></h3>

In [19]:
from sklearn.ensemble import IsolationForest
model_IF = IsolationForest(contamination=0.20)
negative_database,positive_database = split_db_pos_neg(df)
train_array_sizes = [floor(negative_database.shape[0]*0.3),floor(negative_database.shape[0]*0.4),floor(negative_database.shape[0]*0.5)]
execute_benchmark(model_IF,negative_database,positive_database,train_array_sizes)

F1-Score 30%: 0.5209731311808957 (+/- 0.006398416435551614)
F1-Score 40%: 0.5410833892369045 (+/- 0.005849625454789325)
F1-Score 50%: 0.540647177828821 (+/- 0.006507774868150787)


<h3><b>One-Class Support Vector Machines:</b></h3>

In [13]:
from sklearn.svm import OneClassSVM
model_OCSVM = OneClassSVM(gamma='scale', nu=0.20)
negative_database,positive_database = split_db_pos_neg(df)
train_array_sizes = [floor(negative_database.shape[0]*0.3),floor(negative_database.shape[0]*0.4),floor(negative_database.shape[0]*0.5)]
execute_benchmark(model_OCSVM,negative_database,positive_database,train_array_sizes)

F1-Score 30%: 0.35260930888575454 (+/- 5.551115123125783e-17)
F1-Score 40%: 0.39941262848751835 (+/- 0.0)
F1-Score 50%: 0.39941262848751835 (+/- 0.0)
