In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score 
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn import tree

In [2]:
data = pd.read_csv("iris.csv")
data = data.to_numpy()

In [3]:
data

array([[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
       [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
       [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
       [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
       [5.0, 3.6, 1.4, 0.2, 'Iris-setosa'],
       [5.4, 3.9, 1.7, 0.4, 'Iris-setosa'],
       [4.6, 3.4, 1.4, 0.3, 'Iris-setosa'],
       [5.0, 3.4, 1.5, 0.2, 'Iris-setosa'],
       [4.4, 2.9, 1.4, 0.2, 'Iris-setosa'],
       [4.9, 3.1, 1.5, 0.1, 'Iris-setosa'],
       [5.4, 3.7, 1.5, 0.2, 'Iris-setosa'],
       [4.8, 3.4, 1.6, 0.2, 'Iris-setosa'],
       [4.8, 3.0, 1.4, 0.1, 'Iris-setosa'],
       [4.3, 3.0, 1.1, 0.1, 'Iris-setosa'],
       [5.8, 4.0, 1.2, 0.2, 'Iris-setosa'],
       [5.7, 4.4, 1.5, 0.4, 'Iris-setosa'],
       [5.4, 3.9, 1.3, 0.4, 'Iris-setosa'],
       [5.1, 3.5, 1.4, 0.3, 'Iris-setosa'],
       [5.7, 3.8, 1.7, 0.3, 'Iris-setosa'],
       [5.1, 3.8, 1.5, 0.3, 'Iris-setosa'],
       [5.4, 3.4, 1.7, 0.2, 'Iris-setosa'],
       [5.1, 3.7, 1.5, 0.4, 'Iris-setosa'],
       [4.6, 3.6, 1.0, 0.2, 'Iri

In [4]:
def OneClassValidation(data, c):
    df = np.copy(data)
    #Three different classes.
    y= ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']    
    for f in range(3):
        if f == c:
            df[:,4][df[:, 4] == y[c]] = 0
        else:
            df[:,4][df[:, 4] == y[f]] = 1   
    return df[:, :4], df[:,-1]

In [5]:
# make a prediction with a KDE model
def predictionUsingKde(model, trainX, testX):
    c= np.vstack((trainX, testX))    
    yy = model.fit_predict(c)    
    return yy[len(trainX):]

def KDEScore(X_train, X_test, y_train, y_test):
    model = LocalOutlierFactor(contamination=0.5)    
    X_train = X_train[y_train==0]    
    yy = predictionUsingKde(model, X_train, X_test)    
    y_test[y_test == 1] = -1
    y_test[y_test == 0] = 1    
    score=f1_score(y_test.astype(np.int64), yy, pos_label=-1)    
    return score

In [6]:
def OneClassFeatures(X_train, X_test, y_train, y_test):
    dt = tree.DecisionTreeClassifier(max_depth=8, min_samples_leaf=5)
    X_train_float = np.array(X_train).astype("float")
    y_train_int = np.array(y_train).astype("int")
    dt.fit(X_train_float, y_train_int)
    res = dt.predict(X_test)
    res = res.astype(int)
    res = np.reshape(res, len(res))
    y_test = np.reshape(y_test, len(y_test))
    misclassified = np.bitwise_xor(res, y_test)
    misclassified = misclassified.astype(float)
    aux = sum(misclassified)/len(X_test)
    imp = dt.feature_importances_        
    indices = np.nonzero(imp)            
    X_train_float = X_train_float[:,indices]    
    X_train_float = np.copy(X_train_float[:,0])        
    X_test = X_test[:,indices]
    X_test = np.copy(X_test[:,0])    
    return aux, X_test, res

In [7]:
def get_scores(y_test, res):
    y = y_test.astype("int")
    auc = roc_auc_score(y, res)
    acc = accuracy_score(y, res) 
    print ("Acc: %.2f, ROC/AUC: %.2f" % (acc, auc))
    prec = precision_score(y, res, average='macro')
    recall = recall_score(y, res, average='macro')
    f1 = f1_score(y, res, average='macro')
    print ("Average   - Prec: %.2f, Recall: %.2f, F1: %.2f" % (prec, recall, f1))
    prec = precision_score(y, res, average='binary', pos_label=0)
    recall = recall_score(y, res, average='binary', pos_label=0)
    f1 = f1_score(y, res, average='binary', pos_label=0)
    print ("Pos_lab:0 - Prec: %.2f, Recall: %.2f, F1: %.2f" % (prec, recall, f1))
    prec = precision_score(y, res, average='binary', pos_label=1)
    recall = recall_score(y, res, average='binary', pos_label=1)
    f1 = f1_score(y, res, average='binary', pos_label=1)
    print ("Pos_lab:1 - Prec: %.2f, Recall: %.2f, F1: %.2f" % (prec, recall, f1))

In [8]:
#Using KDE

In [9]:
(X_0, y_0),(X_1, y_1),(X_2, y_2) = OneClassValidation(data, 0),OneClassValidation(data, 1),OneClassValidation(data, 2)

X_0_train, X_0_test, y_0_train, y_0_test = train_test_split(X_0, y_0, test_size=0.30, random_state=42)
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size=0.30, random_state=42)
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size=0.30, random_state=42)

In [10]:
f1_0,f1_1,f1_2 = KDEScore(X_0_train, X_0_test, y_0_train, y_0_test),KDEScore(X_1_train, X_1_test, y_1_train, y_1_test),KDEScore(X_2_train, X_2_test, y_2_train, y_2_test)

(X_0, y_0),(X_1, y_1),(X_2, y_2)= OneClassValidation(data, 0),OneClassValidation(data, 1),OneClassValidation(data, 2)

X_0_train, X_0_test, y_0_train, y_0_test = train_test_split(X_0, y_0, test_size=0.30)
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size=0.30)
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size=0.30)

In [11]:
res_KDE = (f1_0+f1_1+f1_2)/3
#Average F1 score.
print(res_KDE)

0.7629261701116655


In [12]:
#Using the Tree

In [13]:
aux_0, X_0_test_new, res_0 = OneClassFeatures(X_0_train, X_0_test, y_0_train, y_0_test)
aux_1, X_1_test_new, res_1 = OneClassFeatures(X_1_train, X_1_test, y_1_train, y_1_test)
aux_2, X_2_test_new, res_2 = OneClassFeatures(X_2_train, X_2_test, y_2_train, y_2_test)

print ("Misclassified samples:", aux_0)
print ("Misclassified samples:", aux_1)
print ("Misclassified samples:", aux_2)

Misclassified samples: 0.0
Misclassified samples: 0.13333333333333333
Misclassified samples: 0.06666666666666667


In [14]:
#For each class vs the rest all classes

print("For Iris-setosa vs Rest Classes")
get_scores(y_0_test, res_0)

print("\nFor Iris-versicolor vs Rest Classes")
get_scores(y_1_test, res_1)

print("\nFor Iris-virginica vs Rest Classes")
get_scores(y_2_test, res_2)


For Iris-setosa vs Rest Classes
Acc: 1.00, ROC/AUC: 1.00
Average   - Prec: 1.00, Recall: 1.00, F1: 1.00
Pos_lab:0 - Prec: 1.00, Recall: 1.00, F1: 1.00
Pos_lab:1 - Prec: 1.00, Recall: 1.00, F1: 1.00

For Iris-versicolor vs Rest Classes
Acc: 0.87, ROC/AUC: 0.84
Average   - Prec: 0.84, Recall: 0.84, F1: 0.84
Pos_lab:0 - Prec: 0.79, Recall: 0.79, F1: 0.79
Pos_lab:1 - Prec: 0.90, Recall: 0.90, F1: 0.90

For Iris-virginica vs Rest Classes
Acc: 0.93, ROC/AUC: 0.94
Average   - Prec: 0.93, Recall: 0.94, F1: 0.93
Pos_lab:0 - Prec: 0.86, Recall: 1.00, F1: 0.92
Pos_lab:1 - Prec: 1.00, Recall: 0.89, F1: 0.94


In [15]:
#Avg F1 score for oneclassclassification using tree=> Sum of all(F1 scores/3)