We previously made a program that used LOF to detect anomalies in the KDDcup99 dataset. Here, we will expand on this to determine the optimal value for n nearest neighbors. To do this we will calculate the precision and recall score. 

In [1]:
from sklearn.datasets import fetch_kddcup99
from sklearn.neighbors import LocalOutlierFactor
import pandas as pd

In [2]:
kddcup99_data = fetch_kddcup99(subset='http')
X = kddcup99_data['data']
y = kddcup99_data['target']
y[y == b'normal.'] = 1
y[y!= 1] = -1

In [3]:
def param_sweep(n):
    print ('\n\nNumber of Nearest Neighbors:', n, '\n')
    lof = LocalOutlierFactor(n_neighbors = n)
    anomaly_predictions = lof.fit_predict(X)
    print('Anomaly Predictions:\n', anomaly_predictions[0:10])
    print('\nNegative Outlier Factor:\n', lof.negative_outlier_factor_[0:10])
    print('\nThreshold: ', lof.offset_)
    
    import pandas as pd
    df_confusion = pd.crosstab(y, anomaly_predictions)
    TP = df_confusion.iloc[0,0]  # True Positives
    FP = df_confusion.iloc[0,1]  # False Positives
    FN = df_confusion.iloc[1,0]  # False Negatives
    TN = df_confusion.iloc[1,1]  # True Negatives

    precision_score = TP/(TP+FP)
    recall_score = TP/(TP+FN)
    f1_score = 2*((precision_score*recall_score)/(precision_score+recall_score))

    true_positive_rate = TP/(TP + FN)
    false_positive_rate = FP/(FP+TN)
    
    print('\nPrecision Score = ', precision_score)
    print('\nRecall Score = ', recall_score)
    print('\nF1 Score = ', f1_score)
    print('\nTrue Positive Rate = ', true_positive_rate)
    print('\nFalse Positive Rate = ', false_positive_rate)

In [7]:
i=1
while i<5:
    print('\n\n-----------------------------------------------------------------')
    param_sweep(i)
    i=i+1
    
i = 5
while i <= 160:
    print('\n\n-----------------------------------------------------------------')
    param_sweep(i)
    i=i*2
   



-----------------------------------------------------------------


Number of Nearest Neighbors: 1 

Anomaly Predictions:
 [ 1 -1  1  1  1  1  1 -1 -1  1]

Negative Outlier Factor:
 [-1.00000000e+00 -4.10593894e+07 -1.00000000e+00 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00 -4.55765382e+00
 -3.81710851e+00 -1.00000000e+00]

Threshold:  -1.5

Precision Score =  0.010864644635581712

Recall Score =  0.002041510717931269

F1 Score =  0.0034371643394199777

True Positive Rate =  0.002041510717931269

False Positive Rate =  0.04652004513615363


-----------------------------------------------------------------


Number of Nearest Neighbors: 2 

Anomaly Predictions:
 [ 1 -1  1  1  1  1  1 -1  1 -1]

Negative Outlier Factor:
 [-1.05286312e+00 -4.10593894e+07 -1.10246281e+00 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00 -8.90097156e-01 -1.90740524e+00
 -9.52685625e-01 -2.46936072e+00]

Threshold:  -1.5

Precision Score =  0.015844273426889995

Recall Score =  0.003724