-------------------------------------------------------------------------------------------------------------------------------

### Setup

In [1]:
# Necessary libraries
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import SelectKBest
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# Import dataset
from sklearn.datasets import fetch_kddcup99
kdd99_data = fetch_kddcup99()
import numpy as np
X_og = kdd99_data['data']
y_og = kdd99_data['target']
y_og[y_og == b'normal.'] = 1
y_og[y_og != 1] = -1
y_og = np.int64(y_og)

In [3]:
# Remove categorial columns
X_num_sample = np.delete(X_og,[1,2,3],1)
print(X_num_sample.shape)
print(y_og.shape)

(494021, 38)
(494021,)


In [4]:
# Take a random sample from 10% of data
sample_indices = np.random.choice(range(len(y_og)), 59000)
X_sample = X_num_sample[sample_indices,:]
y = y_og[sample_indices]
print(X_sample.shape)
print(y.shape)

(59000, 38)
(59000,)


#### Covariance Trial

In [5]:
X_sample = pd.DataFrame({'Column1': X_sample[:, 0], 'Column2': X_sample[:, 1],'Column3': X_sample[:, 2], 'Column4': X_sample[:, 3],'Column5': X_sample[:, 4],'Column6': X_sample[:, 5],'Column7': X_sample[:, 6],'Column8': X_sample[:, 7],'Column9': X_sample[:, 8],'Column10': X_sample[:, 9],'Column11': X_sample[:, 10], 'Column12': X_sample[:, 11], 'Column13': X_sample[:, 12], 'Column14': X_sample[:, 13],'Column15': X_sample[:, 14],'Column16': X_sample[:, 15],'Column17': X_sample[:, 16],'Column18': X_sample[:, 17],'Column19': X_sample[:, 18],'Column20': X_sample[:, 19],'Column21': X_sample[:, 20],'Column22': X_sample[:, 21],'Column23': X_sample[:, 22],'Column24': X_sample[:, 23],'Column25': X_sample[:, 24],'Column26': X_sample[:, 25],'Column27': X_sample[:, 26],'Column28': X_sample[:, 27],'Column29': X_sample[:, 28],'Column30': X_sample[:, 29],'Column31': X_sample[:, 30],'Column32': X_sample[:, 31],'Column33': X_sample[:, 32],'Column34': X_sample[:, 33],'Column35': X_sample[:, 34],'Column36': X_sample[:, 35],'Column37': X_sample[:, 36],'Column38': X_sample[:, 37]})
print(X_sample)

      Column1 Column2 Column3 Column4 Column5 Column6 Column7 Column8 Column9  \
0           0       0       0       0       0       0       0       0       0   
1           0     520       0       0       0       0       0       0       0   
2           0    1032       0       0       0       0       0       0       0   
3           0    1032       0       0       0       0       0       0       0   
4           0     294    1307       0       0       0       0       0       1   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
58995       0     252    1651       0       0       0       0       0       1   
58996       0    1032       0       0       0       0       0       0       0   
58997       0       0       0       0       0       0       0       0       0   
58998       0    1032       0       0       0       0       0       0       0   
58999       0     520       0       0       0       0       0       0       0   

      Column10  ... Column2

In [6]:
X_sample.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column29,Column30,Column31,Column32,Column33,Column34,Column35,Column36,Column37,Column38
0,0,0,0,0,0,0,0,0,0,0,...,255,6,0.02,0.09,0.0,0.0,0,0,1,1
1,0,520,0,0,0,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0,0,0,0
2,0,1032,0,0,0,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0,0,0,0
3,0,1032,0,0,0,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0,0,0,0
4,0,294,1307,0,0,0,0,0,1,0,...,81,255,1.0,0.0,0.01,0.05,0,0,0,0


In [7]:
X_sample.describe()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column29,Column30,Column31,Column32,Column33,Column34,Column35,Column36,Column37,Column38
count,59000,59000,59000,59000,59000,59000,59000,59000,59000,59000,...,59000,59000,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0
unique,445,1294,3419,2,3,1,16,2,2,6,...,255,255,101.0,96.0,101.0,52.0,57.0,22.0,96.0,86.0
top,0,1032,0,0,0,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
freq,57565,27223,48895,58997,58838,59000,58619,58995,50390,58737,...,51709,40365,41599.0,41489.0,34604.0,52787.0,47782.0,47926.0,54800.0,54923.0


In [None]:
cols = ['Column1', 'Column2', 'Column3', 'Column4', 'Column5','Column6','Column7','Column8','Column9','Column10','Column11','Column12','Column13','Column14','Column15','Column16','Column17','Column18','Column19','Column20','Column21','Column22','Column23','Column24','Column25','Column26','Column27','Column28','Column29','Column30','Column31','Column32','Column33','Column34','Column35','Column36','Column37','Column38']
sns.pairplot(X_sample[cols], size=2.0)

In [None]:
cols = ['Column1', 'Column2', 'Column3', 'Column4', 'Column5','Column6','Column7','Column8','Column9','Column10','Column11','Column12','Column13','Column14','Column15','Column16','Column17','Column18','Column19','Column20','Column21','Column22','Column23','Column24','Column25','Column26','Column27','Column28','Column29','Column30','Column31','Column32','Column33','Column34','Column35','Column36','Column37','Column38']
from sklearn.preprocessing import StandardScaler 
stdsc = StandardScaler() 
X_std = stdsc.fit_transform(X_sample[cols].iloc[:,range(0,28)].values)
cov_mat =np.cov(X_std.T)
plt.figure(figsize=(10,10))
sns.set(font_scale=1.5)
hm = sns.heatmap(cov_mat,
                 cbar=True,
                 annot=True,
                 square=True,
                 fmt='.2f',
                 annot_kws={'size': 12},
                 cmap='coolwarm',                 
                 yticklabels=cols,
                 xticklabels=cols)
plt.title('Covariance matrix showing correlation coefficients', size = 18)
plt.tight_layout()
plt.show()

-------------------------------------------------------------------------------------------------------------------------------

### LOF - With Feature Selection

In [None]:
recall_scores_lof = []
precision_scores_lof = []
f1_scores_lof = []


def lof_calculations(n):    
    
    lof = LocalOutlierFactor(n_neighbors = n)
    y_pred = lof.fit_predict(X_sample)
    y_pred = np.array(y_pred)
    
    r = recall_score(y, y_pred, pos_label = -1)
    p = precision_score(y, y_pred, pos_label = -1)
    f = f1_score(y, y_pred, pos_label = -1)
   
    recall_scores_lof.append(r)
    precision_scores_lof.append(p)
    f1_scores_lof.append(f)
    
#     print(y_pred[0:100])
    print('Nearest Neighbors:', n)
    print('Recall =', r, '\nPrecision =', p, '\nf1 =', f)

In [None]:
def lof_plots():
  
    plt.plot(ilist_lof, recall_scores_lof)
    plt.xlabel('Number of Max Samples')
    plt.ylabel('Recall Score')
    plt.title('Isolation Forest Recall Plot')
    plt.show()
    
    plt.plot(ilist_lof, precision_scores_lof)
    plt.xlabel('Number of Max Samples')
    plt.ylabel('Precision Score')
    plt.title('Isolation Forest Precision Plot')
    plt.show()
    
    plt.plot(ilist_lof, f1_scores_lof)
    plt.xlabel('Number of Max Samples')
    plt.ylabel('F1 Score')
    plt.title('Isolation Forest F1 Plot')
    plt.show()

In [None]:
ilist_lof = []
i = 1
while i<100:
    lof_calculations(i)
    ilist_lof.append(i)
    i=i*2

In [None]:
    plt.plot(ilist_lof, recall_scores_lof, label='Recall')
    plt.plot(ilist_lof, precision_scores_lof, label='Precison')
    plt.plot(ilist_lof, f1_scores_lof, label='F1')
    
    plt.xlabel('Number of Max Samples')
    plt.ylabel('Metric Values')
    plt.title('LOF Plot - No Feature Selection')
    
    leg = plt.legend();

    plt.show()

-------------------------------------------------------------------------------------------------------------------------------

### Isolation Forest - No Feature Selection

In [None]:
recall_scores_isof = []
precision_scores_isof = []
f1_scores_isof = []


def isof_calculations(s):    
    
    isof = IsolationForest(max_samples=s, random_state=42)
    isof.fit(X_sample)
    y_pred = isof.predict(X_sample)
    y_pred = np.array(y_pred)
    
    r = recall_score(y, y_pred, pos_label = -1)
    p = precision_score(y, y_pred, pos_label = -1)
    f = f1_score(y, y_pred, pos_label = -1)
   
    recall_scores_isof.append(r)
    precision_scores_isof.append(p)
    f1_scores_isof.append(f)
    
#     print(y_pred[0:100])
    print('Max Samples:', s)
    print('Recall =', r, '\nPrecision =', p, '\nf1 =', f)


In [None]:
def isof_plots():
  
    plt.plot(ilist_isof, recall_scores_isof)
    plt.xlabel('Number of Max Samples')
    plt.ylabel('Recall Score')
    plt.title('Isolation Forest Recall Plot')
    plt.show()
    
    plt.plot(ilist_isof, precision_scores_isof)
    plt.xlabel('Number of Max Samples')
    plt.ylabel('Precision Score')
    plt.title('Isolation Forest Precision Plot')
    plt.show()
    
    plt.plot(ilist_isof, f1_scores_isof)
    plt.xlabel('Number of Max Samples')
    plt.ylabel('F1 Score')
    plt.title('Isolation Forest F1 Plot')
    plt.show()

In [None]:
ilist_isof = []
i = 1
while i<1000:
    isof_calculations(i)
    ilist_isof.append(i)
    i=i*2

In [None]:
isof_plots()

In [None]:
    plt.plot(ilist_lof, recall_scores_isof, label='Recall')
    plt.plot(ilist_lof, precision_scores_isof, label='Precison')
    plt.plot(ilist_lof, f1_scores_isof,label='F1')
    
    plt.xlabel('Number of Max Samples')
    plt.ylabel('Metric Values')
    plt.title('Isolation Forest Plot - No Feature Selection')
    
    leg = plt.legend();

    plt.show()