In [None]:
import sklearn
import numpy as np
import pandas as pd
from numpy import where
import io
import matplotlib.pyplot as plt
import math

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

## Load Dataset

In [None]:
names = np.array(["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels","others"])
TrainDataset = pd.read_csv('KDDTrain+.csv', names=names, header=None)
TestDataset = pd.read_csv('KDDTest+.csv', names=names, header=None)

In [None]:
TrainData = TrainDataset.iloc[:, :-1]
TestData = TestDataset.iloc[:, :-1]

# Data Analysis

In [None]:
at_labels =  TrainData['labels'].replace({
    'normal': 0,
    
    'back': 1,
    'land': 1,
    'neptune': 1,
    'pod': 1,
    'smurf': 1,
    'teardrop': 1,
    'mailbomb': 1,
    'apache2': 1,
    'processtable': 1,
    'udpstorm': 1,
    
    'ipsweep': 2,
    'nmap': 2,
    'portsweep': 2,
    'satan': 2,
    'mscan': 2,
    'saint': 2,

    'ftp_write': 3,
    'guess_passwd': 3,
    'imap': 3,
    'multihop': 3,
    'phf': 3,
    'spy': 3,
    'warezclient': 3,
    'warezmaster': 3,
    'sendmail': 3,
    'named': 3,
    'snmpgetattack': 3,
    'snmpguess': 3,
    'xlock': 3,
    'xsnoop': 3,
    'worm': 3,
    
    'buffer_overflow': 4,
    'loadmodule': 4,
    'perl': 4,
    'rootkit': 4,
    'httptunnel': 4,
    'ps': 4,    
    'sqlattack': 4,
    'xterm': 4
})
at_labels1 = TestData['labels'].replace({
    'normal': 0,
    
    'back': 1,
    'land': 1,
    'neptune': 1,
    'pod': 1,
    'smurf': 1,
    'teardrop': 1,
    'mailbomb': 1,
    'apache2': 1,
    'processtable': 1,
    'udpstorm': 1,
    
    'ipsweep': 2,
    'nmap': 2,
    'portsweep': 2,
    'satan': 2,
    'mscan': 2,
    'saint': 2,

    'ftp_write': 3,
    'guess_passwd': 3,
    'imap': 3,
    'multihop': 3,
    'phf': 3,
    'spy': 3,
    'warezclient': 3,
    'warezmaster': 3,
    'sendmail': 3,
    'named': 3,
    'snmpgetattack': 3,
    'snmpguess': 3,
    'xlock': 3,
    'xsnoop': 3,
    'worm': 3,
    
    'buffer_overflow': 4,
    'loadmodule': 4,
    'perl': 4,
    'rootkit': 4,
    'httptunnel': 4,
    'ps': 4,    
    'sqlattack': 4,
    'xterm': 4
})
TrainData['labels'] = at_labels
TestData['labels'] = at_labels1

In [None]:
TrainData.shape

In [None]:
TestData.shape

In [None]:
categorical = [1, 2, 3]
binary = [6, 11, 13, 14, 20, 21]
numeric = list(set(range(41)).difference(categorical).difference(binary))

categorical_cols = names[categorical].tolist()
binary_cols = names[binary].tolist()
numeric_cols = names[numeric].tolist()

In [None]:
categorical_cols

In [None]:
binary_cols

In [None]:
numeric_cols

In [None]:
TrainData[binary_cols].describe().transpose()

In [None]:
TrainData.loc[TrainData['su_attempted'] == 2.0, 'su_attempted'] = 0.0

In [None]:
TestData.loc[TestData['su_attempted'] == 2.0, 'su_attempted'] = 0.0

In [None]:
TrainData[binary_cols].describe().transpose()

In [None]:
TrainData[numeric_cols].describe().transpose()

In [None]:
TrainData = TrainData.drop('num_outbound_cmds',axis=1)
TestData = TestData.drop('num_outbound_cmds',axis=1)
numeric_cols.remove('num_outbound_cmds')

In [None]:
print('protocol_type', len(TrainData['protocol_type'].value_counts().keys()))
print('service', len(TrainData['service'].value_counts().keys()))
print('flag', len(TrainData['flag'].value_counts().keys()))

In [None]:
print('protocol_type', len(TestData['protocol_type'].value_counts().keys()))
print('service', len(TestData['service'].value_counts().keys()))
print('flag', len(TestData['flag'].value_counts().keys()))

## Encoding Categorical Features

In [None]:
onehotencoder = OneHotEncoder()

In [None]:
#reshape the 1-D protocol, service, flag arrays to 2-D as fit_transform expects 2-D
X_protocol = onehotencoder.fit_transform(TrainData.protocol_type.values.reshape(-1,1)).toarray()
X_service = onehotencoder.fit_transform(TrainData.service.values.reshape(-1,1)).toarray()
X_flag = onehotencoder.fit_transform(TrainData.flag.values.reshape(-1,1)).toarray()

In [None]:
# Now I will add the encoded arrays into the X_train and drop the original columns
protocol_df = pd.DataFrame(X_protocol, columns = ["protocol_type_"+str(int(i)) for i in range(X_protocol.shape[1])])
TrainData = pd.concat([TrainData, protocol_df], axis=1)
TrainData = TrainData.drop(['protocol_type'], axis=1) 

service_df = pd.DataFrame(X_service, columns = ["service_"+str(int(i)) for i in range(X_service.shape[1])])
TrainData = pd.concat([TrainData, service_df], axis=1)
TrainData = TrainData.drop(['service'], axis=1) 

flag_df = pd.DataFrame(X_flag, columns = ["flag_"+str(int(i)) for i in range(X_flag.shape[1])])
TrainData = pd.concat([TrainData, flag_df], axis=1)
TrainData = TrainData.drop(['flag'], axis=1) 

In [None]:
#it will be the same for testing
X_protocol_t = onehotencoder.fit_transform(TestData.protocol_type.values.reshape(-1,1)).toarray()
X_service_t = onehotencoder.fit_transform(TestData.service.values.reshape(-1,1)).toarray()
X_flag_t = onehotencoder.fit_transform(TestData.flag.values.reshape(-1,1)).toarray()

protocol_df_t = pd.DataFrame(X_protocol_t, columns = ["protocol_type_"+str(int(i)) for i in range(X_protocol_t.shape[1])])
TestData = pd.concat([TestData, protocol_df_t], axis=1)
TestData = TestData.drop(['protocol_type'], axis=1) 

service_df_t = pd.DataFrame(X_service_t, columns = ["service_"+str(int(i)) for i in range(X_service_t.shape[1])])
TestData = pd.concat([TestData, service_df_t], axis=1)
TestData = TestData.drop(['service'], axis=1) 

flag_df_t = pd.DataFrame(X_flag_t, columns = ["flag_"+str(int(i)) for i in range(X_flag_t.shape[1])])
TestData = pd.concat([TestData, flag_df_t], axis=1)
TestData = TestData.drop(['flag'], axis=1) 

In [None]:
missing_cols = set( TrainData.columns ) - set( TestData.columns )
for i in missing_cols:
    TestData[i] = 0

In [None]:
TestData = TestData[list(TrainData.columns)]
TestData.head()

In [None]:
yTrainData = TrainData['labels']
yTrainData.value_counts()

In [None]:
yTestData = TestData['labels']
yTestData

In [None]:
TrainData = TrainData.drop(['labels'], axis=1)
TestData = TestData.drop(['labels'], axis=1)

# Standard Scaling

In [None]:
scaler = StandardScaler()
scaler.fit(TrainData)

In [None]:
sc_data = scaler.transform(TrainData)
ScaledTrainData =  pd.DataFrame(sc_data, columns=list(TrainData.columns))

In [None]:
print(sc_data.std(axis=0))

In [None]:
sc_data_test = scaler.transform(TestData)
ScaledTestData =  pd.DataFrame(sc_data_test, columns=list(TestData.columns))
ScaledTestData.tail(10)

In [None]:
ScaledTrainData['labels'] = yTrainData

In [None]:
ScaledTestData['labels'] = yTestData

## Train

In [None]:
DoS_df_train = ScaledTrainData[~ScaledTrainData['labels'].isin([2,3,4])];
Probe_df_train = ScaledTrainData[~ScaledTrainData['labels'].isin([1,3,4])];
R2L_df_train = ScaledTrainData[~ScaledTrainData['labels'].isin([1,2,4])];
U2R_df_train = ScaledTrainData[~ScaledTrainData['labels'].isin([1,2,3])];

# Test

In [None]:
DoS_df_test = ScaledTestData[~ScaledTestData['labels'].isin([2,3,4])];
Probe_df_test = ScaledTestData[~ScaledTestData['labels'].isin([1,3,4])];
R2L_df_test = ScaledTestData[~ScaledTestData['labels'].isin([1,2,4])];
U2R_df_test = ScaledTestData[~ScaledTestData['labels'].isin([1,2,3])];

## Class Balancing

In [None]:
def plot_classes_portions(A, attackName, title):
  noattack = 0
  attack = 0
  for i in A:
    if i == 0:
      noattack += 1
    else:
        attack += 1
  dict_ = {'normal': noattack, attackName: attack}
  x = dict_.keys()
  y = dict_.values()
  plt.bar(x, y, color ='maroon', width = 0.4)
  plt.xlabel("Attack types")
  plt.ylabel("No. of objects")
  plt.title(title)
  plt.show()

In [None]:
plot_classes_portions(DoS_df_train['labels'].to_numpy(), 'DoS', 'Normal vs. DoS')

In [None]:
plot_classes_portions(Probe_df_train['labels'].to_numpy(), 'Probe', 'Normal vs. Probe')

In [None]:
plot_classes_portions(R2L_df_train['labels'].to_numpy(), 'R2L', 'Normal vs. R2L')

In [None]:
plot_classes_portions(U2R_df_train['labels'].to_numpy(), 'U2R', 'Normal vs. U2R')

In [None]:
def underSamplingTech(x, y):
    u = RandomUnderSampler()
    x_t, y_t = u.fit_resample(x, y)
    return x_t, y_t

def overSamplingTech(x, y):
    o = RandomOverSampler()
    x_t, y_t = o.fit_resample(x, y)
    return x_t, y_t

def smote(x, y):
    smote= SMOTE()
    x_t, y_t = smote.fit_resample(x, y)
    return x_t, y_t

def extraction(df):
    return df.loc[:, df.columns != 'labels']

x_train_DoS, y_train_DoS = overSamplingTech(extraction(DoS_df_train), DoS_df_train['labels'])
x_train_Probe, y_train_Probe = overSamplingTech(extraction(Probe_df_train), Probe_df_train['labels'])
x_train_R2L, y_train_R2L = overSamplingTech(extraction(R2L_df_train), R2L_df_train['labels'])
x_train_U2R, y_train_U2R = overSamplingTech(extraction(U2R_df_train), U2R_df_train['labels'])

In [None]:
x_test_DoS, y_test_DoS = extraction(DoS_df_test), DoS_df_test['labels']
x_test_Probe, y_test_Probe = extraction(Probe_df_test), Probe_df_test['labels']
x_test_R2L, y_test_R2L = extraction(R2L_df_test), R2L_df_test['labels']
x_test_U2R, y_test_U2R = extraction(U2R_df_test), U2R_df_test['labels']

In [None]:
plot_classes_portions(y_train_DoS, 'DoS', 'Normal vs. DoS')
plot_classes_portions(y_train_Probe, 'Probe', 'Normal vs. Probe')
plot_classes_portions(y_train_R2L, 'R2L', 'Normal vs. R2L')
plot_classes_portions(y_train_U2R, 'U2R', 'Normal vs. U2R')

## Feature Selection

## SelectFromModel

### LogisticRegression

In [None]:
def fs_lr(x, y):
    scaler = MinMaxScaler()
    scaler.fit(x)
    x_t = scaler.transform(x)
    lr_selector = SelectFromModel(LogisticRegression(penalty="l2",max_iter=10000), max_features=117)
    lr_selector.fit(x_t, y)
    lr_support = lr_selector.get_support()
    lr_feature = x.loc[:,lr_support].columns.tolist()
    print(lr_feature)
    return lr_support

### SVM

In [None]:
def fs_svm(x, y):
    scaler = MinMaxScaler()
    scaler.fit(x)
    x_t= scaler.transform(x)
    svm_selector = SelectFromModel(SVC(kernel='linear'), max_features=117)
    svm_selector.fit(x_t, y)
    svm_support = svm_selector.get_support()
    svm_feature = x.loc[:,svm_support].columns.tolist()
    print(svm_feature)
    return svm_support

### RandomForest

In [None]:
def fs_rf(x, y):
    rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=1000), max_features=117)
    rf_selector.fit(x, y)
    rf_support = rf_selector.get_support()
    rf_feature = x.loc[:,rf_support].columns.tolist()
    print(rf_feature)
    return rf_support

### LGBM

In [None]:
def fs_lgbm(x, y):
    lgbc=LGBMClassifier(n_estimators=1000, learning_rate=0.05, num_leaves=64, colsample_bytree=0.2,
                reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
    lgb_selector = SelectFromModel(lgbc, max_features=117)
    lgb_selector.fit(x, y)
    lgb_support = lgb_selector.get_support()
    lgb_feature = x.loc[:,lgb_support].columns.tolist()
    print(lgb_feature)
    return lgb_support

### Combine all the previous techniques and get the features that have been choosen by at least *count* feature selection technique

In [None]:
# Get combined features
def get_fs_features(lr_supp, rf_supp, svm_supp, lgb_supp, count):
    pd.set_option('display.max_rows', None)
    feature_name = list(x_train_DoS.columns)
    feature_selection_df = pd.DataFrame({'Feature':feature_name,'Logistics':lr_supp,'Random Forest':rf_supp,'svm':svm_supp,
                                         'LightGBM':lgb_supp})
    feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
    feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
    feature_selection_df.index = range(1, len(feature_selection_df)+1)
    f = feature_selection_df[feature_selection_df.Total >= count]
    extracted_features = list(f['Feature'])
    return feature_selection_df, extracted_features

# Prepare train and test data for each attack

### DoS

In [None]:
lr = fs_lr(x_train_DoS, y_train_DoS)
rf = fs_rf(x_train_DoS, y_train_DoS)
sv = fs_svm(x_train_DoS, y_train_DoS)
lgb = fs_lgbm(x_train_DoS, y_train_DoS)

In [None]:
feature_selection_df_DoS, extracted_features_DoS = get_fs_features(lr, rf, sv, lgb, 1)

In [None]:
feature_selection_df_DoS

In [None]:
x_train_DoS_fs = x_train_DoS[extracted_features_DoS]
x_train_DoS_fs.head()

In [None]:
x_test_DoS = x_test_DoS[list(x_train_DoS.columns)]

In [None]:
x_test_DoS_fs = x_test_DoS[extracted_features_DoS]
x_test_DoS_fs.head()

### Probe

In [None]:
lr = fs_lr(x_train_Probe, y_train_Probe)
rf = fs_rf(x_train_Probe, y_train_Probe)
sv = fs_svm(x_train_Probe, y_train_Probe)
lgb = fs_lgbm(x_train_Probe, y_train_Probe)

In [None]:
feature_selection_df_Probe, extracted_features_Probe = get_fs_features(lr, rf, sv, lgb, 1)

In [None]:
feature_selection_df_Probe

In [None]:
x_train_Probe_fs = x_train_Probe[extracted_features_Probe]
x_train_Probe_fs.head()

In [None]:
x_test_Probe = x_test_Probe[list(x_train_Probe.columns)]

In [None]:
x_test_Probe_fs = x_test_Probe[extracted_features_Probe]
x_test_Probe_fs.head()

### R2L

In [None]:
lr = fs_lr(x_train_R2L, y_train_R2L)
rf = fs_rf(x_train_R2L, y_train_R2L)
sv = fs_svm(x_train_R2L, y_train_R2L)
lgb = fs_lgbm(x_train_R2L, y_train_R2L)

In [None]:
feature_selection_df_R2L, extracted_features_R2L = get_fs_features(lr, rf, sv, lgb, 1)

In [None]:
feature_selection_df_R2L

In [None]:
extracted_features_R2L

In [None]:
x_train_R2L_fs = x_train_R2L[extracted_features_R2L]
x_train_R2L_fs.head()

In [None]:
x_test_R2L = x_test_R2L[list(x_train_R2L.columns)]

In [None]:
x_test_R2L_fs = x_test_R2L[extracted_features_R2L]
x_test_R2L_fs.head()

### U2R

In [None]:
lr = fs_lr(x_train_U2R, y_train_U2R)
rf = fs_rf(x_train_U2R, y_train_U2R)
sv = fs_svm(x_train_U2R, y_train_U2R)
lgb = fs_lgbm(x_train_U2R, y_train_U2R)

In [None]:
feature_selection_df_U2R, extracted_features_U2R = get_fs_features(lr, rf, sv, lgb, 1)

In [None]:
feature_selection_df_U2R

In [None]:
x_train_U2R_fs = x_train_U2R[extracted_features_U2R]
x_train_U2R_fs.head()

In [None]:
x_test_U2R = x_test_U2R[list(x_train_U2R.columns)]

In [None]:
x_test_U2R_fs = x_test_U2R[extracted_features_U2R]
x_test_U2R_fs.head()

## Multiclass

In [None]:
ScaledTrainData = ScaledTrainData.drop(['labels'], axis=1)
ScaledTestData = ScaledTestData.drop(['labels'], axis=1)

In [None]:
ScaledTestData = ScaledTestData[list(ScaledTrainData.columns)]

In [None]:
dict_r = yTrainData.value_counts()

In [None]:
d = dict(dict_r)
d = {'normal': 67343, 'DoS': 45927, 'Probe': 11656, 'R2L': 995, 'U2R': 52}

In [None]:
x = d.keys()
y = d.values()
plt.bar(x, y, color ='maroon', width = 0.4)
plt.xlabel("Attack types")
plt.ylabel("No. of objects")
plt.title('Multiclass Balance Before')
plt.show()

In [None]:
x_train_multiclass, y_train_multiclass = smote(ScaledTrainData, yTrainData)
x_train_multiclass.head()

In [None]:
y_train_multiclass.head()

In [None]:
dict_r = y_train_multiclass.value_counts()
d = dict(dict_r)
x = d.keys()
y = d.values()
plt.bar(x, y, color ='maroon', width = 0.4)
plt.xlabel("Attack types")
plt.ylabel("No. of objects")
plt.title('Multiclass Balance After')
plt.show()

In [None]:
x_test_multiclass, y_test_multiclass= ScaledTestData, yTestData
x_test_multiclass.head()

In [None]:
lr = fs_lr(x_train_multiclass, y_train_multiclass)
rf = fs_rf(x_train_multiclass, y_train_multiclass)
sv = fs_svm(x_train_multiclass, y_train_multiclass)
lgb = fs_lgbm(x_train_multiclass, y_train_multiclass)

In [None]:
feature_selection_df_multiclass, extracted_features_multiclass = get_fs_features(lr, rf, sv, lgb, 1)

In [None]:
feature_selection_df_multiclass

In [None]:
x_train_multiclass_fs = x_train_multiclass[extracted_features_multiclass]
x_train_multiclass_fs.head()

In [None]:
x_test_multiclass_fs = x_test_multiclass[extracted_features_multiclass]
x_test_multiclass_fs.head()

# Models

## Naive Bayes

In [None]:
class NaiveBayes():
    def fit_naive_bayes(self, X, y):
        self.X, self.y = X, y
        self.classes = np.unique(y)
        self.parameters = []
        for i, c in enumerate(self.classes):
            X_where_c = X[np.where(y == c)]
            self.parameters.append([])
            for col in X_where_c.T:
                parameters = {"mean": col.mean(), "var": col.var()}
                self.parameters[i].append(parameters)
                
    def classify_naive_bayes(self, sample):
        posteriors = []
        for i, c in enumerate(self.classes):
            posterior = self.prior_naive_bayes(c)
            for feature_value, params in zip(sample, self.parameters[i]):
                likelihood = self.likelihood_naive_bayes(params["mean"], params["var"], feature_value)
                posterior *= likelihood
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]

    def predict_naive_bayes(self, X):
        y_pred = [self.classify_naive_bayes(sample) for sample in X]
        return y_pred
    
    def likelihood_naive_bayes(self, mean, var, x):
        eps = 1e-2
        coeff = 1.0 / math.sqrt(2.0 * math.pi * var + eps)
        exponent = math.exp(-(math.pow(x - mean, 2) / (2 * var + eps)))
        return coeff * exponent

    def prior_naive_bayes(self, c):
        frequency = np.mean(self.y == c)
        return frequency
    
    def accuracy_naive_bayes(self, y_true, y_pred):
        accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
        return accuracy

In [None]:
nv = NaiveBayes()
nv.fit_naive_bayes(x_train_multiclass_fs.to_numpy(), y_train_multiclass.to_numpy())
y_pred = nv.predict_naive_bayes(x_test_multiclass_fs.to_numpy())
accuracy = nv.accuracy_naive_bayes(y_test_multiclass.to_numpy(), y_pred)
print ("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test_multiclass.to_numpy(), y_pred))

In [None]:
nv = NaiveBayes()
nv.fit_naive_bayes(x_train_DoS_fs.to_numpy(), y_train_DoS.to_numpy())
y_pred = nv.predict_naive_bayes(x_test_DoS_fs.to_numpy())
accuracy = nv.accuracy_naive_bayes(y_test_DoS.to_numpy(), y_pred)
print ("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test_DoS.to_numpy(), y_pred))

In [None]:
nv = NaiveBayes()
nv.fit_naive_bayes(x_train_Probe_fs.to_numpy(), y_train_Probe.to_numpy())
y_pred = nv.predict_naive_bayes(x_test_Probe_fs.to_numpy())
accuracy = nv.accuracy_naive_bayes(y_test_Probe.to_numpy(), y_pred)
print ("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test_Probe.to_numpy(), y_pred))

In [None]:
nv = NaiveBayes()
nv.fit_naive_bayes(x_train_U2R_fs.to_numpy(), y_train_U2R.to_numpy())
y_pred = nv.predict_naive_bayes(x_test_U2R_fs.to_numpy())
accuracy = nv.accuracy_naive_bayes(y_test_U2R.to_numpy(), y_pred)
print ("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test_U2R.to_numpy(), y_pred))

In [None]:
nv = NaiveBayes()
nv.fit_naive_bayes(x_train_R2L_fs.to_numpy(), y_train_R2L.to_numpy())
y_pred = nv.predict_naive_bayes(x_test_R2L_fs.to_numpy())
accuracy = nv.accuracy_naive_bayes(y_test_R2L.to_numpy(), y_pred)
print ("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test_R2L.to_numpy(), y_pred))

## knn

In [None]:
class knn():
    def __init__(self, k=5):
        self.k = k
    
    def knn_euclidean_distance(self, x1, x2):
        distance = 0
        for i in range(len(x1)):
            distance += pow((x1[i] - x2[i]), 2)
        return math.sqrt(distance)

    def knn_vote(self, neighbor_labels):
        counts = np.bincount(neighbor_labels.astype('int'))
        return counts.argmax()

    def knn_predict(self, X_test, X_train, y_train):
        y_pred = np.empty(X_test.shape[0])
        for i, test_sample in enumerate(X_test):
            idx = np.argsort([self.knn_euclidean_distance(test_sample, x) for x in X_train])[:self.k]
            k_nearest_neighbors = np.array([y_train[i] for i in idx])
            y_pred[i] = self.knn_vote(k_nearest_neighbors)
        return y_pred
    
    def accuracy_knn(self, y_true, y_pred):
        accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
        return accuracy

In [None]:
clf = knn(k=3)
y_pred = clf.knn_predict(x_test_multiclass_fs.to_numpy(), x_train_multiclass_fs.to_numpy(), y_train_multiclass.to_numpy())
accuracy = accuracy_knn(y_test_multiclass, y_pred)
print ("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test_multiclass.to_numpy(), y_pred))

In [None]:
clf = knn(k=3)
y_pred = clf.knn_predict(x_test_DoS_fs.to_numpy(), x_train_DoS_fs.to_numpy(), y_train_DoS.to_numpy())
accuracy = accuracy_knn(y_test_DoS, y_pred)
print ("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test_DoS.to_numpy(), y_pred))

In [None]:
clf = knn(k=3)
y_pred = clf.knn_predict(x_test_Probe_fs.to_numpy(), x_train_Probe_fs.to_numpy(), y_train_Probe.to_numpy())
accuracy = accuracy_knn(y_test_Probe, y_pred)
print ("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test_Probe.to_numpy(), y_pred))

In [None]:
clf = knn(k=3)
y_pred = clf.knn_predict(x_test_U2R_fs.to_numpy(), x_train_U2R_fs.to_numpy(), y_train_U2R.to_numpy())
accuracy = accuracy_knn(y_test_U2R, y_pred)
print ("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test_U2R.to_numpy(), y_pred))

In [None]:
clf = knn(k=3)
y_pred = clf.knn_predict(x_test_R2L_fs.to_numpy(), x_train_R2L_fs.to_numpy(), y_train_R2L.to_numpy())
accuracy = accuracy_knn(y_test_R2L, y_pred)
print ("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test_R2L.to_numpy(), y_pred))