In [25]:
# selected machine learning models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc, precision_recall_curve, average_precision_score, f1_score, precision_score, recall_score, make_scorer

from sklearn.exceptions import ConvergenceWarning
ConvergenceWarning('ignore')

random_state = 719


In [26]:
# create parameter grids
decision_tree_param_grid = {
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'clf__min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'clf__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'clf__max_features': [None, 'sqrt', 'log2'],
    'clf__class_weight': [None, 'balanced']
}

random_forest_param_grid = {
    'clf__n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'clf__min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'clf__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'clf__max_features': [None, 'sqrt', 'log2'],
    'clf__class_weight': [None, 'balanced']
}

knn_param_grid = {
    'clf__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'clf__weights': ['uniform', 'distance'],
    'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'clf__leaf_size': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'clf__p': [1, 2]
}

logistic_regression_param_grid = {
    'clf__estimator__penalty': ['l2'],
    'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'clf__estimator__class_weight': [None, 'balanced']
}

svm_param_grid = {
    'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'clf__estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'clf__estimator__degree': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'clf__estimator__gamma': ['auto', 'scale'],
    'clf__estimator__class_weight': [None, 'balanced']
}

mlp_param_grid = {
    'clf__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,), (70,), (80,), (90,), (100,)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'clf__solver': ['lbfgs', 'sgd', 'adam'],
    'clf__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'clf__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'clf__learning_rate_init': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'clf__power_t': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'clf__max_iter': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'clf__shuffle': [True, False],
    'clf__random_state': [random_state],
    'clf__tol': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'clf__verbose': [False],
    'clf__warm_start': [True, False],
    'clf__momentum': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'clf__nesterovs_momentum': [True, False],
    'clf__early_stopping': [True, False],
    'clf__validation_fraction': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'clf__beta_1': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'clf__beta_2': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'clf__epsilon': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'clf__n_iter_no_change': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

ada_boost_param_grid = {
    'clf__estimator__n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'clf__estimator__learning_rate': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'clf__estimator__algorithm': ['SAMME', 'SAMME.R']
}


In [27]:
import json


def best_parameters(x_train, y_train, dataset_name):

    # create pipelines
    decision_tree = Pipeline(
        [('scaler', StandardScaler()), ('clf', DecisionTreeClassifier(random_state=random_state))])
    random_forest = Pipeline(
        [('scaler', StandardScaler()), ('clf', RandomForestClassifier(random_state=random_state))])
    knn = Pipeline(
        [('scaler', StandardScaler()), ('clf', KNeighborsClassifier())])
    logistic_regression = Pipeline(
        [('scaler', StandardScaler()), ('clf', OneVsRestClassifier(estimator=LogisticRegression(random_state=random_state)))])
    svm = Pipeline(
        [('scaler', StandardScaler()), ('clf', OneVsRestClassifier(SVC(random_state=random_state)))])
    mlp = Pipeline(
        [('scaler', StandardScaler()), ('clf', MLPClassifier(random_state=random_state))])
    ada_boost = Pipeline(
        [('scaler', StandardScaler()), ('clf', OneVsRestClassifier(AdaBoostClassifier(random_state=random_state)))])

    pipeline_grids = [
        ("decision_tree", decision_tree, decision_tree_param_grid),
        ("random_forest", random_forest, random_forest_param_grid),
        ("knn", knn, knn_param_grid),
        ("logistic_regression", logistic_regression, logistic_regression_param_grid),
        ("svm", svm, svm_param_grid),
        ("mlp", mlp, mlp_param_grid),
        ("ada_boost", ada_boost, ada_boost_param_grid)
    ]

    best_params = []

    for model_name, pipeline, grid in pipeline_grids:
        try:
            grid_search = RandomizedSearchCV(
                pipeline,
                grid,
                scoring='roc_auc',
                n_jobs=-1,
                cv=3,
                n_iter=400)

            grid_search.fit(x_train, y_train)

            best_param = {'model': model_name,
                          'best_parameters': grid_search.best_params_,
                          'best_score': grid_search.best_score_}

            with open("../artefacts/6/params_{}_{}.json".format(dataset_name, model_name), 'w') as f:
                json.dump(best_param, f)

            best_params.append(best_param)
            print(best_param)
        except Exception as e:
            print("error with model: ", model_name, " - ", e)

    return best_params


In [28]:
import pandas as pd

sample_size = 1000

df_kdd = pd.read_csv(
    '../artefacts/5/kdd_train.csv', index_col=0).sample(sample_size, random_state=random_state)

df_kdd.head()


Unnamed: 0_level_0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,Class_benign,Class_dos,Class_probe
index1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
75368.0,-0.110249,-0.007757,-0.004919,0.0,-0.089486,-0.007736,-0.095076,-0.027023,0.0,-0.011664,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0
43738.0,-0.110249,-0.007762,-0.004919,0.0,-0.089486,-0.007736,-0.095076,-0.027023,0.0,-0.011664,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0
111235.0,-0.110249,-0.007719,-0.003477,0.0,-0.089486,-0.007736,-0.095076,-0.027023,1.0,-0.011664,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0
50758.0,-0.110249,-0.007762,-0.004919,0.0,-0.089486,-0.007736,-0.095076,-0.027023,0.0,-0.011664,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0
10934.0,-0.110249,-0.007762,-0.004919,0.0,-0.089486,-0.007736,-0.095076,-0.027023,0.0,-0.011664,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0


In [29]:
kdd_x = df_kdd.iloc[:, :-3]
kdd_y = df_kdd.iloc[:, -3:]

params = best_parameters(kdd_x, kdd_y, 'kdd')


{'model': 'decision_tree', 'best_parameters': {'clf__min_samples_split': 10, 'clf__min_samples_leaf': 5, 'clf__max_features': None, 'clf__max_depth': 9, 'clf__criterion': 'gini', 'clf__class_weight': None}, 'best_score': 0.9875842415690523}
{'model': 'random_forest', 'best_parameters': {'clf__n_estimators': 40, 'clf__min_samples_split': 4, 'clf__min_samples_leaf': 2, 'clf__max_features': None, 'clf__max_depth': None, 'clf__criterion': 'gini', 'clf__class_weight': 'balanced'}, 'best_score': 0.9996966576328202}
{'model': 'knn', 'best_parameters': {'clf__weights': 'distance', 'clf__p': 2, 'clf__n_neighbors': 9, 'clf__leaf_size': 60, 'clf__algorithm': 'auto'}, 'best_score': 0.9890189773634009}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'model': 'logistic_regression', 'best_parameters': {'clf__estimator__penalty': 'l2', 'clf__estimator__class_weight': 'balanced', 'clf__estimator__C': 0.01}, 'best_score': 0.9943246418873324}
{'model': 'svm', 'best_parameters': {'clf__estimator__kernel': 'rbf', 'clf__estimator__gamma': 'scale', 'clf__estimator__degree': 3, 'clf__estimator__class_weight': None, 'clf__estimator__C': 10}, 'best_score': 0.995490632575616}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

{'model': 'mlp', 'best_parameters': {'clf__warm_start': True, 'clf__verbose': False, 'clf__validation_fraction': 0.8, 'clf__tol': 1, 'clf__solver': 'sgd', 'clf__shuffle': True, 'clf__random_state': 719, 'clf__power_t': 0.7, 'clf__nesterovs_momentum': True, 'clf__n_iter_no_change': 3, 'clf__momentum': 0.5, 'clf__max_iter': 500, 'clf__learning_rate_init': 10, 'clf__learning_rate': 'adaptive', 'clf__hidden_layer_sizes': (40,), 'clf__epsilon': 0.1, 'clf__early_stopping': False, 'clf__beta_2': 0.1, 'clf__beta_1': 0.7, 'clf__alpha': 1, 'clf__activation': 'logistic'}, 'best_score': 0.9974681555910637}


  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X

{'model': 'ada_boost', 'best_parameters': {'clf__estimator__n_estimators': 100, 'clf__estimator__learning_rate': 1, 'clf__estimator__algorithm': 'SAMME'}, 'best_score': 0.999183550658901}


In [30]:
df_unsw = pd.read_csv(
    '../artefacts/5/unsw_train.csv', index_col=0).sample(sample_size, random_state=random_state)

df_unsw.head()

Unnamed: 0_level_0,dur,xProt,xServ,xState,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15433.0,9e-06,2,7,1,2.0,0.0,114.0,0.0,111111.1072,254.0,...,16.0,16.0,16.0,0.0,0.0,0.0,17.0,16.0,0.0,1
11243.0,1.007661,4,2,2,14.0,18.0,1684.0,10168.0,30.764315,31.0,...,1.0,1.0,1.0,0.0,0.0,1.0,2.0,4.0,0.0,0
10539.0,1e-05,10,999,1,2.0,0.0,200.0,0.0,100000.0025,254.0,...,3.0,3.0,4.0,0.0,0.0,0.0,3.0,4.0,0.0,1
67830.0,2.020123,4,999,2,12.0,10.0,994.0,838.0,10.395407,254.0,...,3.0,3.0,3.0,0.0,0.0,0.0,3.0,3.0,0.0,1
7621.0,1.149881,4,2,2,14.0,18.0,1684.0,10168.0,26.959311,31.0,...,1.0,1.0,2.0,0.0,0.0,1.0,3.0,3.0,0.0,0


In [31]:
unsw_x = df_unsw.iloc[:, :-1]
unsw_y = df_unsw.iloc[:, -1:]

# change pandas series to numpy array
unsw_y = unsw_y.values.ravel()

params = best_parameters(unsw_x, unsw_y, 'unsw')


{'model': 'decision_tree', 'best_parameters': {'clf__min_samples_split': 8, 'clf__min_samples_leaf': 2, 'clf__max_features': 'sqrt', 'clf__max_depth': 6, 'clf__criterion': 'gini', 'clf__class_weight': 'balanced'}, 'best_score': 0.9676280025025631}
{'model': 'random_forest', 'best_parameters': {'clf__n_estimators': 60, 'clf__min_samples_split': 8, 'clf__min_samples_leaf': 1, 'clf__max_features': 'sqrt', 'clf__max_depth': 8, 'clf__criterion': 'entropy', 'clf__class_weight': 'balanced'}, 'best_score': 0.9828255112463088}
{'model': 'knn', 'best_parameters': {'clf__weights': 'distance', 'clf__p': 1, 'clf__n_neighbors': 10, 'clf__leaf_size': 100, 'clf__algorithm': 'ball_tree'}, 'best_score': 0.961990532527437}
{'model': 'logistic_regression', 'best_parameters': {'clf__estimator__penalty': 'l2', 'clf__estimator__class_weight': None, 'clf__estimator__C': 100}, 'best_score': 0.9740922597546037}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'model': 'svm', 'best_parameters': {'clf__estimator__kernel': 'linear', 'clf__estimator__gamma': 'auto', 'clf__estimator__degree': 7, 'clf__estimator__class_weight': 'balanced', 'clf__estimator__C': 10}, 'best_score': 0.970593002407328}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

{'model': 'mlp', 'best_parameters': {'clf__warm_start': True, 'clf__verbose': False, 'clf__validation_fraction': 0.7, 'clf__tol': 0.001, 'clf__solver': 'adam', 'clf__shuffle': True, 'clf__random_state': 719, 'clf__power_t': 0.4, 'clf__nesterovs_momentum': True, 'clf__n_iter_no_change': 4, 'clf__momentum': 0.5, 'clf__max_iter': 900, 'clf__learning_rate_init': 1, 'clf__learning_rate': 'adaptive', 'clf__hidden_layer_sizes': (50,), 'clf__epsilon': 0.9, 'clf__early_stopping': False, 'clf__beta_2': 0.5, 'clf__beta_1': 0.9, 'clf__alpha': 0.1, 'clf__activation': 'tanh'}, 'best_score': 0.9753596767292075}


  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X, y, sample_weight)
  sample_weight = np.exp(
  return super().fit(X

{'model': 'ada_boost', 'best_parameters': {'clf__estimator__n_estimators': 90, 'clf__estimator__learning_rate': 1, 'clf__estimator__algorithm': 'SAMME'}, 'best_score': 0.9826945779860577}
