In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import pandas as pd
import json


def load_json(json_path):
    with open(json_path, 'r') as f:
        return json.load(f)


In [41]:
import joblib
import json
import time


def train_and_save(pipelines, dataset_name, x, y):
    try:
        results = load_json(
            f'../artefacts/6/results_{dataset_name}.json')
    except:
        results = {}

    for model_name, params, pipeline in pipelines:
        start = time.time()
        print(f"Training {model_name}", end=' -> ')
        pipeline.set_params(**params.get('best_parameters'))
        pipeline.fit(x, y)
        print(f"Saving {model_name}", end=' -> ')
        score = pipeline.score(x.iloc[:10000], y.iloc[:10000])
        print(f"Score: {score}")
        joblib.dump(
            pipeline, f'../artefacts/6/models_{dataset_name}_{model_name}.joblib')
        end = time.time()

        try:
            results[model_name]['train'] = {
                'time': end - start,
                'score': score
            }
        except:
            results[model_name] = {
                'train': {
                    'time': end - start,
                    'score': score
                }
            }

        with open(f'../artefacts/6/results_{dataset_name}.json', 'w+') as f:
            json.dump(results, f, indent=4)


In [42]:

# kdd

verbose = False

kdd_pipelines = [
    (
        "ada_boost",
        load_json('../artefacts/6/params_kdd_ada_boost.json'),
        Pipeline(
            [
                ('scaler', StandardScaler()),
                ('clf', OneVsRestClassifier(
                    AdaBoostClassifier(), verbose=verbose, n_jobs=-1))
            ]
        )
    ),

    (
        "decision_tree",
        load_json('../artefacts/6/params_kdd_decision_tree.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', DecisionTreeClassifier())
        ])
    ),

    (
        "knn",
        load_json('../artefacts/6/params_kdd_knn.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', KNeighborsClassifier(n_jobs=-1))
        ])
    ),

    (
        "logistic_regression",
        load_json('../artefacts/6/params_kdd_logistic_regression.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', OneVsRestClassifier(
                    LogisticRegression(n_jobs=-1, verbose=verbose), verbose=verbose, n_jobs=-1))
        ])
    ),

    (
        "mlp",
        load_json('../artefacts/6/params_kdd_mlp.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', MLPClassifier(verbose=verbose))
        ])
    ),

    (
        "random_forest",
        load_json('../artefacts/6/params_kdd_random_forest.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', RandomForestClassifier(verbose=verbose, n_jobs=-1))
        ])
    ),

    (
        "svm",
        load_json('../artefacts/6/params_kdd_svm.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', OneVsRestClassifier(
                    SVC(verbose=verbose), verbose=verbose, n_jobs=-1))
        ])
    )

]

df_kdd = pd.read_csv('../artefacts/5/kdd_train.csv', index_col=0)

kdd_x = df_kdd.iloc[:, :-3]
kdd_y = df_kdd.iloc[:, -3:]

train_and_save(kdd_pipelines, 'kdd', kdd_x, kdd_y)


Training ada_boost -> Saving ada_boost -> Score: 0.9904
Training decision_tree -> Saving decision_tree -> Score: 0.9986
Training knn -> Saving knn -> Score: 0.9999
Training logistic_regression -> Saving logistic_regression -> Score: 0.96
Training mlp -> Saving mlp -> Score: 0.9878
Training random_forest -> Saving random_forest -> Score: 0.9962
Training svm -> Saving svm -> Score: 0.9971


# Training UNSW Models


In [43]:
verbose = False

unsw_pipelines = [
    (
        "ada_boost",
        load_json('../artefacts/6/params_unsw_ada_boost.json'),
        Pipeline(
            [
                ('scaler', StandardScaler()),
                ('clf', OneVsRestClassifier(
                    AdaBoostClassifier(), verbose=verbose, n_jobs=-1))
            ]
        )
    ),

    (
        "decision_tree",
        load_json('../artefacts/6/params_unsw_decision_tree.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', DecisionTreeClassifier())
        ])
    ),

    (
        "knn",
        load_json('../artefacts/6/params_unsw_knn.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', KNeighborsClassifier(n_jobs=-1))
        ])
    ),

    (
        "logistic_regression",
        load_json('../artefacts/6/params_unsw_logistic_regression.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', OneVsRestClassifier(
                    LogisticRegression(n_jobs=-1, verbose=verbose), verbose=verbose, n_jobs=-1))
        ])
    ),

    (
        "mlp",
        load_json('../artefacts/6/params_unsw_mlp.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', MLPClassifier(verbose=verbose))
        ])
    ),

    (
        "random_forest",
        load_json('../artefacts/6/params_unsw_random_forest.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', RandomForestClassifier(verbose=verbose, n_jobs=-1))
        ])
    ),

    (
        "svm",
        load_json('../artefacts/6/params_unsw_svm.json'),
        Pipeline([
                ('scaler', StandardScaler()),
                ('clf', OneVsRestClassifier(
                    SVC(verbose=verbose), verbose=verbose, n_jobs=-1))
        ])
    )

]

df_unsw = pd.read_csv('../artefacts/5/unsw_train.csv', index_col=0)

unsw_x = df_unsw.iloc[:, :-2]
unsw_y = df_unsw.iloc[:, -2:]

train_and_save(unsw_pipelines, 'unsw', unsw_x, unsw_y)

Training ada_boost -> Saving ada_boost -> Score: 0.9808
Training decision_tree -> Saving decision_tree -> Score: 0.9943
Training knn -> Saving knn -> Score: 1.0
Training logistic_regression -> 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Saving logistic_regression -> Score: 0.9829
Training mlp -> Saving mlp -> Score: 0.9862
Training random_forest -> Saving random_forest -> Score: 0.9963
Training svm -> 

KeyboardInterrupt: 