In [1]:
# importing necessarily libraries for the binary classification task

# libraries imported for data processing and analysis
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

# libraries imported for learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import pipeline

# libraries imported for performance metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [2]:
# load 'Electrical Grid Stability' data and names into pandas dataframe

# load data by using read_csv from .data file
df = pd.read_csv("datasets/HTRU2/HTRU_2.csv")

# clean data
# drop all samples with NaN entries
df = df.dropna()

df

Unnamed: 0,mean_int,stddev_int,excess_int,skew_int,mean_dmsnr,stddev_dmsnr,excess_dmsnr,skew_dmsnr,class
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
...,...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022,0
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173,0
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910,0


In [3]:
# pre-declared values/arrays/functions to be used once inside the trial loop
# C values for logistic regression regularization in range of 10(-8) to 10(4)
Cvals = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
# K values for k-nearest neighbors in range of 1 to 105 in steps of 4
Kvals = np.linspace(1, 105, num=26, dtype=int).tolist()
# max feature values for random forest similar to CNM06
max_features = [1, 2, 4, 6, 8, 12, 16, 20]
# max depth values for decision trees (shallower = better)
max_depths = np.linspace(1, 5, num=5, dtype=int).tolist()
# array of performance metrics
scoring = ['accuracy', 'f1_micro', 'roc_auc_ovr']

# build parameter grids to be passed into GridSearchCV
logreg_pgrid = {'classifier__penalty': ['l1','l2'], 'classifier__C': Cvals, 'classifier__max_iter': [5000]}
knn_pgrid = {'classifier__weights': ['distance'], 'classifier__n_neighbors': Kvals}
rforest_pgrid = {'classifier__n_estimators': [1024], 'classifier__max_features': max_features}
dtree_pgrid = {'classifier__max_depth': max_depths}

# arrays + dictionaries to store scores
score_dict = [{}, {}, {}, {}, {}]

# loop through this entire trial FIVE (5) times
for i in range(5):
    # slice the dataframe to not include the binary classifier (label)
    # last column is the label (income>50K)
    X, y = df.iloc[:,:-1], df.iloc[:,-1]

    # randomly pick 5000 samples with replacement for training set
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)

    # make pipeline for each algorithms to condense model call
    logreg = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', LogisticRegression(n_jobs=-1))])
    knn = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', KNeighborsClassifier(n_jobs=-1))])
    rforest = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', RandomForestClassifier(n_jobs=-1))])
    dtree = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', DecisionTreeClassifier())])

    # 5-fold cross validation using Stratified KFold
    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

    # GridSearchCV classifier for each algorithm
    logreg_clf = GridSearchCV(estimator=logreg, param_grid=logreg_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)
    knn_clf = GridSearchCV(estimator=knn, param_grid=knn_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)
    rforest_clf = GridSearchCV(estimator=rforest, param_grid=rforest_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)
    dtree_clf = GridSearchCV(estimator=dtree, param_grid=dtree_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)



    # for each classifier
    for clf, clf_name in zip([logreg_clf, knn_clf, rforest_clf, dtree_clf], 
                ['LogReg', 'KNN', 'Ran_For', 'Dec_Tree']):
        # fit to training data of 5000 samples
        clf.fit(X_train, y_train)

        # get parameters for each scoring metric's best
        best_acc_param = clf.cv_results_['params'][ np.argmin(clf.cv_results_['rank_test_accuracy']) ]
        best_f1_param = clf.cv_results_['params'][ np.argmin(clf.cv_results_['rank_test_f1_micro']) ]
        best_roc_param = clf.cv_results_['params'][ np.argmin(clf.cv_results_['rank_test_roc_auc_ovr']) ]

        # get pipeline based on current classifier
        if (clf_name == 'LogReg'):
            pipe = logreg
        elif (clf_name == 'KNN'):
            pipe = knn
        elif (clf_name == 'Ran_For'):
            pipe = rforest
        elif (clf_name == 'Dec_Tree'):
            pipe = dtree

        # set pipeline parameters to the parameters for best accuracy
        pipe.set_params(**best_acc_param)
        # fit classifier with training data and new parameters for scoring metric
        pipe.fit(X_train, y_train)
        # get predictions for both training and testing data
        y_train_pred = pipe.predict(X_train)
        y_test_pred = pipe.predict(X_test)

        # get scores for all metrics from both training and testing data
        acc_train = accuracy_score(y_train, y_train_pred)
        f1_train = f1_score(y_train, y_train_pred)
        roc_auc_train = roc_auc_score(y_train, y_train_pred)

        acc_test = accuracy_score(y_test, y_test_pred)
        f1_test = f1_score(y_test, y_test_pred)
        roc_auc_test = roc_auc_score(y_test, y_test_pred)

        # store all scores into a dictionary for accuracy metric
        acc_dict = {'acc_train': acc_train, 'f1_train': f1_train, 'roc_auc_train': roc_auc_train, 
                    'acc_test': acc_test, 'f1_test': f1_test, 'roc_auc_test': roc_auc_test}

        
        # do ^^^^^ all that for f1 score
        pipe.set_params(**best_f1_param)
        pipe.fit(X_train, y_train)
        y_train_pred = pipe.predict(X_train)
        y_test_pred = pipe.predict(X_test)

        acc_train = accuracy_score(y_train, y_train_pred)
        f1_train = f1_score(y_train, y_train_pred)
        roc_auc_train = roc_auc_score(y_train, y_train_pred)

        acc_test = accuracy_score(y_test, y_test_pred)
        f1_test = f1_score(y_test, y_test_pred)
        roc_auc_test = roc_auc_score(y_test, y_test_pred)

        f1_dict = {'acc_train': acc_train, 'f1_train': f1_train, 'roc_auc_train': roc_auc_train, 
                    'acc_test': acc_test, 'f1_test': f1_test, 'roc_auc_test': roc_auc_test}


        # do ^^^^^ all that for roc_auc score
        pipe.set_params(**best_roc_param)
        pipe.fit(X_train, y_train)
        y_train_pred = pipe.predict(X_train)
        y_test_pred = pipe.predict(X_test)

        acc_train = accuracy_score(y_train, y_train_pred)
        f1_train = f1_score(y_train, y_train_pred)
        roc_auc_train = roc_auc_score(y_train, y_train_pred)

        acc_test = accuracy_score(y_test, y_test_pred)
        f1_test = f1_score(y_test, y_test_pred)
        roc_auc_test = roc_auc_score(y_test, y_test_pred)

        roc_auc_dict = {'acc_train': acc_train, 'f1_train': f1_train, 'roc_auc_train': roc_auc_train, 
                    'acc_test': acc_test, 'f1_test': f1_test, 'roc_auc_test': roc_auc_test}

        # build final dictionary to store all scores from all three models and their best parameters
        score_dict[i][clf_name] = {'acc_dict': acc_dict, 'f1_dict': f1_dict, 'roc_auc_dict': roc_auc_dict}

Fitting 5 folds for each of 26 candidates, totalling 130 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed:    2.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Fitting 5 folds for each of 26 candidates, totalling 130 fits
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed:    2.2s finished
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   35.8s finished
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 26 candidates, totalling 130 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent 

In [4]:
print(score_dict)

[{'LogReg': {'acc_dict': {'acc_train': 0.9826, 'f1_train': 0.8937728937728937, 'roc_auc_train': 0.9195783904677127, 'acc_test': 0.9776709567374787, 'f1_test': 0.8726790450928382, 'roc_auc_test': 0.9065503246436406}, 'f1_dict': {'acc_train': 0.9826, 'f1_train': 0.8937728937728937, 'roc_auc_train': 0.9195783904677127, 'acc_test': 0.9776709567374787, 'f1_test': 0.8726790450928382, 'roc_auc_test': 0.9065503246436406}, 'roc_auc_dict': {'acc_train': 0.9816, 'f1_train': 0.8869778869778869, 'roc_auc_train': 0.9138180218041181, 'acc_test': 0.9766630485346566, 'f1_test': 0.8658047258136424, 'roc_auc_test': 0.9000396023723098}}, 'KNN': {'acc_dict': {'acc_train': 1.0, 'f1_train': 1.0, 'roc_auc_train': 1.0, 'acc_test': 0.9761203287331369, 'f1_test': 0.8628673196794302, 'roc_auc_test': 0.89899592372356}, 'f1_dict': {'acc_train': 1.0, 'f1_train': 1.0, 'roc_auc_train': 1.0, 'acc_test': 0.9761203287331369, 'f1_test': 0.8628673196794302, 'roc_auc_test': 0.89899592372356}, 'roc_auc_dict': {'acc_train': 1