In [25]:
import numpy as np
from joblib import load
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import math


# Load in naive features train/test data (same known data always, then mirror only or all unknown sets)
# Given which subflow packet size dataset to load
#
# Returns: known train, all unknown train, mirror train, known test, all unknown test, mirror test
def load_naive_train_test(N):
    # Train sets
    known_train = np.load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Known-TRAIN.npy")
    mirror_train = np.load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Mirror-TRAIN.npy")
    unknown_train = np.load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Unknown-TRAIN.npy")
    # Test sets
    known_test = np.load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Known-TEST.npy")
    mirror_test = np.load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Mirror-TEST.npy")
    unknown_test = np.load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Unknown-TEST.npy")
    return known_train, unknown_train, mirror_train, known_test, unknown_test, mirror_test


# Load in final features train/test data (same known data always, then mirror only or all unknown sets)
# Given which subflow packet size dataset to load
#
# Returns: known train, all unknown train, mirror train, known test, all unknown test, mirror test
def load_final_train_test(N):
    # Train sets
    if N == 100:
        known_train = np.load(f"../Feature-Vectors/train-test-flows/{N}-p-subflows/Final-80-Random-Known-TRAIN.npy")
    else:
        known_train = np.load(f"../Feature-Vectors/train-test-flows/{N}-p-subflows/Final-80-Known-TRAIN.npy")
    mirror_train = np.load(f"../Feature-Vectors/train-test-flows/{N}-p-subflows/Final-80-Mirror-TRAIN.npy")
    if N == 100:
        unknown_train = np.load(f"../Feature-Vectors/train-test-flows/{N}-p-subflows/Final-80-Random-Unknown-TRAIN.npy")
    else:
        unknown_train = np.load(f"../Feature-Vectors/train-test-flows/{N}-p-subflows/Final-80-Unknown-TRAIN.npy")
    # Test sets
    if N == 100:
        known_test = np.load(f"../Feature-Vectors/train-test-flows/{N}-p-subflows/Final-80-Random-Known-TEST.npy")
    else:
        known_test = np.load(f"../Feature-Vectors/train-test-flows/{N}-p-subflows/Final-80-Known-TEST.npy")
    mirror_test = np.load(f"../Feature-Vectors/train-test-flows/{N}-p-subflows/Final-80-Mirror-TEST.npy")
    if N == 100:
        unknown_test = np.load(f"../Feature-Vectors/train-test-flows/{N}-p-subflows/Final-80-Random-Unknown-TEST.npy")
    else:
        unknown_test = np.load(f"../Feature-Vectors/train-test-flows/{N}-p-subflows/Final-80-Unknown-TEST.npy")
    return known_train, unknown_train, mirror_train, known_test, unknown_test, mirror_test


# Combines individual train/test sets into 2 train/test sets and their corresponding labels: 
# Known + All Unknown and Known + Mirror Unknown
#
# Returns all unknown train and labels, all unknown test and labels, 
# mirror unknown train and labels, mirror unknown test and labels
def combine_train_test(known_train, unknown_train, mirror_train, known_test, unknown_test, mirror_test):
    all_unknown_train = np.concatenate((known_train, unknown_train), axis=0)
    all_unknown_train_l = np.concatenate((np.ones(known_train.shape[0]), np.zeros(unknown_train.shape[0])), axis=0)
    all_unknown_test = np.concatenate((known_test, unknown_test), axis=0)
    all_unknown_test_l = np.concatenate((np.ones(known_test.shape[0]), np.zeros(unknown_test.shape[0])), axis=0)
    mirror_unknown_train = np.concatenate((known_train, mirror_train), axis=0)
    mirror_unknown_train_l = np.concatenate((np.ones(known_train.shape[0]), np.zeros(mirror_train.shape[0])), axis=0)
    mirror_unknown_test = np.concatenate((known_test, mirror_test), axis=0)
    mirror_unknown_test_l = np.concatenate((np.ones(known_test.shape[0]), np.zeros(mirror_test.shape[0])), axis=0)
    return all_unknown_train, all_unknown_train_l, all_unknown_test, all_unknown_test_l, \
            mirror_unknown_train, mirror_unknown_train_l, mirror_unknown_test, mirror_unknown_test_l


# Loading in naive features test flows dicts
#
# Returns: all unknown test flows dict, mirror test flows dict, known test flows dict
def load_naive_test_dicts(N):
    unknown_test_flows = load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Unknown-TEST-Flows-Dict")
    mirror_test_flows = load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Mirror-TEST-Flows-Dict")
    known_test_flows = load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Known-TEST-Flows-Dict")
    return unknown_test_flows, mirror_test_flows, known_test_flows


# 5 Fold CV over tuned_parameters of GBDT
# Folds created over passed in training data/labels
def cv(train, labels):
    # CV on just subflows feature vectors (no flow classification)
    tuned_parameters = [{'max_depth': np.arange(1, 6), 'min_samples_leaf': np.arange(1, 11), 
                         'min_samples_split': np.arange(2, 11), 'learning_rate': np.arange(0.1, 1.1, 0.1)}]
    cv = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, n_jobs=-1)
    cv.fit(train, labels)
    print("Best parameters set found on development set:")
    print()
    print(cv.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = cv.cv_results_['mean_test_score']
    stds = cv.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, cv.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
        
        
# Combines individual train sets into 2 train/test sets and their corresponding labels: 
# Known + All Unknown and Known + Mirror Unknown
# Randomly select 50% of the smallest train set for each label count
#
# Returns all unknown train and labels, mirror unknown train and labels (known data the same in both sets)
def cv_data(known_train, unknown_train, mirror_train):
    count = math.floor(np.min(np.array([len(known_train), len(unknown_train), len(mirror_train)]))/2)
    np.random.shuffle(known_train); np.random.shuffle(unknown_train); np.random.shuffle(mirror_train)
    all_unknown_train = np.concatenate((known_train[:count], unknown_train[:count]), axis=0)
    all_unknown_train_l = np.concatenate((np.ones(count), np.zeros(count)), axis=0)
    mirror_unknown_train = np.concatenate((known_train[:count], mirror_train[:count]), axis=0)
    mirror_unknown_train_l = np.concatenate((np.ones(count), np.zeros(count)), axis=0)
    return all_unknown_train, all_unknown_train_l, mirror_unknown_train, mirror_unknown_train_l


# Get subflow classifications for all flows in given flow dictionary, 
# Given a regression model (bins, label likelihoods, KNN)
def maj_vote_flows(flow_dict, model, label):
    flows = list(flow_dict.keys())
    uncertain = 0
    correct = 0
    for flow in flow_dict:
        prediction = 0
        subflows = flow_dict[flow]
        # Classify all subflow feature vectors of a flow
        predictions = model.predict(subflows)
        # Classify flow by majority vote (win by 2)
        label_counts = np.unique(predictions, return_counts=True)[1]
        if len(label_counts) == 2:
            if label_counts[0] >= label_counts[1] + 2:
                prediction = 0
            elif label_counts[1] >= label_counts[0] + 2:
                prediction = 1
            else:
                # uncertain ...
                uncertain += 1
                continue
        else:
            # if only one label for all subflows, label flow as that label
            prediction = predictions[0]
            if prediction == -1:
                prediction = 0
        if label == prediction:
            correct += 1
    acc = correct/len(flow_dict)
    return flows, acc, uncertain

In [27]:
# Data loading, cross validation
N = 25
known_train, unknown_train, mirror_train, known_test, unknown_test, mirror_test = load_final_train_test(N)
all_unknown_train, all_unknown_train_l, mirror_unknown_train, mirror_unknown_train_l = cv_data(known_train, unknown_train, mirror_train)

# CV on just subflows feature vectors (no flow classification)
print("CV on Mirror Unknown")
cv(mirror_unknown_train, mirror_unknown_train_l)

CV on Mirror Unknown


KeyboardInterrupt: 

In [21]:
# Data loading, cross validation
N = 25
known_train, unknown_train, mirror_train, known_test, unknown_test, mirror_test = load_naive_train_test(N)
all_unknown_train, all_unknown_train_l, all_unknown_test, all_unknown_test_l, \
            mirror_unknown_train, mirror_unknown_train_l, mirror_unknown_test, mirror_unknown_test_l =\
combine_train_test(known_train, unknown_train, mirror_train, known_test, unknown_test, mirror_test)

# CV on just subflows feature vectors (no flow classification)
print("CV on Mirror Unknown")
cv(mirror_unknown_train, mirror_unknown_train_l)

CV on Mirror Unknown


KeyboardInterrupt: 