In [4]:
import math
import numpy as np
from joblib import load
from sklearn.ensemble import GradientBoostingClassifier


# Same as above but naive features
def load_naive_test_dicts(N):
    unknown_test_flows = load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Unknown-TEST-Flows-Dict")
    mirror_test_flows = load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Mirror-TEST-Flows-Dict")
    known_test_flows = load(f"../Feature-Vectors/train-test-flows/Basic-Features/{N}-p-subflows/Known-TEST-Flows-Dict")
    return unknown_test_flows, mirror_test_flows, known_test_flows


# Sum the joint likelihoods one subflow at a time
# Make classification decision as soon as one confidence reaches threshold
#
# Returns number of wrong (other confidence reached), correct, and uncertain flows
# Returns array of percentage of total subflows needed to reach decision for each decided flow
# Returns array of total subflows for each decided flow
# If maj is True, classifies all uncertain flows with larger label likelihood (still counts uncertain)
def classify_accum(flow_dict, known_bin_likelihoods, unknown_bin_likelihoods, gbdt, con_thresh, \
                    label, maj=False):
    totals = []
    decision_percents = []
    correct = 0
    wrong = 0
    uncertain = 0
    # Find the likelihood ratio that the confidence requires
    needed_ratio = con_thresh / (1-con_thresh)
    needed_ratio = math.log(needed_ratio)
    for flow in flow_dict:
        known_ll_sum = 0
        unknown_ll_sum = 0
        subflows_seen = 0
        subflows = flow_dict[flow]
        # Get predictions on the flow's subflows
        predictions = gbdt.predict(subflows)
        # Calculate sum of logs of known and unknown likelihoods, summing one subflow's likelihoods at a time
        for i in range(len(predictions)):
            p = predictions[i]
            subflows_seen += 1
            if p == 1:
                known_l = known_bin_likelihoods[1] 
                unknown_l = known_bin_likelihoods[0] 
            elif p == 0:
                known_l = unknown_bin_likelihoods[1] 
                unknown_l = unknown_bin_likelihoods[0] 
            known_ll_sum += known_l
            unknown_ll_sum += unknown_l
            # Known confidence is sum of known lls - sum of unknown lls (vice versa for unknown)
            known_confidence = known_ll_sum - unknown_ll_sum
            unknown_confidence = unknown_ll_sum - known_ll_sum
            # If either confidence ratio is at the needed threshold, classify flow
            if known_confidence > needed_ratio:
                if label == 1: correct += 1;
                else: wrong +=1;
                decision_percents.append(subflows_seen/len(subflows)); totals.append(len(subflows))
                break
            elif unknown_confidence > needed_ratio:
                if label == 0: correct += 1;
                else: wrong += 1;
                decision_percents.append(subflows_seen/len(subflows)); totals.append(len(subflows))
                break
            if i == len(predictions)-1:
                uncertain += 1; assert subflows_seen/len(subflows) == 1
                if maj:
                    # If out of subflows, classify flow as whichever label likelihood is greater
                    classification = 1 if known_confidence > unknown_confidence else 0
                    if classification == label: correct += 1;
                    else: wrong += 1;
    return correct, wrong, uncertain, totals, decision_percents

In [8]:
############################## 25 PACKET SUBFLOWS - NO CLASSIFICATION SUBFLOW MIN
# Incremental: Full flows subflow by subflow with no priors, correct/wrong/certain and % subflows seen
N = 25
unknown_test_flows, mirror_test_flows, known_test_flows = load_naive_test_dicts(N) 
con_thresh = 0.95

# MIRROR UNKNOWN
print("MIRROR UNKNOWN ############################################")
name = f'{N}_M_n'
# Load models
mirror_known_bins = np.load(f"Models/{name}_Known_Bin_Ls.npy")
mirror_unknown_bins = np.load(f"Models/{name}_Unknown_Bin_Ls.npy")
gbdt = load(f'Models/{name}_GBDT')
con_thresh = 0.95

k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1)
print("KNOWN TEST FLOWS - STRICT:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}")
k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1, maj=True)
print("KNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}\n")

u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(mirror_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0)
print("UKNOWN TEST FLOWS - STRICT:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(mirror_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")
u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(mirror_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0, maj=True)
print("UKNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(mirror_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")


# ALL UNKNOWN
print("\n\nALL UNKNOWN #############################################")
name = f'{N}_U_n'
# Load models
mirror_known_bins = np.load(f"Models/{name}_Known_Bin_Ls.npy")
mirror_unknown_bins = np.load(f"Models/{name}_Unknown_Bin_Ls.npy")
gbdt = load(f'Models/{name}_GBDT')
con_thresh = 0.95

k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1)
print("KNOWN TEST FLOWS - STRICT:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}")
k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1, maj=True)
print("KNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}\n")

u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(unknown_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0)
print("UKNOWN TEST FLOWS - STRICT:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(unknown_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")
u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(unknown_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0, maj=True)
print("UKNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(unknown_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")

MIRROR UNKNOWN ############################################
KNOWN TEST FLOWS - STRICT:
Correct: 41 	 Wrong: 0 	 Uncertain: 0
Accuracy: 1.0
Mean % of Subflows taken for decision: 0.010146371673750086
KNOWN TEST FLOWS - MAJORITY:
Correct: 41 	 Wrong: 0 	 Uncertain: 0
Accuracy: 1.0
Mean % of Subflows taken for decision: 0.010146371673750086

UKNOWN TEST FLOWS - STRICT:
Correct: 341 	 Wrong: 0 	 Uncertain: 7
Accuracy: 0.9798850574712644
Mean % of Subflows taken for decision: 0.056018197149412086
UKNOWN TEST FLOWS - MAJORITY:
Correct: 348 	 Wrong: 0 	 Uncertain: 7
Accuracy: 1.0
Mean % of Subflows taken for decision: 0.056018197149412086


ALL UNKNOWN #############################################
KNOWN TEST FLOWS - STRICT:
Correct: 39 	 Wrong: 2 	 Uncertain: 0
Accuracy: 0.9512195121951219
Mean % of Subflows taken for decision: 0.010146371673750086
KNOWN TEST FLOWS - MAJORITY:
Correct: 39 	 Wrong: 2 	 Uncertain: 0
Accuracy: 0.9512195121951219
Mean % of Subflows taken for decision: 0.010146371

In [5]:
############################## 100 PACKET SUBFLOWS 
# Incremental: Full flows subflow by subflow with no priors, correct/wrong/certain and % subflows seen
N = 100
unknown_test_flows, mirror_test_flows, known_test_flows = load_naive_test_dicts(N) 
con_thresh = 0.95

# MIRROR UNKNOWN
print("MIRROR UNKNOWN ############################################")
name = f'{N}_M_n'
# Load models
mirror_known_bins = np.load(f"Models/{name}_Known_Bin_Ls.npy")
mirror_unknown_bins = np.load(f"Models/{name}_Unknown_Bin_Ls.npy")
gbdt = load(f'Models/{name}_GBDT')
con_thresh = 0.95

k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1)
print("KNOWN TEST FLOWS - STRICT:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}")
k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1, maj=True)
print("KNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}\n")

u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(mirror_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0)
print("UKNOWN TEST FLOWS - STRICT:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(mirror_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")
u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(mirror_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0, maj=True)
print("UKNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(mirror_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")


# ALL UNKNOWN
print("\n\nALL UNKNOWN #############################################")
name = f'{N}_U_n'
# Load models
mirror_known_bins = np.load(f"Models/{name}_Known_Bin_Ls.npy")
mirror_unknown_bins = np.load(f"Models/{name}_Unknown_Bin_Ls.npy")
gbdt = load(f'Models/{name}_GBDT')
con_thresh = 0.95

k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1)
print("KNOWN TEST FLOWS - STRICT:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}")
k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1, maj=True)
print("KNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}\n")

u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(unknown_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0)
print("UKNOWN TEST FLOWS - STRICT:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(unknown_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")
u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(unknown_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0, maj=True)
print("UKNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(unknown_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")

MIRROR UNKNOWN ############################################
KNOWN TEST FLOWS - STRICT:
Correct: 38 	 Wrong: 0 	 Uncertain: 1
Accuracy: 0.9743589743589743
Mean % of Subflows taken for decision: 0.05417651190757797
KNOWN TEST FLOWS - MAJORITY:
Correct: 39 	 Wrong: 0 	 Uncertain: 1
Accuracy: 1.0
Mean % of Subflows taken for decision: 0.05417651190757797

UKNOWN TEST FLOWS - STRICT:
Correct: 375 	 Wrong: 0 	 Uncertain: 4
Accuracy: 0.9894459102902374
Mean % of Subflows taken for decision: 0.2154937042597428
UKNOWN TEST FLOWS - MAJORITY:
Correct: 379 	 Wrong: 0 	 Uncertain: 4
Accuracy: 1.0
Mean % of Subflows taken for decision: 0.2154937042597428


ALL UNKNOWN #############################################
KNOWN TEST FLOWS - STRICT:
Correct: 38 	 Wrong: 0 	 Uncertain: 1
Accuracy: 0.9743589743589743
Mean % of Subflows taken for decision: 0.05417651190757797
KNOWN TEST FLOWS - MAJORITY:
Correct: 39 	 Wrong: 0 	 Uncertain: 1
Accuracy: 1.0
Mean % of Subflows taken for decision: 0.0541765119075779

In [9]:
############################## 1000 PACKET SUBFLOWS 
# Incremental: Full flows subflow by subflow with no priors, correct/wrong/certain and % subflows seen
N = 1000
unknown_test_flows, mirror_test_flows, known_test_flows = load_naive_test_dicts(N) 
con_thresh = 0.95

# MIRROR UNKNOWN
print("MIRROR UNKNOWN ############################################")
name = f'{N}_M_n'
# Load models
mirror_known_bins = np.load(f"Models/{name}_Known_Bin_Ls.npy")
mirror_unknown_bins = np.load(f"Models/{name}_Unknown_Bin_Ls.npy")
gbdt = load(f'Models/{name}_GBDT')
con_thresh = 0.95

k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1)
print("KNOWN TEST FLOWS - STRICT:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}")
k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1, maj=True)
print("KNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}\n")

u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(mirror_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0)
print("UKNOWN TEST FLOWS - STRICT:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(mirror_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")
u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(mirror_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0, maj=True)
print("UKNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(mirror_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")


# ALL UNKNOWN
print("\n\nALL UNKNOWN #############################################")
name = f'{N}_U_n'
# Load models
mirror_known_bins = np.load(f"Models/{name}_Known_Bin_Ls.npy")
mirror_unknown_bins = np.load(f"Models/{name}_Unknown_Bin_Ls.npy")
gbdt = load(f'Models/{name}_GBDT')
con_thresh = 0.95

k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1)
print("KNOWN TEST FLOWS - STRICT:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}")
k_correct, k_wrong, k_uncertain, k_totals, k_decision_percents = classify_accum(known_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 1, maj=True)
print("KNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {k_correct} \t Wrong: {k_wrong} \t Uncertain: {k_uncertain}")
print(f"Accuracy: {k_correct / len(known_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(k_decision_percents)}\n")

u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(unknown_test_flows, \
                                                    mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0)
print("UKNOWN TEST FLOWS - STRICT:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(unknown_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")
u_correct, u_wrong, u_uncertain, u_totals, u_decision_percents = classify_accum(unknown_test_flows, \
                                            mirror_known_bins, mirror_unknown_bins, gbdt, con_thresh, 0, maj=True)
print("UKNOWN TEST FLOWS - MAJORITY:")
print(f"Correct: {u_correct} \t Wrong: {u_wrong} \t Uncertain: {u_uncertain}")
print(f"Accuracy: {u_correct / len(unknown_test_flows)}")
print(f"Mean % of Subflows taken for decision: {np.mean(u_decision_percents)}")

MIRROR UNKNOWN ############################################
KNOWN TEST FLOWS - STRICT:
Correct: 21 	 Wrong: 0 	 Uncertain: 1
Accuracy: 0.9545454545454546
Mean % of Subflows taken for decision: 0.01737202199445888
KNOWN TEST FLOWS - MAJORITY:
Correct: 22 	 Wrong: 0 	 Uncertain: 1
Accuracy: 1.0
Mean % of Subflows taken for decision: 0.01737202199445888

UKNOWN TEST FLOWS - STRICT:
Correct: 179 	 Wrong: 0 	 Uncertain: 238
Accuracy: 0.4292565947242206
Mean % of Subflows taken for decision: 0.6473582645594426
UKNOWN TEST FLOWS - MAJORITY:
Correct: 417 	 Wrong: 0 	 Uncertain: 238
Accuracy: 1.0
Mean % of Subflows taken for decision: 0.6473582645594426


ALL UNKNOWN #############################################
KNOWN TEST FLOWS - STRICT:
Correct: 21 	 Wrong: 0 	 Uncertain: 1
Accuracy: 0.9545454545454546
Mean % of Subflows taken for decision: 0.01737202199445888
KNOWN TEST FLOWS - MAJORITY:
Correct: 22 	 Wrong: 0 	 Uncertain: 1
Accuracy: 1.0
Mean % of Subflows taken for decision: 0.017372021994