In [1]:
import stumpy
import numpy as np
import random
import math
import pickle
import sys
import os

from statistics import mean
from tqdm.auto import tqdm
from multiprocessing import Pool

import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
'''
Collects random samples from trace with id2 and computes the matrix profile of class1 compared with class 2

Input: 
    trace1: packet traces from class 1
    id2: id number for class 2 
    num_traces: number of traces to select from class 2 (should be equal to class 1)
    shapelet_size: length of shapelets
    
Output:
    Matrix profile of trace1 compared with trace2
'''
def compare_profile(trace1, trace2, shapelet_size):
    
    length_diff = len(trace2) - len(trace1)
    if(length_diff < 0):
        trace2 = np.append(trace2, [np.nan] * abs(length_diff))
        
    #print(len(trace1))
    #print(len(trace2))
        
    
    c1_c2 = stumpy.stump(trace1, shapelet_size, trace2, ignore_trivial=False)[:, 0].astype(float)
    c1_c2[c1_c2 == np.inf] = np.nan
    
    return c1_c2

'''
Compares a the matrix profile of a class trace with itself

Input: 
    trace: packet traces from class 1
    shapelet_size: length of shapelets
    
Output:
    Matrix profile of trace compared with trace
'''

def same_profile(trace, shapelet_size):
    
    c1_c1 = stumpy.stump(trace, shapelet_size)[:, 0].astype(float)
    c1_c1[c1_c1 == np.inf] = np.nan
    
    return c1_c1

'''
return indices of shapelet as one-hot encoded list
'''
def generate_shapelet(trace, diff, shapelet_size):
    
    idx = np.argmax(diff)
    shapelet = np.asarray([1 if idx <= i < idx + shapelet_size else 0 for i in range(len(trace))])
    
    return shapelet

'''
Compute shapelet of greatest overlaps
'''
def find_overlap(trace_i, shapelets_i, shapelet_size):
    #print(shapelets_i[0])
    
    merged_shapelets = np.sum(shapelets_i, axis=0)
    
    max_size = 0
    start = 0
    end = 0
    
    for i in range(0, len(merged_shapelets), shapelet_size):
        current_size = np.sum(merged_shapelets[i:i+shapelet_size])
        if current_size > max_size:
            max_size = current_size
            start = i
            end = i + shapelet_size
    
    return trace_i[start:end]

In [3]:
'''
Generates a set of 100 shapelets for each class in samples

Input:
    num_traces = Number of traces per class
    shapelet_size = Size of shapelets
    save: save results to file?
    filename: if save, name & location of output file

Output:
    list object containing shapelets for each class

'''
def generate_shapelets(shapelet_coeff):
    shapelet_storage = []
    
    # loop over all classes (generate shapelet for each class)
    for i in tqdm(range(100)):
        
        # get the chosen sample from trace i
        trace_i = chosen_traces[i].astype('float64')
        shapelet_size = math.floor(shapelet_coeff * len(trace_i))
        
        shapelets_i = np.zeros((100, len(trace_i)))
        #print(shapelets_i.shape)
        
        # generate profile of i compared with itself
        # length of sample is coeff* len*trace_i
        ci_ci = same_profile(trace_i, shapelet_size)
        
        # loop over every other class and generate a profile for each one
        for j in range(100):
            # don't compare i with itself 
            if i == j:
                continue
            
            trace_j = chosen_traces[j].astype('float64')
            
            # compute profile of i compared with j
            ci_cj = compare_profile(trace_i, trace_j, shapelet_size)

            # find largest value gap between other and i
            diff_ci = ci_cj - ci_ci
            
            # generate best shapelet for i compared to j and store it in list
            ci_shape = generate_shapelet(trace_i, diff_ci, shapelet_size)
            shapelets_i[j] = ci_shape
        
        # compare shapelets between all classes and return the one which has the most overlap
        # (i.e.) the shapelet that was chosen most between the 99 other classes
        best_shapelet = find_overlap(trace_i, shapelets_i, shapelet_size)
        # save to list
        shapelet_storage.append(best_shapelet)
    
    return shapelet_storage   

In [4]:
'''
Compute the minimum distance beteen data samples and shapelets
Input:
    data = list of individual packet traces
    shapelets = list of shapelets
Output:
    minimum distance between each sample in data compared with each sample in shapelet
    shape = (len(data),len(shapelets))
'''
def distance_to_shapelet(data, shapelets):
    #data = np.asarray(data)
    #print(len(data))
    
    # processed output data
    data_out = np.zeros((len(data),len(shapelets)))
    
    # loop over each sample in the dataset
    for i,sample in enumerate(tqdm(data)):
        shapelet_score = np.empty(len(shapelets))
        # for each shapelet, calculate distance and assign a score
        for j,shapelet in enumerate(shapelets):
            try:
                dist = stumpy.mass(shapelet, sample)
            except ValueError:
                dist = stumpy.mass(sample, shapelet)
            shapelet_score[j] = dist.min()
        data_out[i] = shapelet_score
    
    return data_out

'''
Computes distances between input samples and shapelets, returns X for classifier
Also cleans data and ensures no random errors due to length, NaN, etc...
Underlying function that performs comparison is distance_to_shapelet
Selects data samples (with replacement)
note: some samples will always be bad so actual length of X is less

Input:
    num_traces = numner of traces to process
    save = save output to file
    filenames = tuple that represents (name of X file, name of y file)

Output:
    X values for classifier of shape (None, 100)
    y values for classifier of shape (None, )
'''

def process_traces(num_traces, shapelets):
    X, y = [], []

    
#     for i in range(num_traces):
#         combo_trace = []
#         combo_trace.append(random.choice(traces[random.randint(50,99)]))
#         y_id = random.randint(0,49)
#         combo_trace.append(random.choice(traces[y_id]))
#         combo_trace.append(random.choice(traces[random.randint(50,99)]))
#         out = np.concatenate((combo_trace[0],combo_trace[1],combo_trace[2]))
        
#         X.append(out)
#         y.append(y_id)

    # iterate over dictionary and re-format into X and y
    for trace_id, trace_vals in traces.items():
        for trace in trace_vals:
            X.append(trace)
            y.append(trace_id)
    
#     for i in range(num_traces):
#         random_id = random.randrange(100)
#         random_trace = random.choice(traces[random_id])
#         X.append([random_trace])
#         y.append(random_id)
    
    print("Size of X: " + str(len(X)))
    
    
    # convert traces into float64 data type
    X = [np.asarray(trace).astype('float64') for trace in X]
    
    # clear empty trace values in data
    X = [trace[~np.isnan(trace)] for trace in X]    

    # compute distance between input trace and shapelet arrays
    # return as new X

    X = distance_to_shapelet(X, shapelets)
    
    return X, y

In [5]:
'''
Evaluate performance of sklearn classifier on data samples - 90/10 training testing split

Input:
    clf: sklearn classifier object
    X: x values
    y: y values
    topk: k values for evaluation metrics
Output:
    list of length topk with accuracy for testing data
'''

def classifier_performance(clf, X, y, topk=[1,3,5]):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)
    
    scores = []
    for k in topk:
        correct = 0
        for i in range(len(y_prob)):
            ind = np.argpartition(y_prob[i], -k)[-k:]
            if y_test[i] in ind:
                correct += 1
        scores.append(correct/len(y_prob))
    
    return scores

In [6]:
'''
Utility function for pipeline of evaluating different grid search parameters
Output: a new file located in ../results/param1-val1_param2-val2_param3-val3
        the file contains a pickled python object
        with the scores for top-1, top-3, and top-5 classifier accuracy
'''
# note: python multiprocessing is really annoying to work with
# function needs to be in a separate .py file which is imported
# and function can only have 1 argument
# list input which is immediately used for what would be the arguments
def evaluate_parameters(arr):
    
    num_experiment = arr[0]
    shapelet_coeff = arr[1]
    num_samples = 0
    
    filename = '../results/shapelets/' + 'num=' + str(num_experiment) + 'size=' + str(shapelet_coeff)
    #filename = '../results/data/trace_choice'
    with open(filename, 'rb') as f:
        shapelets = pickle.load(f)
    
    shapelets = [shapelet.astype('float64') for shapelet in shapelets]
    
    X, y = process_traces(num_samples, shapelets)
    
    filename = '../results/data/X/' + 'num=' + str(num_experiment) + 'size=' + str(shapelet_coeff)
    
    with open(filename, 'wb') as f:
        pickle.dump(X, f)
        
    filename = '../results/data/y/' + 'num=' + str(num_experiment) + 'size=' + str(shapelet_coeff)
    
    with open(filename, 'wb') as f:
        pickle.dump(y, f)

In [7]:
# SETUP

global traces

with open('../ds19.npy', 'rb') as f:
    traces = pickle.load(f)

In [11]:
#nums = ['4.1', '4.2', '4.3']
#size = ['0','1']
#parameter_list = [[x,y] for x in nums for y in size]

parameter_list = [['0','0']]

print(parameter_list)

[['0', '0']]


In [None]:
# PART 1

print(parameter_list)

for parameters in parameter_list:
    coeff = parameters[1]
    shapelets = generate_shapelets(coeff)
    
    filename = '../results/shapelets/' + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1])
    
    with open(filename, 'wb') as f:
        pickle.dump(shapelets, f)   

In [9]:
evaluate_parameters(parameter_list)

Size of X: 10000


  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
## PART 2

if __name__ == '__main__':

    from utils import evaluate_parameters
    print(parameter_list)
    
    with Pool(6) as p:
        p.map(evaluate_parameters, parameter_list)

In [19]:
# merge X values from different datasets

folder_X = "../results/data/X/"
names = ["num=3.0size=1","num=3.1size=1","num=3.2size=1"]

X = ()

for filename in names: #os.listdir(folder_X): (for all files)
    print(filename)
    with open(folder_X + filename, 'rb') as f:
        Xi = pickle.load(f)
    X = X + (Xi,) 
        
X = np.concatenate(X, axis=1)
print(X.shape)

with open("../results/data/y/num=2.0size=1", 'rb') as f:
    y = pickle.load(f)
y = np.array(y) 

print(y.shape)

num=3.0size=1
num=3.1size=1
num=3.2size=1
(450000, 300)
(450000,)


In [20]:
clf = RandomForestClassifier()
scores = classifier_performance(clf, X, y)

print(scores)

[0.8040444444444445, 0.9175777777777778, 0.9481333333333334]


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

print(len(set(y_train)))
print(len(set(y_test)))

100
100


In [15]:
import tensorflow as tf
import tensorflow.keras as keras

from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy
from keras.layers import Dense, Dropout

model = Sequential([
    Dense(1024),
    Dense(512),
    Dense(256),
    Dense(128),
    Dense(100)
])

model.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

2024-01-15 08:08:52.703154: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-15 08:09:00.880816: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
model.fit(X_train, y_train, epochs=1000)

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'int'>"})

In [38]:
predictions = model.predict(X_test)




In [41]:
print(predictions.shape)
predictions = tf.nn.softmax(predictions)
print(predictions.shape)

(45000, 100)
(45000, 100)


In [None]:
score = 0
for i, pred in enumerate(predictions):
    final_pred = np.argmax(pred, 0)
    
    print(final_pred)
    
    if final_pred == y_test[i]:
        score += 1

print(score/len(y_test))

In [12]:
## PART 3

print(parameter_list)

for parameters in parameter_list:
    
    filename = '../results/data/X/' + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1])
    
    with open(filename, 'rb') as f:
        X = pickle.load(f)
    
    filename = '../results/data/y/' + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1])
    
    with open(filename, 'rb') as f:
        y = pickle.load(f)
    
    clf = RandomForestClassifier()
    scores = classifier_performance(clf, X, y)
    
    print(scores)
    
    outfile_name = "../results/scores/" + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1])
    
    with open(outfile_name, 'wb') as f:
        pickle.dump(scores, f)

[['0', '0']]
[0.871, 0.95, 0.967]


In [None]:
# PART 4

print(parameter_list)

for parameters in parameter_list:
    
    filename = '../results/data/X/' + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1])
    
    with open(filename, 'rb') as f:
        X = pickle.load(f)
        
    filename = '../results/data/y/' + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1])
    
    with open(filename, 'rb') as f:
        y = pickle.load(f)
    
    clf = RandomForestClassifier()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    matrix = confusion_matrix(y_test, y_pred)
    scores = matrix.diagonal()/matrix.sum(axis=1)
    
    outfile_name = "../results/scores_perclass/" + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1])
    
    with open(outfile_name, 'wb') as f:
        pickle.dump(scores, f)