In [21]:
import stumpy
import numpy as np
import random
import pickle

from tqdm.auto import tqdm
from multiprocessing import Pool
from itertools import product

import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [22]:
def distance_to_shapelet(data, shapelets):    
    # processed output data
    data_out = np.zeros((len(data),len(shapelets)))
    
    # loop over each sample in the dataset
    for i,sample in enumerate(tqdm(data)):
        shapelet_score = [None] * len(shapelets)
        # for each shapelet, calculate distance and assign a score
        for j,shapelet in enumerate(shapelets):
            try:
                dist = stumpy.mass(shapelet, sample)
            except ValueError:
                dist = stumpy.mass(sample, shapelet)
            shapelet_score[j] = dist
    
        data_out[i] = shapelet_score
    
    return data_out

def process_traces(shapelets, namestring):
    X, y = [], []
    
    with open("../results/data/distances/" + namestring + "min=1", 'rb') as f:
        min_dist_traceids = pickle.load(f) 
    with open("../results/data/distances/" + namestring + "min=0", 'rb') as f:
        other_traceids = pickle.load(f)
    
    for i in tqdm(range(25000)):
        main_id = random.choice(min_dist_traceids)
        other_id_1 = random.choice(other_traceids)
        other_id_2 = random.choice(other_traceids)
        
        combo_trace = np.concatenate((
            random.choice(traces[other_id_1]),
            random.choice(traces[main_id]),
            random.choice(traces[other_id_2]),
        ))

        X.append(combo_trace)
        y.append(main_id)
    
    
#     iterate over dictionary and re-format into X and y
#     for trace_id, trace_vals in traces.items():
#         for trace in trace_vals:
#             X.append(trace)
#             y.append(trace_id)
    
    print("Processing " + namestring + "... " + "(" + str(len(X)) + " traces)")
    
    # convert traces into float64 data type
    X = [np.asarray(trace).astype('float64') for trace in X]
    # clear empty trace values in data
    X = [trace[~np.isnan(trace)] for trace in X]    
    # compute distance between input trace and shapelet arrays
    # return as new X
    X = distance_to_shapelet(X, shapelets)
    
    return X, y

In [14]:
# note: python multiprocessing is really annoying to work with
# function needs to be in a separate .py file which is imported
# and function can only have 1 argument
# list input which is immediately used for what would be the arguments
def evaluate_parameters(namestring):
    
    print(namestring)
    
    files = {
        'shapelets': folder_shapelets + namestring,
        'X': folder_X + namestring,
        'y': folder_y + namestring
    }
    try:
        with open(files['shapelets'], 'rb') as f:
            shapelets = pickle.load(f)
    except FileNotFoundError:
        print("Shapelet File Missing:" + files['shapelets'] + ", skipping...")
        return
    
    shapelets = [shapelet.astype('float64') for shapelet in shapelets]
    
    X, y = process_traces(shapelets, namestring)
    
    with open(files['X'], 'wb') as f:
        pickle.dump(X, f)
        
    with open(files['y'], 'wb') as f:
        pickle.dump(y, f)

In [15]:
def classifier_performance(X, y, perclass, cv=False):
    
    clf = RandomForestClassifier()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    if cv:
        return np.mean(cross_val_score(clf, X, y, cv=5))
    if perclass:
        matrix = metrics.confusion_matrix(y_test, y_pred)
        scores = matrix.diagonal()/matrix.sum(axis=1)
    else:
        scores = metrics.accuracy_score(y_test, y_pred)
    
    return scores

In [16]:
def make_namestring_list(namestring_dict):
    
    name_components = []
    for cat in namestring_dict:
        values = namestring_dict[cat]
        name_components.append([str(cat) + "=" + str(value) for value in values])
    
    namestring_list = [''.join(item) for item in product(*name_components)]
    
    return namestring_list

In [17]:
def merge_x(namestring_list, verbose=True):
    
    X = ()

    for filename in namestring_list:
        if verbose: print(filename)
        with open(folder_X + filename, 'rb') as f:
            Xi = pickle.load(f)
        X = X + (Xi,) 

    X = np.concatenate(X, axis=1)

    with open(folder_y + namestring_list[0], 'rb') as f:
        y = pickle.load(f)
    y = np.array(y) 
    
    if verbose: print(X.shape);print(y.shape);
    
    return X, y

In [18]:
def batch_classifier(namestring_list, perclass=False, repeat=1):
        
    for name in namestring_list:
        with open(folder_X + name, 'rb') as f:
            X = pickle.load(f)
        with open(folder_y + name, 'rb') as f:
            y = pickle.load(f)
            
        all_scores = []
        for i in range(repeat):
            clf = RandomForestClassifier()
            scores = classifier_performance(X, y, perclass, cv=True)
            all_scores.append(scores)
        
        print(name + ": " + str(all_scores))
        with open(folder_scores + name, 'wb') as f:
            pickle.dump(all_scores, f)

def batch_merged_classifier(listof_namestring_list, perclass=False, repeat=1):
    for namestring_list in listof_namestring_list:
        X, y = merge_x(namestring_list, False)
        
        all_scores = []
        for i in range(repeat):
            clf = RandomForestClassifier()
            scores = classifier_performance(X, y, perclass, cv=True)
            all_scores.append(scores)
        
        outfile = '-'.join(namestring_list) + "_merged"
        print(outfile + ": " + str(all_scores))
        with open(folder_scores + outfile, 'wb') as f:
            pickle.dump(all_scores, f)

In [19]:
# SETUP GLOBAL VARIABLES
# techincally, in jupyter notebook these dont need to be global and everything will run fine
# for the the sake of clarity, I have made them global

global traces
with open("../datasets/ds19.npy", 'rb') as f:
    traces = pickle.load(f)

global folder_scores
folder_scores = "../results/scores/"
global folder_shapelets
folder_shapelets = "../results/shapelets/"
global folder_X
folder_X = "../results/data/X/"
global folder_y
folder_y = "../results/data/y/"


In [None]:
## PART 2

namestring_list = make_namestring_list({'num':[*range(6)]})
print(namestring_list)

if __name__ == '__main__':

    from utils import evaluate_parameters
    
    with Pool(6) as p:
        p.map(evaluate_parameters, namestring_list)

In [23]:
evaluate_parameters("num=1")

num=1


  0%|          | 0/25000 [00:00<?, ?it/s]

Processing num=1... (25000 traces)


  0%|          | 0/25000 [00:00<?, ?it/s]

In [30]:
batch_classifier(["num=1"], repeat=5)

num=1: [0.55676, 0.5526000000000001, 0.55296, 0.55616, 0.5531200000000001]


In [26]:
batch_classifier(make_namestring_list({'num':[15],'size':[0]}))

num=15size=0: [0.857]


In [24]:
for i in range(2,6):
    listof_namestring_list = [make_namestring_list({'num':random.sample(range(36), i), 'size':[0]}) for j in range(1)]
    batch_merged_classifier(listof_namestring_list, repeat=1)

num=26size=0-num=33size=0_merged: [0.8752000000000001]
num=15size=0-num=13size=0-num=28size=0_merged: [0.8764]
num=26size=0-num=33size=0-num=30size=0-num=15size=0_merged: [0.8811]
num=14size=0-num=22size=0-num=6size=0-num=19size=0-num=33size=0_merged: [0.8787]


In [25]:
listof_namestring_list = [
    make_namestring_list({'n_clusters':'2','num':[*range(2)]}),
    make_namestring_list({'n_clusters':'3','num':[*range(3)]}),
    make_namestring_list({'n_clusters':'4','num':[*range(4)]}),
]

print(listof_namestring_list)

batch_merged_classifier(listof_namestring_list, repeat=1)

[['n_clusters=2num=0', 'n_clusters=2num=1'], ['n_clusters=3num=0', 'n_clusters=3num=1', 'n_clusters=3num=2'], ['n_clusters=4num=0', 'n_clusters=4num=1', 'n_clusters=4num=2', 'n_clusters=4num=3']]
n_clusters=2num=0-n_clusters=2num=1_merged: [0.8872]
n_clusters=3num=0-n_clusters=3num=1-n_clusters=3num=2_merged: [0.8913]
n_clusters=4num=0-n_clusters=4num=1-n_clusters=4num=2-n_clusters=4num=3_merged: [0.8943999999999999]


In [25]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK

In [26]:
# Create training, testing, and validation sets
#X,y = merge_x(make_namestring_list({'n_clusters':'4','num':[*range(4)]}), False)
X, y = merge_x(make_namestring_list({'norm':'1'}), False)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1000)

dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dval_reg = xgb.DMatrix(X_val, y_val, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [27]:
def score(params):
    print("Training with: ", end="")
    print(params)
    
    num_boost_round = params['num_boost_round']
    del params['num_boost_round']
    
    model = xgb.train(
        params=params,
        dtrain=dtrain_reg,
        num_boost_round=num_boost_round,
    )
    
    y_pred = model.predict(dval_reg)
    score = metrics.accuracy_score(y_val, y_pred)
    return {'loss': 1 - score, 'status': STATUS_OK,}
    

In [28]:
# XGBoost Hyper-parameter optimization
search_space = {
    "objective": "multi:softmax",
    "num_class": 100,
    "booster": "gbtree",
    "sampling_method": 'uniform',
    "num_boost_round": hp.randint("num_boost_round", 100) + 100,
    "eta": hp.uniform("eta", 0, 1),
    "gamma": hp.lognormal("gamma", 0, 1),
    "max_depth": hp.randint("max_depth", 9) + 1,
    "min_child_weight": hp.lognormal("min_child_weight", 0, 1),
    "max_delta_step": hp.lognormal("max_delta_step", 0, 1),
    "subsample": hp.uniform("subsample", 0.5, 1),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "colsample_bylevel": hp.uniform("colsample_bylevel", 0.5, 1),
    "colsample_bynode": hp.uniform("colsample_bynode", 0.5, 1),
    "lambda": hp.lognormal("lambda", 0, 1),
    "alpha": hp.lognormal("alpha", 0, 1),
    "tree_method": hp.choice("tree_method", ['auto', 'exact','approx','hist']),
    "grow_policy": hp.choice("grow_policy", ['depthwise', 'lossguide']),
    "eval_metric": hp.choice("eval_metric", ['rmse','rmsle','mae','mape','mphe','mlogloss','merror','map']) 
}

In [29]:
best = fmin(score, search_space, algo=tpe.suggest, max_evals=150)

Training with:                                                                  
{'alpha': 4.150142439674216, 'booster': 'gbtree', 'colsample_bylevel': 0.8312229885124459, 'colsample_bynode': 0.9095906074940869, 'colsample_bytree': 0.6070647781717211, 'eta': 0.20345209929417551, 'eval_metric': 'mae', 'gamma': 1.2684880352423338, 'grow_policy': 'lossguide', 'lambda': 1.3954498069394867, 'max_delta_step': 0.718899201166303, 'max_depth': 5, 'min_child_weight': 0.6260894282099256, 'num_boost_round': 107, 'num_class': 100, 'objective': 'multi:softmax', 'sampling_method': 'uniform', 'subsample': 0.6481660334006136, 'tree_method': 'exact'}
Training with:                                                                  
{'alpha': 0.2515232267078622, 'booster': 'gbtree', 'colsample_bylevel': 0.6243248890150737, 'colsample_bynode': 0.5408770771041027, 'colsample_bytree': 0.8058410268360856, 'eta': 0.44195745863229896, 'eval_metric': 'mlogloss', 'gamma': 2.9318575170399246, 'grow_policy': 'depthw

Training with:                                                                  
{'alpha': 0.4465885088386293, 'booster': 'gbtree', 'colsample_bylevel': 0.8202055578150104, 'colsample_bynode': 0.763343164278167, 'colsample_bytree': 0.8677233230830127, 'eta': 0.3315741136052358, 'eval_metric': 'mae', 'gamma': 0.29774853586268474, 'grow_policy': 'depthwise', 'lambda': 1.0383597552157648, 'max_delta_step': 1.4390762025905741, 'max_depth': 8, 'min_child_weight': 0.6171764092346121, 'num_boost_round': 178, 'num_class': 100, 'objective': 'multi:softmax', 'sampling_method': 'uniform', 'subsample': 0.8910780333281985, 'tree_method': 'exact'}
Training with:                                                                  
{'alpha': 0.540921000207198, 'booster': 'gbtree', 'colsample_bylevel': 0.908476138576195, 'colsample_bynode': 0.577124889520501, 'colsample_bytree': 0.6386748099256426, 'eta': 0.6014168657205758, 'eval_metric': 'map', 'gamma': 0.1528635017729276, 'grow_policy': 'depthwise', 'l

Training with:                                                                  
{'alpha': 1.0528176919864658, 'booster': 'gbtree', 'colsample_bylevel': 0.8751042564866123, 'colsample_bynode': 0.5856282083138928, 'colsample_bytree': 0.5691633780694163, 'eta': 0.8955488177733926, 'eval_metric': 'map', 'gamma': 0.15355141406485276, 'grow_policy': 'lossguide', 'lambda': 0.6435741894544078, 'max_delta_step': 0.9079499406992828, 'max_depth': 8, 'min_child_weight': 2.9442335699745943, 'num_boost_round': 199, 'num_class': 100, 'objective': 'multi:softmax', 'sampling_method': 'uniform', 'subsample': 0.7797715035618581, 'tree_method': 'auto'}
Training with:                                                                  
{'alpha': 1.2241804062294301, 'booster': 'gbtree', 'colsample_bylevel': 0.7520601014599135, 'colsample_bynode': 0.5412014132167358, 'colsample_bytree': 0.5039448546174674, 'eta': 0.7132547979336399, 'eval_metric': 'merror', 'gamma': 0.329463144379402, 'grow_policy': 'depthwise

Training with:                                                                  
{'alpha': 0.03971983553678097, 'booster': 'gbtree', 'colsample_bylevel': 0.8805670008158659, 'colsample_bynode': 0.669745395997395, 'colsample_bytree': 0.7871055206045668, 'eta': 0.24392515432627349, 'eval_metric': 'rmsle', 'gamma': 0.39737304921065064, 'grow_policy': 'lossguide', 'lambda': 0.4326648034432753, 'max_delta_step': 0.188342864846744, 'max_depth': 2, 'min_child_weight': 6.12027535533452, 'num_boost_round': 138, 'num_class': 100, 'objective': 'multi:softmax', 'sampling_method': 'uniform', 'subsample': 0.8543809949032899, 'tree_method': 'exact'}
Training with:                                                                  
{'alpha': 0.09085172207047666, 'booster': 'gbtree', 'colsample_bylevel': 0.9298863085161849, 'colsample_bynode': 0.6008176179605502, 'colsample_bytree': 0.7262843598940933, 'eta': 0.00394826300386053, 'eval_metric': 'rmsle', 'gamma': 0.06677151098263698, 'grow_policy': 'lossg

Training with:                                                                  
{'alpha': 0.5436473940359631, 'booster': 'gbtree', 'colsample_bylevel': 0.7248277047804789, 'colsample_bynode': 0.6695211862172367, 'colsample_bytree': 0.6517867665953762, 'eta': 0.291234497076035, 'eval_metric': 'rmsle', 'gamma': 0.17772143084142214, 'grow_policy': 'depthwise', 'lambda': 1.6673194343044946, 'max_delta_step': 1.4547110369747602, 'max_depth': 1, 'min_child_weight': 0.3281344988882824, 'num_boost_round': 117, 'num_class': 100, 'objective': 'multi:softmax', 'sampling_method': 'uniform', 'subsample': 0.5444792924943656, 'tree_method': 'hist'}
Training with:                                                                  
{'alpha': 22.53403588858358, 'booster': 'gbtree', 'colsample_bylevel': 0.8450996334347591, 'colsample_bynode': 0.5184945097524408, 'colsample_bytree': 0.8895173318918276, 'eta': 0.09077061901499847, 'eval_metric': 'mphe', 'gamma': 2.1379660066640573, 'grow_policy': 'lossguide

Training with:                                                                  
{'alpha': 0.19308127764660152, 'booster': 'gbtree', 'colsample_bylevel': 0.932086317617308, 'colsample_bynode': 0.5027999057478101, 'colsample_bytree': 0.8020154378300657, 'eta': 0.3800840388398825, 'eval_metric': 'rmsle', 'gamma': 0.10581746902412685, 'grow_policy': 'lossguide', 'lambda': 0.44617607659214975, 'max_delta_step': 0.4350770146698771, 'max_depth': 5, 'min_child_weight': 4.778958811390015, 'num_boost_round': 194, 'num_class': 100, 'objective': 'multi:softmax', 'sampling_method': 'uniform', 'subsample': 0.866532741558256, 'tree_method': 'exact'}
Training with:                                                                  
{'alpha': 0.23239847716572565, 'booster': 'gbtree', 'colsample_bylevel': 0.8343462766434591, 'colsample_bynode': 0.5712284700668653, 'colsample_bytree': 0.8853658621368007, 'eta': 0.4805642132921139, 'eval_metric': 'rmsle', 'gamma': 0.09948982184407484, 'grow_policy': 'lossg

Training with:                                                                  
{'alpha': 0.17611979576476808, 'booster': 'gbtree', 'colsample_bylevel': 0.6263304207862151, 'colsample_bynode': 0.5353072830161054, 'colsample_bytree': 0.7233748845102258, 'eta': 0.7910006234581505, 'eval_metric': 'merror', 'gamma': 0.15971981498446902, 'grow_policy': 'depthwise', 'lambda': 0.1938088007652695, 'max_delta_step': 0.27294467531179945, 'max_depth': 2, 'min_child_weight': 4.727829133583686, 'num_boost_round': 116, 'num_class': 100, 'objective': 'multi:softmax', 'sampling_method': 'uniform', 'subsample': 0.7386792519821856, 'tree_method': 'auto'}
Training with:                                                                  
{'alpha': 0.11714062954102294, 'booster': 'gbtree', 'colsample_bylevel': 0.6861216973005501, 'colsample_bynode': 0.5163065275824009, 'colsample_bytree': 0.7557107891374156, 'eta': 0.5848627524754948, 'eval_metric': 'map', 'gamma': 0.1868892384861302, 'grow_policy': 'lossgu

Training with:                                                                  
{'alpha': 0.28247250216167036, 'booster': 'gbtree', 'colsample_bylevel': 0.777537029522329, 'colsample_bynode': 0.8442252207258251, 'colsample_bytree': 0.5917721669806537, 'eta': 0.31480532472658573, 'eval_metric': 'mae', 'gamma': 0.13597323590713822, 'grow_policy': 'lossguide', 'lambda': 0.9618235031193413, 'max_delta_step': 0.1335971263023791, 'max_depth': 4, 'min_child_weight': 4.932317158757192, 'num_boost_round': 196, 'num_class': 100, 'objective': 'multi:softmax', 'sampling_method': 'uniform', 'subsample': 0.9334893610154323, 'tree_method': 'auto'}
Training with:                                                                  
{'alpha': 0.11944901175321963, 'booster': 'gbtree', 'colsample_bylevel': 0.8723089551898617, 'colsample_bynode': 0.5127866100990287, 'colsample_bytree': 0.7938818922713586, 'eta': 0.282931561038272, 'eval_metric': 'mape', 'gamma': 0.20804296106292597, 'grow_policy': 'depthwise

Training with:                                                                  
{'alpha': 9.49970029272588, 'booster': 'gbtree', 'colsample_bylevel': 0.6244229711848296, 'colsample_bynode': 0.5100038208995687, 'colsample_bytree': 0.5478775656940476, 'eta': 0.9422014766288854, 'eval_metric': 'mape', 'gamma': 0.11679938414235087, 'grow_policy': 'lossguide', 'lambda': 0.45443559608581224, 'max_delta_step': 0.7234180422210956, 'max_depth': 9, 'min_child_weight': 2.9455235646380036, 'num_boost_round': 152, 'num_class': 100, 'objective': 'multi:softmax', 'sampling_method': 'uniform', 'subsample': 0.5901613144724736, 'tree_method': 'hist'}
Training with:                                                                  
{'alpha': 0.06234998662800566, 'booster': 'gbtree', 'colsample_bylevel': 0.7988263824349683, 'colsample_bynode': 0.5897601365420078, 'colsample_bytree': 0.6788615298877774, 'eta': 0.14307697207217204, 'eval_metric': 'mphe', 'gamma': 1.0385052637047978, 'grow_policy': 'lossguid

 77%|█████████████▉    | 116/150 [2:26:02<42:48, 75.54s/trial, best loss: 0.124]


KeyboardInterrupt: 

In [29]:
import tensorflow as tf
import tensorflow.keras as keras

from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy
from keras.layers import Dense, Dropout

model = Sequential([
    Dense(1024),
    Dense(512),
    Dense(256),
    Dense(128),
    Dense(100)
])

model.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

2024-01-24 14:06:55.169391: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-24 14:07:05.389910: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
model.fit(X_train, y_train, epochs=1000)

In [32]:
predictions = model.predict(X_test)
predictions = tf.nn.softmax(predictions)

score = 0
for i, pred in enumerate(predictions):
    final_pred = np.argmax(pred, 0)
    
    if final_pred == y_test[i]:
        score += 1

print(score/len(y_test))

0.583
