In [1]:
import stumpy
import numpy as np
import random
import pickle

from tqdm.auto import tqdm
from multiprocessing import Pool
from itertools import product

import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
def distance_to_shapelet(data, shapelets):    
    # processed output data
    data_out = np.zeros((len(data),len(shapelets)))
    
    # loop over each sample in the dataset
    for i,sample in enumerate(tqdm(data)):
        shapelet_score = np.empty(len(shapelets))
        # for each shapelet, calculate distance and assign a score
        for j,shapelet in enumerate(shapelets):
            try:
                dist = stumpy.mass(shapelet, sample)
            except ValueError:
                dist = stumpy.mass(sample, shapelet)
            shapelet_score[j] = dist.min()
        data_out[i] = shapelet_score
    
    return data_out

def process_traces(shapelets, name):
    X, y = [], []

    # iterate over dictionary and re-format into X and y
    for trace_id, trace_vals in traces.items():
        for trace in trace_vals:
            X.append(trace)
            y.append(trace_id)
    
    print("Processing" + name + "... " + "(" + str(len(X)) + " traces)")
    
    # convert traces into float64 data type
    X = [np.asarray(trace).astype('float64') for trace in X]
    # clear empty trace values in data
    X = [trace[~np.isnan(trace)] for trace in X]    
    # compute distance between input trace and shapelet arrays
    # return as new X
    X = distance_to_shapelet(X, shapelets)
    
    return X, y

In [3]:
# note: python multiprocessing is really annoying to work with
# function needs to be in a separate .py file which is imported
# and function can only have 1 argument
# list input which is immediately used for what would be the arguments
def evaluate_parameters(namestring):
    
    print(namestring)
    
    files = {
        'shapelets': folder_shapelets + namestring,
        'X': folder_X + namestring,
        'y': folder_y + namestring
    }
    try:
        with open(files['shapelets'], 'rb') as f:
            shapelets = pickle.load(f)
    except FileNotFoundError:
        print("Shapelet File Missing:" + files['shapelets'] + ", skipping...")
        return
    
    shapelets = [shapelet.astype('float64') for shapelet in shapelets]
    
    X, y = process_traces(shapelets, files['shapelets'])
    
    with open(files['X'], 'wb') as f:
        pickle.dump(X, f)
        
    with open(files['y'], 'wb') as f:
        pickle.dump(y, f)

In [4]:
def classifier_performance(X, y, perclass):
    
    clf = RandomForestClassifier()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    if perclass:
        matrix = metrics.confusion_matrix(y_test, y_pred)
        scores = matrix.diagonal()/matrix.sum(axis=1)
    else:
        scores = metrics.accuracy_score(y_test, y_pred)
    
    return scores

In [5]:
def make_namestring_list(namestring_dict):
    
    name_components = []
    for cat in namestring_dict:
        values = namestring_dict[cat]
        name_components.append([str(cat) + "=" + str(value) for value in values])
    
    namestring_list = [''.join(item) for item in product(*name_components)]
    
    return namestring_list

In [6]:
def merge_x(namestring_list, verbose=True):
    
    X = ()

    for filename in namestring_list:
        if verbose: print(filename)
        with open(folder_X + filename, 'rb') as f:
            Xi = pickle.load(f)
        X = X + (Xi,) 

    X = np.concatenate(X, axis=1)

    with open(folder_y + namestring_list[0], 'rb') as f:
        y = pickle.load(f)
    y = np.array(y) 
    
    if verbose: print(X.shape);print(y.shape);
    
    return X, y

In [7]:
def batch_classifier(namestring_list, perclass=False, repeat=1):
        
    for name in namestring_list:
        with open(folder_X + name, 'rb') as f:
            X = pickle.load(f)
        with open(folder_y + name, 'rb') as f:
            y = pickle.load(f)
            
        all_scores = []
        for i in range(repeat):
            clf = RandomForestClassifier()
            scores = classifier_performance(X, y, perclass)
            all_scores.append(scores[0])
        
        print(name + ": " + str(all_scores))
        with open(folder_scores + name, 'wb') as f:
            pickle.dump(all_scores, f)

def batch_merged_classifier(listof_namestring_list, perclass=False, repeat=1):
    for namestring_list in listof_namestring_list:
        X, y = merge_x(namestring_list, False)
        
        all_scores = []
        for i in range(repeat):
            clf = RandomForestClassifier()
            scores = classifier_performance(X, y, perclass)
            all_scores.append(scores[0])
        
        outfile = '-'.join(namestring_list) + "_merged"
        print(outfile + ": " + str(all_scores))
        with open(folder_scores + outfile, 'wb') as f:
            pickle.dump(all_scores, f)

In [8]:
# SETUP

global traces
with open("../ds19.npy", 'rb') as f:
    traces = pickle.load(f)

global folder_scores
folder_scores = "../results/scores/"
global folder_shapelets
folder_shapelets = "../results/shapelets/"
global folder_X
folder_X = "../results/data/X/"
global folder_y
folder_y = "../results/data/y/"

In [9]:
## PART 2

namestring_list = make_namestring_list({'n_clusters':[2,3,4], 'num':[*range(4)]})
print(namestring_list)

if __name__ == '__main__':

    from utils import evaluate_parameters
    
    with Pool(6) as p:
        p.map(evaluate_parameters, namestring_list)

['n_clusters=2num=0', 'n_clusters=2num=1', 'n_clusters=2num=2', 'n_clusters=2num=3', 'n_clusters=3num=0', 'n_clusters=3num=1', 'n_clusters=3num=2', 'n_clusters=3num=3', 'n_clusters=4num=0', 'n_clusters=4num=1', 'n_clusters=4num=2', 'n_clusters=4num=3']
n_clusters=2num=2
Shapelet File Missing:../results/shapelets/n_clusters=2num=2, skipping...
n_clusters=3num=2
Processing../results/shapelets/n_clusters=3num=2... (10000 traces)
n_clusters=2num=0
Processing../results/shapelets/n_clusters=2num=0... (10000 traces)
n_clusters=2num=1
Processing../results/shapelets/n_clusters=2num=1... (10000 traces)
n_clusters=2num=3
Shapelet File Missing:../results/shapelets/n_clusters=2num=3, skipping...
n_clusters=3num=3
Shapelet File Missing:../results/shapelets/n_clusters=3num=3, skipping...
n_clusters=4num=0
Processing../results/shapelets/n_clusters=4num=0... (10000 traces)
n_clusters=3num=1
Processing../results/shapelets/n_clusters=3num=1... (10000 traces)
n_clusters=3num=0
Processing../results/shapele

100%|██████████| 10000/10000 [48:45<00:00,  3.42it/s] 


n_clusters=4num=1
Processing../results/shapelets/n_clusters=4num=1... (10000 traces)


100%|██████████| 10000/10000 [48:45<00:00,  3.42it/s]
  0%|          | 4/10000 [00:00<27:06,  6.15it/s]/s]

n_clusters=4num=2
Processing../results/shapelets/n_clusters=4num=2... (10000 traces)


100%|██████████| 10000/10000 [48:48<00:00,  3.41it/s]
100%|█████████▉| 9988/10000 [48:48<00:04,  2.67it/s]

n_clusters=4num=3
Processing../results/shapelets/n_clusters=4num=3... (10000 traces)


100%|██████████| 10000/10000 [48:50<00:00,  3.41it/s]
100%|██████████| 10000/10000 [48:54<00:00,  3.41it/s]
100%|██████████| 10000/10000 [48:59<00:00,  3.40it/s]
100%|██████████| 10000/10000 [48:30<00:00,  3.44it/s] 
100%|██████████| 10000/10000 [48:32<00:00,  3.43it/s]
100%|██████████| 10000/10000 [48:34<00:00,  3.43it/s]


In [10]:
listof_namestring_list = [
    make_namestring_list({'n_clusters':'2','num':[*range(2)]}),
    make_namestring_list({'n_clusters':'3','num':[*range(3)]}),
    make_namestring_list({'n_clusters':'4','num':[*range(4)]}),
]

print(listof_namestring_list)

batch_merged_classifier(listof_namestring_list, repeat=5)

[['n_clusters=2num=0', 'n_clusters=2num=1'], ['n_clusters=3num=0', 'n_clusters=3num=1', 'n_clusters=3num=2'], ['n_clusters=4num=0', 'n_clusters=4num=1', 'n_clusters=4num=2', 'n_clusters=4num=3']]
n_clusters=2num=0-n_clusters=2num=1_merged: [0.871, 0.895, 0.883, 0.89, 0.905]
n_clusters=3num=0-n_clusters=3num=1-n_clusters=3num=2_merged: [0.903, 0.905, 0.897, 0.883, 0.892]
n_clusters=4num=0-n_clusters=4num=1-n_clusters=4num=2-n_clusters=4num=3_merged: [0.905, 0.9, 0.899, 0.889, 0.909]


In [56]:
for i in range(3,6):
    listof_namestring_list = [make_namestring_list({'num':random.sample(range(36), i), 'size':[0]}) for j in range(5)]
    batch_merged_classifier(listof_namestring_list, repeat=5)

num=32size=0-num=12size=0-num=0size=0_merged: [0.885, 0.884, 0.858, 0.883, 0.892]
num=22size=0-num=2size=0-num=16size=0_merged: [0.877, 0.874, 0.884, 0.886, 0.883]
num=1size=0-num=15size=0-num=4size=0_merged: [0.888, 0.884, 0.881, 0.884, 0.882]
num=32size=0-num=5size=0-num=16size=0_merged: [0.868, 0.888, 0.874, 0.866, 0.875]
num=19size=0-num=27size=0-num=24size=0_merged: [0.886, 0.885, 0.854, 0.877, 0.868]
num=4size=0-num=11size=0-num=2size=0-num=27size=0_merged: [0.88, 0.872, 0.873, 0.887, 0.888]
num=28size=0-num=21size=0-num=20size=0-num=1size=0_merged: [0.882, 0.878, 0.895, 0.892, 0.887]
num=23size=0-num=27size=0-num=16size=0-num=12size=0_merged: [0.876, 0.875, 0.881, 0.882, 0.875]
num=26size=0-num=4size=0-num=24size=0-num=8size=0_merged: [0.896, 0.887, 0.888, 0.893, 0.876]
num=29size=0-num=33size=0-num=4size=0-num=31size=0_merged: [0.882, 0.894, 0.887, 0.891, 0.889]
num=10size=0-num=26size=0-num=18size=0-num=7size=0-num=35size=0_merged: [0.906, 0.882, 0.871, 0.885, 0.875]
num=11siz

In [27]:
import xgboost as xgb

# Create regression matrices
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)


params = {
    "objective": "multi:softprob",
    "num_class": 100,
    "tree_method": "exact",
    "device": "cpu",
}

model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=150,
    verbose_eval = True
)

preds = model.predict(dtest_reg)

In [28]:
scores = []
topk=[1,3,5]

for k in topk:
    correct = 0
    for i in range(len(preds)):
        ind = np.argpartition(preds[i], -k)[-k:]
        if y_test[i] in ind:
            correct += 1
    scores.append(correct/len(preds))
    
print(scores)

[0.855, 0.948, 0.966]


In [29]:
import tensorflow as tf
import tensorflow.keras as keras

from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy
from keras.layers import Dense, Dropout

model = Sequential([
    Dense(1024),
    Dense(512),
    Dense(256),
    Dense(128),
    Dense(100)
])

model.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

2024-01-24 14:06:55.169391: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-24 14:07:05.389910: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
model.fit(X_train, y_train, epochs=1000)

In [32]:
predictions = model.predict(X_test)
predictions = tf.nn.softmax(predictions)

score = 0
for i, pred in enumerate(predictions):
    final_pred = np.argmax(pred, 0)
    
    if final_pred == y_test[i]:
        score += 1

print(score/len(y_test))

0.583
