In [6]:
import stumpy
import numpy as np
import random
import pickle
import os

from tqdm.auto import tqdm
from multiprocessing import Pool
from itertools import product

import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [7]:
def distance_to_shapelet(data, shapelets):  
    dist_size = 2000
    fill = np.zeros(dist_size)
    
    # processed output data
    data_out = np.zeros((len(data),len(shapelets), dist_size))
    
    # loop over each sample in the dataset
    for i,sample in enumerate(tqdm(data)):

        # for each shapelet, calculate distance and assign a score
        for j,shapelet in enumerate(shapelets):
            try:
                dist = stumpy.mass(shapelet, sample)
            except ValueError:
                dist = stumpy.mass(sample, shapelet)    
            
            if len(dist) > dist_size:
                dist = dist[:dist_size]
            else:
                dist = np.concatenate((dist[:dist_size], fill[len(dist):]))
            
            data_out[i,j] = dist
    
    return data_out

def process_traces(shapelets, namestring):
    X, y = [], []
    
    with open("../results/data/distances/" + namestring + "min=1", 'rb') as f:
        min_dist_traceids = pickle.load(f) 
    with open("../results/data/distances/" + namestring + "min=0", 'rb') as f:
        other_traceids = pickle.load(f)
    
    for i in tqdm(range(10000)):
        main_id = random.choice(min_dist_traceids)
        other_id_1 = random.choice(other_traceids)
        other_id_2 = random.choice(other_traceids)
        
        combo_trace = np.concatenate((
            random.choice(traces[other_id_1]),
            random.choice(traces[main_id]),
            random.choice(traces[other_id_2]),
        ))

        X.append(combo_trace)
        y.append(main_id)
    
    
    # iterate over dictionary and re-format into X and y
#     for trace_id, trace_vals in traces.items():
#         for trace in trace_vals:
#             X.append(trace)
#             y.append(trace_id)
    
    
    
    print("Processing " + namestring + "... " + "(" + str(len(X)) + " traces)")
    
    # convert traces into float64 data type
    X = [np.asarray(trace).astype('float64') for trace in X]
    # clear empty trace values in data
    X = [trace[~np.isnan(trace)] for trace in X]    
    # compute distance between input trace and shapelet arrays
    # return as new X
    X = distance_to_shapelet(X, shapelets)
    
    return X, y

In [8]:
# note: python multiprocessing is really annoying to work with
# function needs to be in a separate .py file which is imported
# and function can only have 1 argument
# list input which is immediately used for what would be the arguments
def evaluate_parameters(namestring):
    
    print(namestring)
    
    files = {
        'shapelets': folder_shapelets + namestring,
        'X': folder_X + namestring,
        'y': folder_y + namestring
    }
    try:
        with open(files['shapelets'], 'rb') as f:
            shapelets = pickle.load(f)
    except FileNotFoundError:
        print("Shapelet File Missing:" + files['shapelets'] + ", skipping...")
        return
    
    shapelets = [shapelet.astype('float64') for shapelet in shapelets]
    
    X, y = process_traces(shapelets, namestring)
    
    with open(files['X'], 'wb') as f:
        pickle.dump(X, f)
        
    with open(files['y'], 'wb') as f:
        pickle.dump(y, f)

In [9]:
evaluate_parameters("num=2")

num=2


  0%|          | 0/10000 [00:00<?, ?it/s]

Processing num=2... (10000 traces)


  0%|          | 0/10000 [00:00<?, ?it/s]

In [5]:
def classifier_performance(X, y, perclass, cv=False):
    
    clf = RandomForestClassifier()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    if cv:
        return np.mean(cross_val_score(clf, X, y, cv=5))
    if perclass:
        matrix = metrics.confusion_matrix(y_test, y_pred)
        scores = matrix.diagonal()/matrix.sum(axis=1)
    else:
        scores = metrics.accuracy_score(y_test, y_pred)
    
    return scores

In [6]:
def make_namestring_list(namestring_dict):
    
    name_components = []
    for cat in namestring_dict:
        values = namestring_dict[cat]
        name_components.append([str(cat) + "=" + str(value) for value in values])
    
    namestring_list = [''.join(item) for item in product(*name_components)]
    
    return namestring_list

In [7]:
def merge_x(namestring_list, verbose=True):
    
    X = ()

    for filename in namestring_list:
        if verbose: print(filename)
        with open(folder_X + filename, 'rb') as f:
            Xi = pickle.load(f)
        X = X + (Xi,) 

    X = np.concatenate(X, axis=1)

    with open(folder_y + namestring_list[0], 'rb') as f:
        y = pickle.load(f)
    y = np.array(y) 
    
    if verbose: print(X.shape);print(y.shape);
    
    return X, y

In [8]:
def batch_classifier(namestring_list, perclass=False, repeat=1):
        
    for name in namestring_list:
        with open(folder_X + name, 'rb') as f:
            X = pickle.load(f)
        with open(folder_y + name, 'rb') as f:
            y = pickle.load(f)
            
        all_scores = []
        for i in range(repeat):
            clf = RandomForestClassifier()
            scores = classifier_performance(X, y, perclass, cv=True)
            all_scores.append(scores)
        
        print(name + ": " + str(all_scores))
        with open(folder_scores + name, 'wb') as f:
            pickle.dump(all_scores, f)

def batch_merged_classifier(listof_namestring_list, perclass=False, repeat=1):
    for namestring_list in listof_namestring_list:
        X, y = merge_x(namestring_list, False)
        
        all_scores = []
        for i in range(repeat):
            clf = RandomForestClassifier()
            scores = classifier_performance(X, y, perclass, cv=True)
            all_scores.append(scores)
        
        outfile = '-'.join(namestring_list) + "_merged"
        print(outfile + ": " + str(all_scores))
        with open(folder_scores + outfile, 'wb') as f:
            pickle.dump(all_scores, f)

In [3]:
# SETUP GLOBAL VARIABLES
# techincally, in jupyter notebook these dont need to be global and everything will run fine
# for the the sake of clarity, I have made them global

global traces
with open("../datasets/ds19.npy", 'rb') as f:
    traces = pickle.load(f)

global folder_scores
folder_scores = "../results/scores/"
global folder_shapelets
folder_shapelets = "../results/shapelets/"
global folder_X
folder_X = "../results/data/X/"
global folder_y
folder_y = "../results/data/y/"


In [None]:
## PART 2

namestring_list = make_namestring_list({'num':[*range(6)]})
print(namestring_list)

if __name__ == '__main__':

    from utils import evaluate_parameters
    
    with Pool(6) as p:
        p.map(evaluate_parameters, namestring_list)

In [30]:
batch_classifier(["num=1"], repeat=5)

num=1: [0.55676, 0.5526000000000001, 0.55296, 0.55616, 0.5531200000000001]


In [24]:
for i in range(2,6):
    listof_namestring_list = [make_namestring_list({'num':random.sample(range(36), i), 'size':[0]}) for j in range(1)]
    batch_merged_classifier(listof_namestring_list, repeat=1)

num=26size=0-num=33size=0_merged: [0.8752000000000001]
num=15size=0-num=13size=0-num=28size=0_merged: [0.8764]
num=26size=0-num=33size=0-num=30size=0-num=15size=0_merged: [0.8811]
num=14size=0-num=22size=0-num=6size=0-num=19size=0-num=33size=0_merged: [0.8787]


In [25]:
listof_namestring_list = [
    make_namestring_list({'n_clusters':'2','num':[*range(2)]}),
    make_namestring_list({'n_clusters':'3','num':[*range(3)]}),
    make_namestring_list({'n_clusters':'4','num':[*range(4)]}),
]

print(listof_namestring_list)

batch_merged_classifier(listof_namestring_list, repeat=1)

[['n_clusters=2num=0', 'n_clusters=2num=1'], ['n_clusters=3num=0', 'n_clusters=3num=1', 'n_clusters=3num=2'], ['n_clusters=4num=0', 'n_clusters=4num=1', 'n_clusters=4num=2', 'n_clusters=4num=3']]
n_clusters=2num=0-n_clusters=2num=1_merged: [0.8872]
n_clusters=3num=0-n_clusters=3num=1-n_clusters=3num=2_merged: [0.8913]
n_clusters=4num=0-n_clusters=4num=1-n_clusters=4num=2-n_clusters=4num=3_merged: [0.8943999999999999]


In [25]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK

In [26]:
# Create training, testing, and validation sets
#X,y = merge_x(make_namestring_list({'n_clusters':'4','num':[*range(4)]}), False)
X, y = merge_x(make_namestring_list({'norm':'1'}), False)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1000)

dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dval_reg = xgb.DMatrix(X_val, y_val, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [27]:
def score(params):
    print("Training with: ", end="")
    print(params)
    
    num_boost_round = params['num_boost_round']
    del params['num_boost_round']
    
    model = xgb.train(
        params=params,
        dtrain=dtrain_reg,
        num_boost_round=num_boost_round,
    )
    
    y_pred = model.predict(dval_reg)
    score = metrics.accuracy_score(y_val, y_pred)
    return {'loss': 1 - score, 'status': STATUS_OK,}
    

In [28]:
# XGBoost Hyper-parameter optimization
search_space = {
    "objective": "multi:softmax",
    "num_class": 100,
    "booster": "gbtree",
    "sampling_method": 'uniform',
    "num_boost_round": hp.randint("num_boost_round", 100) + 100,
    "eta": hp.uniform("eta", 0, 1),
    "gamma": hp.lognormal("gamma", 0, 1),
    "max_depth": hp.randint("max_depth", 9) + 1,
    "min_child_weight": hp.lognormal("min_child_weight", 0, 1),
    "max_delta_step": hp.lognormal("max_delta_step", 0, 1),
    "subsample": hp.uniform("subsample", 0.5, 1),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "colsample_bylevel": hp.uniform("colsample_bylevel", 0.5, 1),
    "colsample_bynode": hp.uniform("colsample_bynode", 0.5, 1),
    "lambda": hp.lognormal("lambda", 0, 1),
    "alpha": hp.lognormal("alpha", 0, 1),
    "tree_method": hp.choice("tree_method", ['auto', 'exact','approx','hist']),
    "grow_policy": hp.choice("grow_policy", ['depthwise', 'lossguide']),
    "eval_metric": hp.choice("eval_metric", ['rmse','rmsle','mae','mape','mphe','mlogloss','merror','map']) 
}

In [None]:
best = fmin(score, search_space, algo=tpe.suggest, max_evals=150)

In [13]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 
import tensorflow as tf
import tensorflow.keras as keras

from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, BatchNormalization, Reshape, InputLayer, Flatten

In [14]:
filters = [32, 64, 128]
kernels = [5, 5, 5]
pools = [8, 8, 8]
dropout = 0.1

model = Sequential([
    InputLayer(input_shape=(100,2000,1)),
    Conv2D(filters[0], kernels[0], activation="relu"),
    BatchNormalization(),
    MaxPooling2D(pools[0]),
    Dropout(dropout),
    Conv2D(filters[1], kernels[1], activation="relu"),
    BatchNormalization(),
    MaxPooling2D(pools[1]),
    Dropout(dropout),
    Flatten(),
    Dense(100, activation='softmax')
])

model.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 96, 1996, 32)      832       
                                                                 
 batch_normalization (BatchN  (None, 96, 1996, 32)     128       
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 12, 249, 32)      0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 12, 249, 32)       0         
                                                                 
 conv2d_1 (Conv2D)           (None, 8, 245, 64)        51264     
                                                                 
 batch_normalization_1 (Batc  (None, 8, 245, 64)       2

In [25]:
with open(folder_X + 'num=2', 'rb') as f:
    X = pickle.load(f)
with open(folder_y + 'num=2', 'rb') as f:
    y = pickle.load(f)

y = np.asarray(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000)


In [26]:
X_train.shape

X_train[0].shape
X_train[0][0].shape

type(y_train)

numpy.ndarray

In [28]:
model.fit(X_train, y_train, epochs=1000, batch_size=32)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000

KeyboardInterrupt: 

In [98]:
predictions = model.predict(X_test)

print(predictions)

predictions = tf.nn.softmax(predictions)

score = 0
for i, pred in enumerate(predictions):
    final_pred = np.argmax(pred, 0)
    
    if final_pred == y_test[i]:
        score += 1


        
print(score/len(y_test))

[[3.1597254e-05 9.5159578e-08 1.4385321e-04 ... 1.2331117e-10
  1.5448155e-08 7.4265478e-07]
 [7.9201891e-07 2.6188986e-06 3.0598298e-02 ... 6.1860228e-09
  1.8800288e-08 8.1609960e-07]
 [4.5685840e-13 8.4074962e-17 7.4164618e-21 ... 5.3726117e-14
  1.3119306e-13 5.1383545e-08]
 ...
 [2.8086722e-11 2.3903297e-09 5.5739302e-11 ... 6.4042452e-11
  5.9976364e-11 2.8067779e-10]
 [1.3908936e-12 3.4480934e-12 4.9120999e-08 ... 3.1233267e-12
  8.5366394e-15 3.4405909e-12]
 [1.1686200e-12 7.6550977e-10 7.2792709e-11 ... 4.4209175e-10
  3.6672072e-11 3.7610456e-11]]
0.842
