In [1]:
import pandas as pd
import stumpy
import numpy as np
import datetime as dt
import random
import math
import pickle
import sys

from statistics import mean
from tqdm.auto import tqdm
from multiprocessing import Pool

import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
# SETUP

global traces

with open('../ipt_traces_train_0.9.npy', 'rb') as f:
    traces = pickle.load(f)
    


In [3]:
'''
Collects random samples from trace with id2 and computes the matrix profile of class1 compared with class 2

Input: 
    trace1: packet traces from class 1
    id2: id number for class 2 
    num_traces: number of traces to select from class 2 (should be equal to class 1)
    shapelet_size: length of shapelets
    
Output:
    Matrix profile of trace1 compared with trace2
    returns np.array([]) if len(trace2) < shapelet_size
'''

def compare_profile(trace1, id2, shapelet_size):
    
    trace2 = []
    
    trace2 = random.sample(traces[id2], 1)
    trace2 = np.asarray([item for row in trace2 for item in row]).astype('float64')
    
    #print("self-profiles generated...")
    # for every subsequence in trace1, we find its closest subsequence in trace2 of length shapelet_size.
    if len(trace2) < shapelet_size:
        #print("len(trace2) < shapelet_size, len(trace2):",len(trace2), "<",shapelet_size,\
         #     "Comparison is not possible returning -1")
        # Comparison is not possible returning -1
        return np.array([])
    
    c1_c2 = stumpy.stump(trace1, shapelet_size, trace2, ignore_trivial=False)[:, 0].astype(float)
    c1_c2[c1_c2 == np.inf] = np.nan
    #print("Comparison profiles generated...")
    
    return c1_c2
'''
Compares a the matrix profile of a class trace with itself

Input: 
    trace: packet traces from class 1
    shapelet_size: length of shapelets
    
Output:
    Matrix profile of trace compared with trace
'''

def same_profile(trace, shapelet_size):
    #print(type(trace))
    c1_c1 = stumpy.stump(trace, shapelet_size)[:, 0].astype(float)
    c1_c1[c1_c1 == np.inf] = np.nan
    
    # for each subsequence of length shapelet_size, c1_c1 stores the distance to the nearest neighbour,
    # stored at position i in c1_c1    
    # the top pattern is global minimum and rarest anomaly is global maximum
    return c1_c1


'''
returns indices of multiple shapelets as one-hot encoded list

Input: 
    trace: packet traces from class 1
    diff: 
    shapelet_size: length of shapelets
    q: number of matchers per class j
Output:
    Matrix profile of trace compared with trace
'''
def generate_shapelet(trace, diff, shapelet_size,q):
    
    
      
    #print("\n=======")
    shapelet_list = []
    for candidate_shapelet_number in range(0,q):
        
        idx_list = np.argpartition(np.nan_to_num(diff), -q)[-q:]  # get the top 10 peak locations in P_diff
        #print(type(idx_list),idx_list)
        
        idx = idx_list[0]
        shapelet_list.append(np.asarray([1 if idx <= i < idx + shapelet_size else 0 for i in range(len(trace))]))
        
        # Does this exclude the above shapelet from next round??
        for i in range(0,len(diff)):
            if (idx - shapelet_size/4) <= i < (idx + shapelet_size + - shapelet_size/4):
                diff[i] = 0
        #diff[idx] = 0
        
        
    
       
    return shapelet_list



'''
Compute shapelet of greatest overlaps
'''
def find_overlap(trace_i, shapelets_i, shapelet_size,r ):
    
#     with np.printoptions(threshold=np.inf):
#         for i in range(0,len(shapelets_i)):
#             print("Shapelet ", i, ":",shapelets_i[i])
    merged_shapelets = np.sum(shapelets_i, axis=0)
    #temp_merged_shapelets = merged_shapelets
    
    #pd.DataFrame({"1":merged_shapelets}).to_csv("merged_shapelets.csv")
    
    
    #with np.printoptions(threshold=np.inf):
    #    print("merged_shapelets",merged_shapelets)
    
    shapelet_list = []
    
    iShapeletsFound = 0
    bSumUpdated = True
    
    while(iShapeletsFound < r and bSumUpdated):
    
        # indicates there is at lease one shapelet left
        bSumUpdated = False
        
        max_size = 0
        start = 0
        end = 0
        
        for i in range(0, len(merged_shapelets)):
            current_size = np.sum(merged_shapelets[i:i+shapelet_size])
            if current_size > max_size:
                bSumUpdated = True
                max_size = current_size
                start = i
                end = i + shapelet_size
                
        if bSumUpdated:
            iShapeletsFound += 1
            shapelet_list.append(trace_i[start:end])
            #print("\n\n***\t","[",start,":",end,"]",temp_merged_shapelets[start:end])

            # Excluse currently selected shapelet from the list of remaining candidate shapelets
            for l in range(0,len(merged_shapelets)):
                #if (start - shapelet_size/4) <= l < (end + shapelet_size/4):
                if start  <= l < end :
                    merged_shapelets[l] = 0
            

    return shapelet_list


In [4]:
'''
Generates a set of 100 shapelets for each class in samples

Input:
    shapelet_coeff: ration of shapelet size to trace size
    max_num_shapelets_per_class: max number of shapelets to be created per class initially, these are later on pruned to get best shapelets

Output:
    list object containing shapelets for each class

'''

train_ratio = 0.9
    
# !!! Choice of prototype - select min dist from each sample to all others
# ! shapelet size

# ! distace between comparing sample trace and shapelet (DTW? vs euclidean)

# ! cross-validation on classifier (5 or 10 - fold) 
# ! classifier parameters

# make results of changes at each stage for comparison (when writing paper)


def generate_shapelets(shapelet_coeff, p,q,r,num_classes):
    shapelet_storage = []
    
    # loop over all classes (generate shapelet for each class)
    for i in tqdm(range(num_classes)):
        
        #print("i",i)
        # get num_random_seed number of random chosen samples from trace i
        
        # p
        trace_i_list = random.sample(traces[i], p)
        #print("========================")
        #print("\n\ntrace_i_list", trace_i_list)
        
         
        for trace_element in trace_i_list:
            
            shapelets_i = []
       
            trace_i = np.asarray([item for item in trace_element]).astype('float64')
            #print("\ntrace_i:",trace_i)

            #random.sample(traces[i], num_traces=1)
            shapelet_size = math.floor(shapelet_coeff * len(trace_i))
            #print("shapelet_size",shapelet_size)

            if shapelet_size < 3:
                print("shapelet_size < 3")
                continue
            # generate profile of i compared with itself
            ci_ci = same_profile(trace_i, shapelet_size)
            
            
            #print("ci_ci",ci_ci)
            #print("========================")
            # loop over every other class and generate a profile for each one
            for j in (range(num_classes)):
                #print("*****************")
                # don't compare i with itself 
                if i == j:
                    continue

                #print(j)
                # compute profile of i compared with j
                ci_cj = compare_profile(trace_i, j, shapelet_size)
                
                if ci_cj.size == 0:
                    continue

                #print("i:",i,"j:",j)
                # find largest value gap between other and i
                #print("len(ci_cj):",len(ci_cj),"len(ci_ci):",len(ci_ci))
                diff_ci = ci_cj - ci_ci

                # generate best shapelet for i compared to j and store it in list
                # q
                ci_shape_list = generate_shapelet(trace_i, diff_ci, shapelet_size,q)
                
                
                
                for k in range(0,len(ci_shape_list)):
                    shapelets_i.append(ci_shape_list[k])
                    
                #print("len(shapelets_i)",len(shapelets_i),"len(ci_shape_list)",len(ci_shape_list))
                #break
                
                
                
            
            #break

            # compare shapelets between all classes and return the one which has the most overlap
            # (i.e.) the shapelet that was chosen most between the 99 other classes
            # r
            best_shapelet_list = find_overlap(trace_i, shapelets_i, shapelet_size, r)
            # save to list
            for best_shapelet in best_shapelet_list:
                shapelet_storage.append(best_shapelet)
            
            #print("\nlen(shapelet_storage)",len(shapelet_storage),"len(shapelets_i)",len(shapelets_i))
            #print("----------end i---------------")
            
            
            
    print(len(shapelet_storage))
      
    return shapelet_storage
        
            


In [5]:
# PART 1
Num_Experiments = 5

# Max number of shapelets per class = k *  |Number of classes |
# k ~= p * r
# p: number of random samples per class i
p = 1
# q: number of matches per class j
q = 1
# r: number of overlaps to consider
r = 1

# num_classes: Number of classes, this is a testing parameter and should be set to 100
num_classes = 100
coeff = 0.25
for i in tqdm(range(Num_Experiments)):
    
    shapelets = generate_shapelets(coeff,p,q,r,num_classes)
    
    filename = '../results/IPT/shapelets_num_random_seed_v3/' + 'exprmnt=' + str(i) + 'shapelet_size=' + str(coeff) + 'p'+\
                str(p) + 'q' + str(q) + 'r' + str(r)
    print(filename)
    #break
    with open(filename, 'wb') as f:
        pickle.dump(shapelets, f)
        
    break

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

shapelet_size < 3
99
../results/IPT/shapelets_num_random_seed_v3/exprmnt=0shapelet_size=0.25p1q1r1
