In [1]:
import sys
sys.path.append("../")

In [2]:
import os
import copy
import numpy as np
import pickle
from common.config import subdatasets
from common.dataloader import load_dataset
from IPython import embed
from collections import defaultdict

In [3]:
from matplotlib import pyplot as plt

In [4]:
def prob(p=0.3):
    r = np.random.randint(0, 100)
    if r < 100*p:
        return True
    return False

In [5]:

summarize_dict = {"HUAWEI_GROUP_A": {"e29ca1cd": 3,
"c23b2b2d": 4,
"aeb5a1de": 6,
"2fe95315": 5,
"0a82a873": 7,
"af732cc4": 7},
"HUAWEI_GROUP_B": {"b2a04b7f": 20,
"c2970798": 13,
"5dafb960": 12},
"HUAWEI_GROUP_C": {"c91f4a07": 24,
"ca2ae31d": 43,
"f7958fb7": 37}}

mappding_dict = {}
for k,v in summarize_dict.items():
    for hashid in list(v.keys()):
        mappding_dict[hashid] = k

In [10]:

np.random.seed(2022)

anomaly_range = {
    # '0a82a873': [(850, 900), (1050, 1150), (1270, 1350), (1400, 1440)],
    '2fe95315': [(600, 700), (800, 1100),(1420, 1440)],
    '5dafb960': [(900, 1000), (1400, 1440)],
    'aeb5a1de': [(370, 410), (580, 670), (780,870), (1290, 1330)],
    'af732cc4': [(130, 170), (430, 450), (870, 970)],
    'b2a04b7f': [(900, 1000), (1120, 1160), (1400, 1440)],
    'c23b2b2d': [(130, 160), (870, 970), (1400, 1440)],
    'c2970798': [(700, 750), (1130, 1160), (1400, 1440)],
    'c91f4a07': [(400, 470), (740, 840), (850, 900), (1180, 1210), (1380, 1440)],
    'ca2ae31d': [(900, 950), (1300, 1350), (1400, 1440)],
    'e29ca1cd': [(1150, 1200), (1260, 1290), (1400, 1440)],
    'f7958fb7': [(220, 270), (650, 700), (960, 1020), (1080, 1150)]
}

summarize_dict = {"Group A": {"e29ca1cd": 3,
"c23b2b2d": 4,
"aeb5a1de": 6,
"2fe95315": 5,
# "0a82a873": 7,
"af732cc4": 7},
"Group B": {"b2a04b7f": 20,
"c2970798": 13,
"5dafb960": 12},
"Group C": {"c91f4a07": 24,
"ca2ae31d": 43,
"f7958fb7": 37}}


dataset = "HUAWEI_FILTERED"  # modify this to HUAWEI
repeat_pattern_len = 200
repeat_num = 100
drop_prob = 0.99
plot_figure=False
group_labels = defaultdict(list)

datapath = os.path.join(f"../datasets/anomaly/{dataset}_AUG2/processed")
datapath_root = os.path.join(f"../datasets/anomaly/")
os.makedirs(datapath, exist_ok=True)
for subdataset in subdatasets[dataset][:]:
    if subdataset not in anomaly_range: continue
    data_dict = load_dataset(dataset, subdataset, nrows=None)
    train_len = len(data_dict["train"])

    # modify wrong labels
    modify_ranges = set()
    for start, end in anomaly_range[subdataset]:
        if start < train_len:
            end_min = min(end, train_len)
            modify_ranges |= set(range(start, end_min))

    normal_indice = [idx for idx in range(train_len) if idx not in modify_ranges]
    rand_start = np.random.randint(0, len(normal_indice) - len(modify_ranges), size=1)[
        0
    ]
    rand_selection_indice = normal_indice[rand_start : rand_start + len(modify_ranges)]
    assert len(modify_ranges) == len(rand_selection_indice)

    original = copy.deepcopy(data_dict["train"])
    data_dict["train"][list(modify_ranges), :] = data_dict["train"][
        rand_selection_indice, :
    ]
    modified = copy.deepcopy(data_dict["train"])

        
    # data augmentation for train
    rand_start_list = np.random.randint(0, len(data_dict["train"]) - repeat_pattern_len, size=repeat_num)
    merged = [data_dict["train"]]
    for start in rand_start_list:
        if prob(p=0.05):
            continue
        merged.append(data_dict["train"][start: start+repeat_pattern_len])
    train_aug = np.vstack(merged)
    print("[Train] from {} to {}".format(data_dict["train"].shape, train_aug.shape))
    

    # data augmentation for test
    rand_start_list = np.random.randint(0, len(data_dict["test"]) - repeat_pattern_len, size=int(repeat_num*1.5))
    merged = [data_dict["test"]]
    merged_label = [data_dict["test_labels"]]
    for start in rand_start_list:
        range_label = data_dict["test_labels"][start: start+repeat_pattern_len]
        if sum(range_label) > 1 and prob(p=drop_prob):
#         if sum(range_label) > 1 and prob(p=drop_prob):
            continue
        merged.append(data_dict["test"][start: start+repeat_pattern_len])
        merged_label.append(range_label)
    test_aug = np.vstack(merged)
    test_labels_aug = np.concatenate(merged_label)[0: len(test_aug)]
    print("[Test] from {} to {}".format(data_dict["test"].shape, test_aug.shape))
    print("[Test Labels] from {} to {}".format(data_dict["test_labels"].shape, test_labels_aug.shape))
    anomaly_ratio = np.sum(test_labels_aug)/test_labels_aug.shape[0]
    print("Anomaly ratio: = {}".format(anomaly_ratio))
    
    if plot_figure:
        dim = 0
        fig, ax = plt.subplots(4, figsize=(15, 4*2))
        ax[0].plot(original[:, dim], label="train (ori)")
        ax[0].scatter(sorted(list(modify_ranges)), [max(original[:,dim])]*len(modify_ranges), c="r")
        ax[0].scatter(rand_selection_indice, [max(original[:,dim])]*len(modify_ranges), c="g")
        ax[0].legend()
        ax[1].plot(modified[:, dim], label="train (repair)")
        ax[1].legend()
        ax[2].plot(data_dict["test"][:, dim], label="Test (ori)")
        ax[2].plot(data_dict["test_labels"]*max(data_dict["test"][:, dim]), c="r")
        ax[2].legend()
        ax[3].plot(test_aug[:, dim], label="Test (aug)")
        ax[3].plot(test_labels_aug*max(test_aug[:, dim]), c="r")
        ax[3].legend()
        print(modified[:, dim].sum(), original[:, dim].sum())
        
        plot_figure = False # plot only once
        
        
    result = {
        "train": train_aug,
        "test": test_aug,
        "test_label": test_labels_aug,
    }
    
    group_name = mappding_dict[subdataset]
    
    os.makedirs(os.path.join(datapath_root, group_name), exist_ok=True)
    for k, v in result.items():
        with open(os.path.join(datapath_root, group_name, f"{subdataset}_{k}.pkl"), "wb") as fw:
            pickle.dump(v, fw)
            
    group_labels[group_name].append((train_aug, test_aug,test_labels_aug))
    print("Done, TY NB!")

for k,v in group_labels.items():
    train_len = 0
    test_len = 0
    label_all = []
    for train, test, label in v:
        train_len += len(train)
        test_len += len(test)
        label_all.append(label)
    label_all  = np.concatenate(label_all)
    print(k, np.sum(label_all)/label_all.shape[0])    
    print(k, "train len", train_len)
    print(k, "test len", test_len)    

Loading 2fe95315 of HUAWEI_FILTERED dataset
.././datasets/anomaly/HUAWEI_FILTERED/2fe95315_train.pkl
Shape of train is (718, 5).
Shape of test is (718, 5).
Shape of test_labels is (719, 1).
[Train] from (718, 5) to (19718, 5)
[Test] from (718, 5) to (7118, 5)
[Test Labels] from (719, 1) to (7118, 1)
Anomaly ratio: = 0.04495644844057319
Done, TY NB!
Loading 5dafb960 of HUAWEI_FILTERED dataset
.././datasets/anomaly/HUAWEI_FILTERED/5dafb960_train.pkl
Shape of train is (718, 12).
Shape of test is (718, 12).
Shape of test_labels is (719, 1).
[Train] from (718, 12) to (20318, 12)
[Test] from (718, 12) to (13918, 12)
[Test Labels] from (719, 1) to (13918, 1)
Anomaly ratio: = 0.011639603391291851
Done, TY NB!
Loading aeb5a1de of HUAWEI_FILTERED dataset
.././datasets/anomaly/HUAWEI_FILTERED/aeb5a1de_train.pkl
Shape of train is (718, 6).
Shape of test is (718, 6).
Shape of test_labels is (719, 1).
[Train] from (718, 6) to (19518, 6)
[Test] from (718, 6) to (14118, 6)
[Test Labels] from (719, 1) 