# ***Libraries***

In [3]:
import os
import random

# ***File Extraction***

In [2]:
def count_files_and_organize_by_subfolder(folder_path):
    subfolder_files = {}

    for subdir, _, files in os.walk(folder_path):
        subfolder_name = os.path.basename(subdir)
        if subfolder_name not in subfolder_files:
            subfolder_files[subfolder_name] = []

        subfolder_files[subfolder_name].extend(files)

    return subfolder_files

def print_subfolder_files_info(subfolder_files):
    for subfolder, files in subfolder_files.items():
        print(f"Subfolder: {subfolder}")
        #for file in files:
            #print(f"  File: {file}")
        print(f"Number of files in '{subfolder}': {len(files)}\n")

tsb_uad_pub_folder_path = '../TSB-UAD-Public/TSB-UAD-Public'
subfolder_files = count_files_and_organize_by_subfolder(tsb_uad_pub_folder_path)
del subfolder_files['TSB-UAD-Public']
print_subfolder_files_info(subfolder_files)

Subfolder: MITDB
Number of files in 'MITDB': 32

Subfolder: NAB
Number of files in 'NAB': 58

Subfolder: ECG
Number of files in 'ECG': 53

Subfolder: GHL
Number of files in 'GHL': 126

Subfolder: NASA-SMAP
Number of files in 'NASA-SMAP': 108

Subfolder: KDD21
Number of files in 'KDD21': 250

Subfolder: NASA-MSL
Number of files in 'NASA-MSL': 54

Subfolder: SensorScope
Number of files in 'SensorScope': 23

Subfolder: Occupancy
Number of files in 'Occupancy': 10

Subfolder: Daphnet
Number of files in 'Daphnet': 45

Subfolder: OPPORTUNITY
Number of files in 'OPPORTUNITY': 465

Subfolder: SVDB
Number of files in 'SVDB': 115

Subfolder: SMD
Number of files in 'SMD': 281

Subfolder: MGAB
Number of files in 'MGAB': 10

Subfolder: IOPS
Number of files in 'IOPS': 58

Subfolder: Genesis
Number of files in 'Genesis': 6

Subfolder: YAHOO
Number of files in 'YAHOO': 367

Subfolder: Dodgers
Number of files in 'Dodgers': 1



In [4]:
def create_normality_dicts(subfolder_files):
    # Create Normality_1
    normality_1 = {}
    for subfolder, files in subfolder_files.items():
        if files:
            num_files_to_take = min(random.randint(1, 3), len(files))
            normality_1[subfolder] = random.sample(files, num_files_to_take)
    
    # Create Normality_2
    normality_2 = {}
    subfolder_list = list(normality_1.keys())
    for i in range(len(subfolder_list)):
        for j in range(i + 1, len(subfolder_list)):
            subfolder_a = subfolder_list[i]
            subfolder_b = subfolder_list[j]
            files_a = normality_1[subfolder_a]
            files_b = normality_1[subfolder_b]
            if files_a and files_b:
                file_a = random.choice(files_a)
                file_b = random.choice(files_b)
                key = f"{file_a}+{file_b}"
                normality_2[key] = (subfolder_a, subfolder_b)
    
    # Create Normality_3
    normality_3 = {}
    for i in range(len(subfolder_list)):
        for j in range(i + 1, len(subfolder_list)):
            for k in range(j + 1, len(subfolder_list)):
                subfolder_a = subfolder_list[i]
                subfolder_b = subfolder_list[j]
                subfolder_c = subfolder_list[k]
                files_a = normality_1[subfolder_a]
                files_b = normality_1[subfolder_b]
                files_c = normality_1[subfolder_c]
                if files_a and files_b and files_c:
                    file_a = random.choice(files_a)
                    file_b = random.choice(files_b)
                    file_c = random.choice(files_c)
                    key = f"{file_a}+{file_b}+{file_c}"
                    normality_3[key] = (subfolder_a, subfolder_b, subfolder_c)

    return normality_1, normality_2, normality_3

In [5]:
normality_1, normality_2, normality_3 = create_normality_dicts(subfolder_files)

In [6]:
# Print the dictionaries
print("Normality_1:")
for subfolder, files in normality_1.items():
    print(f"{subfolder}: {files}")

print("\nNormality_2:")
for key, value in normality_2.items():
    print(f"{key}: {value}")

print("\nNormality_3:")
for key, value in normality_3.items():
    print(f"{key}: {value}")

Normality_1:
MITDB: ['221.test.csv@2.out', '122.test.csv@1.out', '221.test.csv@1.out']
NAB: ['NAB_data_Traffic_5.out', 'NAB_data_Exchange_2.out']
ECG: ['MBA_ECG14046_data_11.out', 'MBA_ECG14046_data_37.out', 'MBA_ECG14046_data_12.out']
GHL: ['15_Lev_fault_Temp_corr_seed_49_vars_23.test.csv@8.out']
NASA-SMAP: ['D-13.train.out', 'D-8.test.out', 'A-8.train.out']
KDD21: ['080_UCR_Anomaly_DISTORTEDresperation2_30000_168250_168251.out', '039_UCR_Anomaly_DISTORTEDLab2Cmac011215EPG3_5000_16390_16420.out', '073_UCR_Anomaly_DISTORTEDpark3m_60000_72150_72495.out']
NASA-MSL: ['M-3.train.out', 'M-6.test.out']
SensorScope: ['stb-7.test.out', 'stb-3.test.out']
Occupancy: ['room-occupancy-0.test.csv@3.out']
Daphnet: ['S03R01E0.test.csv@8.out', 'S03R01E1.test.csv@2.out', 'S09R01E0.test.csv@1.out']
OPPORTUNITY: ['S2-ADL5.test.csv@81.out', 'S4-ADL4.test.csv@108.out']
SVDB: ['870.test.csv@2.out', '806.test.csv@1.out']
SMD: ['machine-1-1.test.csv@19.out', 'machine-3-5.test.csv@31.out', 'machine-2-8.test.cs

In [16]:
def create_combined_dict(normality_1, normality_2, normality_3):
    combined_dict = {}
    counts = {"Normality_1": 0, "Normality_2": 0, "Normality_3": 0}

    # Adding from Normality_1
    for subfolder, files in normality_1.items():
        for file in files:
            if file not in combined_dict:
                combined_dict[file] = ["Normality_1", subfolder]
                counts["Normality_1"] += 1

    # Adding from Normality_2
    for key, subfolders in normality_2.items():
        if key not in combined_dict:
            combined_dict[key] = ["Normality_2", subfolders[0], subfolders[1]]
            counts["Normality_2"] += 1

    # Adding from Normality_3
    for key, subfolders in normality_3.items():
        if key not in combined_dict:
            combined_dict[key] = ["Normality_3", subfolders[0], subfolders[1], subfolders[2]]
            counts["Normality_3"] += 1

    return combined_dict, counts

In [17]:
combined_dict, counts = create_combined_dict(normality_1, normality_2, normality_3)

In [18]:
# Print the counts of filenames from each normality
print("\nCounts of filenames from each normality:")
for normality, count in counts.items():
    print(f"{normality}: {count}")


Counts of filenames from each normality:
Normality_1: 40
Normality_2: 153
Normality_3: 816


In [19]:
# Predefined number of items to take from each normality
normality_1_count = 10
normality_2_count = 10
normality_3_count = 10

In [20]:
def create_predefined_dict(combined_dict, normality_1_count, normality_2_count, normality_3_count):
    predefined_dict = {"Normality_1": {}, "Normality_2": {}, "Normality_3": {}}
    
    # Extracting specific numbers of items from each normality type
    normality_1_items = [item for item in combined_dict.items() if item[1][0] == "Normality_1"]
    normality_2_items = [item for item in combined_dict.items() if item[1][0] == "Normality_2"]
    normality_3_items = [item for item in combined_dict.items() if item[1][0] == "Normality_3"]
    
    # Select predefined numbers of items
    selected_normality_1_items = random.sample(normality_1_items, min(normality_1_count, len(normality_1_items)))
    selected_normality_2_items = random.sample(normality_2_items, min(normality_2_count, len(normality_2_items)))
    selected_normality_3_items = random.sample(normality_3_items, min(normality_3_count, len(normality_3_items)))
    
    # Combine selected items into the predefined_dict
    predefined_dict["Normality_1"] = dict(selected_normality_1_items)
    predefined_dict["Normality_2"] = dict(selected_normality_2_items)
    predefined_dict["Normality_3"] = dict(selected_normality_3_items)
    
    # Combine all selected items into a single dictionary
    all_selected_items = selected_normality_1_items + selected_normality_2_items + selected_normality_3_items
    
    # Shuffle the combined selected items
    random.shuffle(all_selected_items)
    
    # Convert to a dictionary
    shuffled_dict = dict(all_selected_items)
    
    return shuffled_dict

In [21]:
predefined_shuffled_dict = create_predefined_dict(combined_dict, normality_1_count, normality_2_count, normality_3_count)

In [22]:
# Print the shuffled dictionary
print("Predefined Shuffled Dictionary:")
for key, value in predefined_shuffled_dict.items():
    print(f"{key}: {value}")

Predefined Shuffled Dictionary:
D-8.test.out+S4-ADL4.test.csv@108.out: ['Normality_2', 'NASA-SMAP', 'OPPORTUNITY']
039_UCR_Anomaly_DISTORTEDLab2Cmac011215EPG3_5000_16390_16420.out+room-occupancy-0.test.csv@3.out+806.test.csv@1.out: ['Normality_3', 'KDD21', 'Occupancy', 'SVDB']
YahooA3Benchmark-TS29_data.out: ['Normality_1', 'YAHOO']
MBA_ECG14046_data_11.out+M-6.test.out+YahooA3Benchmark-TS29_data.out: ['Normality_3', 'ECG', 'NASA-MSL', 'YAHOO']
A-8.train.out: ['Normality_1', 'NASA-SMAP']
15_Lev_fault_Temp_corr_seed_49_vars_23.test.csv@8.out+039_UCR_Anomaly_DISTORTEDLab2Cmac011215EPG3_5000_16390_16420.out+101-freeway-traffic.test.out: ['Normality_3', 'GHL', 'KDD21', 'Dodgers']
NAB_data_Traffic_5.out+9.test.out: ['Normality_2', 'NAB', 'MGAB']
MBA_ECG14046_data_37.out+KPI-ffb82d38-5f00-37db-abc0-5d2e4e4cb6aa.test.out+101-freeway-traffic.test.out: ['Normality_3', 'ECG', 'IOPS', 'Dodgers']
221.test.csv@1.out+M-3.train.out+3.test.out: ['Normality_3', 'MITDB', 'NASA-MSL', 'MGAB']
M-6.test.out

In [27]:
def create_files_and_new_dict(predefined_shuffled_dict, base_folder):
    new_dict = {}
    id_counter = 1

    for key, value in predefined_shuffled_dict.items():
        new_filename = f"ts{id_counter}"
        new_filepath = os.path.join('Time-Series-Data-Files/', new_filename)
        
        if value[0] == "Normality_1":
            subfolder = value[1]
            original_filepath = os.path.join(base_folder, subfolder, key)
            with open(original_filepath, 'r') as original_file:
                content = original_file.read()
            with open(new_filepath, 'w') as new_file:
                new_file.write(content)
        
        elif value[0] == "Normality_2":
            subfolder1, subfolder2 = value[1], value[2]
            file1, file2 = key.split('+')
            filepath1 = os.path.join(base_folder, subfolder1, file1)
            filepath2 = os.path.join(base_folder, subfolder2, file2)
            with open(filepath1, 'r') as f1, open(filepath2, 'r') as f2:
                content = f1.read() + f2.read()
            with open(new_filepath, 'w') as new_file:
                new_file.write(content)
        
        elif value[0] == "Normality_3":
            subfolder1, subfolder2, subfolder3 = value[1], value[2], value[3]
            file1, file2, file3 = key.split('+')
            filepath1 = os.path.join(base_folder, subfolder1, file1)
            filepath2 = os.path.join(base_folder, subfolder2, file2)
            filepath3 = os.path.join(base_folder, subfolder3, file3)
            with open(filepath1, 'r') as f1, open(filepath2, 'r') as f2, open(filepath3, 'r') as f3:
                content = f1.read() + f2.read() + f3.read()
            with open(new_filepath, 'w') as new_file:
                new_file.write(content)
        
        new_dict[new_filename] = value
        id_counter += 1
    
    return new_dict

In [28]:
# Create files and the new dictionary
new_files_dict = create_files_and_new_dict(predefined_shuffled_dict, '../TSB-UAD-Public/TSB-UAD-Public')

In [29]:
# Print the new dictionary
print("New Dictionary with Generated Files:")
for key, value in new_files_dict.items():
    print(f"{key}: {value}")

New Dictionary with Generated Files:
ts1: ['Normality_2', 'NASA-SMAP', 'OPPORTUNITY']
ts2: ['Normality_3', 'KDD21', 'Occupancy', 'SVDB']
ts3: ['Normality_1', 'YAHOO']
ts4: ['Normality_3', 'ECG', 'NASA-MSL', 'YAHOO']
ts5: ['Normality_1', 'NASA-SMAP']
ts6: ['Normality_3', 'GHL', 'KDD21', 'Dodgers']
ts7: ['Normality_2', 'NAB', 'MGAB']
ts8: ['Normality_3', 'ECG', 'IOPS', 'Dodgers']
ts9: ['Normality_3', 'MITDB', 'NASA-MSL', 'MGAB']
ts10: ['Normality_2', 'NASA-MSL', 'Genesis']
ts11: ['Normality_1', 'ECG']
ts12: ['Normality_3', 'NAB', 'ECG', 'Genesis']
ts13: ['Normality_2', 'IOPS', 'Dodgers']
ts14: ['Normality_1', 'Daphnet']
ts15: ['Normality_2', 'NAB', 'KDD21']
ts16: ['Normality_2', 'GHL', 'MGAB']
ts17: ['Normality_2', 'ECG', 'Daphnet']
ts18: ['Normality_1', 'Genesis']
ts19: ['Normality_1', 'Genesis']
ts20: ['Normality_2', 'MITDB', 'GHL']
ts21: ['Normality_1', 'SensorScope']
ts22: ['Normality_1', 'SMD']
ts23: ['Normality_3', 'MITDB', 'NAB', 'SensorScope']
ts24: ['Normality_2', 'NASA-SMAP', '

# Non-Streaming Methods
We pick two non-streaming methods of our choice as a baseline (e.g., Isolation Forest and a DNN
method). We run the methods on the files of the dictionary we created in an eariler step. In the no streaming setting we use the entire generated files as input. These methods will serve as our offline baselines.
