In [1]:
import os
import sys
import pandas as pd
import numpy as np
import pickle
import json
from folderconstants import *
from shutil import copyfile

def normalize(a):
	a = a / np.maximum(np.absolute(a.max(axis=0)), np.absolute(a.min(axis=0)))
	return (a / 2 + 0.5)

def normalize2(a, min_a = None, max_a = None):
	if min_a is None: min_a, max_a = min(a), max(a)
	return (a - min_a) / (max_a - min_a), min_a, max_a

def normalize2_1(a, min_a = None, max_a = None):
	if min_a is None: 
		min_a, max_a = np.min(a, axis=0), np.max(a, axis=0)
	return (a - min_a) / (max_a - min_a), min_a, max_a

def normalize3(a, min_a = None, max_a = None):
	if min_a is None: min_a, max_a = np.min(a, axis = 0), np.max(a, axis = 0)
	return (a - min_a) / (max_a - min_a + 0.0001), min_a, max_a

def convertNumpy(df):
	x = df[df.columns[3:]].values[::10, :]
	return (x - x.min(0)) / (x.ptp(0) + 1e-4)

In [2]:
def load_and_save(category, filename, dataset, dataset_folder):
    temp = np.genfromtxt(os.path.join(dataset_folder, category, filename),
                         dtype=np.float64,
                         delimiter=',')
    print(dataset, category, filename, temp.shape)
    np.save(os.path.join(output_folder, f"SMD/{dataset}_{category}.npy"), temp)
    return temp.shape

def load_and_save2(category, filename, dataset, dataset_folder, shape):
	temp = np.zeros(shape)
	with open(os.path.join(dataset_folder, 'interpretation_label', filename), "r") as f:
		ls = f.readlines()
	for line in ls:
		pos, values = line.split(':')[0], line.split(':')[1].split(',')
		start, end, indx = int(pos.split('-')[0]), int(pos.split('-')[1]), [int(i)-1 for i in values]
		temp[start-1:end-1, indx] = 1
	print(dataset, category, filename, temp.shape)
	np.save(os.path.join(output_folder, f"SMD/{dataset}_{category}.npy"), temp)

In [None]:
def load_data(dataset):

	folder = os.path.join(output_folder, dataset) # /processed/SMD
	os.makedirs(folder, exist_ok=True)

	if dataset == 'SMD':
		dataset_folder = 'data/SMD'
		file_list = os.listdir(os.path.join(dataset_folder, "train"))
		for filename in file_list:
			if filename.endswith('.txt'):
				load_and_save('train', filename, filename.strip('.txt'), dataset_folder)
				s = load_and_save('test', filename, filename.strip('.txt'), dataset_folder)
				load_and_save2('labels', filename, filename.strip('.txt'), dataset_folder, s)

	elif dataset in ['SMAP', 'MSL']:
		dataset_folder = 'data/SMAP_MSL'
		file = os.path.join(dataset_folder, 'labeled_anomalies.csv')
		values = pd.read_csv(file)
		values = values[values['spacecraft'] == dataset]
		filenames = values['chan_id'].values.tolist()
		for fn in filenames:
			train = np.load(f'{dataset_folder}/train/{fn}.npy')
			test = np.load(f'{dataset_folder}/test/{fn}.npy')
			train, min_a, max_a = normalize3(train)
			test, _, _ = normalize3(test, min_a, max_a)
			np.save(f'{folder}/{fn}_train.npy', train)
			np.save(f'{folder}/{fn}_test.npy', test)
			labels = np.zeros(test.shape)
			indices = values[values['chan_id'] == fn]['anomaly_sequences'].values[0]
			indices = indices.replace(']', '').replace('[', '').split(', ')
			indices = [int(i) for i in indices]
			for i in range(0, len(indices), 2):
				labels[indices[i]:indices[i+1], :] = 1
			np.save(f'{folder}/{fn}_labels.npy', labels)		
			

In [14]:
dataset = 'MSL'
load_data(dataset)

In [15]:
import numpy as np
import pandas as pd
import os

# .npy 파일이 저장된 폴더 경로를 지정해주세요.
dataset = 'SMD'
output_folder = './processed' 

# 각 파일의 전체 경로 생성
train_path = os.path.join(output_folder, dataset, 'machine-1-1_train.npy')
test_path = os.path.join(output_folder, dataset, 'machine-1-1_test.npy')
labels_path = os.path.join(output_folder, dataset, 'machine-1-1_labels.npy')

# .npy 파일 불러오기
train_data = np.load(train_path)
test_data = np.load(test_path)
labels_data = np.load(labels_path)

# Pandas DataFrame으로 변환하여 구조 확인
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)
df_labels = pd.DataFrame(labels_data)

# 각 데이터의 shape과 상위 5개 행 출력
print(f" Train 데이터 shape: {df_train.shape}")
print(df_train.head())

print(f" Test 데이터 shape: {df_test.shape}")
print(df_test.head())

print(f" Labels 데이터 shape: {df_labels.shape}")
print(df_labels.head())

# Labels 데이터에서 실제로 이상(1)으로 표시된 위치 찾아보기
anomaly_points = df_labels[df_labels == 1].stack()

print(anomaly_points.head(10))

 Train 데이터 shape: (28479, 38)
         0         1         2         3    4         5         6    7   \
0  0.032258  0.039195  0.027871  0.024390  0.0  0.915385  0.343691  0.0   
1  0.043011  0.048729  0.033445  0.025552  0.0  0.915385  0.344633  0.0   
2  0.043011  0.034958  0.032330  0.025552  0.0  0.915385  0.344633  0.0   
3  0.032258  0.028602  0.030100  0.024390  0.0  0.912821  0.342750  0.0   
4  0.032258  0.019068  0.026756  0.023229  0.0  0.912821  0.342750  0.0   

         8         9   ...   28        29        30        31   32        33  \
0  0.020011  0.000122  ...  0.0  0.004298  0.029993  0.022131  0.0  0.000045   
1  0.019160  0.001722  ...  0.0  0.004298  0.030041  0.028821  0.0  0.000045   
2  0.020011  0.000122  ...  0.0  0.004298  0.026248  0.021101  0.0  0.000045   
3  0.021289  0.000000  ...  0.0  0.004298  0.030169  0.025733  0.0  0.000022   
4  0.018734  0.000000  ...  0.0  0.004298  0.027240  0.022645  0.0  0.000034   

         34        35   36   37  
0  0

In [16]:
# !pip install gdown
import gdown

train_url = 'https://drive.google.com/uc?id=1IYPnLUjaHMzwefzmxErQpb9UYGW_1dpQ'
train_file = './SWaT_Dataset_Normal_v1.pkl'  
gdown.download(train_url, train_file, quiet=False)

test_url = 'https://drive.google.com/uc?id=17TUUeUgCgEmPoR372yvg41iuOQzAKtUU'
test_file = './SWaT_Dataset_Attack_v0.pkl'  
gdown.download(test_url, test_file, quiet=False)

# 파일 경로 설정
train_path = './data/SWaT/SWaT.A1 _ A2_Dec 2015/Physical/SWaT_Dataset_Normal_v1.pkl'
test_path = './data/SWaT/SWaT.A1 _ A2_Dec 2015/Physical/SWaT_Dataset_Attack_v0.pkl'

Downloading...
From (original): https://drive.google.com/uc?id=1IYPnLUjaHMzwefzmxErQpb9UYGW_1dpQ
From (redirected): https://drive.google.com/uc?id=1IYPnLUjaHMzwefzmxErQpb9UYGW_1dpQ&confirm=t&uuid=fcf45a6b-55fa-41f6-882f-36fb2201c584
To: d:\EUV_Anomaly_Detection\SWaT_Dataset_Normal_v1.pkl
100%|██████████| 217M/217M [00:07<00:00, 27.5MB/s] 
Downloading...
From (original): https://drive.google.com/uc?id=17TUUeUgCgEmPoR372yvg41iuOQzAKtUU
From (redirected): https://drive.google.com/uc?id=17TUUeUgCgEmPoR372yvg41iuOQzAKtUU&confirm=t&uuid=8bf55467-5de0-42cb-8e13-b515bdba0ce7
To: d:\EUV_Anomaly_Detection\SWaT_Dataset_Attack_v0.pkl
100%|██████████| 197M/197M [00:06<00:00, 31.0MB/s] 


In [17]:
# 검증 데이터 분할 비율
valid_split_rate = 0.8

# 훈련 데이터셋 로드 및 전처리
trainset = pd.read_pickle(train_path).drop(['Normal/Attack', ' Timestamp'], axis=1)
valid_split_index = int(len(trainset) * valid_split_rate)
validset = trainset.iloc[valid_split_index:].to_numpy()  # 검증 데이터셋
trainset = trainset.iloc[:valid_split_index].to_numpy()  # 훈련 데이터셋
train_timestamp = np.arange(len(trainset))  # 훈련 데이터 타임스탬프
valid_timestamp = np.arange(len(validset))  # 검증 데이터 타임스탬프

# 테스트 데이터셋 로드 및 전처리
testset = pd.read_pickle(test_path)
test_timestamp = np.arange(len(testset))  # 테스트 데이터 타임스탬프
test_label = testset['Normal/Attack'].copy()
test_label[test_label == 'Normal'] = 0
test_label[test_label != 0] = 1
testset = testset.drop(['Normal/Attack', ' Timestamp'], axis=1)
columns = testset.columns.tolist()  # 컬럼 리스트
testset = testset.to_numpy()

# 결과 출력
print(f"Train set size: {len(trainset)}, Validation set size: {len(validset)}, Test set size: {len(testset)}")
print(f"Columns: {columns}")

FileNotFoundError: [Errno 2] No such file or directory: './data/SWaT/SWaT.A1 _ A2_Dec 2015/Physical/SWaT_Dataset_Normal_v1.pkl'