In [1]:
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle

In [2]:
from dateutil.parser import parse
from datetime import datetime
import time
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.mode.chained_assignment = None #to run loop quicker without warnings

In [3]:
#name = 'bpi_2012'
#name = 'bpi_2013'
name = 'small_log'
#name = 'large_log'


args = {
    'data_dir': '../data/',
    'data_file': name + '.csv',
    'input_dir': '../input/{}/'.format(name),  
    'train_pct': 0.6,
    'val_pct': 0.2,
    'anomaly_pct': 0.1,
    'scaler': 'standardization', 
}

args = argparse.Namespace(**args)

In [4]:
if not os.path.isdir('../input/'):
    os.makedirs('../input/')
    
if not os.path.isdir(args.input_dir):
    os.makedirs(args.input_dir)

In [5]:
sys.path.insert(0, './../utils/')
from utils import *

# Load data

In [6]:
# Only consider Case, Activity, Timestamp
cols = ['CaseID', 'Activity', 'CompleteTimestamp']

# For Timestamp: Convert to time
data = pd.read_csv(args.data_dir + args.data_file, usecols=['Case ID', 'Activity', 'Complete Timestamp'])
data['Case ID'] = data['Case ID'].apply(lambda x: x.split(' ')[1])
    

# Format for each column     
data.columns = cols
data['CompleteTimestamp'] = pd.to_datetime(data['CompleteTimestamp'], errors='coerce')
data['CaseID'] = data['CaseID'].apply(pd.to_numeric)

In [7]:
data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,Activity A,1970-01-01 09:00:00
1,1,Activity B,1970-01-01 10:00:00
2,1,Activity C,1970-01-01 11:00:00
3,1,Activity D,1970-01-01 12:00:00
4,1,Activity E,1970-01-01 13:00:00


In [8]:
#Calculate duration and cumulative duration
groupByCase = data.groupby(['CaseID'])
duration_df = pd.DataFrame(pd.DataFrame(columns=list(data)+['Duration', 'CumDuration']))
                           
for case, group in groupByCase:
    group = calculateDuration(group)
    group = calculateCumDuration(group)
    group['Duration'] = group['Duration'].apply(convert2seconds)
    group['CumDuration'] = group['CumDuration'].apply(convert2seconds)
    duration_df = duration_df.append(group)

In [9]:
duration_df.head(10)

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration
0,1,Activity A,1970-01-01 09:00:00,0.0,0.0
1,1,Activity B,1970-01-01 10:00:00,3600.0,3600.0
2,1,Activity C,1970-01-01 11:00:00,3600.0,7200.0
3,1,Activity D,1970-01-01 12:00:00,3600.0,10800.0
4,1,Activity E,1970-01-01 13:00:00,3600.0,14400.0
5,1,Activity K,1970-01-01 14:00:00,3600.0,18000.0
6,1,Activity G,1970-01-01 14:00:00,0.0,18000.0
7,1,Activity M,1970-01-01 14:00:00,0.0,18000.0
8,1,Activity L,1970-01-01 15:00:00,3600.0,21600.0
9,1,Activity I,1970-01-01 15:00:00,0.0,21600.0


In [10]:
#get statistics storage for activity
groupByActivity = duration_df.groupby(['Activity'])
statistics_storage = {}

for act, act_data in groupByActivity:
    act_storage = {}
    act_storage[act] = {}
    mean_value = act_data['Duration'].mean()
    std_value = act_data['Duration'].std()
    act_storage[act]['mean'] = mean_value
    act_storage[act]['std'] = std_value
    statistics_storage.update(act_storage)

In [11]:
print('Descriptive statistics: \n{}'.format(statistics_storage))

Descriptive statistics: 
{'Activity N': {'std': 1706.9900842048808, 'mean': 1227.5999999999999}, 'Activity D': {'std': 0.0, 'mean': 3600.0}, 'Activity C': {'std': 0.0, 'mean': 3600.0}, 'Activity M': {'std': 1706.9900842048808, 'mean': 1227.5999999999999}, 'Activity B': {'std': 0.0, 'mean': 3600.0}, 'Activity E': {'std': 0.0, 'mean': 3600.0}, 'Activity J': {'std': 1366.6816269104766, 'mean': 628.20000000000005}, 'Activity A': {'std': 0.0, 'mean': 0.0}, 'Activity I': {'std': 1303.1826973111965, 'mean': 558.0}, 'Activity F': {'std': 0.0, 'mean': 3600.0}, 'Activity K': {'std': 1692.537119519237, 'mean': 1186.2}, 'Activity G': {'std': 1692.537119519237, 'mean': 1186.2}, 'Activity L': {'std': 1692.537119519237, 'mean': 1186.2}, 'Activity H': {'std': 0.0, 'mean': 3600.0}}


In [12]:
act_list = data['Activity'].unique()
print('Activity: {}'.format(act_list))

Activity: ['Activity A' 'Activity B' 'Activity C' 'Activity D' 'Activity E'
 'Activity K' 'Activity G' 'Activity M' 'Activity L' 'Activity I'
 'Activity J' 'Activity N' 'Activity H' 'Activity F']


# Introduce anomalous data

In [13]:
data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,Activity A,1970-01-01 09:00:00
1,1,Activity B,1970-01-01 10:00:00
2,1,Activity C,1970-01-01 11:00:00
3,1,Activity D,1970-01-01 12:00:00
4,1,Activity E,1970-01-01 13:00:00


In [14]:
duration_df.head(10)

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration
0,1,Activity A,1970-01-01 09:00:00,0.0,0.0
1,1,Activity B,1970-01-01 10:00:00,3600.0,3600.0
2,1,Activity C,1970-01-01 11:00:00,3600.0,7200.0
3,1,Activity D,1970-01-01 12:00:00,3600.0,10800.0
4,1,Activity E,1970-01-01 13:00:00,3600.0,14400.0
5,1,Activity K,1970-01-01 14:00:00,3600.0,18000.0
6,1,Activity G,1970-01-01 14:00:00,0.0,18000.0
7,1,Activity M,1970-01-01 14:00:00,0.0,18000.0
8,1,Activity L,1970-01-01 15:00:00,3600.0,21600.0
9,1,Activity I,1970-01-01 15:00:00,0.0,21600.0


In [15]:
anomaly_num = int(data.shape[0]*(data.shape[1]-1)*args.anomaly_pct)
anomalous_act_num = int(anomaly_num/2)
anomalous_time_num = anomaly_num - anomalous_act_num

print('Number of anomalous values: {}'.format(anomaly_num))
print('Number of anomalous activities: {}'.format(anomalous_act_num))
print('Number of anomalous time: {}'.format(anomalous_time_num))

Number of anomalous values: 5600
Number of anomalous activities: 2800
Number of anomalous time: 2800


## Activity

**Mutation:**
- Replace an activity by another

In [16]:
temp_act_df = pd.DataFrame({'Activity': duration_df['Activity'].copy(),
                            'AnomalousActivity': duration_df['Activity'].copy(),
                            'ActivityLabel': 0})

In [17]:
temp_act_df.head()

Unnamed: 0,Activity,ActivityLabel,AnomalousActivity
0,Activity A,0,Activity A
1,Activity B,0,Activity B
2,Activity C,0,Activity C
3,Activity D,0,Activity D
4,Activity E,0,Activity E


In [18]:
anomalous_act_index = []

while len(anomalous_act_index) < anomalous_act_num:
    row = np.random.randint(0, temp_act_df.shape[0])
    idx = np.random.randint(0, len(act_list)-1)
    if row not in anomalous_act_index:
        anomalous_act_index.append(row)
        act = temp_act_df.loc[row, 'Activity']
        anomalous_act_list = [i for i in act_list if i != act]
        anomalous_act = anomalous_act_list[idx]                  
        temp_act_df.loc[row, 'AnomalousActivity'] = anomalous_act
        temp_act_df.loc[row, 'ActivityLabel'] = 1

In [19]:
temp_act_df.head(20)

Unnamed: 0,Activity,ActivityLabel,AnomalousActivity
0,Activity A,0,Activity A
1,Activity B,0,Activity B
2,Activity C,0,Activity C
3,Activity D,1,Activity A
4,Activity E,0,Activity E
5,Activity K,0,Activity K
6,Activity G,0,Activity G
7,Activity M,0,Activity M
8,Activity L,0,Activity L
9,Activity I,0,Activity I


In [20]:
temp_act = temp_act_df[['AnomalousActivity', 'ActivityLabel']]

In [21]:
temp_act.head()

Unnamed: 0,AnomalousActivity,ActivityLabel
0,Activity A,0
1,Activity B,0
2,Activity C,0
3,Activity A,1
4,Activity E,0


## Time

**Mutation:**
- Extreme duration

In [22]:
temp_time_df = duration_df.copy()
temp_time_df['AnomalousDuration'] = temp_time_df['Duration'].copy()
temp_time_df['TimeLabel'] = 0

In [23]:
temp_time_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel
0,1,Activity A,1970-01-01 09:00:00,0.0,0.0,0.0,0
1,1,Activity B,1970-01-01 10:00:00,3600.0,3600.0,3600.0,0
2,1,Activity C,1970-01-01 11:00:00,3600.0,7200.0,3600.0,0
3,1,Activity D,1970-01-01 12:00:00,3600.0,10800.0,3600.0,0
4,1,Activity E,1970-01-01 13:00:00,3600.0,14400.0,3600.0,0


In [24]:
#get anomalous duration
anomalous_time_index = []

while len(anomalous_time_index) < anomalous_time_num:
    row = np.random.randint(0, temp_time_df.shape[0])
    if row not in anomalous_time_index:
        anomalous_time_index.append(row)
        act = temp_time_df.loc[row, 'Activity']
        if act != 'A_SUBMITTED-COMPLETE' and act != 'Activity A':
            anomalous_value = (np.random.random_sample() + 1)*(statistics_storage[act]['mean'] + statistics_storage[act]['std'])
            temp_time_df.loc[row, 'AnomalousDuration'] = anomalous_value
            temp_time_df.loc[row, 'TimeLabel'] = 1

In [25]:
temp_time_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel
0,1,Activity A,1970-01-01 09:00:00,0.0,0.0,0.0,0
1,1,Activity B,1970-01-01 10:00:00,3600.0,3600.0,3766.049539,1
2,1,Activity C,1970-01-01 11:00:00,3600.0,7200.0,4285.335178,1
3,1,Activity D,1970-01-01 12:00:00,3600.0,10800.0,3600.0,0
4,1,Activity E,1970-01-01 13:00:00,3600.0,14400.0,3600.0,0


In [26]:
#get anomalous cumulative duration
temp_cum_time_df = pd.DataFrame(columns=list(temp_time_df)+['AnomalousCompleteTimestamp'])
groupByCase = temp_time_df.groupby(['CaseID'])

for case, group in groupByCase:
    group['AnomalousCompleteTimestamp'] = group['CompleteTimestamp'].copy()
    if group['TimeLabel'].sum() > 0:
        for row in range(group.shape[0]-1):
            previous_timestamp = group['CompleteTimestamp'].iloc[row]
            current_duration = group['AnomalousDuration'].iloc[row+1]
            current_timestamp = previous_timestamp + timedelta(seconds=current_duration)
            group['AnomalousCompleteTimestamp'].iloc[row+1] = current_timestamp
    temp_cum_time_df = temp_cum_time_df.append(group)

In [27]:
temp_cum_time_df.head(20)

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel,AnomalousCompleteTimestamp
0,1,Activity A,1970-01-01 09:00:00,0.0,0.0,0.0,0,1970-01-01 09:00:00.000000
1,1,Activity B,1970-01-01 10:00:00,3600.0,3600.0,3766.049539,1,1970-01-01 10:02:46.049539
2,1,Activity C,1970-01-01 11:00:00,3600.0,7200.0,4285.335178,1,1970-01-01 11:11:25.335178
3,1,Activity D,1970-01-01 12:00:00,3600.0,10800.0,3600.0,0,1970-01-01 12:00:00.000000
4,1,Activity E,1970-01-01 13:00:00,3600.0,14400.0,3600.0,0,1970-01-01 13:00:00.000000
5,1,Activity K,1970-01-01 14:00:00,3600.0,18000.0,3600.0,0,1970-01-01 14:00:00.000000
6,1,Activity G,1970-01-01 14:00:00,0.0,18000.0,0.0,0,1970-01-01 14:00:00.000000
7,1,Activity M,1970-01-01 14:00:00,0.0,18000.0,0.0,0,1970-01-01 14:00:00.000000
8,1,Activity L,1970-01-01 15:00:00,3600.0,21600.0,3600.0,0,1970-01-01 15:00:00.000000
9,1,Activity I,1970-01-01 15:00:00,0.0,21600.0,0.0,0,1970-01-01 15:00:00.000000


In [28]:
groupByCase = temp_cum_time_df.groupby(['CaseID'])
temp_time = pd.DataFrame(pd.DataFrame(columns=list(temp_cum_time_df)+['AnomalousCumDuration']))
                           
for case, group in groupByCase:
    group = calculateAnomalousCumDuration(group)
    group['AnomalousCumDuration'] = group['AnomalousCumDuration'].apply(convert2seconds)
    temp_time = temp_time.append(group)

In [29]:
temp_time.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel,AnomalousCompleteTimestamp,AnomalousCumDuration
0,1,Activity A,1970-01-01 09:00:00,0.0,0.0,0.0,0,1970-01-01 09:00:00.000000,0.0
1,1,Activity B,1970-01-01 10:00:00,3600.0,3600.0,3766.049539,1,1970-01-01 10:02:46.049539,3766.049539
2,1,Activity C,1970-01-01 11:00:00,3600.0,7200.0,4285.335178,1,1970-01-01 11:11:25.335178,7885.335178
3,1,Activity D,1970-01-01 12:00:00,3600.0,10800.0,3600.0,0,1970-01-01 12:00:00.000000,10800.0
4,1,Activity E,1970-01-01 13:00:00,3600.0,14400.0,3600.0,0,1970-01-01 13:00:00.000000,14400.0


## Get full df

In [30]:
full_df = pd.concat([temp_time, temp_act], axis=1)

In [31]:
full_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel,AnomalousCompleteTimestamp,AnomalousCumDuration,AnomalousActivity,ActivityLabel
0,1,Activity A,1970-01-01 09:00:00,0.0,0.0,0.0,0,1970-01-01 09:00:00.000000,0.0,Activity A,0
1,1,Activity B,1970-01-01 10:00:00,3600.0,3600.0,3766.049539,1,1970-01-01 10:02:46.049539,3766.049539,Activity B,0
2,1,Activity C,1970-01-01 11:00:00,3600.0,7200.0,4285.335178,1,1970-01-01 11:11:25.335178,7885.335178,Activity C,0
3,1,Activity D,1970-01-01 12:00:00,3600.0,10800.0,3600.0,0,1970-01-01 12:00:00.000000,10800.0,Activity A,1
4,1,Activity E,1970-01-01 13:00:00,3600.0,14400.0,3600.0,0,1970-01-01 13:00:00.000000,14400.0,Activity E,0


In [32]:
normal_df = full_df[['CaseID', 'Activity', 'CompleteTimestamp', 'Duration', 'CumDuration']]
anomalous_df = full_df[['CaseID', 'AnomalousActivity', 'AnomalousCompleteTimestamp', 'AnomalousDuration', 
                        'AnomalousCumDuration', 'ActivityLabel', 'TimeLabel']]

In [33]:
print('Saving dataframes...')
normal_df_name = os.path.join(args.input_dir, 'normal_df_{}.csv'.format(args.anomaly_pct))
normal_df.to_csv(normal_df_name, index=False)

anomalous_df_name = os.path.join(args.input_dir, 'anomolous_df_{}.csv'.format(args.anomaly_pct))
anomalous_df.to_csv(anomalous_df_name, index=False)
print('Done!')

Saving dataframes...
Done!


# Preprocess data

In [34]:
groupByCase = anomalous_df.groupby(['CaseID'])

# Split: 60% train, 20% validate, 20% test
train_case_num = int(len(groupByCase)*args.train_pct)
val_case_num = int(len(groupByCase)*args.val_pct)
test_case_num = len(groupByCase) - train_case_num - val_case_num

In [35]:
anomalous_df_train = pd.DataFrame(columns=list(anomalous_df))
anomalous_df_val = pd.DataFrame(columns=list(anomalous_df))
anomalous_df_test = pd.DataFrame(columns=list(anomalous_df))

for caseid, data_case in groupByCase:
    if caseid <= train_case_num:
        anomalous_df_train = anomalous_df_train.append(data_case)
    elif train_case_num < caseid <= (train_case_num+val_case_num):
        anomalous_df_val = anomalous_df_val.append(data_case)
    else:
        anomalous_df_test = anomalous_df_test.append(data_case)

In [36]:
print('Checking shapes of sub data: ', anomalous_df.shape[0] == anomalous_df_train.shape[0] + anomalous_df_val.shape[0] + anomalous_df_test.shape[0])

Checking shapes of sub data:  True


In [37]:
train_row_num = anomalous_df_train.shape[0]
val_row_num = anomalous_df_val.shape[0]
test_row_num = anomalous_df_test.shape[0]

print('Number of rows for training: {}'.format(train_row_num))
print('Number of rows for val: {}'.format(val_row_num))
print('Number of rows for testing: {}'.format(test_row_num))

Number of rows for training: 16800
Number of rows for val: 5600
Number of rows for testing: 5600


In [38]:
print('Number of anomalous values in train set: {}'.format(anomalous_df_train['ActivityLabel'].sum() + anomalous_df_train['TimeLabel'].sum()))
print('Number of anomalous activities in train set: {}'.format(anomalous_df_train['ActivityLabel'].sum()))
print('Number of anomalous time in train set: {}'.format(anomalous_df_train['TimeLabel'].sum()))
print('\n')
print('Number of anomalous values in validate set: {}'.format(anomalous_df_val['ActivityLabel'].sum() + anomalous_df_val['TimeLabel'].sum()))
print('Number of anomalous activities in validate set: {}'.format(anomalous_df_val['ActivityLabel'].sum()))
print('Number of anomalous time in validate set: {}'.format(anomalous_df_val['TimeLabel'].sum()))
print('\n')
print('Number of anomalous values in test set: {}'.format(anomalous_df_test['ActivityLabel'].sum() + anomalous_df_test['TimeLabel'].sum()))
print('Number of anomalous activities in test set: {}'.format(anomalous_df_test['ActivityLabel'].sum()))
print('Number of anomalous time in test set: {}'.format(anomalous_df_test['TimeLabel'].sum()))

Number of anomalous values in train set: 3255
Number of anomalous activities in train set: 1683
Number of anomalous time in train set: 1572


Number of anomalous values in validate set: 1048
Number of anomalous activities in validate set: 532
Number of anomalous time in validate set: 516


Number of anomalous values in test set: 1099
Number of anomalous activities in test set: 585
Number of anomalous time in test set: 514


# Prepare input

In [39]:
anomalous_df.head()

Unnamed: 0,CaseID,AnomalousActivity,AnomalousCompleteTimestamp,AnomalousDuration,AnomalousCumDuration,ActivityLabel,TimeLabel
0,1,Activity A,1970-01-01 09:00:00.000000,0.0,0.0,0,0
1,1,Activity B,1970-01-01 10:02:46.049539,3766.049539,3766.049539,0,1
2,1,Activity C,1970-01-01 11:11:25.335178,4285.335178,7885.335178,0,1
3,1,Activity A,1970-01-01 12:00:00.000000,3600.0,10800.0,1,0
4,1,Activity E,1970-01-01 13:00:00.000000,3600.0,14400.0,0,0


## Labels

In [40]:
activity_label = anomalous_df['ActivityLabel']
time_label = anomalous_df['TimeLabel']

In [41]:
activity_label_train = activity_label[:train_row_num]
activity_label_val = activity_label[train_row_num:train_row_num+val_row_num]
activity_label_test = activity_label[-test_row_num:]

time_label_train = time_label[:train_row_num]
time_label_val = time_label[train_row_num:train_row_num+val_row_num]
time_label_test = time_label[-test_row_num:]

In [42]:
len(time_label_test)

5600

In [43]:
anomaly = anomalous_df[['CaseID', 'AnomalousActivity', 'AnomalousCumDuration']]

## Activity

In [44]:
cat_var = ['AnomalousActivity']

In [45]:
enc_data = OHE(anomaly, cat_var)

In [46]:
enc_data.head()

Unnamed: 0,CaseID,AnomalousCumDuration,AnomalousActivity_Activity A,AnomalousActivity_Activity B,AnomalousActivity_Activity C,AnomalousActivity_Activity D,AnomalousActivity_Activity E,AnomalousActivity_Activity F,AnomalousActivity_Activity G,AnomalousActivity_Activity H,AnomalousActivity_Activity I,AnomalousActivity_Activity J,AnomalousActivity_Activity K,AnomalousActivity_Activity L,AnomalousActivity_Activity M,AnomalousActivity_Activity N
0,1,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,3766.049539,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,7885.335178,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,10800.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,14400.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


## Time

In [47]:
min_value = np.min(enc_data['AnomalousCumDuration'].iloc[:train_row_num])
max_value = np.max(enc_data['AnomalousCumDuration'].iloc[:train_row_num])

In [48]:
print('Min used for normalization: {}'.format(min_value))
print('Max used for normalization: {}'.format(max_value))

Min used for normalization: 0.0
Max used for normalization: 32390.636623000002


In [49]:
mean_value = np.mean(enc_data['AnomalousCumDuration'].iloc[:train_row_num])
std_value = np.std(enc_data['AnomalousCumDuration'].iloc[:train_row_num])

In [50]:
print('Mean used for standardization: {}'.format(mean_value))
print('STD used for standardization: {}'.format(std_value))

Mean used for standardization: 16681.349230560238
STD used for standardization: 8106.74990745998


In [51]:
enc_data['NormalizedCumDuration'] = enc_data['AnomalousCumDuration'].apply(lambda x: (x-min_value)/(max_value-min_value))
enc_data['StandardizedCumDuration'] = enc_data['AnomalousCumDuration'].apply(lambda x: (x-mean_value)/(std_value))

In [52]:
enc_data.head()

Unnamed: 0,CaseID,AnomalousCumDuration,AnomalousActivity_Activity A,AnomalousActivity_Activity B,AnomalousActivity_Activity C,AnomalousActivity_Activity D,AnomalousActivity_Activity E,AnomalousActivity_Activity F,AnomalousActivity_Activity G,AnomalousActivity_Activity H,AnomalousActivity_Activity I,AnomalousActivity_Activity J,AnomalousActivity_Activity K,AnomalousActivity_Activity L,AnomalousActivity_Activity M,AnomalousActivity_Activity N,NormalizedCumDuration,StandardizedCumDuration
0,1,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,-2.057711
1,1,3766.049539,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.11627,-1.593154
2,1,7885.335178,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.243445,-1.085023
3,1,10800.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.33343,-0.725488
4,1,14400.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.444573,-0.281414


In [53]:
if args.scaler == 'standardization':
    scaled_enc_data = enc_data.drop(['AnomalousCumDuration', 'NormalizedCumDuration'], axis=1)
if args.scaler == 'normalization':
    scaled_enc_data = enc_data.drop(['AnomalousCumDuration', 'StandardizedCumDuration'], axis=1)

In [54]:
scaled_enc_data.head()

Unnamed: 0,CaseID,AnomalousActivity_Activity A,AnomalousActivity_Activity B,AnomalousActivity_Activity C,AnomalousActivity_Activity D,AnomalousActivity_Activity E,AnomalousActivity_Activity F,AnomalousActivity_Activity G,AnomalousActivity_Activity H,AnomalousActivity_Activity I,AnomalousActivity_Activity J,AnomalousActivity_Activity K,AnomalousActivity_Activity L,AnomalousActivity_Activity M,AnomalousActivity_Activity N,StandardizedCumDuration
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-2.057711
1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,-1.593154
2,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,-1.085023
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.725488
4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,-0.281414


## 0-padding

In [55]:
#re arrange cols
cols = list(scaled_enc_data)
cols = ['CaseID', cols[-1]] + cols[1:-1]
scaled_enc_data = scaled_enc_data[cols]

In [56]:
scaled_enc_data.head()

Unnamed: 0,CaseID,StandardizedCumDuration,AnomalousActivity_Activity A,AnomalousActivity_Activity B,AnomalousActivity_Activity C,AnomalousActivity_Activity D,AnomalousActivity_Activity E,AnomalousActivity_Activity F,AnomalousActivity_Activity G,AnomalousActivity_Activity H,AnomalousActivity_Activity I,AnomalousActivity_Activity J,AnomalousActivity_Activity K,AnomalousActivity_Activity L,AnomalousActivity_Activity M,AnomalousActivity_Activity N
0,1,-2.057711,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,-1.593154,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,-1.085023,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,-0.725488,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,-0.281414,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [57]:
true_time = scaled_enc_data.iloc[-test_row_num:, 1]
true_act = scaled_enc_data.iloc[-test_row_num:, 2:]

In [58]:
full_true_time = scaled_enc_data.iloc[:, 1]
full_true_act = scaled_enc_data.iloc[:, 2:]

In [59]:
cols = [i for i in list(scaled_enc_data) if i != 'CaseID']
cols

['StandardizedCumDuration',
 'AnomalousActivity_Activity A',
 'AnomalousActivity_Activity B',
 'AnomalousActivity_Activity C',
 'AnomalousActivity_Activity D',
 'AnomalousActivity_Activity E',
 'AnomalousActivity_Activity F',
 'AnomalousActivity_Activity G',
 'AnomalousActivity_Activity H',
 'AnomalousActivity_Activity I',
 'AnomalousActivity_Activity J',
 'AnomalousActivity_Activity K',
 'AnomalousActivity_Activity L',
 'AnomalousActivity_Activity M',
 'AnomalousActivity_Activity N']

In [60]:
pad_index = scaled_enc_data.copy()
pad_index[cols] = 1.0

In [61]:
pad_index.head()

Unnamed: 0,CaseID,StandardizedCumDuration,AnomalousActivity_Activity A,AnomalousActivity_Activity B,AnomalousActivity_Activity C,AnomalousActivity_Activity D,AnomalousActivity_Activity E,AnomalousActivity_Activity F,AnomalousActivity_Activity G,AnomalousActivity_Activity H,AnomalousActivity_Activity I,AnomalousActivity_Activity J,AnomalousActivity_Activity K,AnomalousActivity_Activity L,AnomalousActivity_Activity M,AnomalousActivity_Activity N
0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Vectorize

In [62]:
groupByCase = scaled_enc_data.groupby(['CaseID'])

maxlen = findLongestLength(groupByCase)
print('Maxlen: ', maxlen)

Maxlen:  14


In [63]:
vectorized_data = getInput(groupByCase, cols, maxlen)

pad_index_groupByCase = pad_index.groupby(['CaseID'])
vectorized_pad_index = getInput(pad_index_groupByCase, cols, maxlen)

# Split in to train/val/test

In [64]:
print('Shape of vectorized data: {}'.format(vectorized_data.shape))
print('Shape of vectorized pad index: {}'.format(vectorized_pad_index.shape))
print('\n')
print('Number of case for train: {}'.format(train_case_num))
print('Number of case for validate: {}'.format(val_case_num))
print('Number of case for test: {}'.format(test_case_num))

Shape of vectorized data: (2000, 14, 15)
Shape of vectorized pad index: (2000, 14, 15)


Number of case for train: 1200
Number of case for validate: 400
Number of case for test: 400


In [65]:
input_train = vectorized_data[0:train_case_num]
input_val = vectorized_data[train_case_num:train_case_num+val_case_num]
input_test = vectorized_data[-test_case_num:]

pad_index_train = vectorized_pad_index[0:train_case_num]
pad_index_val = vectorized_pad_index[train_case_num:train_case_num+val_case_num]
pad_index_test = vectorized_pad_index[-test_case_num:]

In [66]:
print('Check shape of input for training: {}'.format(input_train.shape[0]==train_case_num))
print('Check shape of input for validation: {}'.format(input_val.shape[0]==val_case_num))
print('Check shape of input for testing: {}'.format(input_test.shape[0]==test_case_num))

Check shape of input for training: True
Check shape of input for validation: True
Check shape of input for testing: True


# Save data

In [67]:
preprocessed_data_name = os.path.join(args.input_dir, 'preprocessed_data_{}.pkl'.format(args.anomaly_pct))
with open(preprocessed_data_name, 'wb') as f:
    pickle.dump(input_train, f, protocol=2)
    pickle.dump(input_val, f, protocol=2)
    pickle.dump(input_test, f, protocol=2)
    pickle.dump(pad_index_train, f, protocol=2)
    pickle.dump(pad_index_val, f, protocol=2)
    pickle.dump(pad_index_test, f, protocol=2)
    pickle.dump(activity_label_test, f, protocol=2)
    pickle.dump(time_label_test, f, protocol=2)
    pickle.dump(train_case_num, f, protocol=2)
    pickle.dump(val_case_num, f, protocol=2)
    pickle.dump(test_case_num, f, protocol=2)
    pickle.dump(train_row_num, f, protocol=2)
    pickle.dump(val_row_num, f, protocol=2)
    pickle.dump(test_row_num, f, protocol=2)
    pickle.dump(min_value, f, protocol=2)
    pickle.dump(max_value, f, protocol=2)
    pickle.dump(mean_value, f, protocol=2)
    pickle.dump(std_value, f, protocol=2)
    pickle.dump(cols, f, protocol=2)
    pickle.dump(statistics_storage, f, protocol=2)
    pickle.dump(true_time, f, protocol=2)
    pickle.dump(true_act, f, protocol=2)
    pickle.dump(full_true_time, f, protocol=2)
    pickle.dump(full_true_act, f, protocol=2)