-
Notifications
You must be signed in to change notification settings - Fork 1
/
DatasetManager_for_colab.py
290 lines (223 loc) · 13.8 KB
/
DatasetManager_for_colab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import sys
import dataset_confs_for_colab as dataset_confs
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from keras_preprocessing.sequence import pad_sequences
from sklearn.utils import resample
class DatasetManager:
def __init__(self, dataset_name):
self.dataset_name = dataset_name
dataset_confs.create_dicts()
self.case_id_col = dataset_confs.case_id_col[self.dataset_name]
self.activity_col = dataset_confs.activity_col[self.dataset_name]
self.timestamp_col = dataset_confs.timestamp_col[self.dataset_name]
self.label_col = dataset_confs.label_col[self.dataset_name]
self.pos_label = dataset_confs.pos_label[self.dataset_name]
self.dynamic_cat_cols = dataset_confs.dynamic_cat_cols[self.dataset_name]
self.static_cat_cols = dataset_confs.static_cat_cols[self.dataset_name]
self.dynamic_num_cols = dataset_confs.dynamic_num_cols[self.dataset_name]
self.static_num_cols = dataset_confs.static_num_cols[self.dataset_name]
self.sorting_cols = [self.timestamp_col, self.activity_col]
self.scaler = None
self.encoded_cols = None
def read_dataset(self):
# read dataset
dtypes = {col:"object" for col in self.dynamic_cat_cols+self.static_cat_cols+[self.case_id_col, self.label_col, self.timestamp_col]}
for col in self.dynamic_num_cols + self.static_num_cols:
dtypes[col] = "float"
#print(dataset_confs.filename[self.dataset_name])
data = pd.read_csv(dataset_confs.filename[self.dataset_name], sep=";", dtype=dtypes)
data[self.timestamp_col] = pd.to_datetime(data[self.timestamp_col])
return data
def balance_data(self, data):
y = self.get_label_numeric(data)
case_ids = self.get_case_ids(data)
neg_cases = [case_ids[i] for i in range(len(case_ids)) if y[i] == 0]
pos_cases = [case_ids[i] for i in range(len(case_ids)) if y[i] == 1]
if len(neg_cases) > len(pos_cases):
neg_cases = resample(neg_cases, replace = False, n_samples = len(pos_cases))
elif len(neg_cases) < len(pos_cases):
pos_cases = resample(pos_cases, replace = False, n_samples = len(neg_cases))
bal_data = data.loc[data[self.case_id_col].isin(neg_cases)]
bal_data = bal_data.append(data.loc[data[self.case_id_col].isin(pos_cases)])
return bal_data
def split_data(self, data, train_ratio, split="temporal", seed=22):
# split into train and test using temporal split
grouped = data.groupby(self.case_id_col)
start_timestamps = grouped[self.timestamp_col].min().reset_index()
if split == "temporal":
start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind="mergesort")
elif split == "random":
np.random.seed(seed)
start_timestamps = start_timestamps.reindex(np.random.permutation(start_timestamps.index))
train_ids = list(start_timestamps[self.case_id_col])[:int(train_ratio*len(start_timestamps))]
train = data[data[self.case_id_col].isin(train_ids)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')
test = data[~data[self.case_id_col].isin(train_ids)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')
return (train, test)
def split_data_strict(self, data, train_ratio, split="temporal"):
# split into train and test using temporal split and discard events that overlap the periods
data = data.sort_values(self.sorting_cols, ascending=True, kind='mergesort')
grouped = data.groupby(self.case_id_col)
start_timestamps = grouped[self.timestamp_col].min().reset_index()
start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind='mergesort')
train_ids = list(start_timestamps[self.case_id_col])[:int(train_ratio*len(start_timestamps))]
train = data[data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
test = data[~data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
split_ts = test[self.timestamp_col].min()
train = train[train[self.timestamp_col] < split_ts]
return (train, test)
def split_data_discard(self, data, train_ratio, split="temporal"):
# split into train and test using temporal split and discard events that overlap the periods
data = data.sort_values(self.sorting_cols, ascending=True, kind='mergesort')
grouped = data.groupby(self.case_id_col)
start_timestamps = grouped[self.timestamp_col].min().reset_index()
start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind='mergesort')
train_ids = list(start_timestamps[self.case_id_col])[:int(train_ratio*len(start_timestamps))]
train = data[data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
test = data[~data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
split_ts = test[self.timestamp_col].min()
overlapping_cases = train[train[self.timestamp_col] >= split_ts][self.case_id_col].unique()
train = train[~train[self.case_id_col].isin(overlapping_cases)]
return (train, test)
def split_val(self, data, val_ratio, split="random", seed=22):
# split into train and test using temporal split
grouped = data.groupby(self.case_id_col)
start_timestamps = grouped[self.timestamp_col].min().reset_index()
if split == "temporal":
start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind="mergesort")
elif split == "random":
np.random.seed(seed)
start_timestamps = start_timestamps.reindex(np.random.permutation(start_timestamps.index))
val_ids = list(start_timestamps[self.case_id_col])[-int(val_ratio*len(start_timestamps)):]
val = data[data[self.case_id_col].isin(val_ids)].sort_values(self.sorting_cols, ascending=True, kind="mergesort")
train = data[~data[self.case_id_col].isin(val_ids)].sort_values(self.sorting_cols, ascending=True, kind="mergesort")
return (train, val)
def generate_prefix_data(self, data, min_length, max_length, gap=1):
# generate prefix data (each possible prefix becomes a trace)
data['case_length'] = data.groupby(self.case_id_col)[self.activity_col].transform(len)
dt_prefixes = data[data['case_length'] >= min_length].groupby(self.case_id_col).head(min_length)
dt_prefixes["prefix_nr"] = 1
dt_prefixes["orig_case_id"] = dt_prefixes[self.case_id_col]
for nr_events in range(min_length+gap, max_length+1, gap):
tmp = data[data['case_length'] >= nr_events].groupby(self.case_id_col).head(nr_events)
tmp["orig_case_id"] = tmp[self.case_id_col]
tmp[self.case_id_col] = tmp[self.case_id_col].apply(lambda x: "%s_%s"%(x, nr_events))
tmp["prefix_nr"] = nr_events
dt_prefixes = pd.concat([dt_prefixes, tmp], axis=0)
dt_prefixes['case_length'] = dt_prefixes['case_length'].apply(lambda x: min(max_length, x))
return dt_prefixes
def get_pos_case_length_quantile(self, data, quantile=0.90):
return int(np.ceil(data[data[self.label_col]==self.pos_label].groupby(self.case_id_col).size().quantile(quantile)))
def get_indexes(self, data):
return data.groupby(self.case_id_col).first().index
def get_max_case_length(self, data):
return data[data[self.label_col]==self.pos_label].groupby(self.case_id_col).size().max()
def get_relevant_data_by_indexes(self, data, indexes):
return data[data[self.case_id_col].isin(indexes)]
def get_label(self, data):
return data.groupby(self.case_id_col).first()[self.label_col]
def get_prefix_lengths(self, data):
return data.groupby(self.case_id_col).last()["prefix_nr"]
def get_case_ids(self, data, nr_events=1):
case_ids = pd.Series(data.groupby(self.case_id_col).first().index)
if nr_events > 1:
case_ids = case_ids.apply(lambda x: "_".join(x.split("_")[:-1]))
return case_ids
def get_label_numeric(self, data):
y = self.get_label(data) # one row per case
return [1 if label == self.pos_label else 0 for label in y]
def get_class_ratio(self, data):
class_freqs = data[self.label_col].value_counts()
return class_freqs[self.pos_label] / class_freqs.sum()
def get_stratified_split_generator(self, data, n_splits=5, shuffle=True, random_state=22):
grouped_firsts = data.groupby(self.case_id_col, as_index=False).first()
skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
for train_index, test_index in skf.split(grouped_firsts, grouped_firsts[self.label_col]):
current_train_names = grouped_firsts[self.case_id_col][train_index]
train_chunk = data[data[self.case_id_col].isin(current_train_names)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')
test_chunk = data[~data[self.case_id_col].isin(current_train_names)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')
yield (train_chunk, test_chunk)
def get_idx_split_generator(self, dt_for_splitting, n_splits=5, shuffle=True, random_state=22):
skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
for train_index, test_index in skf.split(dt_for_splitting, dt_for_splitting[self.label_col]):
current_train_names = dt_for_splitting[self.case_id_col][train_index]
current_test_names = dt_for_splitting[self.case_id_col][test_index]
yield (current_train_names, current_test_names)
def encode_data_for_lstm(self, data):
data = data.sort_values(self.sorting_cols, ascending=True, kind='mergesort')
num_cols = self.dynamic_num_cols + self.static_num_cols
cat_cols = self.dynamic_cat_cols + self.static_cat_cols
# scale numeric cols
if self.scaler is None:
self.scaler = MinMaxScaler()
dt_all = pd.DataFrame(self.scaler.fit_transform(data[num_cols]), index=data.index, columns=num_cols)
else:
dt_all = pd.DataFrame(self.scaler.transform(data[num_cols]), index=data.index, columns=num_cols)
# one-hot encode categorical cols
dt_cat = pd.get_dummies(data[cat_cols])
# merge
dt_all = pd.concat([dt_all, dt_cat], axis=1)
dt_all[self.case_id_col] = data[self.case_id_col]
dt_all[self.label_col] = data[self.label_col].apply(lambda x: 1 if x == self.pos_label else 0)
dt_all[self.timestamp_col] = data[self.timestamp_col]
# add missing columns if necessary
if self.encoded_cols is None:
self.encoded_cols = dt_all.columns
else:
for col in self.encoded_cols:
if col not in dt_all.columns:
dt_all[col] = 0
return dt_all[self.encoded_cols]
def generate_3d_data(self, data, max_len):
data = data.sort_values(self.timestamp_col, ascending=True, kind="mergesort").groupby(self.case_id_col).head(max_len)
grouped = data.sort_values(self.timestamp_col, ascending=True, kind="mergesort").groupby(self.case_id_col)
data_dim = data.shape[1] - 3
n_cases = data.shape[0]
X = np.zeros((n_cases, max_len, data_dim), dtype=np.float32)
y = np.zeros((n_cases, 2), dtype=np.float32)
case_ids = []
idx = 0
# each prefix will be a separate instance
for case_id, group in grouped:
group = group.sort_values(self.timestamp_col, ascending=True, kind="mergesort")
label = group[self.label_col].iloc[0]
group = group.values
for i in range(1, len(group) + 1):
X[idx] = pad_sequences(group[np.newaxis,:i,:-3], maxlen=max_len, dtype=np.float32)
y[idx, label] = 1
case_ids.append(case_id)
idx += 1
return (X, y, case_ids)
def generate_3d_data_for_prefix_length(self, data, max_len, nr_events):
grouped = data.groupby(self.case_id_col)
data_dim = data.shape[1] - 3
n_cases = np.sum(grouped.size() >= nr_events)
# encode only prefixes of this length
X = np.zeros((n_cases, max_len, data_dim), dtype=np.float32)
y = np.zeros((n_cases, 2), dtype=np.float32)
case_ids = []
idx = 0
for case_id, group in grouped:
if len(group) < nr_events:
continue
group = group.sort_values(self.timestamp_col, ascending=True, kind="mergesort")
label = group[self.label_col].iloc[0]
group = group.values
X[idx] = pad_sequences(group[np.newaxis,:nr_events,:-3], maxlen=max_len, dtype=np.float32)
y[idx, label] = 1
case_ids.append(case_id)
idx += 1
return (X, y, case_ids)
def get_lstm_encoded_cols(self, max_len):
#print(type(self.encoded_cols))
if np.any(self.encoded_cols): #self.encoded_cols == None or self.encoded_cols == []:
self.encode_data_for_lstm(self.read_dataset())
features = self.encoded_cols[:-3]
feature_names = []
for i in range(max_len):
end = "_pref_"+str(i)
row = [feature+end for feature in features]
feature_names.append(row)
return(feature_names)