In [1]:
import pickle
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import encode_map
from sklearn.model_selection import train_test_split

In [2]:
def process(df, att_encode_map):
    df_filter = df.loc[:, ["case", "activity", "timestamp"]]
    new_names = {"case": "CaseID", "activity": "ActivityID", "timestamp":"CompleteTimestamp"}
    df_filter = df_filter.rename(columns=new_names)
    # 将时间戳列转换为日期时间对象
    df_filter['CompleteTimestamp'] = pd.to_datetime(df_filter['CompleteTimestamp'])
    # 格式化时间戳列为所需的字符串格式
    df_filter['CompleteTimestamp'] = df_filter['CompleteTimestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
    df_filter["ActivityID"] = df_filter["ActivityID"].apply(lambda e: att_encode_map.get(str(e), -1))
    return df_filter


In [3]:

list_eventlog = [
    'helpdesk',
    # 'bpi13_problems',
    # 'bpi13_closed_problems',
    # 'bpi12_all_complete',
    # 'bpi12w_complete',
    # 'bpic2017_o',
    # 'bpi12_work_all',
    # 'receipt',
    # 'bpic2020',
    # 'bpi13_incidents',
]

for event_name in list_eventlog:
    for f in tqdm(range(3), desc="Processing fold"):
        df_train = pd.read_csv("raw_dir/three_fold_data/" + event_name + "/" + event_name + "_kfoldcv_" + str(f) + "_train.csv",
                               sep=',',
                               header=0, index_col=False)
        df_test = pd.read_csv("raw_dir/three_fold_data/" + event_name + "/" + event_name + "_kfoldcv_" + str(f) + "_test.csv",
                              sep=',',
                              header=0, index_col=False)
        np.random.seed(133)
        grouped = df_train.groupby('case')
        new_order = np.random.permutation(list(grouped.groups.keys()))
        new_groups = [grouped.get_group(key) for key in new_order]
        log_shuffled = pd.concat(new_groups)
        log_shuffled.index = range(len(log_shuffled))
        train, valid = train_test_split(log_shuffled, test_size=0.2, shuffle=False)
        all_df = pd.concat([train, valid, df_test], ignore_index=True)
        att_encode_map = encode_map(set(all_df["activity"].values))
        train_processed = process(train, att_encode_map)
        valid_processed = process(valid, att_encode_map)
        test_processed = process(df_test, att_encode_map)
        all_df_processed = pd.concat([train_processed, valid_processed, test_processed], ignore_index=True)

        train_index = list(range(len(train_processed)))
        valid_index = list(range(len(train_processed), len(train_processed) + len(valid_processed)))
        test_index = list(range(len(train_processed) + len(valid_processed), len(all_df_processed)))
        if not os.path.exists("raw_dir/" + event_name + "/" + event_name + "_kfoldcv_" + str(f)):
            os.makedirs("raw_dir/"+ event_name + "/" + event_name + "_kfoldcv_" + str(f))
        all_df_processed.to_csv("raw_dir/"+ event_name + "/" + event_name + "_kfoldcv_" + str(f) + "/" + event_name + "_all.csv", index=False)
        with open("raw_dir/"+ event_name + "/" + event_name + "_kfoldcv_" + str(f) + "/" + 'train' + '_' + "index" + ".npy", 'wb') as file:
            pickle.dump(train_index, file)
        with open("raw_dir/"+ event_name + "/" + event_name + "_kfoldcv_" + str(f) + "/" + 'valid' + '_' + "index" + ".npy", 'wb') as file:
            pickle.dump(valid_index, file)
        with open("raw_dir/"+ event_name + "/" + event_name + "_kfoldcv_" + str(f) + "/" + 'test' + '_' + "index" + ".npy", 'wb') as file:
            pickle.dump(test_index, file)

Processing fold: 100%|██████████| 3/3 [00:13<00:00,  4.54s/it]
