In [None]:
import pandas as pd

test_df = pd.read_csv('StrokeTest.csv')
train_df = pd.read_csv('StrokeTrain.csv')
body_pred_df = pd.read_csv('Stroke_Body_Pred.csv')

# 创建一个字典用于查找 body_pred 对应的 index
body_pred_to_index = body_pred_df.set_index('body_pred')['index'].to_dict()

# 定义一个函数来查找 index
def find_index(row):
    return body_pred_to_index.get(row['k'], None)

# 在 test 和 train 数据集中添加 index 列
test_df['index'] = test_df.apply(find_index, axis=1)
train_df['index'] = train_df.apply(find_index, axis=1)

test_df.to_csv('StrokeTestIndex.csv', index=False)
train_df.to_csv('StrokeTrainIndex.csv', index=False)

In [None]:
# 此处检查一下，是否存在 k 值不为 '1/2transfer3' 且 index 为 null 的行
test_index_df = pd.read_csv('StrokeTestIndex.csv')
train_index_df = pd.read_csv('StrokeTrainIndex.csv')

# 筛选出 k 值不为 '1/2transfer3' 且 index 为 null 的行
test_null_rows = test_index_df[(test_index_df['k'] != 'Middle_to_Sever') & (test_index_df['index'].isnull())]
train_null_rows = train_index_df[(train_index_df['k'] != 'Middle_to_Sever') & (train_index_df['index'].isnull())]

print(f"Test file null rows count: {len(test_null_rows)}")
print(f"Train file null rows count: {len(train_null_rows)}")

In [None]:
import numpy as np
import pandas as pd

# 定义一个非常大的数，如果一个谓词对应的事件没有发生，则用一个很大的数来表示
large_number = 1e10

def process_dataframe(file_path, output_path):
    df = pd.read_csv(file_path)
    
    result = []
    group = []
    for _, row in df.iterrows():
        if row['k'] == 'Middle_to_Sever':
            # 处理当前组
            if group:
                group_id = group[0]['id']
                # 初始化 body_predicates_time 数组
                body_predicates_time = [large_number] * 48
                # 填充 body_predicates_time 数组
                for item in group:
                    index = int(item['index'])
                    body_predicates_time[index] = item['t']
                # If body_predicates_time is all large_number, skip this group
                if all(time == large_number for time in body_predicates_time):
                    continue
                # 获取 head_predicate_time
                head_predicate_time = [row['t']]
                # 添加到结果
                result.append({
                    'id': group_id,
                    'body_predicates_time': body_predicates_time,
                    'head_predicate_time': head_predicate_time
                })
            group = []
        else:
            group.append(row)
    
    np.save(output_path, result)
    # 输出分组数
    print(f"{file_path} 数据集总共有 {len(result)} 组")
    # 打印前两个结果
    print(f"前两个结果: {result[:2]}")

# 处理测试集和训练集
process_dataframe('StrokeTestIndex.csv', 'StrokeTestData.npy')
process_dataframe('StrokeTrainIndex.csv', 'StrokeTrainData.npy')


In [None]:
import numpy as np

# 加载 .npy 文件
train_data = np.load('StrokeTrainData.npy', allow_pickle=True)
test_data = np.load('StrokeTestData.npy', allow_pickle=True)

def filter_data(data):
    filtered_data = []
    for group in data:
        # 假设每组数据是一个字典，包含 'id', 'body_predicates_time', 'head_predicate_time'
        body_predicates_time = group['body_predicates_time']
        head_predicate_time = float(group['head_predicate_time'][0])  # 将 head_predicate_time 转换为数字
        
        # 找到 body_predicates_time 中的最大值，且不等于 10000000000.0
        max_body_time = max(float(time) for time in body_predicates_time if float(time) != 10000000000.0)  # 将 body_predicates_time 中的值转换为数字
        
        # 检查条件
        if max_body_time + 24 >= head_predicate_time:
            filtered_data.append(group)
    
    return filtered_data

# 过滤数据
filtered_train_data = filter_data(train_data)
filtered_test_data = filter_data(test_data)

# 保存过滤后的数据
np.save('StrokeTrain.npy', filtered_train_data)
np.save('StrokeTest.npy', filtered_test_data)

# 打印结果
print(f"StrokeTrain.npy 中的组数: {len(filtered_train_data)}")
print(f"StrokeTest.npy 中的组数: {len(filtered_test_data)}")

# 打印前两个组
print("StrokeTrain.npy 前两个组:", filtered_train_data[:2])
print("StrokeTest.npy 前两个组:", filtered_test_data[:2])