In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch.utils.data import TensorDataset
import os

In [2]:
# Set device
# device = torch.device('cuda:1')
# if torch.cuda.is_available():
#     print("Available CUDA devices:", torch.cuda.device_count())
#     for i in range(torch.cuda.device_count()):
#         print(f"Device {i}: {torch.cuda.get_device_name(i)}")
# else:
#     print("CUDA is not available.")
# Directory for data and logs
node_num = 4634
filter_num = 1250
# filter_num = 1500

inputdir = '../data/'
spatial_data = '../data/processed/graph_data.npz'
precesseddir = '../data/STGNN_data/'
constructed_dir = '../data/constructed/'
if not os.path.exists(constructed_dir):
    os.makedirs(constructed_dir)

In [3]:
# replace iu_ac with adj_matrix id
def replace_iu_ac(data):    
    graph_data = np.load(spatial_data, allow_pickle=True)

    keys = graph_data['values']
    values = graph_data['keys']
    index_to_iu_ac = {key: value for key, value in zip(keys, values)}
    
    data['iu_ac'] = data['iu_ac'].map(index_to_iu_ac)
    
    return data

In [4]:
def test_and_add_constructed_features(data):
    data['constructed'] = (data['target'] == -1).astype(int)
    return data

def add_constructed_features(data):
    data['constructed'] = 0
    return data

In [5]:
from joblib import Parallel, delayed


def construct_missing_data(t_1h, timestep_data, node_num):
    all_iu_ac = set(range(node_num))
    existing_iu_ac = set(timestep_data['iu_ac'])
    missing_iu_ac = all_iu_ac - existing_iu_ac
    
    new_data_rows = []
    
    for iu_ac in missing_iu_ac:
        new_row = {
            'iu_ac': iu_ac,
            't_1h': timestep_data['t_1h'].iloc[0],
            'etat_barre': 0,
            'constructed': 1,
            'q': -1
        }
        new_data_rows.append(new_row)
    
    return new_data_rows

def timestep_construct_optimized(data):
    if 'id' in data.columns:
        data.drop('id', axis=1, inplace=True)
    data.sort_values(by=['t_1h', 'iu_ac'], inplace=True)
    
    counts = data.groupby('t_1h').size()
    
    valid_time_steps = counts[counts >= filter_num].index
    print(f"数量大于{filter_num}的时间步总数:", valid_time_steps.size)
    
    valid_rows = data[data['t_1h'].isin(valid_time_steps)]
    valid_time_steps_df = pd.DataFrame(valid_rows)

    new_data_rows = []
    print(f"Number of nodes: {node_num}")
    total = node_num*valid_time_steps.size
    print(f"total number of processed data should be: {total}")
    
    # 并行计算每个时间步的数据构造过程
    new_data_rows = Parallel(n_jobs=-1)(
        delayed(construct_missing_data)(t_1h, data[data['t_1h'] == t_1h], node_num)
        for t_1h in tqdm(valid_time_steps, desc="Processing time steps")
    )
    # 将并行计算得到的数据合并为一个列表
    new_data_rows = [row for sublist in new_data_rows for row in sublist]
    
    new_data = pd.DataFrame(new_data_rows)
    valid_time_steps_df['constructed'] = 0    
    
    combined_data = pd.concat([valid_time_steps_df, new_data], ignore_index=True)
    combined_data.sort_values(by=['t_1h', 'iu_ac'], inplace=True)
    
    return combined_data


In [6]:
def to_datetime(data):
    data['t_1h'] = pd.to_datetime(data['t_1h'])
    return data

def encode_categorical(data, column_name, num_classes):
    categories = torch.tensor(data[column_name].values)
    return torch.nn.functional.one_hot(categories, num_classes=num_classes).float()

def process_time_features(data, time_column='t_1h'):
    # 提取周期性时间特征
    data['hour'] = data[time_column].dt.hour
    data['day_of_week'] = data[time_column].dt.weekday
    data['month'] = data[time_column].dt.month

    # 应用正弦和余弦变换来捕获时间的周期性
    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
    data['day_of_week_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data['day_of_week_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)
    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
    return data


def data_to_tensor(data, train=1):
    to_datetime(data)
    print("Number of rows in data:", len(data))
    if 'id' in data.columns:
        data.drop('id', axis=1, inplace=True)
    
    # One-hot encode 'etat_barre'
    etat_barre_encoded = encode_categorical(data, 'etat_barre', 4)
    # Process time features
    data = process_time_features(data, 't_1h')
    
    # Combine features into a single tensor
    features = torch.cat([
        torch.tensor(data[['iu_ac', 'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos', 'constructed']].values).float(),
        etat_barre_encoded
    ], dim=1)

    if train:
        targets = torch.tensor(data['q'].values).float()
        return TensorDataset(features, targets)
    return TensorDataset(features)

In [7]:
def tensor_dataset_to_dataframe(tensor_dataset, feature_names):
    # 从 TensorDataset 中提取特征和目标张量
    features = tensor_dataset.tensors[0]
    if len(tensor_dataset.tensors) > 1:
        targets = tensor_dataset.tensors[1]
    else:
        targets = None
    
    # 将特征张量转换为 DataFrame
    features_df = pd.DataFrame(features.numpy(), columns=feature_names)
    
    # 如果存在目标，添加到 DataFrame
    if targets is not None:
        features_df['target'] = targets.numpy()
    
    return features_df


feature_names = [
    'iu_ac',
    'hour_sin', 'hour_cos', 
    'day_of_week_sin', 'day_of_week_cos', 
    'month_sin', 'month_cos',
    'etat_barre_0', 'etat_barre_1', 'etat_barre_2', 'etat_barre_3',
    'constructed'
]


In [8]:
def process_and_save_data():
    train_data = pd.read_csv(f'{inputdir}loop_sensor_train.csv')
    train_data = replace_iu_ac(train_data)
    train_data = timestep_construct_optimized(train_data)
    train_dataset = data_to_tensor(train_data)
    train_df = tensor_dataset_to_dataframe(train_dataset, feature_names)
    # train_df = test_and_add_constructed_features(train_data)
    # train_df.to_csv(f'{precesseddir}train_dataset_stgnn.csv', index=False)
    train_df.to_csv(f'{constructed_dir}train_dataset_constructed_{filter_num}.csv', index=False)    
     
    # eval_data = pd.read_csv(f'{inputdir}loop_sensor_eval.csv')
    # eval_data = replace_iu_ac(eval_data)
    # eval_dataset = data_to_tensor(eval_data)   
    # eval_df = tensor_dataset_to_dataframe(eval_dataset, feature_names)    
    # eval_df.to_csv(f'{precesseddir}eval_dataset_stgnn.csv', index=False)    
    
    # test_data_x = pd.read_csv(f'{inputdir}loop_sensor_test_x.csv')
    # test_data_x = replace_iu_ac(test_data_x)
    # test_dataset_x = data_to_tensor(test_data_x, train=0)
    # test_df = tensor_dataset_to_dataframe(test_dataset_x, feature_names)   
    # test_df.to_csv(f'{precesseddir}test_dataset_stgnn_x.csv', index=False)   

In [9]:
process_and_save_data()

数量大于1250的时间步总数: 15029
Number of nodes: 4634
total number of processed data should be: 69644386


Processing time steps: 100%|██████████| 15029/15029 [2:06:10<00:00,  1.99it/s] 


Number of rows in data: 69644386


In [10]:
# def constructed_data():
#     # eval_data = pd.read_csv(f'{precesseddir}eval_dataset_stgnn.csv')
#     # eval_df = add_constructed_features(eval_data)
#     # eval_df.to_csv(f'{constructed_dir}eval_dataset_constructed.csv', index=False)
    
#     # test_data_x = pd.read_csv(f'{precesseddir}test_dataset_stgnn_x.csv')
#     # test_df = add_constructed_features(test_data_x)
#     # test_df.to_csv(f'{constructed_dir}test_dataset_constructed_x.csv', index=False)
    
#     train_data = pd.read_csv(f'{precesseddir}train_dataset_stgnn.csv')
#     train_df = test_and_add_constructed_features(train_data)
#     train_df.to_csv(f'{constructed_dir}train_dataset_constructed.csv', index=False)    
    
# constructed_data()