In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch.utils.data import TensorDataset
import os

In [2]:
# Set device
# device = torch.device('cuda:1')
# if torch.cuda.is_available():
#     print("Available CUDA devices:", torch.cuda.device_count())
#     for i in range(torch.cuda.device_count()):
#         print(f"Device {i}: {torch.cuda.get_device_name(i)}")
# else:
#     print("CUDA is not available.")
# Directory for data and logs
inputdir = '../data/'
spatial_data = '../data/processed/graph_data.npz'
precesseddir = '../data/STGNN_data/'
if not os.path.exists(precesseddir):
    os.makedirs(precesseddir)

In [3]:
# replace iu_ac with adj_matrix id
def replace_iu_ac(data):    
    graph_data = np.load(spatial_data, allow_pickle=True)

    keys = graph_data['keys']
    values = graph_data['values']
    index_to_iu_ac = {key: value for key, value in zip(keys, values)}
    
    data['iu_ac'] = data['iu_ac'].map(index_to_iu_ac)
    
    return data

In [5]:
from joblib import Parallel, delayed

def construct_missing_data(t_1h, timestep_data, node_num):
    all_iu_ac = set(range(node_num))
    existing_iu_ac = set(timestep_data['iu_ac'])
    missing_iu_ac = all_iu_ac - existing_iu_ac
    
    new_data_rows = []
    
    for iu_ac in missing_iu_ac:
        new_rows = {
            'iu_ac': [iu_ac] * len(timestep_data),
            't_1h': timestep_data['t_1h'],
            'etat_barre': 0,
            'constructed': 1,
            'target': -1
        }
        new_data_rows.append(new_rows)
    
    return new_data_rows

def timestep_construct_optimized(data):
    data.sort_values(by='t_1h', inplace=True)
    
    counts = data.groupby('t_1h').size()
    valid_time_steps = counts[counts >= 1250].index
    
    new_data_rows = []
    node_num = data['iu_ac'].nunique()
    
    print(f"Number of nodes: {node_num}")
    
    # 并行计算每个时间步的数据构造过程
    new_data_rows = Parallel(n_jobs=-1)(
        delayed(construct_missing_data)(t_1h, data[data['t_1h'] == t_1h], node_num)
        for t_1h in tqdm(valid_time_steps, desc="Processing time steps")
    )
    
    # 将并行计算得到的数据合并为一个列表
    new_data_rows = [row for sublist in new_data_rows for row in sublist]
    
    new_data = pd.DataFrame(new_data_rows)
    data['constructed'] = 0
    
    combined_data = pd.concat([data, new_data], ignore_index=True)
    
    return combined_data


In [6]:
def to_datetime(data):
    data['t_1h'] = pd.to_datetime(data['t_1h'])
    return data

def encode_categorical(data, column_name, num_classes):
    categories = torch.tensor(data[column_name].values)
    return torch.nn.functional.one_hot(categories, num_classes=num_classes).float()

def process_time_features(data, time_column='t_1h'):
    # 提取周期性时间特征
    data['hour'] = data[time_column].dt.hour
    data['day_of_week'] = data[time_column].dt.weekday
    data['month'] = data[time_column].dt.month

    # 应用正弦和余弦变换来捕获时间的周期性
    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
    data['day_of_week_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data['day_of_week_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)
    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
    return data


def data_to_tensor(data, train=1):
    to_datetime(data)
    print("Number of rows in data:", len(data))
    if 'id' in data.columns:
        data.drop('id', axis=1, inplace=True)
    
    # One-hot encode 'etat_barre'
    etat_barre_encoded = encode_categorical(data, 'etat_barre', 4)
    # Process time features
    data = process_time_features(data, 't_1h')
    
    # Combine features into a single tensor
    features = torch.cat([
        torch.tensor(data[['iu_ac', 'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos']].values).float(),
        etat_barre_encoded
    ], dim=1)

    if train:
        targets = torch.tensor(data['q'].values).float()
        return TensorDataset(features, targets)
    return TensorDataset(features)

In [7]:
def tensor_dataset_to_dataframe(tensor_dataset, feature_names):
    # 从 TensorDataset 中提取特征和目标张量
    features = tensor_dataset.tensors[0]
    if len(tensor_dataset.tensors) > 1:
        targets = tensor_dataset.tensors[1]
    else:
        targets = None
    
    # 将特征张量转换为 DataFrame
    features_df = pd.DataFrame(features.numpy(), columns=feature_names)
    
    # 如果存在目标，添加到 DataFrame
    if targets is not None:
        features_df['target'] = targets.numpy()
    
    return features_df


feature_names = [
    'iu_ac',
    'hour_sin', 'hour_cos', 
    'day_of_week_sin', 'day_of_week_cos', 
    'month_sin', 'month_cos',
    'etat_barre_0', 'etat_barre_1', 'etat_barre_2', 'etat_barre_3'
]


In [8]:
def process_and_save_data():
    # train_data = pd.read_csv(f'{inputdir}loop_sensor_train.csv')
    # train_data = timestep_construct_optimized(train_data)
    # train_dataset = data_to_tensor(train_data)
    # train_df = tensor_dataset_to_dataframe(train_dataset, feature_names)
    # train_data.to_csv(f'{precesseddir}train_dataset_stgnn.csv', index=False)
    
    
    eval_data = pd.read_csv(f'{inputdir}loop_sensor_eval.csv')
    eval_data = timestep_construct_optimized(eval_data)
    eval_data.to_csv(f'{precesseddir}eval_dataset_stgnn.csv', index=False)
    # eval_dataset = data_to_tensor(eval_data)   
    # eval_df = tensor_dataset_to_dataframe(eval_dataset, feature_names)    
    # eval_df.to_csv(f'{precesseddir}eval_dataset_stgnn.csv', index=False)    
    
     
    # test_data_x = pd.read_csv(f'{inputdir}loop_sensor_test_x.csv')
    # test_dataset_x = data_to_tensor(test_data_x, train=0)
    # test_df = tensor_dataset_to_dataframe(test_dataset_x, feature_names)   
    # test_df.to_csv(f'{precesseddir}test_dataset_stgnn_x.csv', index=False)   

In [9]:
process_and_save_data()

Number of nodes: 1757


Processing time steps: 100%|██████████| 301/301 [00:13<00:00, 23.10it/s]
