In [9]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset

In [10]:
# Set device
# device = torch.device('cuda:1')
# if torch.cuda.is_available():
#     print("Available CUDA devices:", torch.cuda.device_count())
#     for i in range(torch.cuda.device_count()):
#         print(f"Device {i}: {torch.cuda.get_device_name(i)}")
# else:
#     print("CUDA is not available.")
# Directory for data and logs
inputdir = '../data/'
precesseddir = '../data/processed/'

In [11]:
def to_datetime(data):
    data['t_1h'] = pd.to_datetime(data['t_1h'])
    return data

def encode_categorical(data, column_name, num_classes):
    categories = torch.tensor(data[column_name].values)
    return torch.nn.functional.one_hot(categories, num_classes=num_classes).float()

def process_time_features(data, time_column='t_1h'):
    # 提取周期性时间特征
    data['hour'] = data[time_column].dt.hour
    data['day_of_week'] = data[time_column].dt.weekday
    data['month'] = data[time_column].dt.month

    # 应用正弦和余弦变换来捕获时间的周期性
    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
    data['day_of_week_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data['day_of_week_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)
    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
    return data


def data_to_tensor(data, train=1):
    to_datetime(data)
    print("Number of rows in data:", len(data))
    if 'id' in data.columns:
        data.drop('id', axis=1, inplace=True)
    
    # One-hot encode 'etat_barre'
    etat_barre_encoded = encode_categorical(data, 'etat_barre', 4)
    # Process time features
    data = process_time_features(data, 't_1h')
    
    # Combine features into a single tensor
    features = torch.cat([
        torch.tensor(data[['iu_ac', 'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos']].values).float(),
        etat_barre_encoded
    ], dim=1)

    if train:
        targets = torch.tensor(data['q'].values).float()
        return TensorDataset(features, targets)
    return TensorDataset(features)

In [12]:
def tensor_dataset_to_dataframe(tensor_dataset, feature_names):
    # 从 TensorDataset 中提取特征和目标张量
    features = tensor_dataset.tensors[0]
    if len(tensor_dataset.tensors) > 1:
        targets = tensor_dataset.tensors[1]
    else:
        targets = None
    
    # 将特征张量转换为 DataFrame
    features_df = pd.DataFrame(features.numpy(), columns=feature_names)
    
    # 如果存在目标，添加到 DataFrame
    if targets is not None:
        features_df['target'] = targets.numpy()
    
    return features_df


feature_names = [
    'iu_ac',
    'hour_sin', 'hour_cos', 
    'day_of_week_sin', 'day_of_week_cos', 
    'month_sin', 'month_cos',
    'etat_barre_0', 'etat_barre_1', 'etat_barre_2', 'etat_barre_3'
]


In [23]:
def process_and_save_data():
    # train_data = pd.read_csv(f'{inputdir}loop_sensor_train.csv')
    # train_dataset = data_to_tensor(train_data)
    # train_df = tensor_dataset_to_dataframe(train_dataset, feature_names)
    # train_df.to_csv(f'{precesseddir}train_dataset.csv', index=False)
    
    eval_data = pd.read_csv(f'{inputdir}loop_sensor_eval.csv')
    eval_dataset = data_to_tensor(eval_data)   
    eval_df = tensor_dataset_to_dataframe(eval_dataset, feature_names)    
    eval_df.to_csv(f'{precesseddir}eval_dataset.csv', index=False)    
    
     
    test_data_x = pd.read_csv(f'{inputdir}loop_sensor_test_x.csv')
    test_dataset_x = data_to_tensor(test_data_x, train=0)
    test_df = tensor_dataset_to_dataframe(test_dataset_x, feature_names)   
    test_df.to_csv(f'{precesseddir}test_dataset_x.csv', index=False)   

In [24]:
process_and_save_data()

Number of rows in data: 439298
Number of rows in data: 439298
