In [1]:
import torch
import numpy as np
from torch import optim as optim
from cmapss import cmapss
from torch.utils.data import DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sets = ['FD001', 'FD002', 'FD003', 'FD004']
model_name = 'Hybrid'
feature_extract_mode = 'default'
# feature_extract_mode = 'tsfresh'

iteration_num = 10
window_size = 30

In [3]:
set_name = 'FD001'
trainset = cmapss(mode='train',
                feature_extract_mode=feature_extract_mode,
                dataset=f'./CMAPSSData/train_{set_name}_processed.csv',
                window_size=window_size)
testset = cmapss(mode='test',
            feature_extract_mode=feature_extract_mode,
            dataset=f'./CMAPSSData/test_{set_name}_processed.csv',
            rul_result=f'./CMAPSSData/RUL_{set_name}.txt',
            window_size=window_size)

print('dataset load successfully!')

(20631, 17)
(13096, 17)
dataset load successfully!


In [7]:
# trainset.feature_extract()
# testset.feature_extract()

In [8]:
train_loader = DataLoader(dataset=trainset, batch_size=len(trainset), shuffle=True, num_workers=2)
test_loader = DataLoader(dataset=testset, batch_size=len(testset), shuffle=False, num_workers=2)

ValueError: num_samples should be a positive integer value, but got num_samples=0

In [13]:
for batch_index, data in enumerate(train_loader, 0):
    inputs, handcrafted_feature, labels = data

    print(f"Batch Index: {batch_index}")
    print(f"Inputs Shape: {inputs.shape}")
    # print(f"Handcrafted Features Shape: {handcrafted_feature.shape}")
    print(f"Labels Shape: {labels.shape}")

Batch Index: 0
Inputs Shape: torch.Size([17731, 30, 17])
Labels Shape: torch.Size([17731, 1])


In [8]:
from tsfresh import extract_relevant_features
from tsfresh.utilities.dataframe_functions import impute
import pandas as pd

In [9]:
def extract_and_select_tsfresh_features(inputs, labels, column_id="id", column_sort="time", column_value="value"):
    if isinstance(inputs, torch.Tensor):
        inputs = inputs.numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.numpy()
    
    time_series_df = pd.DataFrame()

    for unit_idx in tqdm(range(inputs.shape[0]), desc="Initialization", unit="sample"):
        for feature_dim in range(inputs.shape[2]):
            ts_data = pd.DataFrame({
                column_id: unit_idx,
                column_sort: range(inputs.shape[1]),
                column_value: inputs[unit_idx, :, feature_dim]
            })
            ts_data['feature_dim'] = feature_dim
            time_series_df = pd.concat([time_series_df, ts_data], ignore_index=True)

    relevant_features = extract_relevant_features(time_series_df, 
                                                  y=pd.Series(labels.flatten()), 
                                                  column_id=column_id, 
                                                  column_sort=column_sort, 
                                                  column_kind='feature_dim', 
                                                  column_value=column_value)
    
    impute(relevant_features)
    return relevant_features

In [10]:
extracted_features = extract_and_select_tsfresh_features(inputs[:100], labels[:100])

Initialization: 100%|██████████| 100/100 [00:00<00:00, 137.53sample/s]
Feature Extraction: 100%|██████████| 40/40 [00:23<00:00,  1.67it/s]


In [11]:
# print(extracted_features.head())

In [12]:
from sklearn.decomposition import PCA

pca = PCA(n_components=20)  # Retain 95% of the variance
reduced_features = pca.fit_transform(extracted_features)
selected_features_pca = pd.DataFrame(reduced_features)

In [13]:
selected_features_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,11.538601,99.788538,-73.616616,-119.998600,7.415103,24.038515,50.551229,8.455851,-0.790781,-2.539496,4.063649,-5.180844,1.795415,3.491117,-1.977848,-1.786578,-1.040339,-2.298895,2.551714,-1.453487
1,-71.885615,-65.216774,-24.882602,71.485807,11.338036,40.986940,49.297086,8.641171,4.214721,-3.046241,0.545859,-6.259857,-0.088238,-2.395807,2.184409,-2.750752,-4.565312,0.807413,-3.259262,2.863426
2,51.063344,-82.136767,-119.916713,-1.531582,-8.413727,-53.257087,-165.592956,5.841684,1.897689,-1.942060,-3.084503,-1.294967,-0.265935,0.345316,-2.764307,-0.606400,1.261577,-0.012268,-1.780252,0.148068
3,-26.795507,-104.203966,-9.189505,110.344172,110.575284,27.534322,-84.109203,16.391240,-4.494083,10.342198,-0.985391,2.905408,0.094030,0.270660,-0.928903,0.182100,1.889517,2.438300,-1.323451,-0.109358
4,98.543128,-7.930621,-90.018536,-81.017968,-63.297021,-123.270113,-167.343144,18.606344,0.235378,2.281100,0.680885,-0.429602,3.211319,2.000023,4.441433,0.479843,-0.586156,-0.148288,4.179987,0.823956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-114.315709,-179.141293,-75.876045,45.379685,-55.636952,23.665242,-27.415572,0.952180,-2.073161,-6.721170,-1.134203,-0.495388,1.135689,1.402442,-1.179734,2.376745,-3.922307,0.172131,3.941521,1.577354
96,-107.811158,-148.861083,-4.779699,50.691432,119.882942,-6.739749,47.982642,-6.661037,-8.534820,1.427000,-4.487593,1.977316,2.630347,5.517431,-3.852986,-0.569040,1.469857,0.974105,-1.655281,-0.864231
97,-202.506591,57.521400,-60.670800,-27.653339,-58.252775,49.856865,1.768259,15.855072,-5.978626,-1.231114,8.896695,3.416395,-3.592319,1.729327,-3.821998,4.352909,-4.839882,2.713912,0.352079,2.605625
98,-91.710065,-148.780652,31.525771,3.410131,-90.888194,13.072785,-50.229040,-21.725943,-10.576832,-3.985462,-1.231781,-1.414247,2.868721,5.562184,1.252884,-7.759863,-0.768211,-0.479793,2.922469,0.820814


In [14]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(extracted_features, labels[:100])  # y can be constant
feature_importances = model.feature_importances_

  return fit_method(estimator, *args, **kwargs)


In [15]:
importance = pd.DataFrame(feature_importances).T
filtered_importance = importance.loc[:, (importance > 0.01).iloc[0]]
filtered_importance

Unnamed: 0,0,1,15,25,31,58,79,91,95,150,158,633
0,0.038072,0.232923,0.044069,0.019219,0.02169,0.073187,0.010491,0.022166,0.021668,0.022293,0.086632,0.012217


In [16]:
col_indices = filtered_importance.columns
col_indices

Index([0, 1, 15, 25, 31, 58, 79, 91, 95, 150, 158, 633], dtype='int64')

In [None]:
filtered_train = extracted_features.iloc[:, col_indices]

In [None]:
import seaborn as sns
corr_matrix = filtered_train.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()