In [1]:
import numpy as np
import pandas as pd

import tsfresh
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

## Prepare data

You first have to bring your data into the following form:

| id | time | time series feature 1 (e.g. speed) | time series feature 2 (e.g. force) | time series feature 3 | time series feature 4 |
|----|------|----------|----------|----------|--------|
| 0  | 0    | 0.1      | 0.2    | 0.3      | 0.4    |
| 0  | 1    | 0.2      | 0.3    | 0.4      | 0.5    |
| 0  | 2    | 0.3      | 0.4    | 0.5      | 0.6    |
| ...| ...  | ...      | ...    | ...      | ...    |
| 1  | 0    | 0.2      | 0.3    | 0.4      | 0.5    |
| 1  | 1    | 0.3      | 0.4    | 0.5      | 0.6    |
| 1  | 2    | 0.4      | 0.5    | 0.6      | 0.7    |
| ...| ...  | ...      | ...    | ...      | ...    |
| 298 | 0   | 0.3      | 0.4    | 0.5      | 0.6    |
| 298 | 1   | 0.4      | 0.5    | 0.6      | 0.7    |
| 298 | 2   | 0.5      | 0.6    | 0.7      | 0.8    |
| ...| ...  | ...      | ...    | ...      | ...    |

The dataframe contains all time series for all samples. The samples are identified by the id column.  
The timestamps for the time series are in the time column.  
The features are identified through the respective columns.  

If your time series features are of different lengths, you can also handle them individually.

See: https://tsfresh.readthedocs.io/en/latest/text/data_formats.html

The following code is an example for a dataset from a previous project.  
For your use case and data format, you will have to adapt it / write your own code. 

In [None]:
filename = '...'

multivar_data_np = np.load(filename, allow_pickle=True)
features = multivar_data_np[:, 0]
labels = multivar_data_np[:, 1]

angles = pd.Series([row[4] for row in labels], name="...")

angles.to_pickle("...")

num_samples = features.shape[0]

channel_names = ['...']
length = 1212

In [None]:
all_samples_and_time_series = {}

all_samples_and_time_series["Index"] = [i for i in range(num_samples) for j in range(length)] # [0, 0, 0, ..., 1, 1, 1, ...]
all_samples_and_time_series["Timestamp"] = [i for i in range(length)] * num_samples # [0, 1, 2, ..., 0, 1, 2, ...]

# Init lists for the different time series (channels)
for channel in channel_names[1:]:
    all_samples_and_time_series[channel] = []

for sample in features:
    for c_idx, channel in enumerate(channel_names[1:]):
        all_samples_and_time_series[channel] += list(sample[:, c_idx+1][:length])

df = pd.DataFrame(all_samples_and_time_series)

In [None]:
df

In [None]:
df.describe()

In [None]:
#df.to_pickle("...")

## Feature Extraction

Feature extraction and feature selection with tsfresh.  
Uses the built-in statistical feature selection of tsfresh to determine relevant features.  
tsfresh.extract_relevant_features does the same as extract_features + impute + select_features.  

Extracted features: https://tsfresh.readthedocs.io/en/latest/text/list_of_features.html

In [None]:
# extracted_features = extract_features(timeseries, column_id="id", column_sort="time")
#
# impute(extracted_features)
# features_filtered = select_features(extracted_features, y)

In [None]:
features = tsfresh.extract_relevant_features(df, angles, column_id="Index", column_sort="Timestamp")

For multivariate time series, the features for each time series are named like so:  
"TimeSeriesName__FeatureName"  
e.g.  
"Schleppfehler__abs_energy", "Motormoment__abs_energy", ...

In [None]:
features

In [None]:
features.to_pickle("...")

## Recursive Feature Elimination
The Recursive Feature Elimination (RFECV) tries to eliminate features without a loss in performance.  
This allows for significant reduction of the large dataframe (sometimes >1000 features). 

Currently, XGBoost is used as the base model for RFECV. You can change this to any other model if wanted.

In [None]:
X = pd.read_pickle("...") # This is the featues dataframe from the previous step
y = pd.read_pickle("...") # These are your labels (target variable values for regression in this example)

data = pd.concat([X, y], axis="columns")

RFECV with XGBoost and 5-fold cross-validation.

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

reg = XGBRegressor()
cv = KFold(5)

rfecv = RFECV(
    estimator=reg,
    step=1,
    cv=cv,
    scoring="neg_mean_squared_error",
    min_features_to_select=1,
    n_jobs=8,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 118


The remaining features:

In [None]:
rfecv.get_feature_names_out()

Reduce the dataframe to the remaining features:

In [None]:
X_reduced = pd.DataFrame(rfecv.transform(X), columns=rfecv.get_feature_names_out())

X_reduced

In [None]:
X_reduced.to_pickle("...")

## Feature Extraction without Selection

tsfresh.extract_relevant_features also includes feature selection through statistical tests.  
Here, feature selection is not used, and all features are extracted.  
Then, only Recursive Feature Elimination is used for feature selection.  
This is more complete and can sometimes be better for some datasets and use cases. 

In [None]:
from tsfresh.feature_extraction import EfficientFCParameters

settings = EfficientFCParameters()

extracted_features = extract_features(df, column_id="Index", column_sort="Timestamp", default_fc_parameters=settings, n_jobs=8)

impute(extracted_features)

In [None]:
extracted_features

In [None]:
extracted_features.to_pickle("...")

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

X = pd.read_pickle("...")
y = pd.read_pickle("...")

reg = XGBRegressor()
cv = KFold(5)

rfecv = RFECV(
    estimator=reg,
    step=1,
    cv=cv,
    scoring="neg_mean_squared_error",
    min_features_to_select=1,
    n_jobs=8,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

In [None]:
rfecv.get_feature_names_out()

In [None]:
X_reduced = pd.DataFrame(rfecv.transform(X), columns=rfecv.get_feature_names_out())

X_reduced

In [None]:
X_reduced.to_pickle("...")