In [1]:
import numpy as np
import pandas as pd

import tsfresh
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

## Daten vorbereiten

Ziel: Dataframe in der Form
| id | time | Schleppfehler | Motormoment | Winkelposition | Geschwindigkeit |
|----|------|----------|----------|----------|--------|
| 0  | 0    | 0.1      | 0.2    | 0.3      | 0.4    |
| 0  | 1    | 0.2      | 0.3    | 0.4      | 0.5    |
| 0  | 2    | 0.3      | 0.4    | 0.5      | 0.6    |
| ...| ...  | ...      | ...    | ...      | ...    |
| 1  | 0    | 0.2      | 0.3    | 0.4      | 0.5    |
| 1  | 1    | 0.3      | 0.4    | 0.5      | 0.6    |
| 1  | 2    | 0.4      | 0.5    | 0.6      | 0.7    |
| ...| ...  | ...      | ...    | ...      | ...    |
| 298 | 0   | 0.3      | 0.4    | 0.5      | 0.6    |
| 298 | 1   | 0.4      | 0.5    | 0.6      | 0.7    |
| 298 | 2   | 0.5      | 0.6    | 0.7      | 0.8    |
| ...| ...  | ...      | ...    | ...      | ...    |

Der Dataframe enthält alle Zeitreihen von allen Samples. Die Samples sind durch die id gekennzeichnet.  
Die Timestamps für die Zeitreihen steht in der Spalte time.  
Die Features sind durch die Spalten gekennzeichnet.

Siehe: https://tsfresh.readthedocs.io/en/latest/text/data_formats.html

In [None]:
filename = '...'

multivar_data_np = np.load(filename, allow_pickle=True)
features = multivar_data_np[:, 0]
labels = multivar_data_np[:, 1]

angles = pd.Series([row[4] for row in labels], name="...")

angles.to_pickle("...")

num_samples = features.shape[0]

channel_names = ['...']
length = 1212

In [None]:
all_samples_and_time_series = {}

all_samples_and_time_series["Index"] = [i for i in range(num_samples) for j in range(length)] # [0, 0, 0, ..., 1, 1, 1, ...]
all_samples_and_time_series["Timestamp"] = [i for i in range(length)] * num_samples # [0, 1, 2, ..., 0, 1, 2, ...]

# Init lists for the different time series (channels)
for channel in channel_names[1:]:
    all_samples_and_time_series[channel] = []

for sample in features:
    for c_idx, channel in enumerate(channel_names[1:]):
        all_samples_and_time_series[channel] += list(sample[:, c_idx+1][:length])

df = pd.DataFrame(all_samples_and_time_series)

In [None]:
df

In [None]:
df.describe()

In [None]:
#df.to_pickle("...")

## Feature Extraction

Feature Extraction und Feature Selection mit tsfresh.  
tsfresh.extract_relevant_features macht dasselbe wie extract_features + impute + select_features.  

Extrahierte Features: https://tsfresh.readthedocs.io/en/latest/text/list_of_features.html

In [None]:
#extracted_features = extract_features(timeseries, column_id="id", column_sort="time")
#
#impute(extracted_features)
#features_filtered = select_features(extracted_features, y)

In [None]:
features = tsfresh.extract_relevant_features(df, angles, column_id="Index", column_sort="Timestamp")

Features bei multivariaten Zeitreihen werden bei tsfresh benannt nach dem Schema:  
"Zeitreihenname__Featurename"  
also z.B.  
"Schleppfehler__abs_energy", "Motormoment__abs_energy", ...

In [None]:
features

In [None]:
features.to_pickle("...")

## Recursive Feature Elimination

Die Recursive Feature Elimination (hier RFECV) versucht Features "rauszuschmeißen" ohne die Performance zu verschlechtern.  
Damit kann man den großen Dataframe (>1000 Features) deutlich verkleinern.  

Als Basismodell für die RFECV wird aktuell XGBoost verwendet.

In [None]:
X = pd.read_pickle("...")
y = pd.read_pickle("...")

data = pd.concat([X, y], axis="columns")

RFECV mit XGBoost und 5-fold Cross Validation.

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

reg = XGBRegressor()
cv = KFold(5)

rfecv = RFECV(
    estimator=reg,
    step=1,
    cv=cv,
    scoring="neg_mean_squared_error",
    min_features_to_select=1,
    n_jobs=8,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Die verbleibenden Features:

In [None]:
rfecv.get_feature_names_out()

In [None]:
X_reduced = pd.DataFrame(rfecv.transform(X), columns=rfecv.get_feature_names_out())

X_reduced

In [None]:
X_reduced.to_pickle("...")

## Feature Extraction ohne Selection

tsfresh.extract_relevant_features beinhaltet auch eine Feature Selection durch statistische Tests.  
Hier wird die Feature Selection nicht benutzt und alle Features werden extrahiert.  
Anschließend wird zur Feature Selection (nur) die Recursive Feature Elimination verwendet.

In [None]:
from tsfresh.feature_extraction import EfficientFCParameters
settings = EfficientFCParameters()

extracted_features = extract_features(df, column_id="Index", column_sort="Timestamp", default_fc_parameters=settings, n_jobs=8)

impute(extracted_features)

In [None]:
extracted_features

Unnamed: 0,Schleppfehler__variance_larger_than_standard_deviation,Schleppfehler__has_duplicate_max,Schleppfehler__has_duplicate_min,Schleppfehler__has_duplicate,Schleppfehler__sum_values,Schleppfehler__abs_energy,Schleppfehler__mean_abs_change,Schleppfehler__mean_change,Schleppfehler__mean_second_derivative_central,Schleppfehler__median,...,Geschwindigkeit__permutation_entropy__dimension_6__tau_1,Geschwindigkeit__permutation_entropy__dimension_7__tau_1,Geschwindigkeit__query_similarity_count__query_None__threshold_0.0,"Geschwindigkeit__matrix_profile__feature_""min""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""max""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""mean""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""median""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""25""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""75""__threshold_0.98",Geschwindigkeit__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,1.0,1.0,220.322437,41.918715,0.001148,0.000203,-6.698347e-07,0.191911,...,6.085058,6.825900,0.0,0.819862,2.195672,1.608663,1.631462,1.449151,1.777508,6.164987
1,0.0,0.0,1.0,1.0,218.919271,41.421810,0.001158,0.000203,-2.647107e-07,0.190624,...,6.099968,6.864404,0.0,1.062027,2.745572,2.061609,2.085122,1.889623,2.256906,6.207711
2,0.0,0.0,0.0,1.0,219.675141,41.665743,0.001160,0.000204,-3.860992e-07,0.191287,...,6.057718,6.877742,0.0,1.225015,2.752575,2.073991,2.109529,1.913445,2.267468,6.167603
3,0.0,0.0,1.0,1.0,220.652269,42.045407,0.001153,0.000205,-2.641488e-07,0.192001,...,6.098744,6.871994,0.0,0.652773,2.207524,1.596216,1.606932,1.436306,1.780856,6.205096
4,0.0,0.0,0.0,1.0,220.618642,42.016243,0.001158,0.000203,-3.456116e-07,0.191693,...,6.079461,6.841240,0.0,0.873887,2.271513,1.609895,1.619917,1.449892,1.780274,6.270994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992,0.0,0.0,0.0,1.0,240.123393,49.518120,0.001172,0.000214,-5.480785e-07,0.206122,...,6.144284,6.882001,0.0,0.736885,2.272050,1.623104,1.649816,1.479585,1.785721,6.165723
1993,0.0,0.0,1.0,1.0,237.646545,48.581285,0.001162,0.000212,-3.048843e-07,0.204596,...,6.131274,6.900039,0.0,0.822965,2.216082,1.614130,1.633984,1.476904,1.788359,6.130927
1994,0.0,0.0,0.0,1.0,238.773140,49.012201,0.001177,0.000213,-3.456736e-07,0.205454,...,6.093716,6.865930,0.0,0.840771,2.265572,1.626232,1.644138,1.475412,1.804707,6.132017
1995,0.0,0.0,0.0,1.0,237.576204,48.554987,0.001163,0.000212,-8.721736e-07,0.204348,...,6.114655,6.874293,0.0,0.884326,2.296218,1.603962,1.628663,1.448932,1.770131,6.202153


In [None]:
extracted_features.to_pickle("...")

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

X = pd.read_pickle("...")
y = pd.read_pickle("...")

reg = XGBRegressor()
cv = KFold(5)

rfecv = RFECV(
    estimator=reg,
    step=1,
    cv=cv,
    scoring="neg_mean_squared_error",
    min_features_to_select=1,
    n_jobs=8,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 36


In [None]:
rfecv.get_feature_names_out()

In [None]:
X_reduced = pd.DataFrame(rfecv.transform(X), columns=rfecv.get_feature_names_out())

X_reduced

In [None]:
X_reduced.to_pickle("...")