In [1]:
import numpy as np
import pandas as pd

import tsfresh
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

## Prepare data

You first have to bring your data into the following form:

| id | time | time series feature 1 (e.g. speed) | time series feature 2 (e.g. force) | time series feature 3 | time series feature 4 |
|----|------|----------|----------|----------|--------|
| 0  | 0    | 0.1      | 0.2    | 0.3      | 0.4    |
| 0  | 1    | 0.2      | 0.3    | 0.4      | 0.5    |
| 0  | 2    | 0.3      | 0.4    | 0.5      | 0.6    |
| ...| ...  | ...      | ...    | ...      | ...    |
| 1  | 0    | 0.2      | 0.3    | 0.4      | 0.5    |
| 1  | 1    | 0.3      | 0.4    | 0.5      | 0.6    |
| 1  | 2    | 0.4      | 0.5    | 0.6      | 0.7    |
| ...| ...  | ...      | ...    | ...      | ...    |
| 298 | 0   | 0.3      | 0.4    | 0.5      | 0.6    |
| 298 | 1   | 0.4      | 0.5    | 0.6      | 0.7    |
| 298 | 2   | 0.5      | 0.6    | 0.7      | 0.8    |
| ...| ...  | ...      | ...    | ...      | ...    |

The dataframe contains all time series for all samples. The samples are identified by the id column.  
The timestamps for the time series are in the time column.  
The features are identified through the respective columns.  

If your time series features are of different lengths, you can also handle them individually.

See: https://tsfresh.readthedocs.io/en/latest/text/data_formats.html

The following code is an example for a dataset from a previous project.  
For your use case and data format, you will have to adapt it / write your own code. 

In [None]:
filename = './Daten/Dataset_Hauptversuch_outliers_removed.npy'

multivar_data_np = np.load(filename, allow_pickle=True)
features = multivar_data_np[:, 0]
labels = multivar_data_np[:, 1]

angles = pd.Series([row[4] for row in labels], name="Angle")

angles.to_pickle("AnglesNew.pkl")

num_samples = features.shape[0]

channel_names = ['Timestamp', 'Schleppfehler', 'Motormoment', 'Winkelposition', 'Geschwindigkeit']
length = 1212

In [None]:
all_samples_and_time_series = {}

all_samples_and_time_series["Index"] = [i for i in range(num_samples) for j in range(length)] # [0, 0, 0, ..., 1, 1, 1, ...]
all_samples_and_time_series["Timestamp"] = [i for i in range(length)] * num_samples # [0, 1, 2, ..., 0, 1, 2, ...]

# Init lists for the different time series (channels)
for channel in channel_names[1:]:
    all_samples_and_time_series[channel] = []

for sample in features:
    for c_idx, channel in enumerate(channel_names[1:]):
        all_samples_and_time_series[channel] += list(sample[:, c_idx+1][:length])

df = pd.DataFrame(all_samples_and_time_series)

In [None]:
df

Unnamed: 0,Index,Timestamp,Schleppfehler,Motormoment,Winkelposition,Geschwindigkeit
0,0,0,-0.041395,-0.103120,0.040000,-0.244141
1,0,1,-0.041493,-0.089537,0.040000,0.000000
2,0,2,-0.041493,-0.101483,0.040000,0.244141
3,0,3,-0.039878,-0.067971,0.040000,0.491333
4,0,4,-0.034833,-0.063665,0.040000,-0.244141
...,...,...,...,...,...,...
2420359,1996,1207,0.220028,0.482374,59.709999,3.187943
2420360,1996,1208,0.219675,0.503739,59.740002,2.207184
2420361,1996,1209,0.217656,0.482286,59.759998,1.961517
2420362,1996,1210,0.216425,0.483400,59.770000,2.206802


In [None]:
df.describe()

Unnamed: 0,Index,Timestamp,Schleppfehler,Motormoment,Winkelposition,Geschwindigkeit
count,2420364.0,2420364.0,2420364.0,2420364.0,2420364.0,2420364.0
mean,998.0,605.5,0.1891558,0.4239702,29.73268,4.931107
std,576.4843,349.8742,0.04009242,0.08840307,17.46529,0.6135825
min,0.0,0.0,-0.04325927,-0.1177569,0.02999999,-1.226807
25%,499.0,302.75,0.176666,0.3951881,14.585,4.660034
50%,998.0,605.5,0.1984691,0.4438265,29.695,4.907227
75%,1497.0,908.25,0.2136876,0.478784,44.8575,5.151367
max,1996.0,1211.0,0.242606,0.5641079,59.79,7.113647


In [None]:
#df.to_pickle("Dataset_Hauptversuch_Dataframe.pkl")

## Feature Extraction

Feature extraction and feature selection with tsfresh.  
Uses the built-in statistical feature selection of tsfresh to determine relevant features.  
tsfresh.extract_relevant_features does the same as extract_features + impute + select_features.  

Extracted features: https://tsfresh.readthedocs.io/en/latest/text/list_of_features.html

In [None]:
# extracted_features = extract_features(timeseries, column_id="id", column_sort="time")
#
# impute(extracted_features)
# features_filtered = select_features(extracted_features, y)

In [None]:
features = tsfresh.extract_relevant_features(df, angles, column_id="Index", column_sort="Timestamp")

Feature Extraction: 100%|██████████| 20/20 [1:09:37<00:00, 208.88s/it]


For multivariate time series, the features for each time series are named like so:  
"TimeSeriesName__FeatureName"  
e.g.  
"Schleppfehler__abs_energy", "Motormoment__abs_energy", ...

In [None]:
features

Unnamed: 0,Schleppfehler__sum_values,Motormoment__index_mass_quantile__q_0.7,"Schleppfehler__fft_coefficient__attr_""imag""__coeff_15","Schleppfehler__fft_coefficient__attr_""imag""__coeff_14","Schleppfehler__fft_coefficient__attr_""imag""__coeff_13","Schleppfehler__fft_coefficient__attr_""imag""__coeff_12","Schleppfehler__fft_coefficient__attr_""imag""__coeff_11","Schleppfehler__fft_coefficient__attr_""imag""__coeff_10","Schleppfehler__fft_coefficient__attr_""imag""__coeff_9","Schleppfehler__fft_coefficient__attr_""imag""__coeff_8",...,"Geschwindigkeit__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""max""","Geschwindigkeit__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.2",Geschwindigkeit__ratio_beyond_r_sigma__r_2,Geschwindigkeit__quantile__q_0.4,Geschwindigkeit__fourier_entropy__bins_100,"Winkelposition__fft_coefficient__attr_""angle""__coeff_61","Geschwindigkeit__fft_coefficient__attr_""angle""__coeff_44","Winkelposition__fft_coefficient__attr_""real""__coeff_80","Motormoment__fft_coefficient__attr_""abs""__coeff_90","Motormoment__fft_coefficient__attr_""imag""__coeff_70"
0,220.322437,0.730198,3.739732,4.323654,4.631744,9.201229,-0.043302,4.625843,4.366958,4.321901,...,0.006821,0.035820,0.023927,4.904175,3.285790,98.958028,161.041567,-29.873948,1.267682,1.742434
1,218.919271,0.730198,3.747181,4.352891,4.681251,9.233408,-0.015549,4.684582,4.371583,4.301454,...,0.005772,0.043346,0.018977,4.904175,3.338978,98.968528,-178.435505,-29.807445,1.583211,1.609254
2,219.675141,0.730198,3.738148,4.350300,4.629616,9.134968,-0.092641,4.545530,4.242234,4.174575,...,0.005193,0.047715,0.014851,4.904175,3.507698,98.951211,145.273494,-29.813241,1.436431,1.811560
3,220.652269,0.729373,3.774863,4.355947,4.659000,9.140078,-0.091475,4.600566,4.294953,4.216055,...,0.006647,0.041993,0.020627,4.904175,3.511007,98.952655,152.776108,-29.914680,1.054469,1.607573
4,220.618642,0.730198,3.763362,4.355907,4.654154,9.151323,-0.080069,4.582855,4.264848,4.260284,...,0.006325,0.050838,0.018977,4.904175,3.646655,98.971978,173.448737,-29.805879,1.923010,1.506088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992,240.123393,0.721122,4.239995,4.865204,5.314487,9.397074,1.610787,4.734154,4.728765,4.734880,...,0.005518,0.046649,0.016502,4.904175,3.389938,98.894496,152.224370,-29.890266,1.333948,1.863885
1993,237.646545,0.722772,4.161695,4.806492,5.231058,9.479094,1.763492,4.922195,4.902022,4.958574,...,0.006339,0.046157,0.016502,4.904175,3.561220,98.954399,172.302901,-29.845336,1.377145,1.487392
1994,238.773140,0.721947,4.199249,4.771502,5.304669,9.422395,1.681399,4.838089,4.850566,4.872793,...,0.007167,0.047570,0.017327,4.904175,3.385679,98.905753,154.803597,-29.780302,1.157336,1.667968
1995,237.576204,0.721122,4.115820,4.761524,5.281440,9.477665,1.781677,4.929700,4.965742,5.039469,...,0.008450,0.032032,0.022277,4.904175,3.513251,98.927842,169.767420,-29.748130,1.241793,1.688521


In [None]:
features.to_pickle("Dataset_Hauptversuch_Features.pkl")

## Recursive Feature Elimination
The Recursive Feature Elimination (RFECV) tries to eliminate features without a loss in performance.  
This allows for significant reduction of the large dataframe (sometimes >1000 features). 

Currently, XGBoost is used as the base model for RFECV. You can change this to any other model if wanted.

In [None]:
X = pd.read_pickle("Dataset_Hauptversuch_Features.pkl") # This is the featues dataframe from the previous step
y = pd.read_pickle("Angles_Hauptversuch.pkl") # These are your labels (target variable values for regression in this example)

data = pd.concat([X, y], axis="columns")

RFECV with XGBoost and 5-fold cross-validation.

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

reg = XGBRegressor()
cv = KFold(5)

rfecv = RFECV(
    estimator=reg,
    step=1,
    cv=cv,
    scoring="neg_mean_squared_error",
    min_features_to_select=1,
    n_jobs=8,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 118


The remaining features:

In [None]:
rfecv.get_feature_names_out()

array(['Schleppfehler__sum_values',
       'Schleppfehler__fft_coefficient__attr_"imag"__coeff_15',
       'Schleppfehler__fft_coefficient__attr_"imag"__coeff_53',
       'Schleppfehler__fft_coefficient__attr_"real"__coeff_22',
       'Schleppfehler__fft_coefficient__attr_"real"__coeff_4',
       'Schleppfehler__fft_coefficient__attr_"abs"__coeff_13',
       'Schleppfehler__fft_coefficient__attr_"abs"__coeff_4',
       'Winkelposition__energy_ratio_by_chunks__num_segments_10__segment_focus_1',
       'Winkelposition__fft_coefficient__attr_"imag"__coeff_11',
       'Schleppfehler__quantile__q_0.1',
       'Winkelposition__energy_ratio_by_chunks__num_segments_10__segment_focus_0',
       'Winkelposition__kurtosis',
       'Schleppfehler__fft_coefficient__attr_"imag"__coeff_42',
       'Schleppfehler__fft_coefficient__attr_"real"__coeff_67',
       'Schleppfehler__cwt_coefficients__coeff_4__w_10__widths_(2, 5, 10, 20)',
       'Schleppfehler__fft_coefficient__attr_"imag"__coeff_3',
      

Reduce the dataframe to the remaining features:

In [None]:
X_reduced = pd.DataFrame(rfecv.transform(X), columns=rfecv.get_feature_names_out())

X_reduced

Unnamed: 0,Schleppfehler__sum_values,"Schleppfehler__fft_coefficient__attr_""imag""__coeff_15","Schleppfehler__fft_coefficient__attr_""imag""__coeff_53","Schleppfehler__fft_coefficient__attr_""real""__coeff_22","Schleppfehler__fft_coefficient__attr_""real""__coeff_4","Schleppfehler__fft_coefficient__attr_""abs""__coeff_13","Schleppfehler__fft_coefficient__attr_""abs""__coeff_4",Winkelposition__energy_ratio_by_chunks__num_segments_10__segment_focus_1,"Winkelposition__fft_coefficient__attr_""imag""__coeff_11",Schleppfehler__quantile__q_0.1,...,Schleppfehler__mean_second_derivative_central,Geschwindigkeit__autocorrelation__lag_9,"Motormoment__fft_coefficient__attr_""imag""__coeff_91","Geschwindigkeit__fft_coefficient__attr_""angle""__coeff_85","Geschwindigkeit__fft_coefficient__attr_""imag""__coeff_71","Geschwindigkeit__cwt_coefficients__coeff_0__w_10__widths_(2, 5, 10, 20)","Motormoment__change_quantiles__f_agg_""mean""__isabs_False__qh_0.8__ql_0.2","Geschwindigkeit__change_quantiles__f_agg_""var""__isabs_True__qh_0.4__ql_0.0","Schleppfehler__fft_coefficient__attr_""angle""__coeff_74",Geschwindigkeit__kurtosis
0,220.322437,3.739732,1.536082,-1.553956,-4.002099,5.028484,7.005428,0.006540,1062.098422,0.140627,...,-6.698347e-07,0.112367,1.083625,-36.499264,0.573182,-7.095395,0.000244,0.072657,104.221254,22.687857
1,218.919271,3.747181,1.541711,-1.553590,-4.066590,5.069325,7.085139,0.006541,1062.014035,0.139263,...,-2.647107e-07,0.079811,1.441080,96.929212,-6.010625,-7.147979,-0.000002,0.084615,103.628748,23.320410
2,219.675141,3.738148,1.539704,-1.584206,-3.956770,5.066454,6.946554,0.006541,1062.054868,0.140256,...,-3.860992e-07,0.065809,1.321183,97.433542,-7.480029,-7.333935,0.000008,0.095779,102.373440,22.264500
3,220.652269,3.774863,1.549263,-1.600499,-3.960379,5.086245,7.018991,0.006540,1062.107663,0.140656,...,-2.641488e-07,0.097781,1.075142,81.253547,-1.213854,-7.224966,0.000220,0.070947,101.899117,24.284135
4,220.618642,3.763362,1.545906,-1.598986,-3.958110,5.076282,6.973022,0.006540,1062.070756,0.141401,...,-3.456116e-07,0.066972,1.014918,75.422208,-11.573503,-7.475037,0.000002,0.082197,102.650938,24.000508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992,240.123393,4.239995,1.325320,-1.095709,-5.093910,5.610124,7.592647,0.006521,1060.325034,0.163819,...,-5.480785e-07,0.052190,1.275931,41.899492,13.253326,-7.521428,-0.000006,0.070621,102.042312,22.184097
1993,237.646545,4.161695,1.302196,-1.126151,-5.212160,5.460003,7.814579,0.006524,1060.304887,0.162158,...,-3.048843e-07,0.064291,1.472559,-10.745621,-2.277243,-7.257869,0.000127,0.083441,101.998330,25.122022
1994,238.773140,4.199249,1.307504,-1.123663,-5.146743,5.549681,7.745723,0.006522,1060.248799,0.163094,...,-3.456736e-07,0.069810,1.044994,155.842305,17.755133,-7.155336,0.000062,0.066044,101.996509,23.408470
1995,237.576204,4.115820,1.297624,-1.147701,-5.239736,5.489109,7.928654,0.006523,1060.252533,0.162408,...,-8.721736e-07,0.075891,1.393384,54.424198,18.102410,-7.090682,0.000083,0.062457,102.104924,24.662490


In [None]:
X_reduced.to_pickle("Dataset_Hauptversuch_Features_Reduced.pkl")

## Feature Extraction without Selection

tsfresh.extract_relevant_features also includes feature selection through statistical tests.  
Here, feature selection is not used, and all features are extracted.  
Then, only Recursive Feature Elimination is used for feature selection.  
This is more complete and can sometimes be better for some datasets and use cases. 

In [None]:
from tsfresh.feature_extraction import EfficientFCParameters

settings = EfficientFCParameters()

extracted_features = extract_features(df, column_id="Index", column_sort="Timestamp", default_fc_parameters=settings, n_jobs=8)

impute(extracted_features)

Feature Extraction: 100%|██████████| 40/40 [12:10<00:00, 18.26s/it]  
 'Motormoment__query_similarity_count__query_None__threshold_0.0'
 'Winkelposition__query_similarity_count__query_None__threshold_0.0'
 'Geschwindigkeit__friedrich_coefficients__coeff_0__m_3__r_30'
 'Geschwindigkeit__friedrich_coefficients__coeff_1__m_3__r_30'
 'Geschwindigkeit__friedrich_coefficients__coeff_2__m_3__r_30'
 'Geschwindigkeit__friedrich_coefficients__coeff_3__m_3__r_30'
 'Geschwindigkeit__max_langevin_fixed_point__m_3__r_30'
 'Geschwindigkeit__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


Unnamed: 0,Schleppfehler__variance_larger_than_standard_deviation,Schleppfehler__has_duplicate_max,Schleppfehler__has_duplicate_min,Schleppfehler__has_duplicate,Schleppfehler__sum_values,Schleppfehler__abs_energy,Schleppfehler__mean_abs_change,Schleppfehler__mean_change,Schleppfehler__mean_second_derivative_central,Schleppfehler__median,...,Geschwindigkeit__permutation_entropy__dimension_6__tau_1,Geschwindigkeit__permutation_entropy__dimension_7__tau_1,Geschwindigkeit__query_similarity_count__query_None__threshold_0.0,"Geschwindigkeit__matrix_profile__feature_""min""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""max""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""mean""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""median""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""25""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""75""__threshold_0.98",Geschwindigkeit__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,1.0,1.0,220.322437,41.918715,0.001148,0.000203,-6.698347e-07,0.191911,...,6.085058,6.825900,0.0,0.819862,2.195672,1.608663,1.631462,1.449151,1.777508,6.164987
1,0.0,0.0,1.0,1.0,218.919271,41.421810,0.001158,0.000203,-2.647107e-07,0.190624,...,6.099968,6.864404,0.0,1.062027,2.745572,2.061609,2.085122,1.889623,2.256906,6.207711
2,0.0,0.0,0.0,1.0,219.675141,41.665743,0.001160,0.000204,-3.860992e-07,0.191287,...,6.057718,6.877742,0.0,1.225015,2.752575,2.073991,2.109529,1.913445,2.267468,6.167603
3,0.0,0.0,1.0,1.0,220.652269,42.045407,0.001153,0.000205,-2.641488e-07,0.192001,...,6.098744,6.871994,0.0,0.652773,2.207524,1.596216,1.606932,1.436306,1.780856,6.205096
4,0.0,0.0,0.0,1.0,220.618642,42.016243,0.001158,0.000203,-3.456116e-07,0.191693,...,6.079461,6.841240,0.0,0.873887,2.271513,1.609895,1.619917,1.449892,1.780274,6.270994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992,0.0,0.0,0.0,1.0,240.123393,49.518120,0.001172,0.000214,-5.480785e-07,0.206122,...,6.144284,6.882001,0.0,0.736885,2.272050,1.623104,1.649816,1.479585,1.785721,6.165723
1993,0.0,0.0,1.0,1.0,237.646545,48.581285,0.001162,0.000212,-3.048843e-07,0.204596,...,6.131274,6.900039,0.0,0.822965,2.216082,1.614130,1.633984,1.476904,1.788359,6.130927
1994,0.0,0.0,0.0,1.0,238.773140,49.012201,0.001177,0.000213,-3.456736e-07,0.205454,...,6.093716,6.865930,0.0,0.840771,2.265572,1.626232,1.644138,1.475412,1.804707,6.132017
1995,0.0,0.0,0.0,1.0,237.576204,48.554987,0.001163,0.000212,-8.721736e-07,0.204348,...,6.114655,6.874293,0.0,0.884326,2.296218,1.603962,1.628663,1.448932,1.770131,6.202153


In [None]:
extracted_features

Unnamed: 0,Schleppfehler__variance_larger_than_standard_deviation,Schleppfehler__has_duplicate_max,Schleppfehler__has_duplicate_min,Schleppfehler__has_duplicate,Schleppfehler__sum_values,Schleppfehler__abs_energy,Schleppfehler__mean_abs_change,Schleppfehler__mean_change,Schleppfehler__mean_second_derivative_central,Schleppfehler__median,...,Geschwindigkeit__permutation_entropy__dimension_6__tau_1,Geschwindigkeit__permutation_entropy__dimension_7__tau_1,Geschwindigkeit__query_similarity_count__query_None__threshold_0.0,"Geschwindigkeit__matrix_profile__feature_""min""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""max""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""mean""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""median""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""25""__threshold_0.98","Geschwindigkeit__matrix_profile__feature_""75""__threshold_0.98",Geschwindigkeit__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,1.0,1.0,220.322437,41.918715,0.001148,0.000203,-6.698347e-07,0.191911,...,6.085058,6.825900,0.0,0.819862,2.195672,1.608663,1.631462,1.449151,1.777508,6.164987
1,0.0,0.0,1.0,1.0,218.919271,41.421810,0.001158,0.000203,-2.647107e-07,0.190624,...,6.099968,6.864404,0.0,1.062027,2.745572,2.061609,2.085122,1.889623,2.256906,6.207711
2,0.0,0.0,0.0,1.0,219.675141,41.665743,0.001160,0.000204,-3.860992e-07,0.191287,...,6.057718,6.877742,0.0,1.225015,2.752575,2.073991,2.109529,1.913445,2.267468,6.167603
3,0.0,0.0,1.0,1.0,220.652269,42.045407,0.001153,0.000205,-2.641488e-07,0.192001,...,6.098744,6.871994,0.0,0.652773,2.207524,1.596216,1.606932,1.436306,1.780856,6.205096
4,0.0,0.0,0.0,1.0,220.618642,42.016243,0.001158,0.000203,-3.456116e-07,0.191693,...,6.079461,6.841240,0.0,0.873887,2.271513,1.609895,1.619917,1.449892,1.780274,6.270994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992,0.0,0.0,0.0,1.0,240.123393,49.518120,0.001172,0.000214,-5.480785e-07,0.206122,...,6.144284,6.882001,0.0,0.736885,2.272050,1.623104,1.649816,1.479585,1.785721,6.165723
1993,0.0,0.0,1.0,1.0,237.646545,48.581285,0.001162,0.000212,-3.048843e-07,0.204596,...,6.131274,6.900039,0.0,0.822965,2.216082,1.614130,1.633984,1.476904,1.788359,6.130927
1994,0.0,0.0,0.0,1.0,238.773140,49.012201,0.001177,0.000213,-3.456736e-07,0.205454,...,6.093716,6.865930,0.0,0.840771,2.265572,1.626232,1.644138,1.475412,1.804707,6.132017
1995,0.0,0.0,0.0,1.0,237.576204,48.554987,0.001163,0.000212,-8.721736e-07,0.204348,...,6.114655,6.874293,0.0,0.884326,2.296218,1.603962,1.628663,1.448932,1.770131,6.202153


In [None]:
extracted_features.to_pickle("Dataset_Hauptversuch_Features_Large.pkl")

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

X = pd.read_pickle("Dataset_Hauptversuch_Features_Large.pkl")
y = pd.read_pickle("Angles_Hauptversuch.pkl")

reg = XGBRegressor()
cv = KFold(5)

rfecv = RFECV(
    estimator=reg,
    step=1,
    cv=cv,
    scoring="neg_mean_squared_error",
    min_features_to_select=1,
    n_jobs=8,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 36


In [None]:
rfecv.get_feature_names_out()

array(['Schleppfehler__sum_values', 'Schleppfehler__quantile__q_0.1',
       'Schleppfehler__fft_coefficient__attr_"real"__coeff_4',
       'Schleppfehler__fft_coefficient__attr_"real"__coeff_22',
       'Schleppfehler__fft_coefficient__attr_"real"__coeff_67',
       'Schleppfehler__fft_coefficient__attr_"imag"__coeff_15',
       'Schleppfehler__fft_coefficient__attr_"imag"__coeff_26',
       'Schleppfehler__fft_coefficient__attr_"imag"__coeff_33',
       'Schleppfehler__fft_coefficient__attr_"imag"__coeff_42',
       'Schleppfehler__fft_coefficient__attr_"imag"__coeff_53',
       'Schleppfehler__fft_coefficient__attr_"abs"__coeff_4',
       'Schleppfehler__fft_coefficient__attr_"abs"__coeff_13',
       'Schleppfehler__fft_coefficient__attr_"abs"__coeff_43',
       'Schleppfehler__fft_coefficient__attr_"abs"__coeff_69',
       'Schleppfehler__fft_coefficient__attr_"abs"__coeff_86',
       'Schleppfehler__fft_coefficient__attr_"angle"__coeff_2',
       'Schleppfehler__fft_coefficient__a

In [None]:
X_reduced = pd.DataFrame(rfecv.transform(X), columns=rfecv.get_feature_names_out())

X_reduced

Unnamed: 0,Schleppfehler__sum_values,Schleppfehler__quantile__q_0.1,"Schleppfehler__fft_coefficient__attr_""real""__coeff_4","Schleppfehler__fft_coefficient__attr_""real""__coeff_22","Schleppfehler__fft_coefficient__attr_""real""__coeff_67","Schleppfehler__fft_coefficient__attr_""imag""__coeff_15","Schleppfehler__fft_coefficient__attr_""imag""__coeff_26","Schleppfehler__fft_coefficient__attr_""imag""__coeff_33","Schleppfehler__fft_coefficient__attr_""imag""__coeff_42","Schleppfehler__fft_coefficient__attr_""imag""__coeff_53",...,"Winkelposition__fft_coefficient__attr_""real""__coeff_21","Winkelposition__fft_coefficient__attr_""imag""__coeff_11","Winkelposition__fft_coefficient__attr_""imag""__coeff_50","Winkelposition__fft_coefficient__attr_""abs""__coeff_34","Winkelposition__fft_coefficient__attr_""abs""__coeff_36","Winkelposition__fft_coefficient__attr_""angle""__coeff_92",Winkelposition__energy_ratio_by_chunks__num_segments_10__segment_focus_1,"Geschwindigkeit__change_quantiles__f_agg_""var""__isabs_True__qh_0.4__ql_0.0","Geschwindigkeit__fft_coefficient__attr_""real""__coeff_42",Geschwindigkeit__number_crossing_m__m_0
0,220.322437,0.140627,-4.002099,-1.553956,-0.203245,3.739732,1.099135,1.120004,1.999112,1.536082,...,-31.354995,1062.098422,230.341289,342.052817,322.961328,103.703200,0.006540,0.072657,-12.566426,3.0
1,218.919271,0.139263,-4.066590,-1.553590,-0.194408,3.747181,1.056174,1.098998,2.004950,1.541711,...,-31.450564,1062.014035,230.260515,342.054086,322.893801,103.682017,0.006541,0.084615,-37.857860,1.0
2,219.675141,0.140256,-3.956770,-1.584206,-0.204148,3.738148,1.182929,1.136787,1.973232,1.539704,...,-31.284089,1062.054868,230.214631,342.067975,322.958112,103.666182,0.006541,0.095779,-18.981054,1.0
3,220.652269,0.140656,-3.960379,-1.600499,-0.189124,3.774863,1.167669,1.131747,1.983689,1.549263,...,-31.294631,1062.107663,230.316003,341.975572,322.980553,103.687399,0.006540,0.070947,-4.851996,2.0
4,220.618642,0.141401,-3.958110,-1.598986,-0.187481,3.763362,1.161987,1.132210,1.992836,1.545906,...,-31.266391,1062.070756,230.304630,341.993472,323.094710,103.708282,0.006540,0.082197,-29.725162,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992,240.123393,0.163819,-5.093910,-1.095709,-0.239107,4.239995,1.127529,1.102081,2.263185,1.325320,...,-31.591555,1060.325034,230.225358,341.871633,322.611443,103.655188,0.006521,0.070621,-22.582938,3.0
1993,237.646545,0.162158,-5.212160,-1.126151,-0.238043,4.161695,1.030785,1.152905,2.307806,1.302196,...,-31.707649,1060.304887,230.493840,341.915005,322.533807,103.632283,0.006524,0.083441,-23.795427,4.0
1994,238.773140,0.163094,-5.146743,-1.123663,-0.248571,4.199249,1.058006,1.138798,2.301979,1.307504,...,-31.749943,1060.248799,230.353043,341.956415,322.655063,103.610207,0.006522,0.066044,-20.808860,4.0
1995,237.576204,0.162408,-5.239736,-1.147701,-0.242561,4.115820,0.999828,1.169032,2.285854,1.297624,...,-31.742932,1060.252533,230.419158,341.930063,322.569540,103.590841,0.006523,0.062457,-2.010698,4.0


In [None]:
X_reduced.to_pickle("Dataset_Hauptversuch_Features_Large_Reduced.pkl")