In [8]:
import pandas as pd

X = pd.read_csv('x_v1.csv')
y = pd.read_csv('target.csv')

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89213 entries, 0 to 89212
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ProcessingTime            89213 non-null  float64
 1   config                    89213 non-null  float64
 2   is_dep_B                  89213 non-null  float64
 3   is_local                  89213 non-null  float64
 4   departure_equals_checkin  89213 non-null  float64
 5   dep_day                   89213 non-null  float64
 6   dep_hour                  89213 non-null  float64
 7   nflights_next_3_hours     89213 non-null  float64
 8   0                         89213 non-null  float64
 9   1                         89213 non-null  float64
 10  2                         89213 non-null  float64
 11  3                         89213 non-null  float64
 12  4                         89213 non-null  float64
 13  5                         89213 non-null  float64
 14  6     

In [9]:
rolls = [5, 15, 30, 60, 120]
columns_to_roll = ['ProcessingTime', 'config', 'is_dep_B', 'is_local', 'departure_equals_checkin']
rolled = [X]

for roll in rolls:
    rolled.append(X[columns_to_roll].rolling(roll).mean().rename(columns={c: c+f'_{roll}' for c in columns_to_roll}))
    rolled.append(y.rolling(roll).mean().rename(columns={'target': f'target_{roll}'}))
    
X = pd.concat(rolled, axis=1)
    
X = X.iloc[max(rolls):]
y = y.iloc[max(rolls):]

In [13]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_percentage_error

tscv = TimeSeriesSplit(n_splits=5)
for train_idx, val_idx in tscv.split(X):
    train_X = X.iloc[train_idx].to_numpy()
    train_y = y.iloc[train_idx].to_numpy().ravel()
    
    val_X = X.iloc[val_idx].to_numpy()
    val_y = y.iloc[val_idx].to_numpy().ravel()
    
    estimator = LinearRegression()
    estimator.fit(train_X, train_y)
    
    print(f'split: {round(len(train_idx)/len(X), 2)}/{round(len(val_idx)/len(X), 2)}, MAPE: {mean_absolute_percentage_error(estimator.predict(val_X), val_y)}')

split: 0.17/0.17, MAPE: 0.14848510839824922
split: 0.33/0.17, MAPE: 0.15125990890421076
split: 0.5/0.17, MAPE: 0.1493815431546566
split: 0.67/0.17, MAPE: 0.14706455932112145
split: 0.83/0.17, MAPE: 0.14859198489593373


In [14]:
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import numpy as np

selector = RFECV(LinearRegression(), step=1, cv=tscv.split(X))
selector.fit(X.to_numpy(), y.to_numpy())

for c, r in zip(X.columns, selector.ranking_):
    if r > 1:
        print(c, r)