In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from feature_engine.selection import SmartCorrelatedSelection, RecursiveFeatureElimination
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor

In [3]:
data = pd.read_parquet('d:/demand-forecast-SQGroup/data/sales_BYA_v3.parquet')
df = data.copy()

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
date,2019-08-08 00:00:00,2019-08-10 00:00:00,2019-08-12 00:00:00,2019-08-13 00:00:00,2019-08-14 00:00:00
net_price,5042942.0,2932270.5,1809937.5,3067519.0,5123492.0
qtym,189950.125,130657.28125,79055.164062,122930.046875,211948.609375
date_month,8,8,8,8,8
date_quarter,3,3,3,3,3
date_quarter_start,0,0,0,0,0
date_quarter_end,0,0,0,0,0
date_year_start,0,0,0,0,0
date_year_end,0,0,0,0,0
date_week,32,32,33,33,33


In [5]:
X = df.drop(columns=['net_price', 'qtym', 'date'])
y = df['net_price']

In [6]:
X.shape

(1121, 31)

<center><h1>Feature Selection</h1></center>

In [7]:
def selectBestFeatures(df):
    X = df.drop(columns=['net_price', 'qtym', 'date'])
    y = df['net_price']
    
    scs = SmartCorrelatedSelection(
        variables=None, method='pearson', 
        threshold=0.8, missing_values='ignore', 
        selection_method='model_performance', estimator=Ridge(max_iter=1000), 
        scoring='r2', cv=3, confirm_variables=False
    )
    scs.fit(X, y)
    scs_columns = set(scs.transform(X).columns)
    rfe = RecursiveFeatureElimination(
        DecisionTreeRegressor(max_depth=4), scoring='r2', cv=3, threshold=0.01, 
        variables=None, confirm_variables=False
    )
    rfe.fit(X, y)
    rfe_columns = set(rfe.transform(X).columns)
    return scs_columns.intersection(rfe_columns)

In [8]:
scs = SmartCorrelatedSelection(
    variables=None, method='pearson', 
    threshold=0.8, missing_values='ignore', 
    selection_method='model_performance', estimator=Ridge(max_iter=1000, random_state=33), 
    scoring='r2', cv=3, confirm_variables=False
)
scs.fit(X, y)

In [9]:
scs.transform(X).columns

Index(['date_month', 'date_quarter_start', 'date_quarter_end',
       'date_year_start', 'date_year_end', 'date_day_of_month', 'date_weekend',
       'date_month_start', 'date_month_end', 'net_price_lag_1',
       'net_price_lag_3', 'net_price_lag_7', 'net_price_lag_14',
       'net_price_lag_28', 'net_price_window_7_mean', 'net_price_window_7_std',
       'qtym_expanding_std'],
      dtype='object')

In [10]:
rfe = RecursiveFeatureElimination(
    DecisionTreeRegressor(max_depth=4, random_state=33), scoring='r2', cv=3, threshold=0.01, 
    variables=None, confirm_variables=False
)
rfe.fit(X, y)

In [11]:
rfe.transform(X).columns

Index(['date_quarter', 'date_quarter_start', 'date_quarter_end',
       'date_year_start', 'date_year_end', 'date_week', 'date_day_of_week',
       'date_day_of_year', 'date_weekend', 'date_month_start',
       'date_month_end', 'qtym_lag_1', 'net_price_lag_3', 'qtym_lag_3',
       'net_price_lag_7', 'qtym_lag_7', 'net_price_lag_14', 'qtym_lag_14',
       'net_price_lag_28', 'qtym_lag_28', 'net_price_window_7_std',
       'net_price_window_7_median', 'qtym_window_7_mean', 'qtym_window_7_std',
       'qtym_window_7_median'],
      dtype='object')

In [12]:
selectBestFeatures(df)

{'date_day_of_month',
 'date_month',
 'date_month_end',
 'date_quarter_start',
 'net_price_lag_1',
 'net_price_lag_14',
 'net_price_lag_28',
 'net_price_lag_7',
 'net_price_window_7_mean',
 'net_price_window_7_std',
 'qtym_expanding_std'}

In [13]:
df[list(selectBestFeatures(df))+['net_price', 'date']].to_parquet(
    'd:/demand-forecast-SQGroup/data/saleBYA_bestFeatures.parquet', index=False
)