# Filter Methods - Basics


In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import VarianceThreshold
import pickle

### GLOBAL VARIABLES

In [2]:
INPUT_PATH = '../../data/train_test'
OUTPUT_PATH = '../../data/features'
OUTPUT_FILE_NAME = 'filter_basic_features_selected_v001'
N_SPLITS = 3 # numbers of folds
DAY_COL = 'd'
DATE_COL = "date"
D_THRESH = 1941 - int(365 * 2) # he only left 2 years of training data, from 2014-05-23 to 2016-05-24
DAYS_PRED = 28
SEED = 87

### FUNCTIONS

In [3]:
def reduce_mem_usage(df, verbose=False):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

### LOAD DATASET

In [4]:
# load the Santander customer satisfaction dataset from Kaggle

X_train = pd.read_pickle(f'{INPUT_PATH}/X_train.pkl').pipe(reduce_mem_usage)
X_test  = pd.read_pickle(f'{INPUT_PATH}/X_val.pkl').pipe(reduce_mem_usage)

In [5]:
X_train.shape, X_test.shape

((20580750, 124), (853720, 124))

### SAMPLE DATASET

In [6]:
X_train = X_train.groupby(['item_id', 'store_id']).apply(lambda x: pd.DataFrame.sample(x, frac=.3, random_state=SEED))

In [7]:
X_test = X_test.groupby(['item_id', 'store_id']).apply(lambda x: pd.DataFrame.sample(x, frac=.3, random_state=SEED))

In [8]:
X_train.shape, X_test.shape

((6158980, 124), (243920, 124))

In [9]:
features = X_train.select_dtypes(exclude=['datetime','object']).columns.tolist()

In [10]:
features_init = X_train[features].columns.tolist()

In [11]:
# remove constant features
constant_features = [
    feat for feat in X_train[features].columns if X_train[feat].std() == 0
]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((6158980, 124), (243920, 124))

In [11]:
features = X_train.select_dtypes(exclude=['datetime','object']).columns.tolist()

In [14]:
X_train.reset_index(drop=True,inplace=True)

In [16]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)

In [17]:
# remove quasi-constant features
sel = VarianceThreshold(
    threshold=0.01)  # 0.1 indicates 99% of observations approximately

sel.fit(X_train[features])  # fit finds the features with low variance

sum(sel.get_support()) # how many not quasi-constant?

  self.variances_ = np.nanvar(X, axis=0)
  (self.variances_ <= self.threshold)):
  return self.variances_ > self.threshold


117

In [18]:
features_to_keep = X_train[features].columns[sel.get_support()]

In [19]:
# we can then remove the features like this
X_train = X_train[features_to_keep]
X_test  = X_test[features_to_keep]

X_train.shape, X_test.shape

((6158980, 117), (243920, 117))

In [None]:
# check for duplicated features in the training set
duplicated_feat = []
for i in range(0, len(X_train.columns)):
    if i % 10 == 0:  # this helps me understand how the loop is going
        print(i)

    col_1 = X_train.columns[i]

    for col_2 in X_train.columns[i + 1:]:
        if X_train[col_1].equals(X_train[col_2]):
            duplicated_feat.append(col_2)
            
len(duplicated_feat)

0
10


In [None]:
duplicated_feat

In [None]:
X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
X_test.drop(labels=duplicated_feat, axis=1, inplace=True)

X_train.shape, X_test.shape

In [None]:
print(len(features_init) - X_train.shape[1], " were removed. The number of final features is ", X_train.shape[1])

In [None]:
features_final = X_train.columns.tolist()

In [None]:
np.save(f'{OUTPUT_PATH}/{OUTPUT_FILE_NAME}.npy',features_final)

In [3]:
features_final = np.load(f'{OUTPUT_PATH}/{OUTPUT_FILE_NAME}.npy')

In [4]:
features_final

array(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'wm_yr_wk',
       'event_name_1', 'event_type_1', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'demand_smoothed_shift_t28',
       'demand_smoothed_shift_t29', 'demand_smoothed_shift_t30',
       'demand_smoothed_shift_t31', 'demand_smoothed_shift_t32',
       'demand_smoothed_shift_t33', 'demand_smoothed_rolling_std_t5',
       'demand_smoothed_rolling_std_t10',
       'demand_smoothed_rolling_std_t30',
       'demand_smoothed_rolling_std_t70',
       'demand_smoothed_rolling_std_t90',
       'demand_smoothed_rolling_std_t120',
       'demand_smoothed_rolling_std_t180',
       'demand_smoothed_rolling_mean_t5',
       'demand_smoothed_rolling_mean_t10',
       'demand_smoothed_rolling_mean_t30',
       'demand_smoothed_rolling_mean_t70',
       'demand_smoothed_rolling_mean_t90',
       'demand_smoothed_rolling_mean_t120',
       'demand_smoothed_rolling_mean_t180',
       'demand_smoothed_rolling_acum_mean_t5',
