# Filter Methods - Basics


In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import VarianceThreshold
import pickle

### GLOBAL VARIABLES

In [2]:
INPUT_PATH = '../../data/train_test'
OUTPUT_PATH = '../../data/features'
OUTPUT_FILE_NAME = 'filter_basic_features_selected_v001'
N_SPLITS = 3 # numbers of folds
DAY_COL = 'd'
DATE_COL = "date"
D_THRESH = 1941 - int(365 * 2) # he only left 2 years of training data, from 2014-05-23 to 2016-05-24
DAYS_PRED = 28
SEED = 47

### FUNCTIONS

In [3]:
def reduce_mem_usage(df, verbose=False):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

### LOAD DATASET

In [4]:
# load the Santander customer satisfaction dataset from Kaggle

X_train = pd.read_pickle(f'{INPUT_PATH}/X_train.pkl').pipe(reduce_mem_usage)
X_test  = pd.read_pickle(f'{INPUT_PATH}/X_val.pkl').pipe(reduce_mem_usage)

In [5]:
X_train.shape, X_test.shape

((29973650, 47), (853720, 47))

### SAMPLE DATASET

In [6]:
X_train = X_train.groupby(['item_id', 'store_id']).apply(lambda x: pd.DataFrame.sample(x, frac=.3, random_state=SEED))

In [7]:
X_test = X_test.groupby(['item_id', 'store_id']).apply(lambda x: pd.DataFrame.sample(x, frac=.3, random_state=SEED))

In [8]:
X_train.shape, X_test.shape

((8994550, 47), (243920, 47))

In [10]:
features = X_train.select_dtypes(exclude=['datetime','object']).columns.tolist()

In [12]:
features_init = X_train[features].columns.tolist()

In [13]:
# remove constant features
constant_features = [
    feat for feat in X_train[features].columns if X_train[feat].std() == 0
]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((8994550, 32), (243920, 32))

In [14]:
features = X_train.select_dtypes(exclude=['datetime','object']).columns.tolist()

In [15]:
# remove quasi-constant features
sel = VarianceThreshold(
    threshold=0.01)  # 0.1 indicates 99% of observations approximately

sel.fit(X_train[features])  # fit finds the features with low variance

sum(sel.get_support()) # how many not quasi-constant?

29

In [16]:
features_to_keep = X_train[features].columns[sel.get_support()]

In [17]:
# we can then remove the features like this
X_train = X_train[features_to_keep]
X_test  = X_test[features_to_keep]

X_train.shape, X_test.shape

((8994550, 29), (243920, 29))

In [18]:
# check for duplicated features in the training set
duplicated_feat = []
for i in range(0, len(X_train.columns)):
    if i % 10 == 0:  # this helps me understand how the loop is going
        print(i)

    col_1 = X_train.columns[i]

    for col_2 in X_train.columns[i + 1:]:
        if X_train[col_1].equals(X_train[col_2]):
            duplicated_feat.append(col_2)
            
len(duplicated_feat)

0
10
20


0

In [19]:
duplicated_feat

[]

In [20]:
X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
X_test.drop(labels=duplicated_feat, axis=1, inplace=True)

X_train.shape, X_test.shape

((8994550, 29), (243920, 29))

In [21]:
print(len(features_init) - X_train.shape[1], " were removed. The number of final features is ", X_train.shape[1])

17  were removed. The number of final features is  29


In [23]:
features_final = X_train.columns.tolist()

In [24]:
np.save(f'{OUTPUT_PATH}/{OUTPUT_FILE_NAME}.npy',features_final)