# Feature Selection
- Univariate Feature Selection
1. **binary_log:**

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif

In [4]:
fulldf_binary_log = pd.read_csv("../../../../csv_files/train/full_train/full_train_combinations/fulldf_binary_log.csv", index_col="Unnamed: 0")

In [5]:
testdf_binary_log = pd.read_csv("../../../../csv_files/test/full_test/full_test_combinations/testdf_binary_log.csv", index_col="Unnamed: 0")

In [6]:
feature_train = fulldf_binary_log.columns.drop(['target', 'id'])

# Keep 10 features
selector = SelectKBest(f_classif, k=10)

X_train = selector.fit_transform(fulldf_binary_log[feature_train], fulldf_binary_log['target'])

X_train

array([[0.        , 0.        , 1.        , ..., 0.        , 0.11729722,
        0.63186848],
       [0.        , 0.        , 1.        , ..., 1.        , 0.30559909,
        0.36465054],
       [0.        , 0.        , 1.        , ..., 1.        , 0.37476115,
        0.64927612],
       ...,
       [0.        , 0.        , 1.        , ..., 0.        , 0.38048273,
        0.37120603],
       [0.        , 0.        , 1.        , ..., 1.        , 0.50733541,
        0.30978344],
       [0.        , 0.        , 1.        , ..., 0.        , 0.56176885,
        0.55084162]])

In [7]:
feature_test = testdf_binary_log.columns.drop("id")
X_test = selector.transform(testdf_binary_log[feature_test])
X_test

array([[0.        , 0.        , 1.        , ..., 0.        , 0.39627874,
        0.46719183],
       [0.        , 0.        , 1.        , ..., 1.        , 0.25993962,
        0.39173965],
       [0.        , 0.        , 1.        , ..., 1.        , 0.56580529,
        0.5166454 ],
       ...,
       [0.        , 0.        , 1.        , ..., 1.        , 0.36900393,
        0.35971941],
       [0.        , 0.        , 1.        , ..., 1.        , 0.1764745 ,
        0.60852151],
       [0.        , 0.        , 1.        , ..., 1.        , 0.13215824,
        0.5834791 ]])

In [8]:
# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_train), 
                                 index=fulldf_binary_log.index, 
                                 columns=feature_train)
selected_features_train

Unnamed: 0,cat0_0,cat0_1,cat1_0,cat1_1,cat2_0,cat2_1,cat3_0,cat3_1,cat3_2,cat4_0,...,cont3_log,cont4_log,cont5_log,cont6_log,cont7_log,cont8_log,cont9_log,cont10_log,cont11_log,cont12_log
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.631868,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.364651,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.649276,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.659662,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.323966,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.238685,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.179995,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.371206,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.309783,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_test), 
                                 index=testdf_binary_log.index, 
                                 columns=feature_test)
selected_features_test

Unnamed: 0,cat0_0,cat0_1,cat1_0,cat1_1,cat2_0,cat2_1,cat3_0,cat3_1,cat3_2,cat4_0,...,cont3_log,cont4_log,cont5_log,cont6_log,cont7_log,cont8_log,cont9_log,cont10_log,cont11_log,cont12_log
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.467192,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.391740,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.516645,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.565243,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.190812,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.674819,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.208698,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.359719,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.608522,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]

# Get the valid dataset with the selected features.
train_binary_log = fulldf_binary_log[selected_columns_train]
train_binary_log

Unnamed: 0,cat3_0,cat4_1,cat4_2,cat5_0,cat6_0,cat7_0,cat9_2,cat9_3,cont2_log,cont5_log
0,0,0,1,0,0,0,0,0,0.117297,0.631868
1,0,0,1,0,0,0,0,1,0.305599,0.364651
2,0,0,1,0,0,0,0,1,0.374761,0.649276
3,0,0,1,0,0,0,1,0,0.143005,0.659662
4,0,0,1,0,0,0,0,1,0.402937,0.323966
...,...,...,...,...,...,...,...,...,...,...
299878,0,0,1,0,0,0,1,1,0.149851,0.238685
299879,0,0,1,0,0,0,1,1,0.465314,0.179995
299880,0,0,1,0,0,0,1,0,0.380483,0.371206
299881,0,0,1,0,0,0,0,1,0.507335,0.309783


In [11]:
# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]

# Get the valid dataset with the selected features.
test_binary_log = testdf_binary_log[selected_columns_test]
test_binary_log

Unnamed: 0,cat3_0,cat4_1,cat4_2,cat5_0,cat7_0,cat9_2,cat9_3,cont2_log,cont5_log
0,0,0,1,0,0,0,0,0.396279,0.467192
1,0,0,1,0,0,0,1,0.259940,0.391740
2,0,0,1,0,0,0,1,0.565805,0.516645
3,0,0,1,0,0,0,1,0.539871,0.565243
4,0,0,1,0,0,1,0,0.274042,0.190812
...,...,...,...,...,...,...,...,...,...
199995,0,0,1,0,0,0,1,0.181974,0.674819
199996,0,0,1,0,0,0,1,0.384599,0.208698
199997,0,0,1,0,0,1,1,0.369004,0.359719
199998,0,0,1,0,0,0,1,0.176475,0.608522


In [12]:
target = fulldf_binary_log[["id", "target"]]

In [13]:
i_d = pd.DataFrame(testdf_binary_log["id"])

In [14]:
train_binary_log = pd.merge(train_binary_log, target, left_index=True, right_index=True)
train_binary_log

Unnamed: 0,cat3_0,cat4_1,cat4_2,cat5_0,cat6_0,cat7_0,cat9_2,cat9_3,cont2_log,cont5_log,id,target
0,0,0,1,0,0,0,0,0,0.117297,0.631868,1,6.994023
1,0,0,1,0,0,0,0,1,0.305599,0.364651,2,8.071256
2,0,0,1,0,0,0,0,1,0.374761,0.649276,3,5.760456
3,0,0,1,0,0,0,1,0,0.143005,0.659662,4,7.806457
4,0,0,1,0,0,0,0,1,0.402937,0.323966,6,6.868974
...,...,...,...,...,...,...,...,...,...,...,...,...
299878,0,0,1,0,0,0,1,1,0.149851,0.238685,499993,8.343538
299879,0,0,1,0,0,0,1,1,0.465314,0.179995,499996,7.851861
299880,0,0,1,0,0,0,1,0,0.380483,0.371206,499997,7.600558
299881,0,0,1,0,0,0,0,1,0.507335,0.309783,499998,8.272095


In [15]:
test_binary_log = pd.merge(test_binary_log, i_d, left_index=True, right_index=True)

In [None]:
test_binary_log

In [None]:
train_binary_log.to_csv("train_binary_log.csv")

In [None]:
test_binary_log.to_csv("test_binary_log.csv")