# Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Feature selection - AUC

### Data

In [2]:
data = pd.read_csv('santander.csv', nrows=20000)

data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


### Variables

In [3]:
X = data.drop('TARGET', axis=1)

y = data['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

print('Initial number of features:', X.shape[1])

Initial number of features: 370


### Constant and quasi-constant features removal

In [4]:
constant_filter = VarianceThreshold(threshold=0.01)

constant_filter.fit(X_train)

X_train_filtered = constant_filter.transform(X_train)
X_test_filtered = constant_filter.transform(X_test)

print('Number of features after remove constant and quasi constant features:', X_train_filtered.shape[1])

Number of features after remove constant and quasi constant features: 247


### Duplicate features removal

In [5]:
X_train_filtered = X_train_filtered.T
X_test_filtered = X_test_filtered.T

X_train_filtered = pd.DataFrame(X_train_filtered)
X_test_filtered = pd.DataFrame(X_test_filtered)

duplicated_features = X_train_filtered.duplicated()

features_to_keep = [not feature for feature in duplicated_features]

X_train_filtered = X_train_filtered[features_to_keep].T
X_test_filtered = X_test_filtered[features_to_keep].T

print('Number of features after remove duplicate features:', X_test_filtered.shape[1])

Number of features after remove duplicate features: 230


### AUC filter

In [6]:
# Getting Area Under the ROC Curve for each feature
roc_auc = []
for feature in X_train_filtered.columns:
    random_forest_model = RandomForestClassifier(n_estimators=100)
    random_forest_model.fit(X_train_filtered[feature].to_frame(), y_train)
    prediction = random_forest_model.predict(X_test_filtered[feature].to_frame())
    roc_auc.append(roc_auc_score(y_test, prediction))

In [7]:
auc_values = pd.Series(roc_auc)

auc_values.index = X_train_filtered.columns

auc_values.sort_values(ascending=False, inplace=True)

auc_values = auc_values[auc_values>0.5] # Using a common threshold for AUC to select the best features

X_train_filtered = X_train_filtered[auc_values.index]
X_test_filtered = X_test_filtered[auc_values.index]

print('Number of features after AUC filter:', X_train_filtered.shape[1])

Number of features after AUC filter: 10


### Model

In [8]:
def random_forest_model(X_train, X_test, y_train, y_test):
    random_forest_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    random_forest_model.fit(X_train, y_train)
    prediction = random_forest_model.predict(X_test)
    print('Model accuracy:', accuracy_score(y_test, prediction))

### Comparing model performance

In [9]:
%%time

print('- Model with feature selection -')

print('Number of features:', X_train_filtered.shape[1])

random_forest_model(X_train_filtered, X_test_filtered, y_train, y_test)

- Model with feature selection -
Number of features: 10
Model accuracy: 0.9585
Wall time: 378 ms


In [10]:
%%time

print('- Model without feature selection -')

print('Number of features:', X_train.shape[1])

random_forest_model(X_train, X_test, y_train, y_test)

- Model without feature selection -
Number of features: 370
Model accuracy: 0.9575
Wall time: 889 ms


# Feature selection - MSE

### Data

In [11]:
boston_data = load_boston()

### Variables

In [12]:
X = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)

y = boston_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Initial number of features:', X.shape[1])

Initial number of features: 13


### MSE filter

In [13]:
# Getting MSE for each feature
mse = []
for feature in X_train.columns:
    linear_regression_model = LinearRegression()
    linear_regression_model.fit(X_train[feature].to_frame(), y_train)
    y_pred = linear_regression_model.predict(X_test[feature].to_frame())
    mse.append(mean_squared_error(y_test, y_pred))

In [14]:
mse_values = pd.Series(mse, index=X_train.columns)

mse_values.sort_values(ascending=False, inplace=True)

mse_values

CHAS       98.465044
DIS        95.526284
B          92.248057
AGE        85.570781
NOX        83.531225
CRIM       83.472974
RAD        83.467868
ZN         81.563432
TAX        75.025908
INDUS      71.376093
PTRATIO    69.905120
LSTAT      47.534740
RM         42.725216
dtype: float64

In [15]:
# Selecting the two features with the lowest MSE
X_train_filtered = X_train[['RM', 'LSTAT']]
X_test_filtered = X_test[['RM', 'LSTAT']]

### Model

In [16]:
def linear_regression_model(X_train, X_test, y_train, y_test):
    linear_regression_model = LinearRegression()
    linear_regression_model.fit(X_train, y_train)
    prediction = linear_regression_model.predict(X_test)
    print('R^2:', r2_score(y_test, prediction))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, prediction)))
    print('Std of price:', np.std(y))

### Comparing model performance

In [17]:
%%time

print('- Model with feature selection -')

print('Number of features:', X_train_filtered.shape[1])

linear_regression_model(X_train_filtered, X_test_filtered, y_train, y_test)

- Model with feature selection -
Number of features: 2
R^2: 0.6645590829387986
RMSE: 5.80434588834514
Std of price: 9.188011545278203
Wall time: 6 ms


In [18]:
%%time

print('- Model without feature selection -')

print('Number of features:', X_train.shape[1])

linear_regression_model(X_train, X_test, y_train, y_test)

- Model without feature selection -
Number of features: 13
R^2: 0.7732908322153271
RMSE: 4.771773353893077
Std of price: 9.188011545278203
Wall time: 6 ms
