# Topics
- Constant, Quasi Constant and Duplicate Feature Removal
- Correlated Feature Removal

# Libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import datasets 
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Data

In [2]:
data = pd.read_csv('santander.csv', nrows=20000)

data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


# Variables

In [3]:
X = data.drop('TARGET', axis=1)

y = data['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

print('Initial number of features:', X_train.shape[1])

Initial number of features: 370


# Feature selection

### Constant and quasi-constant features removal

In [4]:
constant_filter = VarianceThreshold(threshold=0.01) # We define the threshold to 0.01 to remove all constant and quasi-constant features

constant_filter.fit(X_train)

X_train_filtered = constant_filter.transform(X_train)
X_test_filtered = constant_filter.transform(X_test)

print('Number of features after remove constant and quasi-constant features:', X_train_filtered.shape[1])

Number of features after remove constant and quasi-constant features: 253


### Duplicate features removal

In [5]:
X_train_filtered = X_train_filtered.T
X_test_filtered = X_test_filtered.T

X_train_filtered = pd.DataFrame(X_train_filtered)
X_test_filtered = pd.DataFrame(X_test_filtered)

duplicated_features = X_train_filtered.duplicated()

features_to_keep = [not index for index in duplicated_features]

X_train_filtered = X_train_filtered[features_to_keep].T
X_test_filtered = X_test_filtered[features_to_keep].T


print('Number of features after remove duplicate features:', X_train_filtered.shape[1])

Number of features after remove duplicate features: 234


### Correlated Feature Removal

In [6]:
features_correlation = X_train_filtered.corr()

features_correlation.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,243,244,245,246,247,248,249,250,251,252
0,1.0,-0.02331,-0.000701,0.002828,0.003067,-0.000179,-0.013214,-0.014766,-0.003753,-0.004696,...,-0.001266,-0.004958,-0.014259,0.003996,0.002911,0.01774,-0.005337,0.019509,0.020112,0.000395
1,-0.02331,1.0,-0.011746,0.001246,0.001816,0.003963,0.001833,0.002136,0.000798,0.000865,...,0.00058,0.000781,0.000579,0.000691,0.000709,0.000714,0.00068,0.000835,0.000943,0.000475
2,-0.000701,-0.011746,1.0,0.036863,0.111251,0.114426,0.053788,0.066757,0.014282,0.013889,...,0.02533,0.024683,0.01672,0.022496,0.02282,0.032614,0.021392,0.039339,0.042717,-0.003368
3,0.002828,0.001246,0.036863,1.0,0.027064,0.0226,0.013511,0.013187,0.003262,0.007257,...,0.012446,-0.000725,-0.000532,-0.000618,-0.000621,-0.000223,-0.000609,-0.000249,-0.000231,0.000884
4,0.003067,0.001816,0.111251,0.027064,1.0,0.87262,0.400361,0.373792,0.037355,0.06727,...,0.002488,0.024184,0.03155,0.023339,0.024863,-0.001728,-0.00055,-0.002575,-0.002846,-0.003201


In [7]:
# Function to select correlated features
def get_correlation(data, threshold):
    correlated_columns = set()
    correlation_matrix = data.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correlated_columns.add(column_name)
    return correlated_columns    

In [8]:
correlated_features = get_correlation(X_train_filtered, 0.85)

X_train_filtered = X_train_filtered.drop(labels=correlated_features, axis=1)
X_test_filtered = X_test_filtered.drop(labels=correlated_features, axis=1)

print('Number of features after correlation filter:', X_train_filtered.shape[1])

Number of features after correlation filter: 113


# Model

In [9]:
# Function to create random forest models
def random_forest_model(X_train, X_test, y_train, y_test):
    random_forest_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    random_forest_model.fit(X_train, y_train)
    prediction = random_forest_model.predict(X_test)
    print('Model accuracy:', accuracy_score(y_test, prediction))

### Comparing models performance

In [10]:
%%time

print('- Model with feature selection -')

random_forest_model(X_train_filtered, X_test_filtered, y_train, y_test)

- Model with feature selection -
Model accuracy: 0.9585
Wall time: 468 ms


In [11]:
%%time

print('- Model without feature selection -')

random_forest_model(X_train, X_test, y_train, y_test)

- Model without feature selection -
Model accuracy: 0.95875
Wall time: 821 ms
