# Topics
- Constant, Quasi Constant and Duplicate Feature Removal
- Correlated Feature Removal

# Libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import datasets 
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Data

In [2]:
data = pd.read_csv('santander.csv', nrows=20000)

data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


# Variables

In [3]:
X = data.drop('TARGET', axis=1)

y = data['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

print('Initial number of features:', X_train.shape[1])

Initial number of features: 370


# Feature selection

### Constant and quasi-constant features removal

In [4]:
constant_filter = VarianceThreshold(threshold=0.01) # Defining the threshold to 0.01, for remove all constant and quasi-constant features.

constant_filter.fit(X_train)

X_train_filtered = constant_filter.transform(X_train)
X_test_filtered = constant_filter.transform(X_test)

print('Number of features after remove constant and quasi-constant features:', X_train_filtered.shape[1])

Number of features after remove constant and quasi-constant features: 256


### Duplicate features removal

In [5]:
# Transposing the data because the method for identifying duplicates of Pandas, is only able to identify duplicate rows.
X_train_filtered = X_train_filtered.T
X_test_filtered = X_test_filtered.T

X_train_filtered = pd.DataFrame(X_train_filtered)
X_test_filtered = pd.DataFrame(X_test_filtered)

duplicated_features = X_train_filtered.duplicated()

features_to_keep = [not index for index in duplicated_features]

X_train_filtered = X_train_filtered[features_to_keep].T
X_test_filtered = X_test_filtered[features_to_keep].T

print('Number of features after remove duplicate features:', X_train_filtered.shape[1])

Number of features after remove duplicate features: 236


### Correlated Feature Removal

In [6]:
features_correlation = X_train_filtered.corr()

features_correlation.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,1.0,-0.020454,0.001019,0.007351,0.003117,-0.00021,-0.011013,-0.015649,0.000905,-0.000611,...,0.003796,-0.004925,-0.014241,0.004005,0.002906,0.01026,-0.009272,0.012272,0.013223,0.00656
1,-0.020454,1.0,-0.017122,0.001722,0.000997,0.003134,0.001692,0.001925,0.000665,0.000779,...,0.000449,0.000733,0.000544,0.000657,0.000679,0.001023,0.000619,0.000923,0.000985,-7.8e-05
2,0.001019,-0.017122,1.0,0.05243,0.102994,0.110686,0.045704,0.057188,0.004767,0.00665,...,0.022039,0.02484,0.016829,0.022746,0.02312,0.040912,0.016473,0.046729,0.047113,-0.000461
3,0.007351,0.001722,0.05243,1.0,0.065649,0.057302,0.029618,0.029002,0.002229,0.007876,...,-0.000819,-0.001324,-0.000971,-0.001137,-0.001147,0.001407,-0.001091,0.001334,0.001281,-0.001218
4,0.003117,0.000997,0.102994,0.065649,1.0,0.890087,0.376431,0.332854,0.021652,0.041567,...,-0.002236,0.023066,0.030169,0.023807,0.026037,0.001243,0.001622,-0.000974,-0.001282,0.002879


In [7]:
# Function to get correlation between features
def get_correlation(data, threshold):
    correlated_columns = set()
    correlation_matrix = data.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correlated_columns.add(column_name)
    return correlated_columns    

In [8]:
correlated_features = get_correlation(X_train_filtered, 0.85) # Defining the threshold to 0.85, for select the most correlated features.

X_train_filtered = X_train_filtered.drop(labels=correlated_features, axis=1)
X_test_filtered = X_test_filtered.drop(labels=correlated_features, axis=1)

print('Number of features after correlation filter:', X_train_filtered.shape[1])

Number of features after correlation filter: 114


# Model

In [9]:
# Function to create random forest models
def random_forest_model(X_train, X_test, y_train, y_test):
    random_forest_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    random_forest_model.fit(X_train, y_train)
    prediction = random_forest_model.predict(X_test)
    print('Model accuracy:', accuracy_score(y_test, prediction))

### Comparing models performance

In [10]:
%%time

print('- Model with feature selection -')

random_forest_model(X_train_filtered, X_test_filtered, y_train, y_test)

- Model with feature selection -
Model accuracy: 0.95825
Wall time: 475 ms


In [11]:
%%time

print('- Model without feature selection -')

random_forest_model(X_train, X_test, y_train, y_test)

- Model without feature selection -
Model accuracy: 0.9575
Wall time: 819 ms
