# Libraries

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Data

In [2]:
data = pd.read_csv('santander.csv', nrows=20000)

data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


# Variables

In [3]:
X = data.drop('TARGET', axis=1)

y = data['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

print('Initial number of features:', X_train.shape[1])

Initial number of features: 370


# Feature selection

### Constant and quasi-constant features removal

In [4]:
constant_filter = VarianceThreshold(threshold=0.01) # Defining the threshold to 0.01, for remove all constant and quasi-constant features.

constant_filter.fit(X_train)

X_train_filtered = constant_filter.transform(X_train)
X_test_filtered = constant_filter.transform(X_test)

print('Number of features after remove constant and quasi-constant features:', X_train_filtered.shape[1])

Number of features after remove constant and quasi-constant features: 251


### Duplicate features removal

In [5]:
# Transposing the data because the method for identifying duplicates of Pandas, is only able to identify duplicate rows.
X_train_filtered = X_train_filtered.T
X_test_filtered = X_test_filtered.T

X_train_filtered = pd.DataFrame(X_train_filtered)
X_test_filtered = pd.DataFrame(X_test_filtered)

duplicated_features = X_train_filtered.duplicated()

features_to_keep = [not index for index in duplicated_features]

X_train_filtered = X_train_filtered[features_to_keep].T
X_test_filtered = X_test_filtered[features_to_keep].T

print('Number of features after remove duplicate features:', X_train_filtered.shape[1])

Number of features after remove duplicate features: 231


### Correlated Feature Removal

In [6]:
features_correlation = X_train_filtered.corr()

features_correlation.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,241,242,243,244,245,246,247,248,249,250
0,1.0,-0.027994,-0.001125,0.005969,0.002914,-0.000196,-0.015701,-0.019065,0.001598,-3.7e-05,...,0.005347,-0.000763,-0.011618,0.008501,0.007093,0.018368,0.009669,0.019819,0.020645,-0.006983
1,-0.027994,1.0,-0.007995,0.001172,0.001332,0.003423,0.001692,0.001954,0.000592,0.000717,...,0.000343,0.000588,0.000483,0.000544,0.000579,0.00068,0.000824,0.000768,0.000862,0.000461
2,-0.001125,-0.007995,1.0,0.036739,0.107163,0.110229,0.049204,0.060954,0.001818,0.003969,...,0.019418,0.020371,0.014175,0.020077,0.020749,0.032116,0.030926,0.038207,0.041393,-0.001735
3,0.005969,0.001172,0.036739,1.0,0.021197,0.017282,0.009868,0.006659,0.000979,0.003502,...,-0.00034,-0.000577,-0.000468,-0.000498,-0.000509,-0.000249,-0.000794,-0.00025,-0.000225,-0.000123
4,0.002914,0.001332,0.107163,0.021197,1.0,0.87697,0.38769,0.352906,0.017228,0.038105,...,-0.001693,0.024666,0.031742,0.023198,0.027058,-0.001705,0.001952,-0.002379,-0.002556,-0.000913


In [7]:
# Function to get correlation between features
def get_correlation(data, threshold):
    correlated_columns = set()
    correlation_matrix = data.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correlated_columns.add(column_name)
    return correlated_columns    

In [8]:
correlated_features = get_correlation(X_train_filtered, 0.85) # Defining the threshold to 0.85, for select the most correlated features.

X_train_filtered = X_train_filtered.drop(labels=correlated_features, axis=1)
X_test_filtered = X_test_filtered.drop(labels=correlated_features, axis=1)

print('Number of features after correlation filter:', X_train_filtered.shape[1])

Number of features after correlation filter: 109


# Model

In [9]:
# Function to create random forest models
def random_forest_model(X_train, X_test, y_train, y_test):
    random_forest_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    random_forest_model.fit(X_train, y_train)
    prediction = random_forest_model.predict(X_test)
    print('Model accuracy:', accuracy_score(y_test, prediction))

### Comparing models performance

In [10]:
%%time

print('- Model with feature selection -')

print('Number of features:', X_train_filtered.shape[1])

random_forest_model(X_train_filtered, X_test_filtered, y_train, y_test)

- Model with feature selection -
Number of features: 109
Model accuracy: 0.9575
Wall time: 494 ms


In [11]:
%%time

print('- Model without feature selection -')

print('Number of features:', X_train.shape[1])

random_forest_model(X_train, X_test, y_train, y_test)

- Model without feature selection -
Number of features: 370
Model accuracy: 0.9575
Wall time: 870 ms
