# Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, f_classif
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Data

### Importing data

In [2]:
data = pd.read_csv('santander.csv', nrows=20000)

data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


# Defining variables

In [3]:
X = data.drop('TARGET', axis=1)

y = data['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

print('Initial number of features:', X_train.shape[1])

Initial number of features: 370


# Feature selection

### Constant and quasi-constant features removal

In [4]:
constant_filter = VarianceThreshold(threshold=0.01)

constant_filter.fit(X_train)

X_train_filtered = constant_filter.transform(X_train)
X_test_filtered = constant_filter.transform(X_test)

print('Number of features after remove constant and quasi-constant features:', X_train_filtered.shape[1])

Number of features after remove constant and quasi-constant features: 255


### Duplicate features removal

In [5]:
X_train_filtered = X_train_filtered.T
X_test_filtered = X_test_filtered.T

X_train_filtered = pd.DataFrame(X_train_filtered)
X_test_filtered = pd.DataFrame(X_test_filtered)

duplicated_features = X_train_filtered.duplicated()

features_to_keep = [not feature for feature in duplicated_features]

X_train_filtered = X_train_filtered[features_to_keep].T
X_test_filtered = X_test_filtered[features_to_keep].T

print('Number of features after remove duplicate features:', X_train_filtered.shape[1])

Number of features after remove duplicate features: 235


### ANOVA filter

In [6]:
anova_test = f_classif(X_train_filtered, y_train)

p_values = pd.Series(anova_test[1], index=X_train_filtered.columns) # Selecting just the p-values from ANOVA test

p_values.sort_values(ascending=True, inplace=True)

p_values = p_values[p_values<0.05] # Using a common threshold for p-values (5%) to select the best features

X_train_filtered = X_train_filtered[p_values.index]
X_test_filtered = X_test_filtered[p_values.index]

print('Number of features after ANOVA p-value filter:', X_train_filtered.shape[1])

Number of features after ANOVA p-value filter: 91


# Model

### Creating model

In [7]:
def random_forest_model(X_train, X_test, y_train, y_test):
    random_forest_model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)
    random_forest_model.fit(X_train, y_train)
    prediction = random_forest_model.predict(X_test)
    print('Model accuracy:', accuracy_score(y_test, prediction))

### Comparing models performance

In [8]:
%%time

print('- Model with feature selection -')

print('Number of features:', X_train_filtered.shape[1])

random_forest_model(X_train_filtered, X_test_filtered, y_train, y_test)

- Model with feature selection -
Number of features: 91
Model accuracy: 0.95375
Wall time: 494 ms


In [9]:
%%time

print('- Model without feature selection -')

print('Number of features:', X_train.shape[1])

random_forest_model(X_train, X_test, y_train, y_test)

- Model without feature selection -
Number of features: 370
Model accuracy: 0.957
Wall time: 890 ms
