In [38]:
from sklearn.feature_selection import VarianceThreshold
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector

In [27]:
rawFeatureSel_df = pd.read_csv(r'featureIng_dataframe.csv')
featureSel_df = rawFeatureSel_df.drop(columns=['Exited', 'Complain'])
scaler = StandardScaler()
featureSel_df[['CreditScore', 'Balance', 'EstimatedSalary']] = scaler.fit_transform(featureSel_df[['CreditScore', 'Balance', 'EstimatedSalary']])
featureSel_df.head()
featureSel_df.head()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Satisfaction Score,Point Earned,Gender_Female,...,Geography_Spain,Card Type_DIAMOND,Card Type_GOLD,Card Type_PLATINUM,Card Type_SILVER,Age_30-40,Age_40-50,Age_50-60,Age_<30,Age_>60
0,-0.326221,2,-1.225848,0,1,1,0.021886,2,464,1,...,0,1,0,0,0,0,1,0,0,0
1,-0.440036,1,0.11735,0,0,1,0.216534,3,456,1,...,1,1,0,0,0,0,1,0,0,0
2,-1.536794,8,1.333053,1,1,0,0.240687,3,377,1,...,0,1,0,0,0,0,1,0,0,0
3,0.501521,1,-1.225848,1,0,0,-0.108918,5,350,1,...,0,0,1,0,0,1,0,0,0,0
4,2.063884,2,0.785728,0,1,1,-0.365276,5,425,1,...,1,0,1,0,0,0,1,0,0,0


### Variance Threshold Feature Selection

In [28]:
featureSel_df.var()

CreditScore               1.000100
Tenure                    8.364673
Balance                   1.000100
NumOfProducts             0.249954
HasCrCard                 0.207791
IsActiveMember            0.249797
EstimatedSalary           1.000100
Satisfaction Score        1.976607
Point Earned          51042.032975
Gender_Female             0.247936
Gender_Male               0.247936
Geography_France          0.250023
Geography_Germany         0.187968
Geography_Spain           0.186363
Card Type_DIAMOND         0.187868
Card Type_GOLD            0.187619
Card Type_PLATINUM        0.187268
Card Type_SILVER          0.187319
Age_30-40                 0.245747
Age_40-50                 0.193280
Age_50-60                 0.079356
Age_<30                   0.137185
Age_>60                   0.049838
dtype: float64

Here we have the variability of each feature, with higher values indicating more variability. Features with very low variance, such as Age_>60 (0.0498), contribute little to differenciate between data points and can usually be removed.

In [29]:
selector = VarianceThreshold(1)
selector.fit(featureSel_df)
featureSel_df.columns[selector.get_support()]

Index(['Tenure', 'Satisfaction Score', 'Point Earned'], dtype='object')

Setting a threshold of 1 selects the three features with variability above that level: Tenure, Satisfaction Score, and Point Earned, indicating these have meaningful variance in the dataset. Lower threshold selections would include more features with smaller variances.

### Univariate Feature Selection with SelectKBest


In [30]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
selector = SelectKBest(mutual_info_regression, k=2)

selector.fit(featureSel_df, rawFeatureSel_df['Exited'])
featureSel_df.columns[selector.get_support()]

Index(['Age_50-60', 'Age_<30'], dtype='object')

Age_50-60 and Age_<30 have the highest relevance for predicting the target. These features were chosen over others as they provide the most useful information for the model.

### Recursive Feature Elimination (RFE)


In [34]:
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
X = featureSel_df
y = rawFeatureSel_df['Exited']

rfe.fit(X, y)

X.columns[rfe.get_support()]

Index(['CreditScore', 'Tenure', 'Balance', 'EstimatedSalary', 'Point Earned'], dtype='object')

We choose a Random Forest for Recursive Feature Elimination (RFE) because it automatically shows which features are most important. The output indicates that CreditScore, Tenure, Balance, EstimatedSalary, and Point Earned are the top five features selected by the RFE method, meaning they have the highest relevance for predicting churn in the dataset. These features are considered the most influential in the model's performance.

### Feature Selection via SelectFromModel

In [37]:
sfm = SelectFromModel(estimator=RandomForestClassifier())
sfm.fit(X, y)

X.columns[sfm.get_support()]

Index(['CreditScore', 'Tenure', 'Balance', 'EstimatedSalary',
       'Satisfaction Score', 'Point Earned', 'Age_50-60'],
      dtype='object')

In the selection via SelectFromModel we also used a Random Forest as estimator but we se how the outcome differs compared to the other methods. In this model we se how 'CreditScore', 'Tenure', 'Balance', 'EstimatedSalary', 'Satisfaction Score', 'Point Earned' and 'Age_50-60' might be the most influential in the model's performance.

### Feature Selection Sequential Feature Selection (SFS)

In [40]:
vt = VarianceThreshold(threshold=0.1)
X_reduced = vt.fit_transform(X)

sfs_selector = SequentialFeatureSelector(estimator=RandomForestClassifier(), n_features_to_select=3, cv=5, direction='backward', n_jobs=-1)
sfs_selector.fit(X_reduced, y)

selected_columns = X.columns[vt.get_support()][sfs_selector.get_support()]
print(selected_columns)


Index(['Age_30-40', 'Age_40-50', 'Age_<30'], dtype='object')


These age-related features were identified as the most relevant for predicting churn in the dataset by SFS, which is a method for selecting features by iteratively adding or removing them based on their importance, using again as a estimator, a RandomForestClassifier