## Feature Selection

1. Identifies the most relevant features for predicting the target variable.
2. Uses statistical methods, correlation analysis, or algorithms (like Recursive Feature Elimination, Lasso regression, tree-based feature importance) to remove irrelevant/redundant features.

On the prepared data from feature engineering i will identify most relevant features

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
dataset = pd.read_csv(r"C:\Users\KOLADE\OneDrive\Documents\Practices\Titanic\data\Train.csv")
train = dataset.copy()
train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Family_size,Cabin_status,Alone,Embarked_nan,Age_nan,...,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Age_scaled,Fare_log_scaled,Survived
0,1,1,54.0,0,0,1,1,1,0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.888773,1.053619,0
1,3,1,26.0,0,0,1,0,1,0,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.232122,-0.159147,0
2,2,1,25.0,1,2,3,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.307868,0.828292,0
3,3,1,26.0,1,0,1,0,1,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.232122,-0.22735,0
4,3,0,22.0,0,0,1,0,1,0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.535107,-0.533665,0


In [3]:
X_train = train.drop("Survived", axis=1)
y_train = train["Survived"]

In [4]:
print(f"train: {train.shape}, X_train: {X_train.shape}, y_train: {y_train.shape}")

train: (596, 21), X_train: (596, 20), y_train: (596,)


In [5]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Family_size,Cabin_status,Alone,Embarked_nan,Age_nan,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Age_scaled,Fare_log_scaled
0,1,1,54.0,0,0,1,1,1,0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.888773,1.053619
1,3,1,26.0,0,0,1,0,1,0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.232122,-0.159147
2,2,1,25.0,1,2,3,0,0,0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.307868,0.828292
3,3,1,26.0,1,0,1,0,1,0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.232122,-0.22735
4,3,0,22.0,0,0,1,0,1,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.535107,-0.533665


**Dropping Constant Features**

In [6]:
selector = VarianceThreshold(threshold=0)
selector.fit(X_train)

In [7]:
print(selector.get_support().sum())

20


**Information Gain mutual_info_classif**

In [8]:
mutual_info_classif(X_train, y_train)

array([0.0292976 , 0.1319101 , 0.04580449, 0.01290675, 0.02158532,
       0.03733476, 0.03951824, 0.02100797, 0.        , 0.03403781,
       0.02915288, 0.        , 0.        , 0.        , 0.03431083,
       0.11503516, 0.02207744, 0.        , 0.04655562, 0.10729366])

In [9]:
select = SelectKBest(mutual_info_classif)
select.fit(X_train, y_train)

In [10]:
select.get_support()

array([ True,  True, False, False, False,  True,  True, False, False,
       False, False,  True, False, False,  True,  True,  True, False,
        True,  True])

In [11]:
mask = select.get_support()
selected_features = X_train.columns[mask]

In [12]:
print(select.get_support().sum())
print(selected_features.tolist())

10
['Pclass', 'Sex', 'Family_size', 'Cabin_status', 'Embarked_Q', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Age_scaled', 'Fare_log_scaled']


In [13]:
print(X_train.columns.tolist())

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Family_size', 'Cabin_status', 'Alone', 'Embarked_nan', 'Age_nan', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Other', 'Age_scaled', 'Fare_log_scaled']


In [14]:
selector.get_params()

{'threshold': 0}

**Chi-square**

In [15]:
chi2_stats, p_values = chi2(X_train.drop(columns=['Age_scaled', 'Fare_log_scaled']), y_train)

In [16]:
chi2_stats

array([1.67587941e+01, 5.72929115e+01, 1.17795331e+01, 4.55335381e+00,
       4.87230976e+00, 2.91433098e-01, 4.39149476e+01, 9.60417795e+00,
       1.68468468e+00, 7.05866185e+00, 1.77797163e+01, 5.38940717e-03,
       4.36835678e+00, 2.46347060e+00, 4.18775590e+01, 6.67616231e+01,
       6.32060213e+01, 7.00001984e-01])

In [17]:
p_values

array([4.24450830e-05, 3.75506162e-14, 5.98855015e-04, 3.28543326e-02,
       2.72909492e-02, 5.89303628e-01, 3.42959656e-11, 1.94135164e-03,
       1.94303129e-01, 7.88828980e-03, 2.48013235e-05, 9.41477806e-01,
       3.66122818e-02, 1.16521070e-01, 9.71714893e-11, 3.06405458e-16,
       1.86178387e-15, 4.02783028e-01])

In [18]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Family_size', 'Cabin_status',
       'Alone', 'Embarked_nan', 'Age_nan', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
       'Title_Other', 'Age_scaled', 'Fare_log_scaled'],
      dtype='object')

In [19]:
pd.DataFrame({
    "Feature": X_train.drop(columns=['Age_scaled', 'Fare_log_scaled']).columns,
    "Chi2 Stat": chi2_stats,
    "p-value": p_values
}).sort_values("p-value", ascending=True)

Unnamed: 0,Feature,Chi2 Stat,p-value
15,Title_Mr,66.761623,3.064055e-16
16,Title_Mrs,63.206021,1.861784e-15
1,Sex,57.292912,3.755062e-14
6,Cabin_status,43.914948,3.429597e-11
14,Title_Miss,41.877559,9.717149e-11
10,Embarked_C,17.779716,2.480132e-05
0,Pclass,16.758794,4.244508e-05
2,Age,11.779533,0.000598855
7,Alone,9.604178,0.001941352
9,Age_nan,7.058662,0.00788829


In [20]:
X_train.drop(columns=['Age_scaled', 'Fare_log_scaled']).columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Family_size', 'Cabin_status',
       'Alone', 'Embarked_nan', 'Age_nan', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
       'Title_Other'],
      dtype='object')