# Feature Selection

- *Data Preparation*

In [None]:
df = pd.read_csv(r'src\rawDataset.csv')

df.drop(columns=['RowNumber','Surname','CustomerId', 'Complain'], inplace= True)

In [None]:
bins = [0, 30, 40, 50, 60, 100]  
labels = ['<30', '30-40', '40-50', '50-60', '>60']

df['Age'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

In [None]:
#Codification
encoder = OneHotEncoder(sparse_output=False, drop='first', dtype=int)

catcols = ['Gender','Geography', 'Card Type', 'Age']
encData = encoder.fit_transform(df[catcols])
encDF = pd.DataFrame(encData, columns=encoder.get_feature_names_out(catcols))
df = pd.concat([df.drop(columns= catcols),encDF],axis=1)

In [None]:
scaler = MinMaxScaler()
df[['Balance', 'EstimatedSalary']] = scaler.fit_transform(df[['Balance', 'EstimatedSalary']])

df.head()

**Filter Method**

**Wrapped method**

- *Forward Selection*

In [None]:
X = df.drop(columns=['Exited'])
y = df['Exited']

selected_features = []
feature_names = list(X.columns)
scores_list = []

model = RandomForestClassifier(random_state=42)

accuracy_threshold = 0.01
max_no_improvement_iterations = 5
no_improvement_count = 0
best_score = -1

while len(selected_features) < 13:
    best_score = -1
    best_feature = None

    for feature_idx in range(X.shape[1]):
        if feature_idx in selected_features:
            continue

        candidate_features = selected_features + [feature_idx]
        candidate_feature_names = [feature_names[i] for i in candidate_features]

        # cross-validation
        scores = cross_val_score(model, X[candidate_feature_names], y, cv=5, scoring='accuracy', n_jobs=-1)
        mean_score = np.mean(scores)

        # best-performing feature
        if mean_score > best_score:
            best_score = mean_score
            best_feature = feature_idx

    if best_feature is not None:
        selected_features.append(best_feature)
        scores_list.append(best_score)

        print(f"Selected Feature {len(selected_features)}: {feature_names[best_feature]}, Mean Accuracy: {best_score:.4f}")

        if abs(best_score - np.mean(cross_val_score(model, X[[feature_names[i] for i in selected_features]], y, cv=5))) < accuracy_threshold:
            no_improvement_count += 1
        else:
            no_improvement_count = 0
        
        # Stop if there's no improvement for a predefined number of iterations
        if no_improvement_count >= max_no_improvement_iterations:
            print("Stopping early due to lack of significant improvement.")
            break
        

In [None]:

table = go.Figure(data=[go.Table(
    header=dict(values=["<b>Selection Order</b>", "<b>Feature Name</b>"],
                fill_color='lightgrey',
                align='center'),
    cells=dict(values=[list(range(1, len(selected_features) + 1)),
                       [feature_names[i] for i in selected_features]],
               fill_color='white',
               align='center')
)])

table.update_layout(title="Feature Selection Order")
table.show()

- *Backward Elimination*

In [None]:
all_features = list(range(X.shape[1]))
min_features_to_retain = 5

removed_features = []
accuracy_scores = []

while len(all_features) > min_features_to_retain:
    worst_score = 1.0  
    worst_feature = None

    for feature_idx in all_features:

        candidate_features = [f for f in all_features if f != feature_idx]
        candidate_feature_names = [feature_names[i] for i in candidate_features]

        # cross-validation
        scores = cross_val_score(model, X[candidate_feature_names], y, cv=5, scoring='accuracy', n_jobs=-1)
        mean_score = np.mean(scores)

        # worst-performing feature
        if mean_score < worst_score:
            worst_score = mean_score
            worst_feature = feature_idx

    if worst_feature is not None:
        all_features.remove(worst_feature)
        removed_features.append(feature_names[worst_feature])
        accuracy_scores.append(worst_score)
        print(f"Removed Feature : {feature_names[worst_feature]}, Mean Accuracy: {worst_score:.4f}")

print("Remaining feature indices:", {feature_names[i] for i in all_features})

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=list(range(1, len(removed_features) + 1)),
    y=accuracy_scores,
    mode='lines+markers',
    text=removed_features,
    hovertemplate='<b>Step %{x}</b><br>Removed Feature: %{text}<br>Mean Accuracy: %{y:.4f}<extra></extra>',
    name='Backward Elimination'
))

fig.update_layout(
    title="Backward Elimination Process",
    xaxis_title="Number of Features Removed",
    yaxis_title="Mean Accuracy",
    hovermode="closest",
    template="plotly_white"
)

fig.show()

- *Selection Model*

In [None]:
from sklearn.feature_selection import SelectFromModel

sfm_selector = SelectFromModel(estimator=model)
sfm_selector.fit(X, y)
features = X.columns[sfm_selector.get_support()]

In [None]:
table = go.Figure(data=[go.Table(
    header=dict(values=["<b>Feature Name</b>"],
                fill_color='lightgrey',
                align='center'),
    cells=dict(values=[features],
               fill_color='white',
               align='center')
)])


table.update_layout(title="Feature Selection Order")
table.show()

- *Feature Selection Sequential Feature Selection (SFS)*

In [None]:
threshold = 0.01
vt = VarianceThreshold(threshold=threshold)
X_reduced = vt.fit_transform(X)

sfs_selector = SequentialFeatureSelector(estimator=model, n_features_to_select = 3, cv =5, direction ='backward', n_jobs=-1)
sfs_selector.fit(X_reduced, y)
features = X.columns[sfs_selector.get_support()]

In [None]:
table = go.Figure(data=[go.Table(
    header=dict(values=["<b>Feature Name</b>"],
                fill_color='lightgrey',
                align='center'),
    cells=dict(values=[features],
               fill_color='white',
               align='center')
)])


table.update_layout(title="Feature Selection Order")
table.show()

- *Recursive Feature Elimination (RFE)*

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np


num_features_to_retain = 5
rfe = RFE(estimator=model, n_features_to_select=num_features_to_retain)

# Fit the RFE
rfe.fit(X, y)


selected_features = np.where(rfe.support_)[0]

print("Selected feature indices:", [feature_names[i] for i in selected_features])

# cross-validation
scores = cross_val_score(model, X[[feature_names[i] for i in selected_features]], y, cv=5, scoring='accuracy')
mean_accuracy = np.mean(scores)
print(f"Mean Accuracy with Selected Features: {mean_accuracy:.4f}")

In [None]:
fig = go.Figure(data=[go.Table(
    header=dict(
        values=["<b>Selected Features</b>", "<b>Mean Accuracy</b>"],
        fill_color="lightblue",
        align="center",
        font=dict(size=14, color="black")
    ),
    cells=dict(
        values=[[feature_names[i] for i in selected_features], [f"{mean_accuracy:.4f}"]],
        fill_color="white",
        align="center",
        font=dict(size=12)
    )
)])

fig.update_layout(
    title="Selected Features and Model Accuracy",
    title_x=0.5,
    template="plotly_white",
    height=400
)

fig.show()

- *Exhaustive Search*

In [None]:
from itertools import combinations
import xgboost as xgb

model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0)

max_features = 5

# Initialize variables to keep track of the best feature subset and its accuracy
best_subset = None
best_accuracy = 0.0

# Generate all possible combinations of feature indices
all_feature_combinations = list(combinations(range(X.shape[1]), max_features))

for feature_subset in all_feature_combinations:
    feature_subset = list(feature_subset)
    
    # Evaluate the model's performance using cross-validation
    X_subset = X.iloc[:, feature_subset]
    scores = cross_val_score(model, X_subset, y, cv=5, scoring='accuracy', n_jobs=-1)
    mean_accuracy = np.mean(scores)

    # Check if this feature subset is better than the best one found so far
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_subset = feature_subset

print("Best Feature Subset:", best_subset)
print("Best Accuracy:", best_accuracy)

In [None]:

fig = go.Figure(data=[go.Table(
    header=dict(
        values=["<b>Selected Features</b>", "<b>Mean Accuracy</b>"],
        fill_color="lightblue",
        align="center",
        font=dict(size=14, color="black")
    ),
    cells=dict(
        values=[[feature_names[i] for i in best_subset], [f"{best_accuracy:.4f}"]],
        fill_color="white",
        align="center",
        font=dict(size=12)
    )
)])

# Estilizar y mostrar la tabla
fig.update_layout(
    title="Selected Features and Model Accuracy",
    title_x=0.5,
    template="plotly_white",
    height=400
)

fig.show()