<a href="https://colab.research.google.com/github/MostafaSamirKamel/NLP_project/blob/MostafaSamir%2FNLP_search%2FMachine_Translation/feature%20selection_wrapper%20forward.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_breast_cancer

# Load dataset (breast cancer dataset for this example)
data = load_breast_cancer()
features = data.data   # Features
target = data.target   # Target

# Forward selection implementation from scratch
def forward_selection(features, target, model, max_features=None):
    n_features = features.shape[1]
    selected_features = []  # Store selected feature indices
    remaining_features = list(range(n_features))  # All features initially available

    if max_features is None:
        max_features = n_features  # Use all features if max_features not provided

    best_score = 0  # To track the best score obtained
    while len(selected_features) < max_features and remaining_features:
        best_feature = None
        best_feature_score = best_score

        # Try adding each remaining feature one by one
        for feature in remaining_features:
            # Create a subset of the data with selected features + this feature
            feature_subset = features[:, selected_features + [feature]]

            # Perform cross-validation and get the mean accuracy score
            score = np.mean(cross_val_score(model, feature_subset, target, cv=5, scoring='accuracy'))

            # If the score improves, keep track of this feature
            if score > best_feature_score:
                best_feature_score = score
                best_feature = feature

        # If adding a feature improves the performance, add it to the selected set
        if best_feature is not None:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
            best_score = best_feature_score
            print(f"Selected feature {best_feature} with cross-validation score: {best_score:.4f}")
        else:
            break  # Stop if no further improvement

    return selected_features, best_score

# Initialize the random forest classifier
model = RandomForestClassifier(random_state=42)

# Perform forward feature selection
selected_features, best_score = forward_selection(features, target, model, max_features=5)

print(f"Selected Features: {selected_features}")
print(f"Best Cross-Validation Score: {best_score:.4f}")



Selected feature 22 with cross-validation score: 0.8875
Selected feature 24 with cross-validation score: 0.9472
Selected feature 21 with cross-validation score: 0.9631
Selected feature 14 with cross-validation score: 0.9683
Selected feature 16 with cross-validation score: 0.9719
Selected Features: [22, 24, 21, 14, 16]
Best Cross-Validation Score: 0.9719


In [15]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score


# Load dataset (breast cancer dataset for this example)
data = load_breast_cancer()
features = data.data   # Features
target = data.target   # Target


X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


model = RandomForestClassifier(random_state=42)


sfs = SequentialFeatureSelector(model, n_features_to_select=5, direction='forward', scoring='accuracy', cv=5)

# Fit the feature selector to the training data
sfs.fit(X_train, y_train)


selected_features = sfs.get_support(indices=True)


X_test_selected = sfs.transform(X_test)

model.fit(sfs.transform(X_train), y_train)

# Make predictions on the test data with selected features
y_pred = model.predict(X_test_selected)

# Calculate accuracy score
best_score = accuracy_score(y_test, y_pred)

print(f"Selected Features: {selected_features}")
print(f"Best Cross-Validation Score: {best_score:.4f}")

Selected Features: [ 0  1 23 24 27]
Best Cross-Validation Score: 0.9561
