In [2]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_iris
from sklearn.metrics import f1_score
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_array, check_X_y

In [3]:
class SFS(BaseEstimator, TransformerMixin):
    def __init__(self, n_features_to_select):
        self.n_features_to_select = n_features_to_select

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.n_features_ = X.shape[1]
        self.selected_features_ = []

        # Initialize the list of selected features
        current_features = []
        remaining_features = list(range(X.shape[1]))

        while len(current_features) < self.n_features_to_select:
            best_score = -np.inf
            best_feature = None

            for feature in remaining_features:
                temp_features = current_features + [feature]
                score = self.evaluate_subset(X[:, temp_features], y)
                if score > best_score:
                    best_score = score
                    best_feature = feature

            if best_feature is not None:
                current_features.append(best_feature)
                remaining_features.remove(best_feature)
                self.selected_features_ = current_features
            else:
                break

        return self

    def transform(self, X):
        check_array(X)
        return X[:, self.selected_features_]

    def evaluate_subset(self, X_subset, y):
        # Apply EM clustering
        gmm = GaussianMixture(n_components=len(np.unique(y)), random_state=42)
        gmm.fit(X_subset)
        labels = gmm.predict(X_subset)

        # Compute scatter matrices
        class_means = np.array(
            [X_subset[y == label].mean(axis=0) for label in np.unique(y)]
        )
        overall_mean = X_subset.mean(axis=0)

        # Within-class scatter matrix
        S_W = np.sum(
            [
                np.cov(X_subset[y == label].T, bias=True) * (np.sum(y == label) - 1)
                for label in np.unique(y)
            ],
            axis=0,
        )

        # Between-class scatter matrix
        S_B = np.sum(
            [
                (
                    np.sum(y == label)
                    * np.outer(mean - overall_mean, mean - overall_mean)
                )
                for label, mean in zip(np.unique(y), class_means)
            ],
            axis=0,
        )

        # Scatter Discriminability
        scatter_discriminability = np.trace(np.linalg.inv(S_W).dot(S_B))
        return scatter_discriminability


# Load sample dataset
data = load_iris()
X = data.data
y = data.target

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# Initialize and fit SFS
sfs = SFS(n_features_to_select=2)
sfs.fit(X_train, y_train)

# Transform the training and test set
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

# Train a classifier on the selected features
from sklearn.svm import SVC

classifier = SVC()
classifier.fit(X_train_selected, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test_selected)
score = f1_score(y_test, y_pred, average="weighted")

print(f"Selected Features: {sfs.selected_features_}")
print(f"F1 Score: {score}")

LinAlgError: 0-dimensional array given. Array must be at least two-dimensional