# Unfinished FeatureSelector

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import chi2, f_classif, f_regression, mutual_info_classif, mutual_info_regression
from statsmodels.stats.multitest import multipletests
import numpy as np

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, tests_to_features, alpha=0.05):
        self.tests_to_features = tests_to_features
        self.alpha = alpha
        self.selected_features_ = []

    def fit(self, X, y=None):
        p_values = []
        feature_indices = []

        # Map the test functions to their corresponding sklearn function
        test_functions = {
            'chi2': chi2,
            'f_classif': f_classif,
            'f_regression': f_regression,
            'mutual_info_classif': mutual_info_classif,
            'mutual_info_regression': mutual_info_regression
        }

        # Perform each test and collect p-values or scores
        for test, features in self.tests_to_features.items():
            if test in test_functions:
                # Get the indices of the features for this test
                indices = [X.columns.get_loc(feature) for feature in features]
                # Perform the test
                test_func = test_functions[test]
                if test in ['mutual_info_classif', 'mutual_info_regression']:
                    # These functions do not return p-values
                    scores = test_func(X.iloc[:, indices], y)
                    # For mutual information, we cannot directly apply FDR correction as there are no p-values.
                    # You might need to set a threshold manually based on scores or use a different method for feature selection.
                else:
                    scores, pvals = test_func(X.iloc[:, indices], y)
                    p_values.extend(pvals)
                feature_indices.extend(indices)

        # Apply FDR correction to p-values (not applicable to mutual information scores)
        if p_values:
            _, pvals_corrected, _, _ = multipletests(p_values, alpha=self.alpha, method='fdr_bh')
            # Select features whose corrected p-values are below the alpha threshold
            self.selected_features_ = [feature_indices[i] for i, pval in enumerate(pvals_corrected) if pval < self.alpha]
        else:
            # If using mutual information, you might need a different approach to select features based on scores.
            # Placeholder for handling mutual information scores selection.
            pass

        return self

    def transform(self, X):
        # Return a new dataset containing only the selected features
        return X.iloc[:, self.selected_features_]

# Example usage:
# Assuming 'data2' is your DataFrame and 'Total Cholesterol (mg/dL)' is the column you want to predict (and thus exclude from feature selection)
data10=data1.drop(columns=['Gender','Birthplace'])
# Exclude 'Total Cholesterol (mg/dL)' from the columns to be considered for feature selection
features_for_selection = data10.columns.drop('Total Cholesterol (mg/dL)')

# Initialize the FeatureSelector with the corrected set of features
selector = FeatureSelector({'f_regression': features_for_selection}, alpha=0.05)

# Now, you can safely drop the column from 'data2' and proceed with fitting and transforming
X_new = selector.fit_transform(data10.drop(columns=['Total Cholesterol (mg/dL)']), data10['Total Cholesterol (mg/dL)'])