In [None]:
# wine_feature_pipeline.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

class FeatureSelectionPipeline:
    def __init__(self, df, target):
        self.df = df
        self.target = target
        self.X = df.drop(columns=[target])
        self.y = df[target]
        self.scaler = StandardScaler()
        self.results = pd.DataFrame()
        self.selected_features = {}

    def preprocess(self):
        # Scale numeric features
        self.X_scaled = pd.DataFrame(
            self.scaler.fit_transform(self.X), 
            columns=self.X.columns
        )
        print("Preprocessing done. Features scaled.")

    def filter_methods(self, top_k=8):
        # Univariate feature selection
        selector = SelectKBest(score_func=f_classif, k=top_k)
        selector.fit(self.X_scaled, self.y)
        selected = self.X_scaled.columns[selector.get_support()].tolist()
        self.selected_features['Filter_SelectKBest'] = selected
        print(f"Filter method selected features: {selected}")

    def wrapper_methods(self):
        # Recursive Feature Elimination
        model = LogisticRegression(max_iter=1000)
        rfe = RFE(model, n_features_to_select=8)
        rfe.fit(self.X_scaled, self.y)
        selected = self.X_scaled.columns[rfe.support_].tolist()
        self.selected_features['Wrapper_RFE'] = selected
        print(f"Wrapper method selected features: {selected}")

    def embedded_methods(self):
        # Random Forest feature importance
        model = RandomForestClassifier(n_estimators=100)
        model.fit(self.X_scaled, self.y)
        importances = pd.Series(model.feature_importances_, index=self.X_scaled.columns)
        selected = importances.sort_values(ascending=False).head(8).index.tolist()
        self.selected_features['Embedded_RandomForest'] = selected
        print(f"Embedded method selected features: {selected}")

        # Optional: XGBoost feature importance
        model_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        model_xgb.fit(self.X_scaled, self.y)
        importances_xgb = pd.Series(model_xgb.feature_importances_, index=self.X_scaled.columns)
        selected_xgb = importances_xgb.sort_values(ascending=False).head(8).index.tolist()
        self.selected_features['Embedded_XGBoost'] = selected_xgb
        print(f"Embedded XGBoost selected features: {selected_xgb}")

    def evaluate_models(self):
        # Split dataset
        X_train, X_test, y_train, y_test = train_test_split(
            self.X_scaled, self.y, test_size=0.2, random_state=42
        )

        metrics_list = []
        for method, features in self.selected_features.items():
            clf = RandomForestClassifier(n_estimators=100)
            clf.fit(X_train[features], y_train)
            y_pred = clf.predict(X_test[features])
            metrics = {
                "Method": method,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred, average='weighted'),
                "Precision": precision_score(y_test, y_pred, average='weighted'),
                "Recall": recall_score(y_test, y_pred, average='weighted'),
                "Selected_Features": ", ".join(features)
            }
            metrics_list.append(metrics)
        
        self.results = pd.DataFrame(metrics_list)
        print("Evaluation completed. Metrics stored.")

    def generate_report(self, output_path="wine_feature_report.xlsx"):
        with pd.ExcelWriter(output_path) as writer:
            self.results.to_excel(writer, index=False, sheet_name='Metrics')
            
            # Save selected features per method
            features_df = pd.DataFrame(dict([(k, pd.Series(v)) for k,v in self.selected_features.items()]))
            features_df.to_excel(writer, index=False, sheet_name='Selected_Features')
        
        print(f"Report generated at {output_path}")

    def plot_feature_importance(self):
        # Combined importance visualization (optional)
        combined_features = set(sum(self.selected_features.values(), []))
        importance_counts = {feat: sum(feat in v for v in self.selected_features.values()) for feat in combined_features}
        importance_series = pd.Series(importance_counts).sort_values(ascending=False)

        plt.figure(figsize=(10,6))
        sns.barplot(x=importance_series.values, y=importance_series.index)
        plt.title("Feature Selection Frequency Across Methods")
        plt.xlabel("Number of Methods Selected")
        plt.ylabel("Feature")
        plt.show()

In [None]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=';')

# Convert target to binary classification (quality >=6 as good)
df['quality'] = (df['quality'] >= 6).astype(int)

pipeline = FeatureSelectionPipeline(df, target='quality')
pipeline.preprocess()
pipeline.filter_methods()
pipeline.wrapper_methods()
pipeline.embedded_methods()
pipeline.evaluate_models()
pipeline.generate_report()
pipeline.plot_feature_importance()