In [None]:
!pip install openml

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes, fetch_california_housing
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.ensemble import StackingRegressor, StackingClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
import openml
import time

# Regression

In [None]:
# Suppress warnings
warnings.filterwarnings('ignore')

def compute_stacking_feature_importance(X, y, X_test, y_test, base_models, meta_model):
    """
    Your original method to compute feature importance for stacking model
    """
    # Generate predictions from base models
    base_train_preds = np.column_stack([
        model.predict(X) for model in base_models
    ])

    base_test_preds = np.column_stack([
        model.predict(X_test) for model in base_models
    ])

    # Train meta-model
    meta_model.fit(base_train_preds, y)

    # Get meta-model weights
    meta_model_weights = np.abs(meta_model.coef_)
    meta_model_weights = meta_model_weights / np.sum(meta_model_weights)

    # Get feature importance from base models
    base_importances = []
    for model in base_models:
        importance = model.feature_importances_
        importance = importance / np.sum(importance)
        base_importances.append(importance)
    base_importances = np.array(base_importances)

    # Calculate stability adjustment
    importance_std = np.std(base_importances, axis=0)  # Standard deviation across models for each feature
    epsilon = 1  # Small constant to avoid division by zero
    stability_adjustment = 1 / (importance_std + epsilon)

    # Normalize stability adjustment
    stability_adjustment = stability_adjustment / np.sum(stability_adjustment)

    # Calculate final feature importance
    overall_importance = np.zeros(X.shape[1])
    for i, importance in enumerate(base_importances):
        overall_importance += importance * meta_model_weights[i]

    # Apply stability adjustment to overall importance
    adjusted_importance = overall_importance * stability_adjustment

    # Ensure normalization
    adjusted_importance = adjusted_importance / np.sum(adjusted_importance)

    #overall_importance = overall_importance / np.sum(overall_importance)
    
    return adjusted_importance

def compute_permutation_feature_importance(X, y, X_test, y_test, base_models, meta_model):
    """
    Compute permutation feature importance for stacking model using StackingRegressor
    """
    # Create named estimators for StackingRegressor
    named_estimators = [
        ('xgb', base_models[0]),
        ('lgb', base_models[1]),
        ('rf', base_models[2]),
        ('et', base_models[3]),
        ('dt', base_models[4]),
        ('ada', base_models[5])
    ]
    
    # Create StackingRegressor with no CV (passthrough=False to avoid including original features)
    stacking_model = StackingRegressor(
        estimators=named_estimators,
        final_estimator=meta_model,
        cv=None,  # No cross-validation
        passthrough=False
    )
    
    # Train the stacking model
    stacking_model.fit(X, y)
    
    # Compute permutation importance
    result = permutation_importance(
        estimator=stacking_model, 
        X=X_test,
        y=y_test,
        n_repeats=10,
        random_state=42
    )
    
    # Normalize importance scores
    perm_importance = result.importances_mean
    perm_importance = perm_importance / np.sum(perm_importance)
    
    return perm_importance

def load_dataset(dataset_name):
    """Load dataset by name"""
    if dataset_name == "california":
        data = fetch_california_housing()
        X, y = data.data, data.target
        feature_names = data.feature_names
    elif dataset_name == "diabetes":
        data = load_diabetes()
        X, y = data.data, data.target
        feature_names = data.feature_names
    elif dataset_name == "concrete":
        # Concrete Compressive Strength from OpenML
        dataset = openml.datasets.get_dataset(44959)
        data = dataset.get_data(dataset_format="dataframe")
        X = data[0].drop('strength', axis=1)
        y = data[0]['strength']
        feature_names = X.columns.tolist()
        X = X.values
    elif dataset_name == "air":
        # Energy Efficiency from OpenML (heating load)
        dataset = openml.datasets.get_dataset(547)
        data = dataset.get_data(dataset_format="dataframe")
        X = data[0].drop(['no2_concentration'], axis=1)
        y = data[0]['no2_concentration']  # Heating Load
        feature_names = X.columns.tolist()
        X = X.values
    elif dataset_name == "water":
        !kaggle datasets download -d devanshibavaria/water-potability-dataset-with-10-parameteres
        !unzip -q water-potability-dataset-with-10-parameteres.zip
        df = pd.read_csv("water_potability.csv")
        X = df.drop('Potability', axis=1)  # Features (10 parameters)
        y = df['Potability']               # Target (binary: 0/1)
        X = X.fillna(X.mean())
        X = X.values
        feature_names = df.drop('Potability', axis=1).columns.tolist()
    else:
        raise ValueError(f"Unknown dataset: {dataset_name}")
    
    return X, y, feature_names

def compare_feature_importance_methods(dataset_name):
    """Compare your method with permutation feature importance"""
    print(f"\nAnalyzing dataset: {dataset_name}")
    
    # Load dataset
    X, y, feature_names = load_dataset(dataset_name)
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Initialize models
    base_models = [
        xgb.XGBRegressor(random_state=42),
        lgb.LGBMRegressor(random_state=42, verbose=-1),
        RandomForestRegressor(random_state=42),
        ExtraTreesRegressor(random_state=42),
        DecisionTreeRegressor(random_state=42),
        AdaBoostRegressor(random_state=42)
    ]
    
    meta_model = Ridge()
    
    # Train base models
    for model in base_models:
        model.fit(X_train, y_train)
    
    # Time and compute method3 
    start_time = time.time()
    method3_importance = compute_stacking_feature_importance(
        X_train, y_train, X_test, y_test, base_models, meta_model
    )
    method3_method_time = time.time() - start_time
    
    # Time and compute permutation importance with StackingRegressor
    start_time = time.time()
    perm_importance = compute_permutation_feature_importance(
        X_train, y_train, X_test, y_test, base_models, meta_model
    )
    perm_method_time = time.time() - start_time
    
    # Prepare and return results
    comparison_df = pd.DataFrame({
        'Feature': feature_names,
        'Method 3': your_importance,
        'Permutation Importance': perm_importance
    })
    
    # Sort by method 3's importance
    comparison_df = comparison_df.sort_values('Method 3', ascending=False)
    
    # Calculate correlation between methods
    correlation = np.corrcoef(your_importance, perm_importance)[0, 1]
    
    # Create StackingRegressor for performance evaluation
    named_estimators = [
        ('xgb', base_models[0]),
        ('lgb', base_models[1]),
        ('rf', base_models[2]),
        ('et', base_models[3]),
        ('dt', base_models[4]),
        ('ada', base_models[5])
    ]
    
    stacking_model = StackingRegressor(
        estimators=named_estimators,
        final_estimator=meta_model,
        cv=None,
        passthrough=False
    )
    
    # Base model performance metrics
    model_scores = {}
    for i, model in enumerate(base_models):
        model_name = model.__class__.__name__
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        model_scores[model_name] = {'MSE': mse, 'R2': r2}
    
    # Stacking model performance
    stacking_model.fit(X_train, y_train)
    stacking_pred = stacking_model.predict(X_test)
    stacking_mse = mean_squared_error(y_test, stacking_pred)
    stacking_r2 = r2_score(y_test, stacking_pred)
    model_scores['Stacking'] = {'MSE': stacking_mse, 'R2': stacking_r2}
    
    return {
        'comparison': comparison_df,
        'correlation': correlation,
        'method3_time': your_method_time,
        'perm_method_time': perm_method_time,
        'model_scores': model_scores
    }

def visualize_comparison(results, dataset_name):
    """Visualize comparison between methods"""
    comparison_df = results['comparison']
    
    plt.figure(figsize=(12, 8))
    
    # Side by side bar plots
    x = np.arange(len(comparison_df))
    width = 0.35
    
    plt.bar(x - width/2, comparison_df['Method 3'], width, label='Method 3')
    plt.bar(x + width/2, comparison_df['Permutation Importance'], width, label='Permutation Importance')
    
    plt.xlabel('Features')
    plt.ylabel('Importance (normalized)')
    plt.title(f'Feature Importance Comparison - {dataset_name.title()} Dataset\nCorrelation: {results["correlation"]:.4f}')
    plt.xticks(x, comparison_df['Feature'], rotation=90)
    plt.legend()
    plt.tight_layout()
    
    return plt.gcf()

def analyze_all_datasets():
    """Run analysis on all datasets"""
    datasets = ["california", "diabetes", "air", "concrete"]
    all_results = {}
    summary_rows = []
    
    for dataset in datasets:
        try:
            results = compare_feature_importance_methods(dataset)
            all_results[dataset] = results
            
            # Create visualization
            fig = visualize_comparison(results, dataset)
            plt.close(fig)
            
            # Top features by each method
            top_yours = results['comparison'].nlargest(3, 'Method 3')['Feature'].tolist()
            top_perm = results['comparison'].nlargest(3, 'Permutation Importance')['Feature'].tolist()
            
            # Add to summary
            summary_rows.append({
                'Dataset': dataset.title(),
                'Correlation': results['correlation'],
                'Method 3 Time (s)': results['method3_time'],
                'Permutation Time (s)': results['perm_method_time'],
                'Top-3 Features (Method 3)': ', '.join(top_yours),
                'Top-3 Features (Perm)': ', '.join(top_perm),
                'Stacking R2': results['model_scores']['Stacking']['R2'],
                'Speedup Factor': results['perm_method_time'] / results['method3_time']
            })
        except Exception as e:
            print(f"Error processing dataset {dataset}: {str(e)}")
    
    # Create summary table
    summary_df = pd.DataFrame(summary_rows)
    
    return all_results, summary_df

# Run analysis on all datasets
all_results, summary_df = analyze_all_datasets()

# Print summary
print("\n===== SUMMARY OF RESULTS =====")
print(summary_df.to_string(index=False))

# Generate detailed comparison for each dataset
for dataset_name, results in all_results.items():
    print(f"\n\n===== DETAILED RESULTS FOR {dataset_name.upper()} =====")
    print("\nFeature Importance Comparison:")
    print(results['comparison'].to_string(index=False))
    
    print(f"\nCorrelation between methods: {results['correlation']:.4f}")
    print(f"method 3 execution time: {results['method3_time']:.4f} seconds")
    print(f"Permutation importance execution time: {results['perm_method_time']:.4f} seconds")
    print(f"Speedup factor: {results['perm_method_time'] / results['method3_time']:.2f}x")
    
    print("\nModel Performance:")
    model_df = pd.DataFrame.from_dict(results['model_scores'], orient='index')
    print(model_df.to_string())

# Classification

In [None]:
# Suppress warnings
warnings.filterwarnings('ignore')

def compute_stacking_feature_importance(X, y, X_test, y_test, base_models, meta_model):
    """
    Compute feature importance for stacking classification model
    """
    # Generate predictions from base models
    base_train_preds = np.column_stack([
        model.predict_proba(X)[:, 1] if hasattr(model, 'predict_proba') else model.predict(X)
        for model in base_models
    ])

    base_test_preds = np.column_stack([
        model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.predict(X_test)
        for model in base_models
    ])

    # Train meta-model
    meta_model.fit(base_train_preds, y)

    # Get meta-model weights (use absolute values for importance)
    if hasattr(meta_model, 'coef_'):
        meta_model_weights = np.abs(meta_model.coef_[0])
    else:
        meta_model_weights = np.ones(len(base_models))
    meta_model_weights = meta_model_weights / np.sum(meta_model_weights)

    # Get feature importance from base models
    base_importances = []
    for model in base_models:
        if hasattr(model, 'feature_importances_'):
            importance = model.feature_importances_
        elif hasattr(model, 'coef_'):
            importance = np.abs(model.coef_[0])
        else:
            importance = np.ones(X.shape[1])
        importance = importance / np.sum(importance)
        base_importances.append(importance)
    base_importances = np.array(base_importances)

    # Calculate stability adjustment
    importance_std = np.std(base_importances, axis=0)  # Standard deviation across models for each feature
    epsilon = 1  # Small constant to avoid division by zero
    stability_adjustment = 1 / (importance_std + epsilon)

    # Normalize stability adjustment
    stability_adjustment = stability_adjustment / np.sum(stability_adjustment)

    # Calculate final feature importance
    overall_importance = np.zeros(X.shape[1])
    for i, importance in enumerate(base_importances):
        overall_importance += importance * meta_model_weights[i]

    # Apply stability adjustment to overall importance
    adjusted_importance = overall_importance * stability_adjustment

    # Ensure normalization
    adjusted_importance = adjusted_importance / np.sum(adjusted_importance)

    #overall_importance = overall_importance / np.sum(overall_importance)
    
    return adjusted_importance

def compute_permutation_feature_importance(X, y, X_test, y_test, base_models, meta_model):
    """
    Compute permutation feature importance for stacking classification model
    """
    # Create named estimators for StackingClassifier
    named_estimators = [
        ('xgb', base_models[0]),
        ('lgb', base_models[1]),
        ('rf', base_models[2]),
        ('et', base_models[3]),
        ('dt', base_models[4]),
        ('ada', base_models[5])
    ]
    
    # Create StackingClassifier with no CV (passthrough=False to avoid including original features)
    stacking_model = StackingClassifier(
        estimators=named_estimators,
        final_estimator=meta_model,
        cv=None,  # No cross-validation
        passthrough=False
    )
    
    # Train the stacking model
    stacking_model.fit(X, y)
    
    # Compute permutation importance
    result = permutation_importance(
        estimator=stacking_model, 
        X=X_test,
        y=y_test,
        n_repeats=10,
        random_state=42
    )
    
    # Normalize importance scores
    perm_importance = result.importances_mean
    perm_importance = perm_importance / np.sum(perm_importance)
    
    return perm_importance

def load_dataset(dataset_name):
    """Load dataset by name"""
    if dataset_name == "california":
        data = fetch_california_housing()
        X, y = data.data, (data.target > data.target.mean()).astype(int)  # Binary classification
        feature_names = data.feature_names
    elif dataset_name == "diabetes":
        data = load_diabetes()
        X, y = data.data, (data.target > data.target.mean()).astype(int)  # Binary classification
        feature_names = data.feature_names
    elif dataset_name == "concrete":
        # Concrete Compressive Strength from OpenML
        dataset = openml.datasets.get_dataset(44959)
        data = dataset.get_data(dataset_format="dataframe")
        X = data[0].drop('strength', axis=1)
        y = (data[0]['strength'] > data[0]['strength'].mean()).astype(int)  # Binary classification
        feature_names = X.columns.tolist()
        X = X.values
    elif dataset_name == "parkinson":
        # Parkinson's dataset from OpenML
        dataset = openml.datasets.get_dataset(1488)
        data = dataset.get_data(dataset_format="dataframe")
        X = data[0].drop(['Class'], axis=1)
        y = data[0]['Class']  # Already categorical

        y_labels = data[0]['Class']
        label_map = {label: i for i, label in enumerate(y_labels.unique())}
        y = y_labels.map(label_map)
        
        feature_names = X.columns.tolist()
        X = X.values
    elif dataset_name == "air":
        # Air Quality dataset from OpenML
        dataset = openml.datasets.get_dataset(547)
        data = dataset.get_data(dataset_format="dataframe")
        X = data[0].drop(['no2_concentration'], axis=1)
        y = (data[0]['no2_concentration'] > data[0]['no2_concentration'].mean()).astype(int)  # Binary classification
        feature_names = X.columns.tolist()
        X = X.values
    elif dataset_name == "water":
        # Water Potability dataset (already binary classification)
        !kaggle datasets download -d devanshibavaria/water-potability-dataset-with-10-parameteres
        !unzip -q water-potability-dataset-with-10-parameteres.zip
        df = pd.read_csv("water_potability.csv")
        X = df.drop('Potability', axis=1)  # Features (10 parameters)
        y = df['Potability']               # Target (binary: 0/1)
        X = X.fillna(X.mean())
        X = X.values
        feature_names = df.drop('Potability', axis=1).columns.tolist()
    else:
        raise ValueError(f"Unknown dataset: {dataset_name}")
    
    return X, y, feature_names

def compare_feature_importance_methods(dataset_name):
    """Compare your method with permutation feature importance for classification"""
    print(f"\nAnalyzing dataset: {dataset_name}")
    
    # Load dataset
    X, y, feature_names = load_dataset(dataset_name)
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Initialize models
    base_models = [
        xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        lgb.LGBMClassifier(random_state=42, verbose=-1),
        RandomForestClassifier(random_state=42),
        ExtraTreesClassifier(random_state=42),
        DecisionTreeClassifier(random_state=42),
        AdaBoostClassifier(random_state=42)
    ]
    
    meta_model = LogisticRegression(random_state=42)
    
    # Train base models
    for model in base_models:
        model.fit(X_train, y_train)
    
    # Time and compute your method
    start_time = time.time()
    your_importance = compute_stacking_feature_importance(
        X_train, y_train, X_test, y_test, base_models, meta_model
    )
    your_method_time = time.time() - start_time
    
    # Time and compute permutation importance with StackingClassifier
    start_time = time.time()
    perm_importance = compute_permutation_feature_importance(
        X_train, y_train, X_test, y_test, base_models, meta_model
    )
    perm_method_time = time.time() - start_time
    
    # Prepare and return results
    comparison_df = pd.DataFrame({
        'Feature': feature_names,
        'Your Method': your_importance,
        'Permutation Importance': perm_importance
    })
    
    # Sort by your method's importance
    comparison_df = comparison_df.sort_values('Your Method', ascending=False)
    
    # Calculate correlation between methods
    correlation = np.corrcoef(your_importance, perm_importance)[0, 1]
    
    # Create StackingClassifier for performance evaluation
    named_estimators = [
        ('xgb', base_models[0]),
        ('lgb', base_models[1]),
        ('rf', base_models[2]),
        ('et', base_models[3]),
        ('dt', base_models[4]),
        ('ada', base_models[5])
    ]
    
    stacking_model = StackingClassifier(
        estimators=named_estimators,
        final_estimator=meta_model,
        cv=None,
        passthrough=False
    )
    
    # Base model performance metrics
    model_scores = {}
    for i, model in enumerate(base_models):
        model_name = model.__class__.__name__
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        model_scores[model_name] = {'Accuracy': accuracy, 'F1 Score': f1}
    
    # Stacking model performance
    stacking_model.fit(X_train, y_train)
    stacking_pred = stacking_model.predict(X_test)
    stacking_accuracy = accuracy_score(y_test, stacking_pred)
    stacking_f1 = f1_score(y_test, stacking_pred, average='weighted')
    model_scores['Stacking'] = {'Accuracy': stacking_accuracy, 'F1 Score': stacking_f1}
    
    return {
        'comparison': comparison_df,
        'correlation': correlation,
        'your_method_time': your_method_time,
        'perm_method_time': perm_method_time,
        'model_scores': model_scores
    }

def visualize_comparison(results, dataset_name):
    """Visualize comparison between methods"""
    comparison_df = results['comparison']
    
    plt.figure(figsize=(12, 8))
    
    # Side by side bar plots
    x = np.arange(len(comparison_df))
    width = 0.35
    
    plt.bar(x - width/2, comparison_df['Your Method'], width, label='Your Method')
    plt.bar(x + width/2, comparison_df['Permutation Importance'], width, label='Permutation Importance')
    
    plt.xlabel('Features')
    plt.ylabel('Importance (normalized)')
    plt.title(f'Feature Importance Comparison - {dataset_name.title()} Dataset\nCorrelation: {results["correlation"]:.4f}')
    plt.xticks(x, comparison_df['Feature'], rotation=90)
    plt.legend()
    plt.tight_layout()
    
    return plt.gcf()

def analyze_all_datasets():
    """Run analysis on all datasets"""
    datasets = ["water"]
    all_results = {}
    summary_rows = []
    
    for dataset in datasets:
        try:
            results = compare_feature_importance_methods(dataset)
            all_results[dataset] = results
            
            # Create visualization
            fig = visualize_comparison(results, dataset)
            plt.close(fig)
            
            # Top features by each method
            top_yours = results['comparison'].nlargest(3, 'Your Method')['Feature'].tolist()
            top_perm = results['comparison'].nlargest(3, 'Permutation Importance')['Feature'].tolist()
            
            # Add to summary
            summary_rows.append({
                'Dataset': dataset.title(),
                'Correlation': results['correlation'],
                'Your Method Time (s)': results['your_method_time'],
                'Permutation Time (s)': results['perm_method_time'],
                'Top-3 Features (Your)': ', '.join(top_yours),
                'Top-3 Features (Perm)': ', '.join(top_perm),
                'Stacking Accuracy': results['model_scores']['Stacking']['Accuracy'],
                'Stacking F1 Score': results['model_scores']['Stacking']['F1 Score'],
                'Speedup Factor': results['perm_method_time'] / results['your_method_time']
            })
        except Exception as e:
            print(f"Error processing dataset {dataset}: {str(e)}")
    
    # Create summary table
    summary_df = pd.DataFrame(summary_rows)
    
    return all_results, summary_df

# Run analysis on all datasets
all_results, summary_df = analyze_all_datasets()

# Print summary
print("\n===== SUMMARY OF RESULTS =====")
print(summary_df.to_string(index=False))

# Generate detailed comparison for each dataset
for dataset_name, results in all_results.items():
    print(f"\n\n===== DETAILED RESULTS FOR {dataset_name.upper()} =====")
    print("\nFeature Importance Comparison:")
    print(results['comparison'].to_string(index=False))
    
    print(f"\nCorrelation between methods: {results['correlation']:.4f}")
    print(f"Your method execution time: {results['your_method_time']:.4f} seconds")
    print(f"Permutation importance execution time: {results['perm_method_time']:.4f} seconds")
    print(f"Speedup factor: {results['perm_method_time'] / results['your_method_time']:.2f}x")
    
    print("\nModel Performance:")
    model_df = pd.DataFrame.from_dict(results['model_scores'], orient='index')
    print(model_df.to_string())