In [1]:
import time
import warnings
import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.datasets import make_classification
import threading
import os

# --- Plotting Libraries ---
import matplotlib.pyplot as plt
import seaborn as sns

# --- Profiling and Estimator Libraries ---
from memory_profiler import memory_usage
try:
    from pynvml import *
    pynvml_available = True
except ImportError:
    pynvml_available = False

from skrebate import ReliefF, SURF, SURFstar, MultiSURF as SkrebateMultiSURF, MultiSURFstar
# Assuming your local implementations are in a 'src' directory
from src.fast_select.ReliefF import ReliefF as FastReliefF
from src.fast_select.SURF import SURF as FastSURF
from src.fast_select.MultiSURF import MultiSURF as FastMultiSURF

try:
    from numba import cuda
    GPU_AVAILABLE = cuda.is_available()
    if pynvml_available and GPU_AVAILABLE:
        nvmlInit()
except (ImportError, NVMLError):
    GPU_AVAILABLE = False

# --- Benchmark Configuration ---
P_DOMINANT_SCENARIOS = { "n_samples": 100, "n_features_range": [100000, 200000, 300000, 400000, 500000] }
N_DOMINANT_SCENARIOS = { "n_features": 100, "n_samples_range": [10000, 20000, 30000, 40000, 50000] }
N_FEATURES_TO_SELECT = 10
N_REPEATS = 1 # Increased repeats for more stable averages in plots

# --- Estimators to Test ---
estimators = {
    #"skrebate.ReliefF": ReliefF(n_features_to_select=N_FEATURES_TO_SELECT, n_jobs=-1),
    #"skrebate.MultiSURF": SkrebateMultiSURF(n_features_to_select=N_FEATURES_TO_SELECT, n_jobs=-1),
    "fast_relief.ReliefF (CPU)": FastReliefF(n_features_to_select=N_FEATURES_TO_SELECT, backend='cpu', verbose=True),
    "fast_relief.MultiSURF (CPU)": FastMultiSURF(n_features_to_select=N_FEATURES_TO_SELECT, backend='cpu'),
}
if GPU_AVAILABLE:
    print("NVIDIA GPU detected. Including GPU benchmarks.")
    estimators.update({
        "fast_relief.ReliefF (GPU)": FastReliefF(n_features_to_select=N_FEATURES_TO_SELECT, backend='gpu'),
        "fast_relief.MultiSURF (GPU)": FastMultiSURF(n_features_to_select=N_FEATURES_TO_SELECT, backend='gpu'),
    })
else:
    print("No NVIDIA GPU detected. Skipping GPU benchmarks.")


# --- CORE BENCHMARKING FUNCTIONS (with memory profiling) ---
def run_single_benchmark(estimator, X, y, is_gpu=False):
    mem_increase_mb = -1.0
    def fit_estimator():
        estimator.fit(X, y)

    if is_gpu and GPU_AVAILABLE and pynvml_available:
        handle = nvmlDeviceGetHandleByIndex(0)
        class MemTracker(threading.Thread):
            def __init__(self):
                threading.Thread.__init__(self)
                self.peak_mem = 0
                self.running = True
            def run(self):
                initial_mem = nvmlDeviceGetMemoryInfo(handle).used
                while self.running:
                    self.peak_mem = max(self.peak_mem, nvmlDeviceGetMemoryInfo(handle).used - initial_mem)
                    time.sleep(0.01)
            def stop(self):
                self.running = False
        tracker = MemTracker()
        tracker.start()
        start_time = time.perf_counter()
        try:
            fit_estimator()
        finally:
            tracker.stop()
            tracker.join()
        end_time = time.perf_counter()
        mem_increase_mb = tracker.peak_mem / (1024**2)
    else:
        start_time = time.perf_counter()
        mem_profile, _ = memory_usage((fit_estimator,), retval=True, interval=0.1)
        end_time = time.perf_counter()
        mem_increase_mb = max(mem_profile) - mem_profile[0]
        
    runtime = end_time - start_time
    return runtime, mem_increase_mb

def warmup_jit_compilers(estimators_dict):
    print("\n--- Warming up JIT compilers ---")
    X_warmup, y_warmup = make_classification(n_samples=10, n_features=10, random_state=42)
    for name, estimator in estimators_dict.items():
        if "fast_relief" in name:
            print(f"  Warming up {name}...")
            try:
                clone(estimator).fit(X_warmup, y_warmup)
            except Exception as e:
                warnings.warn(f"  > Warm-up FAILED for {name}. Reason: {e}")
    print("--- Warm-up complete ---")


def plot_scenario(df, scenario_name, x_axis, y_axis, title, filename):
    """
    Generates and saves a line plot for a given benchmark scenario.
    Handles potential pandas/matplotlib version conflicts automatically.
    """
    plt.figure(figsize=(14, 8))
    scenario_df = df[df['scenario'] == scenario_name]

    # Use seaborn for a clean, publication-quality line plot
    # It automatically groups by 'algorithm' and calculates mean/confidence intervals
    sns.lineplot(
        data=scenario_df,
        x=x_axis,
        y=y_axis,
        hue='algorithm',
        marker='o',
        linestyle='-',
        errorbar=('ci', 95) # Show 95% confidence interval
    )

    # Adding plot labels and title
    plt.title(title, fontsize=18, fontweight='bold')
    plt.xlabel(x_axis.replace('_', ' ').title(), fontsize=14)
    plt.ylabel(y_axis.replace('_', ' ').title(), fontsize=14)
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.legend(title='Algorithm', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend
    
    # Save the figure
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    print(f"Plot saved to '{filename}'")
    plt.show()


# --- MAIN EXECUTION BLOCK ---
def main():
    """Main function to run all benchmark scenarios and generate plots."""
    results = []
    output_dir = "benchmark_plots"
    os.makedirs(output_dir, exist_ok=True) # Create directory for plots

    warmup_jit_compilers(estimators)

    # --- Run Benchmark Scenarios ---
    scenarios = {
        "p >> n": (P_DOMINANT_SCENARIOS, "n_features"),
        "n >> p": (N_DOMINANT_SCENARIOS, "n_samples")
    }

    for name, (params, independent_var) in scenarios.items():
        print(f"\n--- Running Scenario: {name} ---")
        if name == "p >> n":
            param_range = params["n_features_range"]
            n_samples = params["n_samples"]
            for n_features in param_range:
                print(f"\nGenerating data: {n_samples} samples, {n_features} features")
                X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=20, n_redundant=100, random_state=42)
                for est_name, estimator in estimators.items():
                    for i in range(N_REPEATS):
                        print(f"  Benchmarking {est_name} (Run {i+1}/{N_REPEATS})...")
                        try:
                            is_gpu = "(GPU)" in est_name
                            runtime, memory_mb = run_single_benchmark(clone(estimator), X, y, is_gpu)
                            results.append({"scenario": name, "algorithm": est_name, "n_samples": n_samples, "n_features": n_features, "runtime": runtime, "memory_increase_mb": memory_mb})
                        except Exception as e:
                            warnings.warn(f"  > FAILED: {est_name}. Reason: {e}")
        else: # n >> p
            param_range = params["n_samples_range"]
            n_features = params["n_features"]
            for n_samples in param_range:
                print(f"\nGenerating data: {n_samples} samples, {n_features} features")
                X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=20, n_redundant=50, random_state=42)
                for est_name, estimator in estimators.items():
                    for i in range(N_REPEATS):
                        print(f"  Benchmarking {est_name} (Run {i+1}/{N_REPEATS})...")
                        try:
                            is_gpu = "(GPU)" in est_name
                            runtime, memory_mb = run_single_benchmark(clone(estimator), X, y, is_gpu)
                            results.append({"scenario": name, "algorithm": est_name, "n_samples": n_samples, "n_features": n_features, "runtime": runtime, "memory_increase_mb": memory_mb})
                        except Exception as e:
                            warnings.warn(f"  > FAILED: {est_name}. Reason: {e}")
    
    # --- Save results to CSV ---
    df = pd.DataFrame(results)
    output_file = "benchmark_results_with_memory.csv"
    df.to_csv(output_file, index=False)
    print(f"\nBenchmarking complete. Results saved to '{output_file}'")

    # --- Generate and Save Plots ---
    print("\n--- Generating Plots ---")
    
    # Plot 1: Runtime for p >> n
    plot_scenario(df, 'p >> n', 'n_features', 'runtime', 
                  'Runtime Performance (Many Features, p >> n)', 
                  os.path.join(output_dir, 'p_dominant_runtime.png'))

    # Plot 2: Memory for p >> n
    plot_scenario(df, 'p >> n', 'n_features', 'memory_increase_mb', 
                  'Memory Usage (Many Features, p >> n)', 
                  os.path.join(output_dir, 'p_dominant_memory.png'))

    # Plot 3: Runtime for n >> p
    plot_scenario(df, 'n >> p', 'n_samples', 'runtime', 
                  'Runtime Performance (Many Samples, n >> p)', 
                  os.path.join(output_dir, 'n_dominant_runtime.png'))

    # Plot 4: Memory for n >> p
    plot_scenario(df, 'n >> p', 'n_samples', 'memory_increase_mb', 
                  'Memory Usage (Many Samples, n >> p)', 
                  os.path.join(output_dir, 'n_dominant_memory.png'))

if __name__ == "__main__":
    # Best practice is to have your script execute via a main function call
    main()
    
    # Clean up NVML
    if GPU_AVAILABLE and pynvml_available:
        try:
            nvmlShutdown()
        except NVMLError:
            pass



NVIDIA GPU detected. Including GPU benchmarks.

--- Warming up JIT compilers ---
  Warming up fast_relief.ReliefF (CPU)...
Running ReliefF on the CPU now...
  Warming up fast_relief.MultiSURF (CPU)...
  Warming up fast_relief.ReliefF (GPU)...
  Warming up fast_relief.MultiSURF (GPU)...
--- Warm-up complete ---

--- Running Scenario: p >> n ---

Generating data: 100 samples, 100000 features
  Benchmarking fast_relief.ReliefF (CPU) (Run 1/1)...
Running ReliefF on the CPU now...
  Benchmarking fast_relief.MultiSURF (CPU) (Run 1/1)...
  Benchmarking fast_relief.ReliefF (GPU) (Run 1/1)...
  Benchmarking fast_relief.MultiSURF (GPU) (Run 1/1)...

Generating data: 100 samples, 200000 features
  Benchmarking fast_relief.ReliefF (CPU) (Run 1/1)...
Running ReliefF on the CPU now...
  Benchmarking fast_relief.MultiSURF (CPU) (Run 1/1)...
  Benchmarking fast_relief.ReliefF (GPU) (Run 1/1)...
  Benchmarking fast_relief.MultiSURF (GPU) (Run 1/1)...

Generating data: 100 samples, 300000 features
  Ben

Process MemTimer-5:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/galynch/.local/lib/python3.10/site-packages/memory_profiler.py", line 262, in run
    stop = self.pipe.poll(self.interval)
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 424, in _poll
    r = wait([self], timeout)
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/usr/lib/python3.10/selectors.py", line 416, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt


KeyboardInterrupt: 

In [12]:
!git commit -a -m "getting tests and CI/CD working"
!git push

[main 4cf720a] getting tests and CI/CD working
 1 file changed, 2 deletions(-)
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 20 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 452 bytes | 452.00 KiB/s, done.
Total 5 (delta 3), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/GavinLynch04/FastSelect.git
   4020dd9..4cf720a  main -> main


In [9]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mmodified:   BenchmarkingRelief.ipynb[m
	[32mmodified:   __init__.py[m
	[32mmodified:   tests/test_chi2.py[m
	[32mmodified:   tests/test_relieff.py[m
	[32mmodified:   tests/test_surf.py[m

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   BenchmarkingRelief.ipynb[m



In [11]:
!git add .


In [2]:
!git pull

remote: Enumerating objects: 6, done.[K
remote: Counting objects:  16% (1/6)[Kremote: Counting objects:  33% (2/6)[Kremote: Counting objects:  50% (3/6)[Kremote: Counting objects:  66% (4/6)[Kremote: Counting objects:  83% (5/6)[Kremote: Counting objects: 100% (6/6)[Kremote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects:  33% (1/3)[Kremote: Compressing objects:  66% (2/3)[Kremote: Compressing objects: 100% (3/3)[Kremote: Compressing objects: 100% (3/3), done.[K
remote: Total 5 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Unpacking objects:  20% (1/5)Unpacking objects:  40% (2/5)Unpacking objects:  60% (3/5)Unpacking objects:  80% (4/5)Unpacking objects: 100% (5/5)Unpacking objects: 100% (5/5), 1.94 KiB | 991.00 KiB/s, done.
From https://github.com/GavinLynch04/FastSelect
   43df5fb..55dbf04  main       -> origin/main
Updating 43df5fb..55dbf04
Fast-forward
 .github/workflows/python-tests.yml | 61 [32m+++++++++++++++++++