In [None]:
import timeit
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import chi2 as chi2_sklearn
from chi2 import chi2_numba

# Set up the test parameters
N_SAMPLES = 2000
N_FEATURES = 2000
N_CLASSES = 5
RANDOM_STATE = 42

print("Chi-Squared Implementation Benchmark")
print("-" * 40)
print(f"Dataset shape: Samples={N_SAMPLES}, Features={N_FEATURES}, Classes={N_CLASSES}")
print("-" * 40)

# 1. Generate synthetic data
X, y = make_classification(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=500,
    n_redundant=500,
    n_classes=N_CLASSES,
    n_clusters_per_class=1,
    random_state=RANDOM_STATE
)
# The Chi-squared test requires non-negative features (e.g., counts)
X = np.abs(X * 100).astype(np.int64)

# 2. Run the Numba implementation
# First run is for JIT compilation ("warm-up") and is not timed.
print("Compiling Numba function...")
chi2_numba(X, y)
print("Compilation complete.\n")

# Time the Numba implementation
print("Timing Numba implementation...")
numba_time = timeit.timeit(lambda: chi2_numba(X, y), number=10)
print(f"Done.")

# 3. Run the scikit-learn implementation
print("\nTiming scikit-learn implementation...")
sklearn_time = timeit.timeit(lambda: chi2_sklearn(X, y), number=10)
print(f"Done.")

# 4. Verify that the results are the same
chi2_n, p_n = chi2_numba(X, y)
chi2_s, p_s = chi2_sklearn(X, y)

assert np.allclose(chi2_n, chi2_s), "Chi2 statistics do not match!"
assert np.allclose(p_n, p_s), "P-values do not match!"
print("\nCorrectness check passed: Results are identical.")

# 5. Report the results
print("\n\n--- Benchmark Results ---")
print(f"Scikit-learn time: {sklearn_time:.4f} seconds")
print(f"Numba time:        {numba_time:.4f} seconds")

speedup = sklearn_time / numba_time
print(f"\nNumba implementation is {speedup:.2f}x faster.")

In [21]:
!git commit -a -m "editing documentation"
!git push

[main f8861a9] editing documentation
 2 files changed, 60 insertions(+), 24 deletions(-)
Enumerating objects: 13, done.
Counting objects: 100% (13/13), done.
Delta compression using up to 20 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 1.40 KiB | 1.40 MiB/s, done.
Total 7 (delta 5), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (5/5), completed with 5 local objects.[K
To https://github.com/GavinLynch04/FastSelect.git
   bdf68e1..f8861a9  main -> main


In [20]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   BenchmarkingChi2.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
# benchmark_mrmr.py
import time
import numpy as np
import pandas as pd
from memory_profiler import memory_usage

from src.fast_select.mRMR import mRMR as My_mRMR

from mrmr import mrmr_classif

def generate_data(n_samples, n_features, n_states):
    """Creates a random discrete integer dataset."""
    print(f"Generating data: {n_samples} samples, {n_features} features, {n_states} states")
    X = np.random.randint(0, n_states, size=(n_samples, n_features), dtype=np.int8)
    y = np.random.randint(0, 2, size=n_samples, dtype=np.int8)
    return X, y

def benchmark_estimator(name, estimator, X, y):
    """Measures runtime and peak memory for an estimator's fit method."""
    print(f"\n--- Benchmarking: {name} ---")

    fit_func = lambda: estimator.fit(X, y)

    mem_samples = memory_usage(fit_func, interval=0.01)
    peak_mem = max(mem_samples)

    start_time = time.perf_counter()
    fit_func()
    end_time = time.perf_counter()
    runtime = end_time - start_time

    print(f"Runtime:         {runtime:.4f} seconds")
    print(f"Peak Memory:     {peak_mem:.2f} MiB")
    print(f"Selected Indicies: {estimator.top_features_}")
    return runtime, peak_mem

if __name__ == "__main__":
    # --- Configuration ---
    N_SAMPLES = 1600
    N_FEATURES = 30000
    N_STATES = 3  # Number of unique values per feature
    K_FEATURES_TO_SELECT = 100

    X_data, y_data = generate_data(N_SAMPLES, N_FEATURES, N_STATES)
    feature_names = [f'feature_{i}' for i in range(N_FEATURES)]

    X_pandas = pd.DataFrame(X_data, columns=feature_names)

    y_pandas = pd.Series(y_data, name='target')

    # --- Define Estimators ---
    estimators = {
        "My mRMR (Numba)": My_mRMR(
            n_features_to_select=K_FEATURES_TO_SELECT,
            method='MID',
            backend='gpu'
        ),
        "mrmr_selection": mrmr_classif(
            X=X_pandas, y=y_pandas, K=K_FEATURES_TO_SELECT
        )
    }
    
    benchmark_estimator("My mRMR (Numba)", estimators["My mRMR (Numba)"], X_data, y_data)
    
    print("\n--- Benchmarking: mrmr_selection ---")
    
    target_func = lambda: mrmr_classif(X=X_pandas, y=y_pandas, K=K_FEATURES_TO_SELECT)
    
    mem_samples = memory_usage(target_func, interval=0.01)
    peak_mem = max(mem_samples)
    
    start_time = time.perf_counter()
    selected_features = target_func()
    end_time = time.perf_counter()
    runtime = end_time - start_time
    
    print(f"Runtime:         {runtime:.4f} seconds")
    print(f"Peak Memory:     {peak_mem:.2f} MiB")
    print(f"Selected features: {selected_features}")


Generating data: 1600 samples, 30000 features, 3 states


100%|██████████| 100/100 [01:38<00:00,  1.02it/s]


--- Benchmarking: My mRMR (Numba) ---





0.517378568649292
0.779015302658081


Process MemTimer-21:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/galynch/.local/lib/python3.10/site-packages/memory_profiler.py", line 262, in run
    stop = self.pipe.poll(self.interval)
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 424, in _poll
    r = wait([self], timeout)
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/usr/lib/python3.10/selectors.py", line 416, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt


In [1]:
%cd ..

/home/galynch/snap/snapd-desktop-integration/253/Desktop/FastSelect


In [2]:
import numpy as np
import time
import tracemalloc  # <-- Import the memory tracking module

from sklearn.datasets import make_classification
from sklearn.preprocessing import KBinsDiscretizer

# Assume your CFS class and its helper functions are in a file named `my_cfs.py`
from src.fast_select.CFS import CFS


# =============================================================================
# Helper function for memory profiling
# =============================================================================
def profile_memory(func, *args, **kwargs):
    """
    Profiles the peak memory usage of a function call.

    Returns:
        (any, float): A tuple containing the function's return value
                      and the peak memory used in MiB.
    """
    tracemalloc.start()
    
    # Run the function and get its result
    result = func(*args, **kwargs)
    
    # Get memory usage statistics
    current, peak = tracemalloc.get_traced_memory()
    
    tracemalloc.stop()
    
    # Convert peak memory from bytes to Mebibytes (MiB) for readability
    peak_mem_mib = peak / 1024**2
    
    return result, peak_mem_mib

# =============================================================================
# Main benchmark
# =============================================================================

# --- 1. Setup a challenging dataset ---
print("Generating a synthetic dataset...")
n_samples = 100
n_features = 500
n_informative = 15
n_redundant = 50

X, y = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=n_informative,
    n_redundant=n_redundant,
    n_classes=3,
    flip_y=0.05,
    random_state=42
)

print("Discretizing data...")
discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
X_discrete = discretizer.fit_transform(X).astype(np.int8)

print("-" * 50)
print(f"Benchmark running on data with shape: {X_discrete.shape}")
print("-" * 50)


# --- 2. Benchmark your implementation ---
print("Benchmarking YOUR Numba-based CFS implementation...")
my_cfs_selector = CFS(n_jobs=-1)

start_time = time.perf_counter()
# Use the memory profiler to run the fit method
_, my_cfs_mem_peak = profile_memory(my_cfs_selector.fit, X_discrete, y)
duration_my_cfs = time.perf_counter() - start_time

my_cfs_indices = my_cfs_selector.selected_indices_
print(f"Time taken: {duration_my_cfs:.4f} seconds")
print(f"Peak memory usage: {my_cfs_mem_peak:.2f} MiB") # <-- New output
print(f"Number of features selected: {len(my_cfs_indices)}")
print(f"Selected indices: {my_cfs_indices}")
print("-" * 50)



Generating a synthetic dataset...
Discretizing data...
--------------------------------------------------
Benchmark running on data with shape: (100, 500)
--------------------------------------------------
Benchmarking YOUR Numba-based CFS implementation...




Time taken: 22.1363 seconds
Peak memory usage: 36.14 MiB
Number of features selected: 16
Selected indices: [ 11  53  54  61  73  93 110 111 131 139 148 159 188 215 295 490]
--------------------------------------------------
