In [None]:
import timeit
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import chi2 as chi2_sklearn
from chi2 import chi2_numba

# Set up the test parameters
N_SAMPLES = 2000
N_FEATURES = 2000
N_CLASSES = 5
RANDOM_STATE = 42

print("Chi-Squared Implementation Benchmark")
print("-" * 40)
print(f"Dataset shape: Samples={N_SAMPLES}, Features={N_FEATURES}, Classes={N_CLASSES}")
print("-" * 40)

# 1. Generate synthetic data
X, y = make_classification(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=500,
    n_redundant=500,
    n_classes=N_CLASSES,
    n_clusters_per_class=1,
    random_state=RANDOM_STATE
)
# The Chi-squared test requires non-negative features (e.g., counts)
X = np.abs(X * 100).astype(np.int64)

# 2. Run the Numba implementation
# First run is for JIT compilation ("warm-up") and is not timed.
print("Compiling Numba function...")
chi2_numba(X, y)
print("Compilation complete.\n")

# Time the Numba implementation
print("Timing Numba implementation...")
numba_time = timeit.timeit(lambda: chi2_numba(X, y), number=10)
print(f"Done.")

# 3. Run the scikit-learn implementation
print("\nTiming scikit-learn implementation...")
sklearn_time = timeit.timeit(lambda: chi2_sklearn(X, y), number=10)
print(f"Done.")

# 4. Verify that the results are the same
chi2_n, p_n = chi2_numba(X, y)
chi2_s, p_s = chi2_sklearn(X, y)

assert np.allclose(chi2_n, chi2_s), "Chi2 statistics do not match!"
assert np.allclose(p_n, p_s), "P-values do not match!"
print("\nCorrectness check passed: Results are identical.")

# 5. Report the results
print("\n\n--- Benchmark Results ---")
print(f"Scikit-learn time: {sklearn_time:.4f} seconds")
print(f"Numba time:        {numba_time:.4f} seconds")

speedup = sklearn_time / numba_time
print(f"\nNumba implementation is {speedup:.2f}x faster.")

In [21]:
!git commit -a -m "editing documentation"
!git push

[main f8861a9] editing documentation
 2 files changed, 60 insertions(+), 24 deletions(-)
Enumerating objects: 13, done.
Counting objects: 100% (13/13), done.
Delta compression using up to 20 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 1.40 KiB | 1.40 MiB/s, done.
Total 7 (delta 5), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (5/5), completed with 5 local objects.[K
To https://github.com/GavinLynch04/FastSelect.git
   bdf68e1..f8861a9  main -> main


In [20]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   BenchmarkingChi2.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [2]:
# benchmark_mrmr.py
import time
import numpy as np
import pandas as pd
from memory_profiler import memory_usage

# --- Import your local implementation ---
# Adjust this path if your file structure is different
from src.fast_select.mRMR import mRMR as My_mRMR

# --- Import the benchmark target from its library ---
from mrmr import mrmr_classif

def generate_data(n_samples, n_features, n_states):
    """Creates a random discrete integer dataset."""
    print(f"Generating data: {n_samples} samples, {n_features} features, {n_states} states")
    X = np.random.randint(0, n_states, size=(n_samples, n_features), dtype=np.int8)
    y = np.random.randint(0, 2, size=n_samples, dtype=np.int8)
    return X, y

def benchmark_estimator(name, estimator, X, y):
    """Measures runtime and peak memory for an estimator's fit method."""
    print(f"\n--- Benchmarking: {name} ---")

    # Wrap the .fit() method in a lambda for the profiler
    fit_func = lambda: estimator.fit(X, y)

    # 1. Measure Peak Memory Usage
    # memory_usage runs the function and returns a list of memory samples
    mem_samples = memory_usage(fit_func, interval=0.01)
    peak_mem = max(mem_samples)

    # 2. Measure Runtime
    # Time a second run to get a more stable execution time after any caching
    start_time = time.perf_counter()
    fit_func()
    end_time = time.perf_counter()
    runtime = end_time - start_time

    print(f"Runtime:         {runtime:.4f} seconds")
    print(f"Peak Memory:     {peak_mem:.2f} MiB")
    return runtime, peak_mem

if __name__ == "__main__":
    # --- Configuration ---
    N_SAMPLES = 200
    N_FEATURES = 30000
    N_STATES = 10  # Number of unique values per feature
    K_FEATURES_TO_SELECT = 100

    # --- Data Generation ---
    X_data, y_data = generate_data(N_SAMPLES, N_FEATURES, N_STATES)
    feature_names = [f'feature_{i}' for i in range(N_FEATURES)]

    X_pandas = pd.DataFrame(X_data, columns=feature_names)

    y_pandas = pd.Series(y_data, name='target')

    # --- Define Estimators ---
    estimators = {
        "My mRMR (Numba)": My_mRMR(
            n_features_to_select=K_FEATURES_TO_SELECT,
            method='MID',
            backend='gpu'
        ),
        "mrmr_selection": mrmr_classif(
            X=X_pandas, y=y_pandas, K=K_FEATURES_TO_SELECT
        )
    }
    
    # --- Run Benchmarks ---
    # Note: mrmr_selection's fit is done during initialization
    # We will benchmark its core selection logic if possible, or just the init.
    # For a fair comparison, we benchmark the fit() method where it exists.
    
    # Benchmark your implementation
    benchmark_estimator("My mRMR (Numba)", estimators["My mRMR (Numba)"], X_data, y_data)
    
    # Benchmark mrmr_selection
    # Its main work is done in the function call, not a separate .fit()
    print("\n--- Benchmarking: mrmr_selection ---")
    
    target_func = lambda: mrmr_classif(X=X_pandas, y=y_pandas, K=K_FEATURES_TO_SELECT)
    
    mem_samples = memory_usage(target_func, interval=0.01)
    peak_mem = max(mem_samples)
    
    start_time = time.perf_counter()
    selected_features = target_func()
    end_time = time.perf_counter()
    runtime = end_time - start_time
    
    print(f"Runtime:         {runtime:.4f} seconds")
    print(f"Peak Memory:     {peak_mem:.2f} MiB")
    print(f"Selected features: {selected_features}")


Generating data: 200 samples, 30000 features, 10 states


100%|██████████| 100/100 [01:17<00:00,  1.30it/s]


--- Benchmarking: My mRMR (Numba) ---





0.09353780746459961
0.1338510513305664
14.787329912185669
13.700662612915039
0.0888822078704834
0.005830049514770508
14.403152704238892
13.65813136100769
Runtime:         28.1610 seconds
Peak Memory:     3919.29 MiB

--- Benchmarking: mrmr_selection ---


100%|██████████| 100/100 [01:16<00:00,  1.30it/s]
100%|██████████| 100/100 [01:17<00:00,  1.30it/s]

Runtime:         80.4362 seconds
Peak Memory:     10783.03 MiB
Selected features: ['feature_14359', 'feature_8572', 'feature_27641', 'feature_8815', 'feature_750', 'feature_25619', 'feature_23986', 'feature_779', 'feature_15863', 'feature_6542', 'feature_26689', 'feature_24495', 'feature_21456', 'feature_20010', 'feature_15369', 'feature_2547', 'feature_28715', 'feature_5952', 'feature_18604', 'feature_18134', 'feature_26530', 'feature_24761', 'feature_14483', 'feature_2903', 'feature_18104', 'feature_2456', 'feature_15887', 'feature_2395', 'feature_26080', 'feature_22716', 'feature_29384', 'feature_9407', 'feature_26671', 'feature_6528', 'feature_21765', 'feature_25982', 'feature_8532', 'feature_13764', 'feature_5235', 'feature_25017', 'feature_2024', 'feature_22234', 'feature_20758', 'feature_26367', 'feature_13031', 'feature_3449', 'feature_12657', 'feature_2460', 'feature_4322', 'feature_21733', 'feature_27963', 'feature_8873', 'feature_16673', 'feature_22962', 'feature_29013', 'fe




In [1]:
%cd ..

/home/galynch/snap/snapd-desktop-integration/253/Desktop/FastSelect
