diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5d5568f..34045be 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,10 +19,10 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -30,8 +30,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e . - pip install pytest pytest-cov flake8 + pip install -e ".[dev]" + pip install flake8 - name: Lint with flake8 run: | @@ -43,13 +43,12 @@ jobs: - name: Run tests run: | - pytest --cov=qbiocode --cov-report=xml --cov-report=term - continue-on-error: true + python -m pytest --cov=qbiocode --cov-report=xml --cov-report=term - name: Upload coverage to Codecov - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v6 with: - file: ./coverage.xml + files: ./coverage.xml flags: unittests name: codecov-${{ matrix.os }}-py${{ matrix.python-version }} continue-on-error: true @@ -60,18 +59,19 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: - python-version: '3.10' + python-version: '3.12' cache: 'pip' - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 black isort mypy + pip install -e ".[dev]" + pip install isort - name: Check code formatting with black run: black --check --diff qbiocode/ @@ -91,10 +91,10 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.10' cache: 'pip' @@ -102,8 +102,12 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e . - pip install sphinx sphinx-rtd-theme + pip install -e ".[docs]" + + - name: Install pandoc + run: | + sudo apt-get update + sudo apt-get install -y pandoc - name: Build documentation run: | @@ -112,7 +116,7 @@ jobs: continue-on-error: true - name: Upload documentation artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: documentation path: docs/build/html/ diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 83105c1..c239af3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,5 +1,8 @@ name: Release +permissions: + contents: write + on: release: types: [published] @@ -12,10 +15,10 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.10' @@ -39,14 +42,9 @@ jobs: - name: Upload release assets if: github.event_name == 'release' - uses: actions/upload-release-asset@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ github.event.release.upload_url }} - asset_path: ./dist/*.whl - asset_name: qbiocode-${{ github.event.release.tag_name }}-py3-none-any.whl - asset_content_type: application/zip + run: gh release upload "${{ github.event.release.tag_name }}" dist/*.whl --clobber continue-on-error: true create-zenodo-release: @@ -57,7 +55,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Zenodo release notification run: | diff --git a/.gitignore b/.gitignore index 20cdcf4..f051c60 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,13 @@ dist/ build/ *.whl +# Testing results +.coverage +htmlcov/ +.coverage.* +nosetests.xml +coverage.xml + # IDE .DS_Store .vscode/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6f67e07..436e2ff 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -56,7 +56,7 @@ This project adheres to a [Code of Conduct](CODE_OF_CONDUCT.md). By participatin 4. **Install Development Dependencies** (optional) ```bash - pip install pytest pytest-cov black flake8 mypy + pip install -e ".[dev]" ``` 5. **Verify Installation** @@ -164,7 +164,7 @@ from qbiocode.learning import compute_qsvc 3. **Test Your Changes** ```bash # Run existing tests - pytest tests/ + python -m pytest # Check code style black qbiocode/ diff --git a/README.md b/README.md index 7cd90ab..986644c 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,18 @@ pip install --force-reinstall xgboost For detailed installation instructions, see the [Installation Guide](https://ibm.github.io/QBioCode/installation.html). +### Running Tests + +```bash +# Install the package with development dependencies +pip install -e ".[dev]" + +# Run the test suite +python -m pytest +``` + +The current test suite focuses on utility modules and data-generation helpers that do not require a full runtime setup for all optional quantum workflows. + ### Basic Usage ```python diff --git a/docs/source/api_overview.rst b/docs/source/api_overview.rst index cddbb32..3d9feb3 100644 --- a/docs/source/api_overview.rst +++ b/docs/source/api_overview.rst @@ -45,7 +45,7 @@ Multiple models can be run via the following Classical Models """""""""""""""" -QBioCode provides classical machine learning models from `scikit-learn `_ for baseline comparisons and benchmarking against quantum models. +QBioCode provides classical machine learning models from `scikit-learn `__ for baseline comparisons and benchmarking against quantum models. .. autosummary:: ~qbiocode.learning.compute_dt.compute_dt @@ -291,4 +291,3 @@ Generated datasets are saved with: References ^^^^^^^^^^ - diff --git a/docs/source/apps/sage.rst b/docs/source/apps/sage.rst index 808e065..46eb756 100644 --- a/docs/source/apps/sage.rst +++ b/docs/source/apps/sage.rst @@ -154,10 +154,11 @@ This trains QSage on historical QProfiler data and generates predictions for all Train with Random Forest only: .. code-block:: bash -qsage --input qprofiler_results.csv --output results/ --model-type rf -# Or train MLP sub-sages -qsage --input qprofiler_results.csv --output results/ --model-type mlp + qsage --input qprofiler_results.csv --output results/ --model-type rf + + # Or train MLP sub-sages + qsage --input qprofiler_results.csv --output results/ --model-type mlp Train with custom seed and test size: @@ -166,11 +167,12 @@ Train with custom seed and test size: qsage --input data.csv --output results/ --seed 123 --test-size 0.3 -Train both Random Forest and MLP: +Train with a custom MLP iteration count: .. code-block:: bash -# Train MLP with more epochs -qsage --input data.csv --output results/ --model-type mlp --n-iter 2000 + + # Train MLP with more epochs + qsage --input data.csv --output results/ --model-type mlp --n-iter 2000 **Output Files** @@ -439,4 +441,4 @@ QSage can reveal which complexity features are most predictive of model performa .. admonition:: Reference :class: tip - For implementation details, see ``apps/sage/sage.py`` in the QBioCode repository. \ No newline at end of file + For implementation details, see ``apps/sage/sage.py`` in the QBioCode repository. diff --git a/docs/source/citing.rst b/docs/source/citing.rst index bf4e2ff..1490b7a 100644 --- a/docs/source/citing.rst +++ b/docs/source/citing.rst @@ -1,10 +1,7 @@ .. _citing: -Citing -=============== - Citing qbiocode --------------- +=============== If qbiocode is integral to a scientific publication, please cite it. A paper describing qbiocode has been published in the : diff --git a/docs/source/conf.py b/docs/source/conf.py index a232e56..469cac9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -142,7 +142,8 @@ def run_apidoc(app): html_show_sourcelink = False html_logo = "_static/QBioCode_logo.png" -html_favicon = "_static/favicon.ico" +if os.path.exists(os.path.join(os.path.dirname(__file__), "_static", "favicon.ico")): + html_favicon = "_static/favicon.ico" html_theme_options = { "icon_links": [ diff --git a/pyproject.toml b/pyproject.toml index a50a5c4..18f6565 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,8 +84,10 @@ dev = [ "pytest>=7.0", "pytest-cov>=4.0", "black>=23.0", + "isort>=5.0", "flake8>=6.0", "mypy>=1.0", + "types-PyYAML", ] all = [ "hydra-core", @@ -104,8 +106,10 @@ all = [ "pytest>=7.0", "pytest-cov>=4.0", "black>=23.0", + "isort>=5.0", "flake8>=6.0", "mypy>=1.0", + "types-PyYAML", ] [project.scripts] @@ -134,6 +138,10 @@ line-length = 100 target-version = ['py310', 'py311', 'py312'] include = '\.pyi?$' +[tool.isort] +profile = "black" +line_length = 100 + [tool.pytest.ini_options] testpaths = ["tests"] python_files = ["test_*.py"] @@ -146,4 +154,4 @@ python_version = "3.10" warn_return_any = true warn_unused_configs = true disallow_untyped_defs = false -ignore_missing_imports = true \ No newline at end of file +ignore_missing_imports = true diff --git a/qbiocode/__init__.py b/qbiocode/__init__.py index b1c9dac..67dff52 100644 --- a/qbiocode/__init__.py +++ b/qbiocode/__init__.py @@ -25,109 +25,102 @@ >>> results = compute_rf(X_train, y_train, X_test, y_test) """ -from .version import __version__ +# ====== Import data generation functions ====== +from .data_generation import ( + generate_circles_datasets, + generate_classification_datasets, + generate_moons_datasets, + generate_s_curve_datasets, + generate_spheres_datasets, + generate_spirals_datasets, + generate_swiss_roll_datasets, +) +from .data_generation.generator import generate_data + +# ====== Import embedding functions ====== +from .embeddings.embed import get_embeddings, pqk + +# ====== Import evaluation functions ====== +from .evaluation.dataset_evaluation import evaluate +from .evaluation.model_evaluation import modeleval +from .evaluation.model_run import model_run # ====== Import learning functions ====== -from .learning.compute_svc import compute_svc, compute_svc_opt from .learning.compute_dt import compute_dt, compute_dt_opt -from .learning.compute_nb import compute_nb, compute_nb_opt from .learning.compute_lr import compute_lr, compute_lr_opt +from .learning.compute_mlp import compute_mlp, compute_mlp_opt +from .learning.compute_nb import compute_nb, compute_nb_opt +from .learning.compute_pqk import compute_pqk +from .learning.compute_qnn import compute_qnn +from .learning.compute_qsvc import compute_qsvc from .learning.compute_rf import compute_rf, compute_rf_opt +from .learning.compute_svc import compute_svc, compute_svc_opt +from .learning.compute_vqc import compute_vqc + try: from .learning.compute_xgb import compute_xgb, compute_xgb_opt except Exception: # XGBoost not available (e.g., OpenMP not installed on macOS) compute_xgb = None # type: ignore compute_xgb_opt = None # type: ignore -from .learning.compute_mlp import compute_mlp, compute_mlp_opt -from .learning.compute_qnn import compute_qnn -from .learning.compute_qsvc import compute_qsvc -from .learning.compute_vqc import compute_vqc -from .learning.compute_pqk import compute_pqk - -# ====== Import embedding functions ====== -from .embeddings.embed import get_embeddings, pqk # ====== Import helper functions ====== -from .utils.helper_fn import scaler_fn, feature_encoding -from .utils.qc_winner_finder import qml_winner from .utils.dataset_checkpoint import checkpoint_restart - -# ====== Import evaluation functions ====== -from .evaluation.model_evaluation import modeleval -from .evaluation.dataset_evaluation import evaluate -from .evaluation.model_run import model_run +from .utils.helper_fn import feature_encoding, scaler_fn +from .utils.qc_winner_finder import qml_winner +from .version import __version__ # ====== Import visualization functions ====== from .visualization.visualize_correlation import ( + compute_results_correlation, plot_results_correlation, - compute_results_correlation -) - -# ====== Import data generation functions ====== -from .data_generation.generator import generate_data -from .data_generation import ( - generate_circles_datasets, - generate_moons_datasets, - generate_classification_datasets, - generate_s_curve_datasets, - generate_spheres_datasets, - generate_spirals_datasets, - generate_swiss_roll_datasets, ) __all__ = [ # Version - '__version__', - + "__version__", # Classical ML algorithms - 'compute_svc', - 'compute_svc_opt', - 'compute_dt', - 'compute_dt_opt', - 'compute_nb', - 'compute_nb_opt', - 'compute_lr', - 'compute_lr_opt', - 'compute_rf', - 'compute_rf_opt', - 'compute_xgb', - 'compute_xgb_opt', - 'compute_mlp', - 'compute_mlp_opt', - + "compute_svc", + "compute_svc_opt", + "compute_dt", + "compute_dt_opt", + "compute_nb", + "compute_nb_opt", + "compute_lr", + "compute_lr_opt", + "compute_rf", + "compute_rf_opt", + "compute_xgb", + "compute_xgb_opt", + "compute_mlp", + "compute_mlp_opt", # Quantum ML algorithms - 'compute_qnn', - 'compute_qsvc', - 'compute_vqc', - 'compute_pqk', - + "compute_qnn", + "compute_qsvc", + "compute_vqc", + "compute_pqk", # Embeddings - 'get_embeddings', - 'pqk', - + "get_embeddings", + "pqk", # Utilities - 'scaler_fn', - 'feature_encoding', - 'qml_winner', - 'checkpoint_restart', - + "scaler_fn", + "feature_encoding", + "qml_winner", + "checkpoint_restart", # Evaluation - 'modeleval', - 'evaluate', - 'model_run', - + "modeleval", + "evaluate", + "model_run", # Visualization - 'plot_results_correlation', - 'compute_results_correlation', - + "plot_results_correlation", + "compute_results_correlation", # Data generation - 'generate_data', - 'generate_circles_datasets', - 'generate_moons_datasets', - 'generate_classification_datasets', - 'generate_s_curve_datasets', - 'generate_spheres_datasets', - 'generate_spirals_datasets', - 'generate_swiss_roll_datasets', + "generate_data", + "generate_circles_datasets", + "generate_moons_datasets", + "generate_classification_datasets", + "generate_s_curve_datasets", + "generate_spheres_datasets", + "generate_spirals_datasets", + "generate_swiss_roll_datasets", ] diff --git a/qbiocode/data_generation/__init__.py b/qbiocode/data_generation/__init__.py index 2b292d6..1afc139 100644 --- a/qbiocode/data_generation/__init__.py +++ b/qbiocode/data_generation/__init__.py @@ -16,19 +16,19 @@ """ from .make_circles import generate_circles_datasets -from .make_moons import generate_moons_datasets from .make_class import generate_classification_datasets +from .make_moons import generate_moons_datasets from .make_s_curve import generate_s_curve_datasets from .make_spheres import generate_spheres_datasets from .make_spirals import generate_spirals_datasets from .make_swiss_roll import generate_swiss_roll_datasets __all__ = [ - 'generate_circles_datasets', - 'generate_moons_datasets', - 'generate_classification_datasets', - 'generate_s_curve_datasets', - 'generate_spheres_datasets', - 'generate_spirals_datasets', - 'generate_swiss_roll_datasets', + "generate_circles_datasets", + "generate_moons_datasets", + "generate_classification_datasets", + "generate_s_curve_datasets", + "generate_spheres_datasets", + "generate_spirals_datasets", + "generate_swiss_roll_datasets", ] diff --git a/qbiocode/data_generation/generator.py b/qbiocode/data_generation/generator.py index 36fa75b..f78f3a7 100644 --- a/qbiocode/data_generation/generator.py +++ b/qbiocode/data_generation/generator.py @@ -8,8 +8,8 @@ ### Imports ### import qbiocode.data_generation.make_circles as circles -import qbiocode.data_generation.make_moons as moons import qbiocode.data_generation.make_class as make_class +import qbiocode.data_generation.make_moons as moons import qbiocode.data_generation.make_s_curve as s_curve import qbiocode.data_generation.make_spheres as spheres import qbiocode.data_generation.make_spirals as spirals @@ -32,28 +32,28 @@ def generate_data( - type_of_data=None, - save_path=None, - n_samples=N_SAMPLES, - noise=NOISE, - hole=HOLE, - n_classes=N_CLASSES, - dim=DIM, - rad=RAD, - n_features=N_FEATURES, - n_informative=N_INFORMATIVE, - n_redundant=N_REDUNDANT, - n_clusters_per_class=N_CLUSTERS_PER_CLASS, - weights=WEIGHTS, - random_state=42, + type_of_data=None, + save_path=None, + n_samples=N_SAMPLES, + noise=NOISE, + hole=HOLE, + n_classes=N_CLASSES, + dim=DIM, + rad=RAD, + n_features=N_FEATURES, + n_informative=N_INFORMATIVE, + n_redundant=N_REDUNDANT, + n_clusters_per_class=N_CLUSTERS_PER_CLASS, + weights=WEIGHTS, + random_state=42, ): """ Generate synthetic datasets for machine learning benchmarking. - + Unified interface to generate various types of synthetic datasets with configurable parameters. Each dataset type creates multiple configurations by varying the specified parameters. - + Parameters ---------- type_of_data : str @@ -85,17 +85,17 @@ def generate_data( Class weight distributions (for classes only). random_state : int, default=42 Random seed for reproducibility. - + Returns ------- None Saves generated datasets to the specified path. - + Raises ------ ValueError If type_of_data is not one of the supported types. - + Examples -------- >>> from qbiocode.data_generation import generate_data @@ -104,65 +104,62 @@ def generate_data( Dataset generation complete. """ - if type_of_data == 'circles': + if type_of_data == "circles": # Generate circles dataset - circles.generate_circles_datasets(n_samples=n_samples, - noise=noise, - save_path=save_path, - random_state=random_state) - elif type_of_data == 'moons': + circles.generate_circles_datasets( + n_samples=n_samples, noise=noise, save_path=save_path, random_state=random_state + ) + elif type_of_data == "moons": # Generate moons dataset - moons.generate_moons_datasets(n_samples=n_samples, - noise=noise, - save_path=save_path, - random_state=random_state) - elif type_of_data == 'classes': + moons.generate_moons_datasets( + n_samples=n_samples, noise=noise, save_path=save_path, random_state=random_state + ) + elif type_of_data == "classes": # Generate higher-dimensional classification dataset - make_class.generate_classification_datasets(n_samples=n_samples, - n_features=n_features, - n_informative=n_informative, - n_redundant=n_redundant, - n_classes=n_classes, - n_clusters_per_class=n_clusters_per_class, - weights=weights, - save_path=save_path, - random_state=random_state + make_class.generate_classification_datasets( + n_samples=n_samples, + n_features=n_features, + n_informative=n_informative, + n_redundant=n_redundant, + n_classes=n_classes, + n_clusters_per_class=n_clusters_per_class, + weights=weights, + save_path=save_path, + random_state=random_state, ) - elif type_of_data == 's_curve': + elif type_of_data == "s_curve": # Generate S-curve dataset - s_curve.generate_s_curve_datasets(n_samples=n_samples, - noise=noise, - save_path=save_path, - random_state=random_state - ) - elif type_of_data == 'spheres': + s_curve.generate_s_curve_datasets( + n_samples=n_samples, noise=noise, save_path=save_path, random_state=random_state + ) + elif type_of_data == "spheres": # Generate spheres dataset - spheres.generate_spheres_datasets(n_s=n_samples, - dim=dim, - radius=rad, - save_path=save_path, - random_state=random_state - ) - elif type_of_data == 'spirals': + spheres.generate_spheres_datasets( + n_s=n_samples, dim=dim, radius=rad, save_path=save_path, random_state=random_state + ) + elif type_of_data == "spirals": # Generate spirals dataset - spirals.generate_spirals_datasets(n_s=n_samples, - n_c=n_classes, - n_n=noise, - n_d=dim, - save_path=save_path, - random_state=random_state - ) - elif type_of_data == 'swiss_roll': + spirals.generate_spirals_datasets( + n_s=n_samples, + n_c=n_classes, + n_n=noise, + n_d=dim, + save_path=save_path, + random_state=random_state, + ) + elif type_of_data == "swiss_roll": # Generate Swiss roll dataset - swiss_roll.generate_swiss_roll_datasets(n_samples=n_samples, - noise=noise, - hole=hole, - save_path=save_path, - random_state=random_state - ) + swiss_roll.generate_swiss_roll_datasets( + n_samples=n_samples, + noise=noise, + hole=hole, + save_path=save_path, + random_state=random_state, + ) else: - raise ValueError("Invalid type_of_data. Choose from 'circles', 'moons', 'classes', 's_curve', 'spheres', 'spirals', or 'swiss_roll'.") + raise ValueError( + "Invalid type_of_data. Choose from 'circles', 'moons', 'classes', 's_curve', 'spheres', 'spirals', or 'swiss_roll'." + ) print("Dataset generation complete.") return - diff --git a/qbiocode/data_generation/make_circles.py b/qbiocode/data_generation/make_circles.py index cdcfcb3..1eacb8d 100644 --- a/qbiocode/data_generation/make_circles.py +++ b/qbiocode/data_generation/make_circles.py @@ -6,18 +6,19 @@ algorithms on non-linearly separable data. """ -from sklearn.datasets import make_circles -import pandas as pd -import numpy as np import itertools import json import os +import numpy as np +import pandas as pd +from sklearn.datasets import make_circles # parameters to vary across the configurations N_SAMPLES = list(range(100, 300, 20)) NOISE = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + def generate_circles_datasets( n_samples=N_SAMPLES, noise=NOISE, @@ -26,11 +27,11 @@ def generate_circles_datasets( ): """ Generate multiple concentric circles datasets with varying parameters. - + Creates a series of 2D datasets where samples form two concentric circles, providing a classic non-linearly separable binary classification problem. Each configuration varies the number of samples and noise level. - + Parameters ---------- n_samples : list of int, default=range(100, 300, 20) @@ -41,19 +42,19 @@ def generate_circles_datasets( Directory path where datasets and configuration files will be saved. random_state : int, default=42 Random seed for reproducibility. - + Returns ------- None Saves CSV files for each dataset configuration and a JSON file with all configuration parameters. - + Notes ----- - Each dataset is saved as 'circles_data-{i}.csv' where i is the configuration number - Configuration parameters are saved in 'dataset_config.json' - The last column 'class' contains binary labels (0 or 1) - + Examples -------- >>> from qbiocode.data_generation import generate_circles_datasets @@ -61,12 +62,12 @@ def generate_circles_datasets( Generating circles dataset... """ print("Generating circles dataset...") - + np.random.seed(random_state) if save_path is None: - save_path = 'circles_data' - + save_path = "circles_data" + if not os.path.exists(save_path): os.makedirs(save_path) @@ -78,29 +79,30 @@ def generate_circles_datasets( # populate all the configs with the corresponding argument values for n_s, n_n in configurations: - config = "n_samples={}, noise={}".format( - n_s, n_n, - ) - # print(count_configs) - - + config = "n_samples={}, noise={}".format( + n_s, + n_n, + ) + # print(count_configs) + # iteratively run the function for each combination of arguments - X, y = make_circles( - n_samples=n_s, - noise=n_n, - random_state=random_state, + X, y = make_circles( + n_samples=n_s, + noise=n_n, + random_state=random_state, + ) + # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) + dataset = pd.DataFrame(X) + dataset["class"] = y + with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile: + dataset_config.update( + {"ld_data-{}.csv".format(count_configs): {"n_samples": n_s, "noise": n_n}} ) - # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) - dataset = pd.DataFrame(X) - dataset['class'] = y - with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile: - dataset_config.update({'ld_data-{}.csv'.format(count_configs): - {'n_samples': n_s, - 'noise': n_n}}) - json.dump(dataset_config, outfile, indent=4) - new_dataset = dataset.to_csv( os.path.join( save_path, 'circles_data-{}.csv'.format(count_configs)), index=False) - count_configs += 1 - # print(X.shape) - # print(y.shape) + json.dump(dataset_config, outfile, indent=4) + new_dataset = dataset.to_csv( + os.path.join(save_path, "circles_data-{}.csv".format(count_configs)), index=False + ) + count_configs += 1 + # print(X.shape) + # print(y.shape) return - diff --git a/qbiocode/data_generation/make_class.py b/qbiocode/data_generation/make_class.py index ced8627..c61a05c 100644 --- a/qbiocode/data_generation/make_class.py +++ b/qbiocode/data_generation/make_class.py @@ -6,25 +6,26 @@ useful for testing machine learning algorithms on high-dimensional data. """ -from sklearn.datasets import make_classification -import pandas as pd -import numpy as np -import json import itertools +import json import os +import numpy as np +import pandas as pd +from sklearn.datasets import make_classification dataset_config = {} # parameters to vary across the configurations N_SAMPLES = list(range(100, 300, 50)) -N_FEATURES = list(range(10,60,10)) -N_INFORMATIVE = list(range(2,8,4)) -N_REDUNDANT = list(range(2,8,4)) +N_FEATURES = list(range(10, 60, 10)) +N_INFORMATIVE = list(range(2, 8, 4)) +N_REDUNDANT = list(range(2, 8, 4)) N_CLASSES = list(range(2, 4, 6)) N_CLUSTERS_PER_CLASS = list(range(1, 2, 3)) WEIGHTS = [[0.3, 0.7], [0.4, 0.6], [0.5, 0.5]] + def generate_classification_datasets( n_samples, n_features, @@ -38,11 +39,11 @@ def generate_classification_datasets( ): """ Generate multiple high-dimensional classification datasets with varying parameters. - + Creates a series of synthetic datasets for multi-class classification problems with configurable feature characteristics including informative features, redundant features, and class distributions. - + Parameters ---------- n_samples : list of int @@ -63,20 +64,20 @@ def generate_classification_datasets( Directory path where datasets and configuration files will be saved. random_state : int, default=42 Random seed for reproducibility. - + Returns ------- None Saves CSV files for each dataset configuration and a JSON file with all configuration parameters. - + Notes ----- - Each dataset is saved as 'class_data-{i}.csv' where i is the configuration number - Configuration parameters are saved in 'dataset_config.json' - The last column 'class' contains class labels - Only valid configurations where (n_informative + n_redundant) <= n_features are generated - + Examples -------- >>> from qbiocode.data_generation import generate_classification_datasets @@ -88,55 +89,72 @@ def generate_classification_datasets( Generating classes dataset... """ print("Generating classes dataset...") - + np.random.seed(random_state) if save_path is None: - save_path = 'class_data' - + save_path = "class_data" + if not os.path.exists(save_path): os.makedirs(save_path) # enumerate all possible combinations of parameters based on ranges above - configurations = list(itertools.product(*[n_samples, n_features, n_informative, n_redundant, n_classes, n_clusters_per_class, weights])) + configurations = list( + itertools.product( + *[ + n_samples, + n_features, + n_informative, + n_redundant, + n_classes, + n_clusters_per_class, + weights, + ] + ) + ) count_configs = 1 # populate all the configs with the corresponding argument values for n_s, n_f, n_i, n_r, n_cla, n_clu, weights in configurations: - if (n_i + n_r) <= n_f: - config = "n_samples={}, n_features={}, n_informative={}, n_redundant={}, n_classes={}, n_clusters_per_class={}, weights={}".format( - n_s, n_f, n_i, n_r, n_cla, n_clu, weights - ) - # print(count_configs) - - + if (n_i + n_r) <= n_f: + config = "n_samples={}, n_features={}, n_informative={}, n_redundant={}, n_classes={}, n_clusters_per_class={}, weights={}".format( + n_s, n_f, n_i, n_r, n_cla, n_clu, weights + ) + # print(count_configs) + # iteratively run the function for each combination of arguments - X, y = make_classification( - n_samples=n_s, - n_features=n_f, - n_informative=n_i, - n_redundant=n_r, - n_classes=n_cla, - n_clusters_per_class=n_clu, - weights=weights, - random_state=random_state, + X, y = make_classification( + n_samples=n_s, + n_features=n_f, + n_informative=n_i, + n_redundant=n_r, + n_classes=n_cla, + n_clusters_per_class=n_clu, + weights=weights, + random_state=random_state, + ) + # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) + dataset = pd.DataFrame(X) + dataset["class"] = y + with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile: + dataset_config.update( + { + "hd_data-{}.csv".format(count_configs): { + "n_samples": n_s, + "n_features": n_f, + "n_informative": n_i, + "n_redundant": n_r, + "n_classes": n_cla, + "n_clusters_per_class": n_clu, + "weights": weights, + } + } ) - # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) - dataset = pd.DataFrame(X) - dataset['class'] = y - with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile: - dataset_config.update({'hd_data-{}.csv'.format(count_configs): - {'n_samples': n_s, - 'n_features': n_f, - 'n_informative': n_i, - 'n_redundant': n_r, - 'n_classes': n_cla, - 'n_clusters_per_class': n_clu, - 'weights': weights}}) - json.dump(dataset_config, outfile, indent=4) - new_dataset = dataset.to_csv( os.path.join( save_path, 'class_data-{}.csv'.format(count_configs)), index=False) - count_configs += 1 - # print(X.shape) - # print(y.shape) + json.dump(dataset_config, outfile, indent=4) + new_dataset = dataset.to_csv( + os.path.join(save_path, "class_data-{}.csv".format(count_configs)), index=False + ) + count_configs += 1 + # print(X.shape) + # print(y.shape) return - \ No newline at end of file diff --git a/qbiocode/data_generation/make_moons.py b/qbiocode/data_generation/make_moons.py index b9341a6..f392438 100644 --- a/qbiocode/data_generation/make_moons.py +++ b/qbiocode/data_generation/make_moons.py @@ -6,18 +6,19 @@ algorithms on non-linearly separable data with interleaving classes. """ -from sklearn.datasets import make_moons -import pandas as pd -import numpy as np import itertools import json import os +import numpy as np +import pandas as pd +from sklearn.datasets import make_moons # parameters to vary across the configurations N_SAMPLES = list(range(100, 300, 20)) NOISE = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + def generate_moons_datasets( n_samples=N_SAMPLES, noise=NOISE, @@ -26,11 +27,11 @@ def generate_moons_datasets( ): """ Generate multiple two-moons datasets with varying parameters. - + Creates a series of 2D datasets where samples form two interleaving half-circles (moons), providing a challenging non-linearly separable binary classification problem. Each configuration varies the number of samples and noise level. - + Parameters ---------- n_samples : list of int, default=range(100, 300, 20) @@ -41,20 +42,20 @@ def generate_moons_datasets( Directory path where datasets and configuration files will be saved. random_state : int, default=42 Random seed for reproducibility. - + Returns ------- None Saves CSV files for each dataset configuration and a JSON file with all configuration parameters. - + Notes ----- - Each dataset is saved as 'moons_data-{i}.csv' where i is the configuration number - Configuration parameters are saved in 'dataset_config.json' - The last column 'class' contains binary labels (0 or 1) - Two-moons datasets are commonly used to evaluate algorithms on interleaving patterns - + Examples -------- >>> from qbiocode.data_generation import generate_moons_datasets @@ -62,12 +63,12 @@ def generate_moons_datasets( Generating moons dataset... """ print("Generating moons dataset...") - + np.random.seed(random_state) if save_path is None: - save_path = 'moons_data' - + save_path = "moons_data" + if not os.path.exists(save_path): os.makedirs(save_path) @@ -81,28 +82,30 @@ def generate_moons_datasets( # populate all the configs with the corresponding argument values for n_s, n_n in configurations: - config = "n_samples={}, noise={}".format( - n_s, n_n, - ) - # print(count_configs) - - + config = "n_samples={}, noise={}".format( + n_s, + n_n, + ) + # print(count_configs) + # iteratively run the function for each combination of arguments - X, y = make_moons( - n_samples=n_s, - noise=n_n, - random_state=random_state, + X, y = make_moons( + n_samples=n_s, + noise=n_n, + random_state=random_state, + ) + # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) + dataset = pd.DataFrame(X) + dataset["class"] = y + with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile: + dataset_config.update( + {"moons_data-{}.csv".format(count_configs): {"n_samples": n_s, "noise": n_n}} ) - # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) - dataset = pd.DataFrame(X) - dataset['class'] = y - with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile: - dataset_config.update({'moons_data-{}.csv'.format(count_configs): - {'n_samples': n_s, - 'noise': n_n}}) - json.dump(dataset_config, outfile, indent=4) - new_dataset = dataset.to_csv( os.path.join( save_path, 'moons_data-{}.csv'.format(count_configs)), index=False) - count_configs += 1 - # print(X.shape) - # print(y.shape) + json.dump(dataset_config, outfile, indent=4) + new_dataset = dataset.to_csv( + os.path.join(save_path, "moons_data-{}.csv".format(count_configs)), index=False + ) + count_configs += 1 + # print(X.shape) + # print(y.shape) return diff --git a/qbiocode/data_generation/make_s_curve.py b/qbiocode/data_generation/make_s_curve.py index 9ee3f83..e1be9a5 100644 --- a/qbiocode/data_generation/make_s_curve.py +++ b/qbiocode/data_generation/make_s_curve.py @@ -6,18 +6,19 @@ reduction and manifold learning algorithms. """ -from sklearn.datasets import make_s_curve -import pandas as pd -import numpy as np import itertools import json import os +import numpy as np +import pandas as pd +from sklearn.datasets import make_s_curve # parameters to vary across the configurations N_SAMPLES = list(range(100, 300, 20)) NOISE = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + def generate_s_curve_datasets( n_samples=N_SAMPLES, noise=NOISE, @@ -26,11 +27,11 @@ def generate_s_curve_datasets( ): """ Generate multiple 3D S-curve datasets with varying parameters. - + Creates a series of 3D datasets where samples lie on an S-shaped manifold, a classic benchmark for manifold learning and dimensionality reduction algorithms. Each configuration varies the number of samples and noise level. - + Parameters ---------- n_samples : list of int, default=range(100, 300, 20) @@ -41,20 +42,20 @@ def generate_s_curve_datasets( Directory path where datasets and configuration files will be saved. random_state : int, default=42 Random seed for reproducibility. - + Returns ------- None Saves CSV files for each dataset configuration and a JSON file with all configuration parameters. - + Notes ----- - Each dataset is saved as 's_curve_data-{i}.csv' where i is the configuration number - Configuration parameters are saved in 'dataset_config.json' - The last column 'class' contains the position along the manifold (continuous values) - S-curve is a standard benchmark for testing manifold learning algorithms - + Examples -------- >>> from qbiocode.data_generation import generate_s_curve_datasets @@ -62,12 +63,12 @@ def generate_s_curve_datasets( Generating S Curve dataset... """ print("Generating S Curve dataset...") - + np.random.seed(random_state) if save_path is None: - save_path = 's_curve_data' - + save_path = "s_curve_data" + if not os.path.exists(save_path): os.makedirs(save_path) @@ -81,29 +82,30 @@ def generate_s_curve_datasets( # populate all the configs with the corresponding argument values for n_s, n_n in configurations: - config = "n_samples={}, noise={}".format( - n_s, n_n, - ) - # print(count_configs) - - + config = "n_samples={}, noise={}".format( + n_s, + n_n, + ) + # print(count_configs) + # iteratively run the function for each combination of arguments - X, y = make_s_curve( - n_samples=n_s, - noise=n_n, - random_state=random_state, + X, y = make_s_curve( + n_samples=n_s, + noise=n_n, + random_state=random_state, + ) + # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) + dataset = pd.DataFrame(X) + dataset["class"] = y + with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile: + dataset_config.update( + {"s_curve_data-{}.csv".format(count_configs): {"n_samples": n_s, "noise": n_n}} ) - # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) - dataset = pd.DataFrame(X) - dataset['class'] = y - with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile: - dataset_config.update({'s_curve_data-{}.csv'.format(count_configs): - {'n_samples': n_s, - 'noise': n_n}}) - json.dump(dataset_config, outfile, indent=4) - new_dataset = dataset.to_csv( os.path.join( save_path, 's_curve_data-{}.csv'.format(count_configs)), index=False) - count_configs += 1 - # print(X.shape) - # print(y.shape) + json.dump(dataset_config, outfile, indent=4) + new_dataset = dataset.to_csv( + os.path.join(save_path, "s_curve_data-{}.csv".format(count_configs)), index=False + ) + count_configs += 1 + # print(X.shape) + # print(y.shape) return - diff --git a/qbiocode/data_generation/make_spheres.py b/qbiocode/data_generation/make_spheres.py index 16bc33a..22e2aea 100644 --- a/qbiocode/data_generation/make_spheres.py +++ b/qbiocode/data_generation/make_spheres.py @@ -6,19 +6,19 @@ machine learning algorithms on high-dimensional non-linearly separable data. """ -import numpy as np -import matplotlib.pyplot as plt -import pandas as pd import itertools import json import os +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd -def generate_points_in_nd_sphere(n_s, dim = 3, radius=1, thresh = 0.9): +def generate_points_in_nd_sphere(n_s, dim=3, radius=1, thresh=0.9): """ Generate random points within an n-dimensional spherical shell. - + Parameters ---------- n_s : int @@ -29,7 +29,7 @@ def generate_points_in_nd_sphere(n_s, dim = 3, radius=1, thresh = 0.9): Outer radius of the spherical shell. thresh : float, default=0.9 Inner radius threshold as fraction of outer radius (creates shell). - + Returns ------- points : ndarray of shape (n_s, dim) @@ -40,17 +40,19 @@ def generate_points_in_nd_sphere(n_s, dim = 3, radius=1, thresh = 0.9): while cnt < n_s: pnts = np.random.rand(dim) * 2 * radius - radius pnts_nrm = np.linalg.norm(pnts) - if (pnts_nrm <= radius) & (pnts_nrm >= radius*thresh): + if (pnts_nrm <= radius) & (pnts_nrm >= radius * thresh): points.append(pnts) cnt += 1 points = np.asarray(points) return points + # parameters to vary across the configurations N_SAMPLES = list(range(100, 300, 25)) DIM = list(range(5, 15, 5)) RAD = list(range(5, 20, 5)) + def generate_spheres_datasets( n_s=N_SAMPLES, dim=DIM, @@ -60,12 +62,12 @@ def generate_spheres_datasets( ): """ Generate multiple concentric n-dimensional spheres datasets with varying parameters. - + Creates a series of high-dimensional datasets where samples form two concentric spherical shells, providing a challenging non-linearly separable binary classification problem in high dimensions. Each configuration varies the number of samples, dimensionality, and sphere radii. - + Parameters ---------- n_s : list of int, default=range(100, 300, 25) @@ -78,20 +80,20 @@ def generate_spheres_datasets( Directory path where datasets and configuration files will be saved. random_state : int, default=42 Random seed for reproducibility. - + Returns ------- None Saves CSV files for each dataset configuration and a JSON file with all configuration parameters. - + Notes ----- - Each dataset is saved as 'spheres_data-{i}.csv' where i is the configuration number - Configuration parameters are saved in 'dataset_config.json' - The last column 'class' contains binary labels (0 for outer, 1 for inner sphere) - Samples are generated in spherical shells (not solid spheres) for better separation - + Examples -------- >>> from qbiocode.data_generation import generate_spheres_datasets @@ -99,12 +101,12 @@ def generate_spheres_datasets( Generating spheres dataset... """ print("Generating spheres dataset...") - + np.random.seed(random_state) if save_path is None: - save_path = 'spheres_data' - + save_path = "spheres_data" + if not os.path.exists(save_path): os.makedirs(save_path) @@ -118,38 +120,41 @@ def generate_spheres_datasets( # populate all the configs with the corresponding argument values for n_s, n_d, n_r in configurations: - config = "samples={}, dimensions={}, radius={}".format( - n_s, n_d, n_r - ) - # print(count_configs) - radius1 = n_r - radius2 = radius1 * 0.5 - Xa = generate_points_in_nd_sphere(n_s, dim = n_d, radius=radius1, thresh = 0.9) - Xb = generate_points_in_nd_sphere(n_s, dim = n_d, radius=radius2, thresh = 0.9) - X = np.concatenate((Xa, Xb)) - y = [0]*len(Xa) + [1]*len(Xb) - - # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) - X_df = pd.DataFrame(X) - y_dict = {'class':y} - y_df = pd.DataFrame(y_dict) - df = pd.concat([X_df, y_df], axis=1) - with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile: - dataset_config.update({'spheres_data-{}.csv'.format(count_configs): + config = "samples={}, dimensions={}, radius={}".format(n_s, n_d, n_r) + # print(count_configs) + radius1 = n_r + radius2 = radius1 * 0.5 + Xa = generate_points_in_nd_sphere(n_s, dim=n_d, radius=radius1, thresh=0.9) + Xb = generate_points_in_nd_sphere(n_s, dim=n_d, radius=radius2, thresh=0.9) + X = np.concatenate((Xa, Xb)) + y = [0] * len(Xa) + [1] * len(Xb) + + # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) + X_df = pd.DataFrame(X) + y_dict = {"class": y} + y_df = pd.DataFrame(y_dict) + df = pd.concat([X_df, y_df], axis=1) + with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile: + dataset_config.update( { - 'n_samples':n_s, - 'dimensions': n_d, - 'radius': n_r}}) - json.dump(dataset_config, outfile, indent=4) - new_dataset = df.to_csv( os.path.join( save_path, 'spheres_data-{}.csv'.format(count_configs)), index=False) - count_configs += 1 - - # fig = plt.figure() - # ax = fig.add_subplot(111, projection='3d') - # # ax.scatter(X[:, 0], X[:, 1],X[:,2], c= y, cmap='viridis') - # ax.scatter(X[:, n_d-3], X[:, n_d-2],X[:, n_d-1], c=y, cmap='viridis') - # plt.savefig('spheres_data/spheres_data-{}.png'.format(count_configs)) - # print(X.shape) - # print(y.shape) - return + "spheres_data-{}.csv".format(count_configs): { + "n_samples": n_s, + "dimensions": n_d, + "radius": n_r, + } + } + ) + json.dump(dataset_config, outfile, indent=4) + new_dataset = df.to_csv( + os.path.join(save_path, "spheres_data-{}.csv".format(count_configs)), index=False + ) + count_configs += 1 + # fig = plt.figure() + # ax = fig.add_subplot(111, projection='3d') + # # ax.scatter(X[:, 0], X[:, 1],X[:,2], c= y, cmap='viridis') + # ax.scatter(X[:, n_d-3], X[:, n_d-2],X[:, n_d-1], c=y, cmap='viridis') + # plt.savefig('spheres_data/spheres_data-{}.png'.format(count_configs)) + # print(X.shape) + # print(y.shape) + return diff --git a/qbiocode/data_generation/make_spirals.py b/qbiocode/data_generation/make_spirals.py index 8f9c433..9857d0c 100644 --- a/qbiocode/data_generation/make_spirals.py +++ b/qbiocode/data_generation/make_spirals.py @@ -6,21 +6,22 @@ machine learning algorithms on complex non-linearly separable patterns. """ -import numpy as np -import matplotlib.pyplot as plt -import pandas as pd import itertools import json import os +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + def make_spirals(n_samples=5000, n_classes=2, noise=0.3, dim=3): """ Generate an n-dimensional dataset of intertwined spirals. - + Creates spiral patterns in n-dimensional space where each class forms a distinct spiral arm. Supports dimensions 3, 6, 9, and 12. - + Parameters ---------- n_samples : int, default=5000 @@ -31,7 +32,7 @@ def make_spirals(n_samples=5000, n_classes=2, noise=0.3, dim=3): Standard deviation of Gaussian noise added to each dimension. dim : int, default=3 Dimensionality of the output space (must be 3, 6, 9, or 12). - + Returns ------- X : ndarray of shape (n_samples, dim) @@ -48,30 +49,36 @@ def make_spirals(n_samples=5000, n_classes=2, noise=0.3, dim=3): x = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) y_ = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) z = t + np.random.normal(0, noise, n_samples // n_classes) - if dim==3: - X.append(np.column_stack([x, y_, z])) # any new dimensions need to be added to this list - - # to add more dimensions, apparently you would just keep adding 't' variable from above, to each new dimension, + if dim == 3: + X.append( + np.column_stack([x, y_, z]) + ) # any new dimensions need to be added to this list + + # to add more dimensions, apparently you would just keep adding 't' variable from above, to each new dimension, # as seen below. The question is, how can we iteratively do this while maintaining the binary classification - # that this for loop is creating? + # that this for loop is creating? # nesting a loop iterating over the number of dimensions doesn't really work from what I'm seeing. so far # However, manually adding repeats of the same 3Ds, does work, as seen below -- is this correct? - - # for j in range(dim-3): # for anything above the first 3D - if dim==6: + + # for j in range(dim-3): # for anything above the first 3D + if dim == 6: new_d1 = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) new_d2 = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) new_d3 = t + np.random.normal(0, noise, n_samples // n_classes) - X.append(np.column_stack([x, y_, z, new_d1, new_d2, new_d3])) # any new dimensions need to be added to this list - if dim==9: + X.append( + np.column_stack([x, y_, z, new_d1, new_d2, new_d3]) + ) # any new dimensions need to be added to this list + if dim == 9: new_d1 = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) new_d2 = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) new_d3 = t + np.random.normal(0, noise, n_samples // n_classes) new_d4 = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) new_d5 = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) new_d6 = t + np.random.normal(0, noise, n_samples // n_classes) - X.append(np.column_stack([x, y_, z, new_d1, new_d2, new_d3, new_d4, new_d5, new_d6])) # any new dimensions need to be added to this list - if dim==12: + X.append( + np.column_stack([x, y_, z, new_d1, new_d2, new_d3, new_d4, new_d5, new_d6]) + ) # any new dimensions need to be added to this list + if dim == 12: new_d1 = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) new_d2 = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) new_d3 = t + np.random.normal(0, noise, n_samples // n_classes) @@ -81,7 +88,24 @@ def make_spirals(n_samples=5000, n_classes=2, noise=0.3, dim=3): new_d7 = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) new_d8 = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes) new_d9 = t + np.random.normal(0, noise, n_samples // n_classes) - X.append(np.column_stack([x, y_, z, new_d1, new_d2, new_d3, new_d4, new_d5, new_d6, new_d7, new_d8, new_d9])) + X.append( + np.column_stack( + [ + x, + y_, + z, + new_d1, + new_d2, + new_d3, + new_d4, + new_d5, + new_d6, + new_d7, + new_d8, + new_d9, + ] + ) + ) y.extend([i] * (n_samples // n_classes)) return np.vstack(X), np.array(y) @@ -93,6 +117,7 @@ def make_spirals(n_samples=5000, n_classes=2, noise=0.3, dim=3): NOISE = [0.3, 0.6, 0.9] DIM = [3, 6, 9, 12] + def generate_spirals_datasets( n_s=N_SAMPLES, n_c=N_CLASSES, @@ -103,12 +128,12 @@ def generate_spirals_datasets( ): """ Generate multiple n-dimensional spiral datasets with varying parameters. - + Creates a series of high-dimensional datasets where samples form intertwined spiral patterns, providing challenging non-linearly separable multi-class classification problems. Each configuration varies the number of samples, classes, noise level, and dimensionality. - + Parameters ---------- n_s : list of int, default=range(100, 300, 50) @@ -123,20 +148,20 @@ def generate_spirals_datasets( Directory path where datasets and configuration files will be saved. random_state : int, default=42 Random seed for reproducibility. - + Returns ------- None Saves CSV files for each dataset configuration and a JSON file with all configuration parameters. - + Notes ----- - Each dataset is saved as 'spirals_data-{i}.csv' where i is the configuration number - Configuration parameters are saved in 'dataset_config.json' - The last column 'class' contains class labels - Spiral patterns become increasingly complex in higher dimensions - + Examples -------- >>> from qbiocode.data_generation import generate_spirals_datasets @@ -144,12 +169,12 @@ def generate_spirals_datasets( Generating spirals dataset... """ print("Generating spirals dataset...") - + np.random.seed(random_state) if save_path is None: - save_path = 'spirals_data' - + save_path = "spirals_data" + if not os.path.exists(save_path): os.makedirs(save_path) @@ -161,38 +186,36 @@ def generate_spirals_datasets( dataset_config = {} - # populate all the configs with the corresponding argument values + # populate all the configs with the corresponding argument values for n_s, n_c, n_n, n_d in configurations: - config = "samples={}, classes={}, noise={}, dimensions={}".format( - n_s, n_c, n_n, n_d - ) - # print(count_configs) - - X, y = make_spirals( - n_samples=n_s, - n_classes=n_c, - noise=n_n, - dim=n_d + config = "samples={}, classes={}, noise={}, dimensions={}".format(n_s, n_c, n_n, n_d) + # print(count_configs) + + X, y = make_spirals(n_samples=n_s, n_classes=n_c, noise=n_n, dim=n_d) + # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) + dataset = pd.DataFrame(X) + dataset["class"] = y + with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile: + dataset_config.update( + { + "spirals_data-{}.csv".format(count_configs): { + "n_samples": n_s, + "noise": n_n, + "dimensions": n_d, + } + } ) - # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) - dataset = pd.DataFrame(X) - dataset['class'] = y - with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile: - dataset_config.update({'spirals_data-{}.csv'.format(count_configs): - {'n_samples': n_s, - 'noise': n_n, - 'dimensions': n_d - }}) - json.dump(dataset_config, outfile, indent=4) - new_dataset = dataset.to_csv( os.path.join( save_path, 'spirals_data-{}.csv'.format(count_configs)), index=False) - count_configs += 1 - - # plot the last 3 dimensions in each case - # fig = plt.figure() - # ax = fig.add_subplot(111, projection='3d') - # ax.scatter(X[:, n_d-3], X[:, n_d-2],X[:, n_d-1], c=y, cmap='viridis') - # plt.savefig('spirals_data/spirals_data-{}.png'.format(count_configs)) - #print(X.shape) - #print(y.shape) - return + json.dump(dataset_config, outfile, indent=4) + new_dataset = dataset.to_csv( + os.path.join(save_path, "spirals_data-{}.csv".format(count_configs)), index=False + ) + count_configs += 1 + # plot the last 3 dimensions in each case + # fig = plt.figure() + # ax = fig.add_subplot(111, projection='3d') + # ax.scatter(X[:, n_d-3], X[:, n_d-2],X[:, n_d-1], c=y, cmap='viridis') + # plt.savefig('spirals_data/spirals_data-{}.png'.format(count_configs)) + # print(X.shape) + # print(y.shape) + return diff --git a/qbiocode/data_generation/make_swiss_roll.py b/qbiocode/data_generation/make_swiss_roll.py index 0c9b6f4..824aa21 100644 --- a/qbiocode/data_generation/make_swiss_roll.py +++ b/qbiocode/data_generation/make_swiss_roll.py @@ -6,19 +6,20 @@ dimensionality reduction and manifold learning algorithms. """ -from sklearn.datasets import make_swiss_roll -import pandas as pd -import numpy as np import itertools import json import os +import numpy as np +import pandas as pd +from sklearn.datasets import make_swiss_roll # parameters to vary across the configurations N_SAMPLES = list(range(100, 300, 20)) NOISE = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] HOLE = [True, False] + def generate_swiss_roll_datasets( n_samples=N_SAMPLES, noise=NOISE, @@ -28,12 +29,12 @@ def generate_swiss_roll_datasets( ): """ Generate multiple 3D Swiss roll datasets with varying parameters. - + Creates a series of 3D datasets where samples lie on a Swiss roll manifold, a classic benchmark for manifold learning and dimensionality reduction algorithms. Each configuration varies the number of samples, noise level, and whether the roll has a hole in the center. - + Parameters ---------- n_samples : list of int, default=range(100, 300, 20) @@ -46,20 +47,20 @@ def generate_swiss_roll_datasets( Directory path where datasets and configuration files will be saved. random_state : int, default=42 Random seed for reproducibility. - + Returns ------- None Saves CSV files for each dataset configuration and a JSON file with all configuration parameters. - + Notes ----- - Each dataset is saved as 'swiss_roll_data-{i}.csv' where i is the configuration number - Configuration parameters are saved in 'dataset_config.json' - The last column 'class' contains the position along the manifold (continuous values) - Swiss roll is a standard benchmark for testing manifold learning algorithms - + Examples -------- >>> from qbiocode.data_generation import generate_swiss_roll_datasets @@ -67,12 +68,12 @@ def generate_swiss_roll_datasets( Generating swiss roll dataset... """ print("Generating swiss roll dataset...") - + np.random.seed(random_state) if save_path is None: - save_path = 'swiss_roll_data' - + save_path = "swiss_roll_data" + if not os.path.exists(save_path): os.makedirs(save_path) @@ -86,31 +87,34 @@ def generate_swiss_roll_datasets( # populate all the configs with the corresponding argument values for n_s, n_n, n_h in configurations: - config = "n_samples={}, noise={}, hole={}".format( - n_s, n_n, n_h - ) - # print(count_configs) - - + config = "n_samples={}, noise={}, hole={}".format(n_s, n_n, n_h) + # print(count_configs) + # iteratively run the function for each combination of arguments - X, y = make_swiss_roll( - n_samples=n_s, - noise=n_n, - hole=n_h, - random_state=random_state, + X, y = make_swiss_roll( + n_samples=n_s, + noise=n_n, + hole=n_h, + random_state=random_state, + ) + # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) + dataset = pd.DataFrame(X) + dataset["class"] = y + with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile: + dataset_config.update( + { + "swiss_roll_data-{}.csv".format(count_configs): { + "n_samples": n_s, + "noise": n_n, + "hole": n_h, + } + } ) - # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config)) - dataset = pd.DataFrame(X) - dataset['class'] = y - with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile: - dataset_config.update({'swiss_roll_data-{}.csv'.format(count_configs): - {'n_samples': n_s, - 'noise': n_n, - 'hole': n_h}}) - json.dump(dataset_config, outfile, indent=4) - new_dataset = dataset.to_csv( os.path.join( save_path, 'swiss_roll_data-{}.csv'.format(count_configs)), index=False) - count_configs += 1 - # print(X.shape) - # print(y.shape) + json.dump(dataset_config, outfile, indent=4) + new_dataset = dataset.to_csv( + os.path.join(save_path, "swiss_roll_data-{}.csv".format(count_configs)), index=False + ) + count_configs += 1 + # print(X.shape) + # print(y.shape) return - diff --git a/qbiocode/embeddings/__init__.py b/qbiocode/embeddings/__init__.py index af0664a..bd6f300 100644 --- a/qbiocode/embeddings/__init__.py +++ b/qbiocode/embeddings/__init__.py @@ -7,12 +7,12 @@ and quantum feature maps. Available Functions ------------------- +------------------- - get_embeddings: Compute embeddings using various methods (PCA, t-SNE, UMAP, etc.) - pqk: Projected Quantum Kernel embedding Available Classes ----------------- +----------------- - ConvAutoencoder: Convolutional autoencoder for dimensionality reduction Usage @@ -24,11 +24,11 @@ >>> X_pqk = pqk(X, n_components=4) """ -from .embed import get_embeddings, pqk from .compute_autoencoder import ConvAutoencoder +from .embed import get_embeddings, pqk __all__ = [ - 'get_embeddings', - 'pqk', - 'ConvAutoencoder', + "get_embeddings", + "pqk", + "ConvAutoencoder", ] diff --git a/qbiocode/embeddings/compute_autoencoder.py b/qbiocode/embeddings/compute_autoencoder.py index 6653fcf..32b7470 100644 --- a/qbiocode/embeddings/compute_autoencoder.py +++ b/qbiocode/embeddings/compute_autoencoder.py @@ -2,11 +2,12 @@ import torch.nn as nn import torch.optim as optim + # Define the Autoencoder Model class ConvAutoencoder(nn.Module): def __init__(self): super(ConvAutoencoder, self).__init__() - + # Encoder self.encoder = nn.Sequential( nn.Conv2d(7, 64, kernel_size=3, stride=2, padding=1), # (64, 192, 192) @@ -17,25 +18,35 @@ def __init__(self): nn.ReLU(), nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1), # (512, 24, 24) nn.ReLU(), - nn.Conv2d(512, 7, kernel_size=3, stride=2, padding=1), # (7, 16, 16) - nn.ReLU() + nn.Conv2d(512, 7, kernel_size=3, stride=2, padding=1), # (7, 16, 16) + nn.ReLU(), ) - + # Decoder self.decoder = nn.Sequential( - nn.ConvTranspose2d(7, 512, kernel_size=3, stride=2, padding=1, output_padding=1), # (512, 24, 24) - nn.ReLU(), - nn.ConvTranspose2d(512, 256, kernel_size=3, stride=2, padding=1, output_padding=1), # (256, 48, 48) - nn.ReLU(), - nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, output_padding=1), # (128, 96, 96) - nn.ReLU(), - nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1), # (64, 192, 192) - nn.ReLU(), - nn.ConvTranspose2d(64, 7, kernel_size=3, stride=2, padding=1, output_padding=1), # (7, 384, 384) - nn.Sigmoid() + nn.ConvTranspose2d( + 7, 512, kernel_size=3, stride=2, padding=1, output_padding=1 + ), # (512, 24, 24) + nn.ReLU(), + nn.ConvTranspose2d( + 512, 256, kernel_size=3, stride=2, padding=1, output_padding=1 + ), # (256, 48, 48) + nn.ReLU(), + nn.ConvTranspose2d( + 256, 128, kernel_size=3, stride=2, padding=1, output_padding=1 + ), # (128, 96, 96) + nn.ReLU(), + nn.ConvTranspose2d( + 128, 64, kernel_size=3, stride=2, padding=1, output_padding=1 + ), # (64, 192, 192) + nn.ReLU(), + nn.ConvTranspose2d( + 64, 7, kernel_size=3, stride=2, padding=1, output_padding=1 + ), # (7, 384, 384) + nn.Sigmoid(), ) - + def forward(self, x): latent = self.encoder(x) reconstructed = self.decoder(latent) - return reconstructed \ No newline at end of file + return reconstructed diff --git a/qbiocode/embeddings/embed.py b/qbiocode/embeddings/embed.py index a4b5391..3b6b087 100644 --- a/qbiocode/embeddings/embed.py +++ b/qbiocode/embeddings/embed.py @@ -1,34 +1,40 @@ -import numpy as np import os +from functools import reduce + +import numpy as np +# ====== Qiskit imports ====== +from qiskit import QuantumCircuit +from qiskit.quantum_info import Pauli # ====== Embedding functions imports ====== -from sklearn.decomposition import PCA -from sklearn.decomposition import NMF -from sklearn.manifold import ( - Isomap, - LocallyLinearEmbedding, - SpectralEmbedding, -) +from sklearn.decomposition import NMF, PCA +from sklearn.manifold import Isomap, LocallyLinearEmbedding, SpectralEmbedding from umap import UMAP -from functools import reduce - -# ====== Qiskit imports ====== -from qiskit import QuantumCircuit import qbiocode.utils.qutils as qutils -from qiskit.quantum_info import Pauli -def pqk(X_train, X_test, args, store = False, data_key = '', - encoding = 'Z', data_map=True, primitive = 'estimator', entanglement = 'linear', reps= 2): + +def pqk( + X_train, + X_test, + args, + store=False, + data_key="", + encoding="Z", + data_map=True, + primitive="estimator", + entanglement="linear", + reps=2, +): """ This function generates quantum circuits, computes projections of the data onto these circuits. It uses a feature map to encode the data into quantum states and then measures the expectation values - of Pauli operators to obtain the features. + of Pauli operators to obtain the features. This function requires a quantum backend (simulator or real quantum hardware) for execution. It supports various configurations such as encoding methods, entanglement strategies, and repetitions of the feature map. Optionally the results are saved to files for training and test projections. - + Args: X_train (np.ndarray): Training data features. X_test (np.ndarray): Test data features. @@ -47,8 +53,8 @@ def pqk(X_train, X_test, args, store = False, data_key = '', feat_dimension = X_train.shape[1] - if data_map: - # This function ensures that all multiplicative factors of data features inside single qubit gates are 1.0 + if data_map: + # This function ensures that all multiplicative factors of data features inside single qubit gates are 1.0 def data_map_func(x: np.ndarray) -> float: """ Define a function map from R^n to R. @@ -60,70 +66,82 @@ def data_map_func(x: np.ndarray) -> float: float: the mapped value """ coeff = x[0] / 2 if len(x) == 1 else reduce(lambda m, n: (m * n) / 2, x) - return coeff + return float(coeff) + else: data_map_func = None - - # choose a method for mapping your features onto the circuit - feature_map, _ = qutils.get_feature_map(feature_map=encoding, - feat_dimension=X_train.shape[1], - reps = reps, - entanglement=entanglement, - data_map_func = data_map_func) + + # choose a method for mapping your features onto the circuit + feature_map, _ = qutils.get_feature_map( + feature_map=encoding, + feat_dimension=X_train.shape[1], + reps=reps, + entanglement=entanglement, + data_map_func=data_map_func, + ) # Build quantum circuit circuit = QuantumCircuit(feature_map.num_qubits) circuit.compose(feature_map, inplace=True) num_qubits = circuit.num_qubits - # Generate the backend, session and primitive - backend, session, prim = qutils.get_backend_session(args, - 'estimator', - num_qubits=num_qubits) + backend, session, prim = qutils.get_backend_session(args, "estimator", num_qubits=num_qubits) # Transpile - if args['backend'] != 'simulator': - circuit = qutils.transpile_circuit( circuit, opt_level=3, backend = backend, - PT = True, initial_layout = None) + if args["backend"] != "simulator": + circuit = qutils.transpile_circuit( + circuit, opt_level=3, backend=backend, PT=True, initial_layout=None + ) - for f_tr in ['train', 'test']: - - if 'train' in f_tr: + for f_tr in ["train", "test"]: + + if "train" in f_tr: dat = X_train.copy() else: dat = X_test.copy() - + # Identity operator on all qubits - id = 'I' * feat_dimension + id = "I" * feat_dimension # We group all commuting observables # These groups are the Pauli X, Y and Z operators on individual qubits # Apply the circuit layout to the observable if mapped to device - if args['backend'] != 'simulator': - observables_x =[] - observables_y =[] - observables_z =[] + if args["backend"] != "simulator": + observables_x = [] + observables_y = [] + observables_z = [] for i in range(feat_dimension): - observables_x.append( Pauli(id[:i] + 'X' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) ) - observables_y.append( Pauli(id[:i] + 'Y' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) ) - observables_z.append( Pauli(id[:i] + 'Z' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) ) + observables_x.append( + Pauli(id[:i] + "X" + id[(i + 1) :]).apply_layout( + circuit.layout, num_qubits=backend.num_qubits + ) + ) + observables_y.append( + Pauli(id[:i] + "Y" + id[(i + 1) :]).apply_layout( + circuit.layout, num_qubits=backend.num_qubits + ) + ) + observables_z.append( + Pauli(id[:i] + "Z" + id[(i + 1) :]).apply_layout( + circuit.layout, num_qubits=backend.num_qubits + ) + ) else: - observables_x = [Pauli(id[:i] + 'X' + id[(i + 1):]) for i in range(feat_dimension)] - observables_y = [Pauli(id[:i] + 'Y' + id[(i + 1):]) for i in range(feat_dimension)] - observables_z = [Pauli(id[:i] + 'Z' + id[(i + 1):]) for i in range(feat_dimension)] - - + observables_x = [Pauli(id[:i] + "X" + id[(i + 1) :]) for i in range(feat_dimension)] + observables_y = [Pauli(id[:i] + "Y" + id[(i + 1) :]) for i in range(feat_dimension)] + observables_z = [Pauli(id[:i] + "Z" + id[(i + 1) :]) for i in range(feat_dimension)] + # projections[i][j][k] will be the expectation value of the j-th Pauli operator (0: X, 1: Y, 2: Z) # of datapoint i on qubit k projections = [] for i in range(len(dat)): - - # Get training sample + + # Get training sample parameters = dat[i] - # We define the primitive unified blocs (PUBs) consisting of the embedding circuit, + # We define the primitive unified blocs (PUBs) consisting of the embedding circuit, # set of observables and the circuit parameters pub_x = (circuit, observables_x, parameters) pub_y = (circuit, observables_y, parameters) @@ -135,28 +153,30 @@ def data_map_func(x: np.ndarray) -> float: job_result_z = job.result()[2].data.evs # Record , and on all qubits for the current datapoint - projections.append([job_result_x, job_result_y, job_result_z]) - + projections.append([job_result_x, job_result_y, job_result_z]) + if store: - if not os.path.exists( 'pqk_projections'): - os.makedirs('pqk_projections') + if not os.path.exists("pqk_projections"): + os.makedirs("pqk_projections") - file_projection = os.path.join( 'pqk_projections', 'pqk_projection_' + data_key + '_'+f_tr+'.npy') - - np.save( file_projection, projections ) + file_projection = os.path.join( + "pqk_projections", "pqk_projection_" + data_key + "_" + f_tr + ".npy" + ) - if 'train' in f_tr: + np.save(file_projection, projections) + + if "train" in f_tr: X_train_prj = np.array(projections.copy()).reshape(len(projections), -1) else: X_test_prj = np.array(projections.copy()).reshape(len(projections), -1) - + if not isinstance(session, type(None)): session.close() return X_train_prj, X_test_prj -def get_embeddings(embedding: str, X_train, X_test, n_neighbors=30, n_components=None, method=None): +def get_embeddings(embedding: str, X_train, X_test, n_neighbors=30, n_components=None, method=None): """This function applies the specified embedding technique to the training and test datasets. Args: @@ -166,55 +186,50 @@ def get_embeddings(embedding: str, X_train, X_test, n_neighbors=30, n_components n_neighbors (int, optional): Number of neighbors for certain embeddings. Defaults to 30. n_components (int, optional): Number of components for the embedding. If None, it defaults to the number of features in X_train. method (str, optional): Method for Locally Linear Embedding. Defaults to None. - + Returns: tuple: Transformed training and test datasets. """ - embedding = embedding.lower() - valid_modes = ['none', 'pca', 'lle', 'isomap', 'spectral', 'umap', 'nmf'] + embedding = embedding.lower() + valid_modes = ["none", "pca", "lle", "isomap", "spectral", "umap", "nmf"] if embedding not in valid_modes: raise ValueError(f"Invalid mode: {embedding}. Mode must be one of {valid_modes}") - - assert n_components <= X_train.shape[1], "number of components greater than number of feature in the dataset" - if 'none' == embedding: + assert ( + n_components <= X_train.shape[1] + ), "number of components greater than number of feature in the dataset" + if "none" == embedding: return X_train, X_test else: embedding_model = None - if 'pca' == embedding: - embedding_model = PCA( - n_components=n_components) - elif 'nmf' == embedding: - embedding_model = NMF( - n_components=n_components) - elif 'lle' == embedding: - if method==None: + if "pca" == embedding: + embedding_model = PCA(n_components=n_components) + elif "nmf" == embedding: + embedding_model = NMF(n_components=n_components) + elif "lle" == embedding: + if method == None: embedding_model = LocallyLinearEmbedding( - n_neighbors=n_neighbors, - n_components=n_components, - method='standard') - else: + n_neighbors=n_neighbors, n_components=n_components, method="standard" + ) + else: embedding_model = LocallyLinearEmbedding( - n_neighbors=n_neighbors, - n_components=n_components, - method='modified') - elif 'isomap' == embedding: + n_neighbors=n_neighbors, n_components=n_components, method="modified" + ) + elif "isomap" == embedding: embedding_model = Isomap( - n_neighbors=n_neighbors, - n_components=n_components, - ) - elif 'spectral' == embedding: - embedding_model = SpectralEmbedding( - n_components=n_components, - eigen_solver="arpack") - elif 'umap' == embedding: + n_neighbors=n_neighbors, + n_components=n_components, + ) + elif "spectral" == embedding: + embedding_model = SpectralEmbedding(n_components=n_components, eigen_solver="arpack") + elif "umap" == embedding: embedding_model = UMAP( - n_neighbors=n_neighbors, - n_components=n_components, - ) + n_neighbors=n_neighbors, + n_components=n_components, + ) X_train = embedding_model.fit_transform(X_train) X_test = embedding_model.transform(X_test) - - return X_train, X_test \ No newline at end of file + + return X_train, X_test diff --git a/qbiocode/evaluation/__init__.py b/qbiocode/evaluation/__init__.py index bdeb755..eddfc06 100644 --- a/qbiocode/evaluation/__init__.py +++ b/qbiocode/evaluation/__init__.py @@ -7,7 +7,7 @@ dataset complexity analysis, and automated model execution. Available Functions ------------------- +------------------- - modeleval: Evaluate model performance with multiple metrics - evaluate: Comprehensive dataset complexity evaluation - model_run: Automated model training and evaluation pipeline @@ -21,12 +21,12 @@ >>> complexity_metrics = evaluate(X, y) """ -from .model_evaluation import modeleval from .dataset_evaluation import evaluate +from .model_evaluation import modeleval from .model_run import model_run __all__ = [ - 'modeleval', - 'evaluate', - 'model_run', + "modeleval", + "evaluate", + "model_run", ] diff --git a/qbiocode/evaluation/dataset_evaluation.py b/qbiocode/evaluation/dataset_evaluation.py index 61647d0..b86b5d7 100644 --- a/qbiocode/evaluation/dataset_evaluation.py +++ b/qbiocode/evaluation/dataset_evaluation.py @@ -1,23 +1,24 @@ # ====== Base class imports ====== +import warnings + +import hfda import numpy as np import pandas as pd -import hfda +from scipy.linalg import eigvals, inv, norm +from scipy.spatial import ConvexHull as CH # ====== Scipy imports ====== from scipy.stats import entropy -from scipy.linalg import norm, inv, eigvals -from scipy.spatial import ConvexHull as CH +from skdim import id +from skdim.id import lPCA # ====== Scikit-learn imports ====== from sklearn import datasets -from skdim import id -from skdim.id import lPCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis -from sklearn.feature_selection import mutual_info_classif, VarianceThreshold -from sklearn.neighbors import KernelDensity +from sklearn.feature_selection import VarianceThreshold, mutual_info_classif from sklearn.manifold import Isomap +from sklearn.neighbors import KernelDensity -import warnings # df = pd.DataFrame(X) def get_dimensions(df): @@ -29,15 +30,16 @@ def get_dimensions(df): - num_features (int): Number of features in the DataFrame - num_samples (int): Number of samples in the DataFrame - ratio (float): Feature-to-sample ratio - """ + """ # number of features num_features = df.shape[1] # of samples num_samples = df.shape[0] - # feature-to-sample ratio - ratio = num_features/num_samples - - return num_features, num_samples, ratio + # feature-to-sample ratio + ratio = num_features / num_samples + + return num_features, num_samples, ratio + def get_intrinsic_dim(df): """Get intrinsic dimension of the data using lPCA from skdim. @@ -45,79 +47,85 @@ def get_intrinsic_dim(df): df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns Returns: float: Intrinsic dimension of the data - """ + """ # Intrinsic dimension, calculated via scikit-dimension's PCA method - pca = id.lPCA() # Initialize the PCA estimator from skdim - pca.fit(df) # Fit the estimator to your data - return pca.dimension_ + pca = id.lPCA() # Initialize the PCA estimator from skdim + pca.fit(df) # Fit the estimator to your data + return pca.dimension_ + def get_condition_number(df): - """Get condition number of a matrix. - A function with a high condition number is said to be ill-conditioned. - Ill conditioned matrices produce large errors in its output even with small errors in its input. - Low condition number means more stable errors. + """Get the condition number of a matrix. + + A high condition number indicates that the matrix is ill-conditioned and + can produce large output errors even for small input perturbations. A low + condition number indicates a more stable matrix. + Args: df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns Returns: - float: condition number of the matrix represented in df + float: Condition number of the matrix represented in ``df``. """ - # In general, - # meaning that it can produce large errors in its output even with small errors in its input. + # In general, + # meaning that it can produce large errors in its output even with small errors in its input. # Conversely, a function with a low condition number is well-conditioned and more stable in terms of its output. return np.linalg.cond(df) -def get_fdr(df,y): - """Calculate Fisher Discriminant Ratio for a given dataset. + +def get_fdr(df, y): + """Calculate Fisher Discriminant Ratio for a given dataset. Args: df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns y (int): supervised binary class label - + Returns: float: Fisher Discriminant ratio """ X = df.values class_labels = np.unique(y) n_classes = len(class_labels) - FDR = 0 - - if n_classes != 2: + FDR = 0 + + if n_classes != 2: warnings.warn("WARNING: Fisher Discriminant Ratio is only defined for binary classes. ") - else: - mean1 = np.mean(X[y == class_labels[0]], axis=0) #mean for class1 - mean2 = np.mean(X[y == class_labels[1]], axis=0) #mean for class2 - - #calculate within-class scatter matrices + else: + mean1 = np.mean(X[y == class_labels[0]], axis=0) # mean for class1 + mean2 = np.mean(X[y == class_labels[1]], axis=0) # mean for class2 + + # calculate within-class scatter matrices scatter_within = np.zeros((X.shape[1], X.shape[1])) - for label in class_labels: + for label in class_labels: X_class = X[y == label] scatter_within += np.cov(X_class.T) - - #calculate between-class scatter matrix + + # calculate between-class scatter matrix scatter_between = np.outer(mean1 - mean2, mean1 - mean2) - - #compute FDR - FDR = np.trace(scatter_between)/np.trace(scatter_within) - - return FDR - + + # compute FDR + FDR = np.trace(scatter_between) / np.trace(scatter_within) + + return FDR + + def get_total_correlation(df): - """Calculate Total Correlation - + """Calculate Total Correlation + Args: df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns Returns: float: Total correlation """ - corr_matrix = df.corr() #correlation matrix - #total correlation by subtracting diagonal values to remove self-correlation - total_correlation = corr_matrix.abs().sum().sum() - len(df.columns) - + corr_matrix = df.corr() # correlation matrix + # total correlation by subtracting diagonal values to remove self-correlation + total_correlation = corr_matrix.abs().sum().sum() - len(df.columns) + return total_correlation -def get_mutual_information(df, y): + +def get_mutual_information(df, y): """Calculate mutual information via sklearn Args: @@ -128,10 +136,11 @@ def get_mutual_information(df, y): float: Mutual information """ mutual_info = np.mean(mutual_info_classif(df, y)) - + return mutual_info -def get_variance(df): + +def get_variance(df): """Get variance Args: @@ -144,10 +153,11 @@ def get_variance(df): variations = round(df.var(), 2) avg_var = variations.mean() std_var = variations.std() - + return avg_var, std_var -def get_coefficient_var(df): + +def get_coefficient_var(df): """Get coefficient of variance Args: @@ -160,45 +170,48 @@ def get_coefficient_var(df): co_of_v = (df.std() / df.mean()) * 100 avg_co_of_v = co_of_v.mean() std_co_of_v = co_of_v.std() - + return avg_co_of_v, std_co_of_v -def get_nnz(df): + +def get_nnz(df): """Calculate nonzero values in the data Args: df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns Returns: - int: nonzero count + int: nonzero count """ return np.count_nonzero(df.values) -def get_low_var_features(df, num_features): + +def get_low_var_features(df, num_features): """Calculate get count of low variance features Args: df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns num_features (int): number of features in the dataset - + Raises: ValueError: If no feature is strong enough to keep Returns: int: count of features with low variance """ - + threshold = np.percentile(df.var(), 25) - + try: - low_var_features = num_features - VarianceThreshold(threshold).fit(df).get_support().sum() + low_var_features = num_features - VarianceThreshold(threshold).fit(df).get_support().sum() except ValueError: print("No feature is strong enough to keep") low_var_features = None - + return low_var_features -def get_log_density(df): + +def get_log_density(df): """Calculate the mean log density of the data Args: @@ -207,28 +220,31 @@ def get_log_density(df): Returns: float: mean log kernel density """ - kde = KernelDensity(bandwidth=0.2, kernel='gaussian').fit(df) # Create a KernelDensity estimator and fit the estimator to the data + kde = KernelDensity(bandwidth=0.2, kernel="gaussian").fit( + df + ) # Create a KernelDensity estimator and fit the estimator to the data log_density = kde.score_samples(df) - + return log_density.mean() + def get_fractal_dim(df, k_max): """Calculate the fractal dimension of the data using Higuchi's method - + Args: df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns k_max (int): Maximum number of k values to use in the calculation - + Returns: float: Fractal dimension of the data """ FD = hfda.measure(df, k_max) - - return FD + return FD -def get_moments(df): - """Compute third and fourth order moments of the data + +def get_moments(df): + """Compute third and fourth order moments of the data Args: df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns @@ -247,80 +263,83 @@ def get_moments(df): kurt = df.kurtosis() avg_kurt = kurt.mean() std_kurt = kurt.std() - - return avg_skew, std_skew, avg_kurt, std_kurt -def get_entropy(y): + return avg_skew, std_skew, avg_kurt, std_kurt + + +def get_entropy(y): """Calculate entropy of the target variable Args: - y (int): supervised binary class label - - Returns: - avg_y_entropy (float): mean entropy - std_y_entropy (flat): standard deviation of entropy + y (int): supervised binary class label + + Returns: + avg_y_entropy (float): mean entropy + std_y_entropy (flat): standard deviation of entropy """ - y_entropy = entropy(np.bincount(y), base=2) # Compute the entropy of the target variable (y) + y_entropy = entropy(np.bincount(y), base=2) # Compute the entropy of the target variable (y) avg_y_entropy = y_entropy.mean() std_y_entropy = y_entropy.std() - + return avg_y_entropy, std_y_entropy -def get_volume(df): - """Get volume of the data from Convex Hull + +def get_volume(df): + """Get volume of the data from Convex Hull Args: df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns - - Returns: - volume (float): Volume of the space spanned by the features of the data + + Returns: + volume (float): Volume of the space spanned by the features of the data """ - - vol = 0 - if df.shape[0] <= df.shape[1]: + + vol = 0 + if df.shape[0] <= df.shape[1]: warnings.warn("Convex Hull requires number of observations > number of features") - else: - vol = CH(df, qhull_options='QJ').volume - + else: + vol = CH(df, qhull_options="QJ").volume + return vol -def get_complexity(df, n_neighbors=10, n_components=2): - """ Measure the manifold complexity by fitting Isomap and analyzing the geodesic vs. Euclidean distances. + +def get_complexity(df, n_neighbors=10, n_components=2): + """Measure the manifold complexity by fitting Isomap and analyzing the geodesic vs. Euclidean distances. This function computes the reconstruction error of the Isomap algorithm, which serves as an indicator of the complexity of the manifold represented by the data. Args: df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns n_neighbors: Number of neighbors for the Isomap algorithm. Default value 10 n_components: Number of components (dimensions) for Isomap projection. Default value 2 - + Returns: - reconstruction_error: float The reconstruction error of the Isomap model, which indicates the complexity of the manifold. - reconstruction_error: The residual error of geodesic distances """ - + isomap = Isomap(n_neighbors=10, n_components=2) isomap.fit(df.values) - - #reconstruction error - an indicator of complexity + + # reconstruction error - an indicator of complexity reconstruction_error = isomap.reconstruction_error() - + return reconstruction_error - + def evaluate(df, y, file): """This function evaluates a dataset and returns a transposed summary DataFrame with various statistical measures, derived from the dataset. - Using the functions defined above, it computes intrinsic dimension, condition number, Fisher Discriminant Ratio, total correlation, mutual information, variance, coefficient of variation, + Using the functions defined above, it computes intrinsic dimension, condition number, Fisher Discriminant Ratio, total correlation, mutual information, variance, coefficient of variation, data sparsity, low variance features, data density, fractal dimension, data distributions (skewness and kurtosis), entropy of the target variable, and manifold complexity. The summary DataFrame is transposed for easier readability and contains the dataset name, number of features, number of samples, feature-to-sample ratio, and various statistical measures. - This function is useful for quickly summarizing the characteristics of a dataset, especially in the context of machine learning and data analysis, allowing you to correlate the dataset's + This function is useful for quickly summarizing the characteristics of a dataset, especially in the context of machine learning and data analysis, allowing you to correlate the dataset's properties with its performance in predictive modeling tasks. - + Args: df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns y (int): supervised binary class label file (str): Name of the dataset file for identification in the summary DataFrame - + Returns: transposed (pandas.DataFrame): Summary DataFrame containing various statistical measures of the dataset """ @@ -329,17 +348,17 @@ def evaluate(df, y, file): # Calculate statistical measures n_features, n_samples, feature_sample_ratio = get_dimensions(df_numeric) - - # get intrinsic dimension + + # get intrinsic dimension intrinsic_dim = get_intrinsic_dim(df_numeric) - + # Condition number condition_number = get_condition_number(df_numeric) # Class imbalance ratio via Fischer Discriminant fdr = get_fdr(df_numeric, y) - # Total correlation + # Total correlation total_correlation = get_total_correlation(df_numeric) # Mutual information @@ -348,12 +367,12 @@ def evaluate(df, y, file): # Variance avg_var, std_var = get_variance(df_numeric) - # Coefficient of variance + # Coefficient of variance avg_co_of_v, std_co_of_v = get_coefficient_var(df_numeric) - + # Data sparsity count_nonzero = get_nnz(df) - + # Get the number of low variance features num_low_variance_features = get_low_var_features(df_numeric, n_features) @@ -366,81 +385,69 @@ def evaluate(df, y, file): # Data distributions avg_skew, std_skew, avg_kurt, std_kurt = get_moments(df_numeric) - + # entropy avg_y_entropy, std_y_entropy = get_entropy(y) - #volume of data + # volume of data # volume = get_volume(df_numeric) - - #manifold complexity + + # manifold complexity complexity = get_complexity(df_numeric) - + # Create summary DataFrame - summary_df = pd.DataFrame.from_dict({ - # Data set - 'Dataset': file, - - # Dimensions - '# Features': n_features, - '# Samples': n_samples, - 'Feature_Samples_ratio': feature_sample_ratio, - - # Intrinsic dimension - 'Intrinsic_Dimension': intrinsic_dim, - - # Condition number - 'Condition number': condition_number, - - # Class imbalance ratio - 'Fisher Discriminant Ratio': fdr, - - # Feature Correlations - 'Total Correlations': total_correlation, # Total Correlations - 'Mutual information': mutual_info,# Mutual information - - # Data sparsity - '# Non-zero entries': count_nonzero, - '# Low variance features': num_low_variance_features, - - #'Variation': variations, - 'Variation': avg_var, - 'std_var': std_var, - - #'Coefficient of Variation %': co_of_v, - 'Coefficient of Variation %': avg_co_of_v, - 'std_co_of_v': std_co_of_v, - - # Data distributions - #'Skewness': skew, - 'Skewness': avg_skew, - 'std_skew': std_skew, - - #'Kurtosis': kurt, - 'Kurtosis': avg_kurt, - 'std_kurt': std_kurt, - - # Data density - 'Mean Log Kernel Density': mean_log_density, - - # volume of feature space - #'Volume': volume, - - # Manifold complexity - 'Isomap Reconstruction Error': complexity, - - # Fractal dimension - 'Fractal dimension': fractal_dim, # calculated via Higuchi Dimension - - #'Entropy': y_entropy, - 'Entropy': avg_y_entropy, - 'std_entropy': std_y_entropy - }, - orient='index') + summary_df = pd.DataFrame.from_dict( + { + # Data set + "Dataset": file, + # Dimensions + "# Features": n_features, + "# Samples": n_samples, + "Feature_Samples_ratio": feature_sample_ratio, + # Intrinsic dimension + "Intrinsic_Dimension": intrinsic_dim, + # Condition number + "Condition number": condition_number, + # Class imbalance ratio + "Fisher Discriminant Ratio": fdr, + # Feature Correlations + "Total Correlations": total_correlation, # Total Correlations + "Mutual information": mutual_info, # Mutual information + # Data sparsity + "# Non-zero entries": count_nonzero, + "# Low variance features": num_low_variance_features, + #'Variation': variations, + "Variation": avg_var, + "std_var": std_var, + #'Coefficient of Variation %': co_of_v, + "Coefficient of Variation %": avg_co_of_v, + "std_co_of_v": std_co_of_v, + # Data distributions + #'Skewness': skew, + "Skewness": avg_skew, + "std_skew": std_skew, + #'Kurtosis': kurt, + "Kurtosis": avg_kurt, + "std_kurt": std_kurt, + # Data density + "Mean Log Kernel Density": mean_log_density, + # volume of feature space + #'Volume': volume, + # Manifold complexity + "Isomap Reconstruction Error": complexity, + # Fractal dimension + "Fractal dimension": fractal_dim, # calculated via Higuchi Dimension + #'Entropy': y_entropy, + "Entropy": avg_y_entropy, + "std_entropy": std_y_entropy, + }, + orient="index", + ) transposed = summary_df.T - #transposed.to_csv('DataSetEvaluation.csv', sep='\t', index=False) - #print(transposed) + # transposed.to_csv('DataSetEvaluation.csv', sep='\t', index=False) + # print(transposed) return transposed -# evaluate(df,y) \ No newline at end of file + +# evaluate(df,y) diff --git a/qbiocode/evaluation/model_evaluation.py b/qbiocode/evaluation/model_evaluation.py index d510cbe..ed493a6 100644 --- a/qbiocode/evaluation/model_evaluation.py +++ b/qbiocode/evaluation/model_evaluation.py @@ -2,17 +2,19 @@ import time from typing import Literal + import pandas as pd +from sklearn.metrics import accuracy_score, f1_score, roc_auc_score +from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler -# ====== Scikit-learn imports ====== +from qbiocode.utils.helper_fn import print_results -from sklearn.preprocessing import StandardScaler, MinMaxScaler -from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder -from sklearn.metrics import f1_score, accuracy_score, roc_auc_score +# ====== Scikit-learn imports ====== -from qbiocode.utils.helper_fn import print_results -def modeleval(y_test, y_predicted, beg_time, params, args, model:str, verbose = True, average='weighted'): +def modeleval( + y_test, y_predicted, beg_time, params, args, model: str, verbose=True, average="weighted" +): """ Evaluates the model performance using accuracy, F1 score, and AUC. @@ -36,14 +38,42 @@ def modeleval(y_test, y_predicted, beg_time, params, args, model:str, verbose = f1 = f1_score(y_test, y_predicted, average=average) compile_time = time.time() - beg_time params = params - if verbose==True: + if verbose == True: print_results(model, accuracy, f1, compile_time, params) - - if args['grid_search'] == True: - return pd.DataFrame({'y_test_' + model: [y_test], - 'y_predicted_' + model: [y_predicted], - 'results_' + model: [{'model':model,'accuracy': accuracy, 'f1_score': f1,'time': compile_time, 'auc': auc, 'BestParams_GridSearch': params}]}) - else: - return pd.DataFrame({'y_test_' + model: [y_test], - 'y_predicted_' + model: [y_predicted], - 'results_' + model: [{'model':model,'accuracy': accuracy, 'f1_score': f1,'time': compile_time, 'auc': auc, 'Model_Parameters': params}]}) \ No newline at end of file + + if args["grid_search"] == True: + return pd.DataFrame( + { + "y_test_" + model: [y_test], + "y_predicted_" + model: [y_predicted], + "results_" + + model: [ + { + "model": model, + "accuracy": accuracy, + "f1_score": f1, + "time": compile_time, + "auc": auc, + "BestParams_GridSearch": params, + } + ], + } + ) + else: + return pd.DataFrame( + { + "y_test_" + model: [y_test], + "y_predicted_" + model: [y_predicted], + "results_" + + model: [ + { + "model": model, + "accuracy": accuracy, + "f1_score": f1, + "time": compile_time, + "auc": auc, + "Model_Parameters": params, + } + ], + } + ) diff --git a/qbiocode/evaluation/model_run.py b/qbiocode/evaluation/model_run.py index 132681c..5e8843b 100644 --- a/qbiocode/evaluation/model_run.py +++ b/qbiocode/evaluation/model_run.py @@ -1,20 +1,22 @@ # ====== Base class imports ====== -import os, json +import json +import os + import pandas as pd # ======= Parallelization ===== from joblib import Parallel, delayed - current_dir = os.getcwd() + def model_run(X_train, X_test, y_train, y_test, data_key, args): """This function runs the ML methods, with or without a grid search, as specified in the config.yaml file. It returns a python dictionary contatining these results, which can then be parsed out. It is designed to run - each of the ML methods in parallel, for each data set (this is done by calling the Parallel module in results below). - The arguments X_train, X_test, y_train, y_test are all passed in from the main script (qmlbench.py) as the input - datasets are processed, while the remaining arguments are passed from the config.yaml file. - + each of the ML methods in parallel, for each data set (this is done by calling the Parallel module in results below). + The arguments X_train, X_test, y_train, y_test are all passed in from the main script (qmlbench.py) as the input + datasets are processed, while the remaining arguments are passed from the config.yaml file. + Args: X_train (pd.DataFrame): Training features. X_test (pd.DataFrame): Testing features. @@ -28,69 +30,69 @@ def model_run(X_train, X_test, y_train, y_test, data_key, args): - cross_validation: Cross-validation strategy. - gridsearch__args: Arguments for grid search for each model. - _args: Additional arguments for each model. - + Returns: model_total_result (dict): A dictionary containing the results of the models run, with keys as model names and values as their respective results. This dictionary can readily be converted to a Pandas Dataframe, as seen in the 'ModelResults.csv' files that are produced in the results directory when the main profiler is run (qbiocode-profiler.py). - + """ - + # Lazy imports to avoid circular dependency # These imports happen inside the function, not at module level - from qbiocode.learning.compute_svc import compute_svc, compute_svc_opt from qbiocode.learning.compute_dt import compute_dt, compute_dt_opt - from qbiocode.learning.compute_nb import compute_nb, compute_nb_opt from qbiocode.learning.compute_lr import compute_lr, compute_lr_opt - from qbiocode.learning.compute_rf import compute_rf, compute_rf_opt - from qbiocode.learning.compute_xgb import compute_xgb, compute_xgb_opt from qbiocode.learning.compute_mlp import compute_mlp, compute_mlp_opt + from qbiocode.learning.compute_nb import compute_nb, compute_nb_opt + from qbiocode.learning.compute_pqk import compute_pqk from qbiocode.learning.compute_qnn import compute_qnn from qbiocode.learning.compute_qsvc import compute_qsvc + from qbiocode.learning.compute_rf import compute_rf, compute_rf_opt + from qbiocode.learning.compute_svc import compute_svc, compute_svc_opt from qbiocode.learning.compute_vqc import compute_vqc - from qbiocode.learning.compute_pqk import compute_pqk - + from qbiocode.learning.compute_xgb import compute_xgb, compute_xgb_opt + # Build model dictionary compute_ml_dict = { - 'svc_opt': compute_svc_opt, - 'svc': compute_svc, - 'dt_opt': compute_dt_opt, - 'dt': compute_dt, - 'lr_opt': compute_lr_opt, - 'lr': compute_lr, - 'nb_opt': compute_nb_opt, - 'nb': compute_nb, - 'rf_opt': compute_rf_opt, - 'rf': compute_rf, - 'xgb_opt': compute_xgb_opt, - 'xgb': compute_xgb, - 'mlp_opt': compute_mlp_opt, - 'mlp': compute_mlp, - 'qsvc': compute_qsvc, - 'vqc': compute_vqc, - 'qnn': compute_qnn, - 'pqk': compute_pqk + "svc_opt": compute_svc_opt, + "svc": compute_svc, + "dt_opt": compute_dt_opt, + "dt": compute_dt, + "lr_opt": compute_lr_opt, + "lr": compute_lr, + "nb_opt": compute_nb_opt, + "nb": compute_nb, + "rf_opt": compute_rf_opt, + "rf": compute_rf, + "xgb_opt": compute_xgb_opt, + "xgb": compute_xgb, + "mlp_opt": compute_mlp_opt, + "mlp": compute_mlp, + "qsvc": compute_qsvc, + "vqc": compute_vqc, + "qnn": compute_qnn, + "pqk": compute_pqk, } # Quantum models don't have _opt versions (use separate configs for hyperparameter tuning) - quantum_models = {'qsvc', 'qnn', 'vqc', 'pqk'} - + quantum_models = {"qsvc", "qnn", "vqc", "pqk"} + # Run classical and quantum models - n_jobs = len(args['model']) - if 'n_jobs' in args.keys(): - n_jobs = min(args['n_jobs'], len(args['model'])) - + n_jobs = len(args["model"]) + if "n_jobs" in args.keys(): + n_jobs = min(args["n_jobs"], len(args["model"])) + grid_search = False - if 'grid_search' in args.keys(): - grid_search = args['grid_search'] - + if "grid_search" in args.keys(): + grid_search = args["grid_search"] + # Check if any quantum models are in the model list when grid_search is enabled if grid_search: - quantum_in_models = [m for m in args['model'] if m in quantum_models] + quantum_in_models = [m for m in args["model"] if m in quantum_models] if quantum_in_models: - print("\n" + "="*80) + print("\n" + "=" * 80) print("WARNING: Grid search is enabled with quantum models:", quantum_in_models) - print("="*80) + print("=" * 80) print("Quantum models do not support automated grid search.") print("For hyperparameter tuning of quantum models, you should:") print(" 1. Create multiple configuration files with different hyperparameters") @@ -104,38 +106,56 @@ def model_run(X_train, X_test, y_train, y_test, data_key, args): print(" data_dirs=['data/your_data_dir']") print(" )") print("\nSee documentation: qbiocode.utils.generate_qml_experiment_configs") - print("="*80 + "\n") - + print("=" * 80 + "\n") + if grid_search: results = [] - for method in args['model']: + for method in args["model"]: if method in quantum_models: # Quantum models don't have _opt versions, use regular function result = delayed(compute_ml_dict[method])( - X_train, X_test, y_train, y_test, args, + X_train, + X_test, + y_train, + y_test, + args, model=method, data_key=data_key, - **args.get(method + '_args', {}), - verbose=False + **args.get(method + "_args", {}), + verbose=False, ) else: # Classical models have _opt versions with grid search - result = delayed(compute_ml_dict[method + '_opt'])( - X_train, X_test, y_train, y_test, args, - model=method + '_opt', - cv=args['cross_validation'], - **args.get('gridsearch_' + method + '_args', {}), - verbose=False + result = delayed(compute_ml_dict[method + "_opt"])( + X_train, + X_test, + y_train, + y_test, + args, + model=method + "_opt", + cv=args["cross_validation"], + **args.get("gridsearch_" + method + "_args", {}), + verbose=False, ) results.append(result) results = Parallel(n_jobs=n_jobs)(results) else: - results = Parallel(n_jobs=n_jobs)(delayed(compute_ml_dict[method])(X_train, X_test, y_train, y_test, args, model=method, data_key = data_key, - **args[method+'_args'], verbose=False) - for method in args['model']) - + results = Parallel(n_jobs=n_jobs)( + delayed(compute_ml_dict[method])( + X_train, + X_test, + y_train, + y_test, + args, + model=method, + data_key=data_key, + **args[method + "_args"], + verbose=False, + ) + for method in args["model"] + ) + model_total_result = pd.melt(pd.concat(results)).dropna() # type: ignore - model_total_result['i'] = 0 + model_total_result["i"] = 0 model_total_result = model_total_result.pivot(columns="variable", values="value", index="i") return model_total_result.to_dict() - diff --git a/qbiocode/learning/__init__.py b/qbiocode/learning/__init__.py index 85da6c3..47c4cdd 100644 --- a/qbiocode/learning/__init__.py +++ b/qbiocode/learning/__init__.py @@ -7,7 +7,7 @@ optimized versions (where applicable) with hyperparameter tuning. Classical Algorithms -------------------- +-------------------- - Decision Tree (DT) - Logistic Regression (LR) - Multi-Layer Perceptron (MLP) @@ -17,7 +17,7 @@ - XGBoost (XGB) Quantum Algorithms ------------------ +------------------ - Quantum Neural Network (QNN) - Quantum Support Vector Classifier (QSVC) - Variational Quantum Classifier (VQC) @@ -39,6 +39,7 @@ from .compute_nb import compute_nb, compute_nb_opt from .compute_rf import compute_rf, compute_rf_opt from .compute_svc import compute_svc, compute_svc_opt + try: from .compute_xgb import compute_xgb, compute_xgb_opt except Exception: @@ -46,32 +47,32 @@ compute_xgb = None # type: ignore compute_xgb_opt = None # type: ignore +from .compute_pqk import compute_pqk + # Quantum ML algorithms from .compute_qnn import compute_qnn from .compute_qsvc import compute_qsvc from .compute_vqc import compute_vqc -from .compute_pqk import compute_pqk __all__ = [ # Classical algorithms - 'compute_dt', - 'compute_dt_opt', - 'compute_lr', - 'compute_lr_opt', - 'compute_mlp', - 'compute_mlp_opt', - 'compute_nb', - 'compute_nb_opt', - 'compute_rf', - 'compute_rf_opt', - 'compute_svc', - 'compute_svc_opt', - 'compute_xgb', - 'compute_xgb_opt', - + "compute_dt", + "compute_dt_opt", + "compute_lr", + "compute_lr_opt", + "compute_mlp", + "compute_mlp_opt", + "compute_nb", + "compute_nb_opt", + "compute_rf", + "compute_rf_opt", + "compute_svc", + "compute_svc_opt", + "compute_xgb", + "compute_xgb_opt", # Quantum algorithms - 'compute_qnn', - 'compute_qsvc', - 'compute_vqc', - 'compute_pqk', + "compute_qnn", + "compute_qsvc", + "compute_vqc", + "compute_pqk", ] diff --git a/qbiocode/learning/compute_dt.py b/qbiocode/learning/compute_dt.py index 241fd76..5babec6 100644 --- a/qbiocode/learning/compute_dt.py +++ b/qbiocode/learning/compute_dt.py @@ -2,26 +2,46 @@ import time -# ====== Scikit-learn imports ====== - -from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier +from sklearn.tree import DecisionTreeClassifier # ====== Additional local imports ====== from qbiocode.evaluation.model_evaluation import modeleval +# ====== Scikit-learn imports ====== + + # ====== Begin functions ====== -def compute_dt(X_train, X_test, y_train, y_test, args, verbose=False, model='Decision Tree', data_key = '',criterion='gini', splitter='best', - max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, - random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, - monotonic_cst=None): - + +def compute_dt( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + model="Decision Tree", + data_key="", + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + class_weight=None, + ccp_alpha=0.0, + monotonic_cst=None, +): """This function generates a model using a Decision Tree (DT) Classifier method as implemented in - `scikit-learn `_. + `scikit-learn `__. It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. - The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset. + The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model. This function is designed to be used in a supervised learning context, where the goal is to classify data points. @@ -49,34 +69,60 @@ def compute_dt(X_train, X_test, y_train, y_test, args, verbose=False, model='Dec monotonic_cst: Monotonic constraints for tree nodes, if applicable. Default is None. Returns: modeleval (dict): A dictionary containing the evaluation metrics, model parameters, and time taken for training and validation. - """ - + """ + beg_time = time.time() - dt = OneVsOneClassifier(DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, - min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, - min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, - random_state=random_state, max_leaf_nodes=max_leaf_nodes, - min_impurity_decrease=min_impurity_decrease, class_weight=class_weight, - ccp_alpha=ccp_alpha, monotonic_cst=monotonic_cst)) + dt = OneVsOneClassifier( + DecisionTreeClassifier( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + random_state=random_state, + max_leaf_nodes=max_leaf_nodes, + min_impurity_decrease=min_impurity_decrease, + class_weight=class_weight, + ccp_alpha=ccp_alpha, + monotonic_cst=monotonic_cst, + ) + ) # Fit the training datset model_fit = dt.fit(X_train, y_train) model_params = model_fit.get_params() # Validate the model in test dataset and calculate accuracy - y_predicted = dt.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose)) + y_predicted = dt.predict(X_test) + return modeleval( + y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose + ) + -def compute_dt_opt(X_train, X_test, y_train, y_test, args, verbose=False, model='Decision Tree', cv=5, - criterion=[], max_depth=[], min_samples_split=[], min_samples_leaf=[], max_features=[]): - +def compute_dt_opt( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + model="Decision Tree", + cv=5, + criterion=[], + max_depth=[], + min_samples_split=[], + min_samples_leaf=[], + max_features=[], +): """This function also generates a model using a Decision Tree (DT) Classifier method as implemented in - `scikit-learn `_. + `scikit-learn `__. The difference here is that this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar - datasets, without having to run the grid search. - The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset. + datasets, without having to run the grid search. + The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search. This function is designed to be used in a supervised learning context, where the goal is to classify data points. - + Args: X_train (array-like): Training data features. X_test (array-like): Test data features. @@ -91,18 +137,19 @@ def compute_dt_opt(X_train, X_test, y_train, y_test, args, verbose=False, model= min_samples_split (list): List of minimum samples required to split an internal node. Default is empty list. min_samples_leaf (list): List of minimum samples required to be at a leaf node. Default is empty list. max_features (list): List of maximum features to consider when looking for the best split. Default is empty list. - + Returns: modeleval (dict): A dictionary containing the evaluation metrics, best parameters, and time taken for training and validation. - """ - + """ + beg_time = time.time() - params = {'criterion': criterion, - 'max_depth': max_depth, - 'min_samples_split': min_samples_split, - 'min_samples_leaf': min_samples_leaf, - 'max_features': max_features - } + params = { + "criterion": criterion, + "max_depth": max_depth, + "min_samples_split": min_samples_split, + "min_samples_leaf": min_samples_leaf, + "max_features": max_features, + } # Perform Grid Search to find the best parameters grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=params, cv=cv) grid_search.fit(X_train, y_train) @@ -114,4 +161,4 @@ def compute_dt_opt(X_train, X_test, y_train, y_test, args, verbose=False, model= # Make predictions and calculate accuracy y_predicted = best_dt.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)) + return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose) diff --git a/qbiocode/learning/compute_lr.py b/qbiocode/learning/compute_lr.py index 8da039b..d25f0f2 100644 --- a/qbiocode/learning/compute_lr.py +++ b/qbiocode/learning/compute_lr.py @@ -1,10 +1,8 @@ # ====== Base class imports ====== import time -import numpy as np - -# ====== Scikit-learn imports ====== +import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier @@ -12,15 +10,39 @@ # ====== Additional local imports ====== from qbiocode.evaluation.model_evaluation import modeleval +# ====== Scikit-learn imports ====== + + # ====== Begin functions ====== - -def compute_lr(X_train, X_test, y_train, y_test, args, model='Logistic Regression', data_key = '', - penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, - class_weight=None, random_state=None, solver='saga', max_iter=10000, multi_class='deprecated', - verbose=False, warm_start=False, n_jobs=None, l1_ratio=None): - + + +def compute_lr( + X_train, + X_test, + y_train, + y_test, + args, + model="Logistic Regression", + data_key="", + penalty="l2", + *, + dual=False, + tol=0.0001, + C=1.0, + fit_intercept=True, + intercept_scaling=1, + class_weight=None, + random_state=None, + solver="saga", + max_iter=10000, + multi_class="deprecated", + verbose=False, + warm_start=False, + n_jobs=None, + l1_ratio=None, +): """This function generates a model using a Logistic Regression (LR) method as implemented in - `scikit-learn `_. + `scikit-learn `__. It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model on the test dataset, including accuracy, @@ -49,36 +71,62 @@ def compute_lr(X_train, X_test, y_train, y_test, args, model='Logistic Regressio verbose (bool): Whether to print detailed logs, default is False. warm_start (bool): Whether to reuse the solution of the previous call to fit as initialization, default is False. - n_jobs (int or None): Number of jobs to run in parallel for both `fit` and `predict`, + n_jobs (int or None): Number of jobs to run in parallel for both `fit` and `predict`, default is None which means 1 unless in a joblib.parallel_backend context. - l1_ratio (float or None): The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1. + l1_ratio (float or None): The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1. Only used if penalty='elasticnet', default is None. - + Returns: modeleval (dict): A dictionary containing the evaluation metrics, model parameters, and time taken for training and validation. - """ - + """ + beg_time = time.time() - logres = OneVsOneClassifier(LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, - intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state, - solver=solver, max_iter=max_iter, - warm_start=warm_start, n_jobs=n_jobs, l1_ratio=l1_ratio)) + logres = OneVsOneClassifier( + LogisticRegression( + penalty=penalty, + dual=dual, + tol=tol, + C=C, + fit_intercept=fit_intercept, + intercept_scaling=intercept_scaling, + class_weight=class_weight, + random_state=random_state, + solver=solver, + max_iter=max_iter, + warm_start=warm_start, + n_jobs=n_jobs, + l1_ratio=l1_ratio, + ) + ) # Fit the training datset model_fit = logres.fit(X_train, y_train) model_params = model_fit.get_params() # Validate the model in test dataset and calculate accuracy - y_predicted = logres.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose)) + y_predicted = logres.predict(X_test) + return modeleval( + y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose + ) -def compute_lr_opt(X_train, X_test, y_train, y_test, args, model='Logistic Regression', cv=5, - penalty=[], C=[], - solver=[], verbose=False, max_iter=[]): - + +def compute_lr_opt( + X_train, + X_test, + y_train, + y_test, + args, + model="Logistic Regression", + cv=5, + penalty=[], + C=[], + solver=[], + verbose=False, + max_iter=[], +): """This function also generates a model using a Logistic Regression (LR) method as implemented in - `scikit-learn `_. + `scikit-learn `__. The difference here is that this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar - datasets, without having to run the grid search. The function returns the evaluation of the model + datasets, without having to run the grid search. The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search. This function is designed to be used in a supervised learning context, where the goal is to classify data points. @@ -95,17 +143,13 @@ def compute_lr_opt(X_train, X_test, y_train, y_test, args, model='Logistic Regre solver (list): List of solvers to try, default is an empty list. verbose (bool): Whether to print detailed logs, default is False. max_iter (list): List of maximum iterations to try, default is an empty list. - + Returns: modeleval (dict): A dictionary containing the evaluation metrics, best parameters, and time taken for training and validation. - """ - + """ + beg_time = time.time() - params = {'penalty': penalty, - 'C': C, - 'solver':solver, - 'max_iter':max_iter - } + params = {"penalty": penalty, "C": C, "solver": solver, "max_iter": max_iter} # Perform Grid Search to find the best parameters grid_search = GridSearchCV(LogisticRegression(), param_grid=params, cv=cv) grid_search.fit(X_train, y_train) @@ -117,4 +161,4 @@ def compute_lr_opt(X_train, X_test, y_train, y_test, args, model='Logistic Regre # Make predictions and calculate accuracy y_predicted = best_logres.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)) \ No newline at end of file + return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose) diff --git a/qbiocode/learning/compute_mlp.py b/qbiocode/learning/compute_mlp.py index fc6c676..375f81b 100644 --- a/qbiocode/learning/compute_mlp.py +++ b/qbiocode/learning/compute_mlp.py @@ -1,125 +1,188 @@ # ====== Base class imports ====== import time -import numpy as np - -# ====== Scikit-learn imports ====== -from sklearn.neural_network import MLPClassifier +import numpy as np from sklearn.model_selection import GridSearchCV from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier +from sklearn.neural_network import MLPClassifier # ====== Additional local imports ====== from qbiocode.evaluation.model_evaluation import modeleval +# ====== Scikit-learn imports ====== + + # ====== Begin functions ====== -def compute_mlp(X_train, X_test, y_train, y_test, args, verbose=False, model='Multi-layer Perceptron', data_key = '', - hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', - learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=10000, shuffle=True, - random_state=None, tol=0.0001, warm_start=False, momentum=0.9, nesterovs_momentum=True, - early_stopping=False, validation_fraction=0.1, beta_1=0.9, - beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000): - + +def compute_mlp( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + model="Multi-layer Perceptron", + data_key="", + hidden_layer_sizes=(100,), + activation="relu", + solver="adam", + alpha=0.0001, + batch_size="auto", + learning_rate="constant", + learning_rate_init=0.001, + power_t=0.5, + max_iter=10000, + shuffle=True, + random_state=None, + tol=0.0001, + warm_start=False, + momentum=0.9, + nesterovs_momentum=True, + early_stopping=False, + validation_fraction=0.1, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-08, + n_iter_no_change=10, + max_fun=15000, +): + """ + This function generates a model using a Multi-layer Perceptron (mlp), a neural network, method as implemented in + `scikit-learn `__. It takes in parameter + arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. + The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model + on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model. + This function is designed to be used in a supervised learning context, where the goal is to classify data points. + + Args: + X_train (numpy.ndarray): Training features. + X_test (numpy.ndarray): Test features. + y_train (numpy.ndarray): Training labels. + y_test (numpy.ndarray): Test labels. + args (dict): Additional arguments, such as config parameters. + verbose (bool): If True, prints additional information during execution. + model (str): Name of the model being used. + data_key (str): Key for the dataset, if applicable. + hidden_layer_sizes (tuple): The ith element represents the number of neurons in the ith hidden layer. + activation (str): Activation function for the hidden layer. + solver (str): The solver for weight optimization. + alpha (float): L2 penalty (regularization term) parameter. + batch_size (int or str): Size of minibatches for stochastic optimizers. + learning_rate (str): Learning rate schedule for weight updates. + learning_rate_init (float): Initial learning rate used. + power_t (float): The exponent for inverse scaling learning rate. + max_iter (int): Maximum number of iterations. + shuffle (bool): Whether to shuffle samples in each iteration. + random_state (int or None): Random seed for reproducibility. + tol (float): Tolerance for stopping criteria. + warm_start (bool): If True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. + momentum (float): Momentum for gradient descent update. + nesterovs_momentum (bool): Whether to use Nesterov's momentum or not. + early_stopping (bool): Whether to use early stopping to terminate training when validation score is not improving. + validation_fraction (float): Proportion of training data to set aside as validation set for early stopping. + beta_1, beta_2, epsilon: Parameters for Adam optimizer. + n_iter_no_change: Number of iterations with no improvement after which training will be stopped. + max_fun: Maximum number of function evaluations. + + Returns: + modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score, + and the time taken to train and validate the model, along with the model parameters. """ - This function generates a model using a Multi-layer Perceptron (mlp), a neural network, method as implemented in - `scikit-learn `_. It takes in parameter - arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. - The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model - on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model. - This function is designed to be used in a supervised learning context, where the goal is to classify data points. - - Args: + + beg_time = time.time() + mlp = OneVsOneClassifier( + MLPClassifier( + hidden_layer_sizes=hidden_layer_sizes, + activation=activation, + solver=solver, + alpha=alpha, + batch_size=batch_size, + learning_rate=learning_rate, + learning_rate_init=learning_rate_init, + power_t=power_t, + max_iter=max_iter, + shuffle=shuffle, + random_state=random_state, + tol=tol, + warm_start=warm_start, + momentum=momentum, + nesterovs_momentum=nesterovs_momentum, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + beta_1=beta_1, + beta_2=beta_2, + epsilon=epsilon, + n_iter_no_change=n_iter_no_change, + max_fun=max_fun, + ) + ) + # Fit the training datset + model_fit = mlp.fit(X_train, y_train) + model_params = model_fit.get_params() + # Validate the model in test dataset and calculate accuracy + y_predicted = mlp.predict(X_test) + return modeleval( + y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose + ) + + +def compute_mlp_opt( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + cv=5, + model="Multi-layer Perceptron", + hidden_layer_sizes=[], + activation=[], + max_iter=[], + solver=[], + alpha=[], + learning_rate=[], +): + """ + This function also generates a model using a Multi-layer Perceptron (mlp), a neural network, as implemented in scikit-learn + (https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html). The difference here is that + this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The + combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar + datasets, without having to run the grid search. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model + on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search. + This function is designed to be used in a supervised learning context, where the goal is to classify data points. + + Args: X_train (numpy.ndarray): Training features. X_test (numpy.ndarray): Test features. y_train (numpy.ndarray): Training labels. y_test (numpy.ndarray): Test labels. args (dict): Additional arguments, such as config parameters. verbose (bool): If True, prints additional information during execution. + cv (int): Number of cross-validation folds. model (str): Name of the model being used. - data_key (str): Key for the dataset, if applicable. - hidden_layer_sizes (tuple): The ith element represents the number of neurons in the ith hidden layer. - activation (str): Activation function for the hidden layer. - solver (str): The solver for weight optimization. - alpha (float): L2 penalty (regularization term) parameter. - batch_size (int or str): Size of minibatches for stochastic optimizers. - learning_rate (str): Learning rate schedule for weight updates. - learning_rate_init (float): Initial learning rate used. - power_t (float): The exponent for inverse scaling learning rate. - max_iter (int): Maximum number of iterations. - shuffle (bool): Whether to shuffle samples in each iteration. - random_state (int or None): Random seed for reproducibility. - tol (float): Tolerance for stopping criteria. - warm_start (bool): If True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. - momentum (float): Momentum for gradient descent update. - nesterovs_momentum (bool): Whether to use Nesterov's momentum or not. - early_stopping (bool): Whether to use early stopping to terminate training when validation score is not improving. - validation_fraction (float): Proportion of training data to set aside as validation set for early stopping. - beta_1, beta_2, epsilon: Parameters for Adam optimizer. - n_iter_no_change: Number of iterations with no improvement after which training will be stopped. - max_fun: Maximum number of function evaluations. - - Returns: - modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score, - and the time taken to train and validate the model, along with the model parameters. - """ - - beg_time = time.time() - mlp = OneVsOneClassifier(MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, - batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, - power_t=power_t, max_iter=max_iter, shuffle=shuffle, random_state=random_state, tol=tol, - warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum, - early_stopping=early_stopping, validation_fraction=validation_fraction, beta_1=beta_1, - beta_2=beta_2, epsilon=epsilon, n_iter_no_change=n_iter_no_change, max_fun=max_fun)) - # Fit the training datset - model_fit = mlp.fit(X_train, y_train) - model_params = model_fit.get_params() - # Validate the model in test dataset and calculate accuracy - y_predicted = mlp.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose)) - -def compute_mlp_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, model='Multi-layer Perceptron', - hidden_layer_sizes= [], activation = [], max_iter= [], - solver = [], alpha = [], learning_rate= []): - + hidden_layer_sizes (tuple or list): The ith element represents the number of neurons in the ith hidden layer. + activation (str or list): Activation function for the hidden layer. + max_iter (int or list): Maximum number of iterations. + solver (str or list): The solver for weight optimization. + alpha (float or list): L2 penalty (regularization term) parameter. + learning_rate (str or list): Learning rate schedule for weight updates. + Returns: + modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score, + and the time taken to train and validate the model, along with the best parameters found during grid search. """ - This function also generates a model using a Multi-layer Perceptron (mlp), a neural network, as implemented in scikit-learn - (https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html). The difference here is that - this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The - combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar - datasets, without having to run the grid search. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model - on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search. - This function is designed to be used in a supervised learning context, where the goal is to classify data points. - - Args: - X_train (numpy.ndarray): Training features. - X_test (numpy.ndarray): Test features. - y_train (numpy.ndarray): Training labels. - y_test (numpy.ndarray): Test labels. - args (dict): Additional arguments, such as config parameters. - verbose (bool): If True, prints additional information during execution. - cv (int): Number of cross-validation folds. - model (str): Name of the model being used. - hidden_layer_sizes (tuple or list): The ith element represents the number of neurons in the ith hidden layer. - activation (str or list): Activation function for the hidden layer. - max_iter (int or list): Maximum number of iterations. - solver (str or list): The solver for weight optimization. - alpha (float or list): L2 penalty (regularization term) parameter. - learning_rate (str or list): Learning rate schedule for weight updates. - Returns: - modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score, - and the time taken to train and validate the model, along with the best parameters found during grid search. - """ - + beg_time = time.time() - params={'hidden_layer_sizes': hidden_layer_sizes, - 'activation': activation, - 'max_iter': max_iter, - 'solver': solver, - 'alpha': alpha, - 'learning_rate': learning_rate, - } - + params = { + "hidden_layer_sizes": hidden_layer_sizes, + "activation": activation, + "max_iter": max_iter, + "solver": solver, + "alpha": alpha, + "learning_rate": learning_rate, + } + # Pemlporm Grid Search to find the best parameters grid_search = GridSearchCV(MLPClassifier(), param_grid=params, cv=cv) grid_search.fit(X_train, y_train) @@ -131,4 +194,4 @@ def compute_mlp_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, # Make predictions and calculate accuracy y_predicted = best_mlp.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)) + return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose) diff --git a/qbiocode/learning/compute_nb.py b/qbiocode/learning/compute_nb.py index 43ff692..c8ac20f 100644 --- a/qbiocode/learning/compute_nb.py +++ b/qbiocode/learning/compute_nb.py @@ -2,24 +2,34 @@ import time -# ====== Scikit-learn imports ====== - -from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import GridSearchCV from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier +from sklearn.naive_bayes import GaussianNB # ====== Additional local imports ====== from qbiocode.evaluation.model_evaluation import modeleval -def compute_nb(X_train, X_test, y_train, y_test, args, verbose=False, model='Naive Bayes', data_key = '', var_smoothing=1e-09): - +# ====== Scikit-learn imports ====== + + +def compute_nb( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + model="Naive Bayes", + data_key="", + var_smoothing=1e-09, +): """This function generates a model using a Gaussian Naive Bayes (NB) Classifier method as implemented in - `scikit-learn `_. + `scikit-learn `__. It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. - The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model + The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model. This function is designed to be used in a supervised learning context, where the goal is to classify data points. - + Args: X_train (numpy.ndarray): Training features. X_test (numpy.ndarray): Test features. @@ -32,9 +42,9 @@ def compute_nb(X_train, X_test, y_train, y_test, args, verbose=False, model='Nai var_smoothing (float): Portion of the largest variance of all features added to variances for calculation stability. Returns: modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score, - and the time taken to train and validate the model, along with the model parameters. - """ - + and the time taken to train and validate the model, along with the model parameters. + """ + beg_time = time.time() nb = OneVsOneClassifier(GaussianNB(var_smoothing=var_smoothing)) # Fit the training datset @@ -42,16 +52,27 @@ def compute_nb(X_train, X_test, y_train, y_test, args, verbose=False, model='Nai model_params = model_fit.get_params() # Validate the model in test dataset and calculate accuracy y_predicted = nb.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose)) + return modeleval( + y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose + ) -def compute_nb_opt(X_train, X_test, y_train, y_test, args, verbose=False, model='Naive Bayes', cv=5, - var_smoothing = [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 1e-02]): - - """ This function generates a model using a Gaussian Naive Bayes (NB) Classifier method as implemented in - `scikit-learn `_. + +def compute_nb_opt( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + model="Naive Bayes", + cv=5, + var_smoothing=[1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 1e-02], +): + """This function generates a model using a Gaussian Naive Bayes (NB) Classifier method as implemented in + `scikit-learn `__. It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. The combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar - datasets, without having to run the grid search. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model + datasets, without having to run the grid search. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search. This function is designed to be used in a supervised learning context, where the goal is to classify data points. Args: @@ -67,10 +88,10 @@ def compute_nb_opt(X_train, X_test, y_train, y_test, args, verbose=False, model= Returns: modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model, along with the best parameters found during grid search. - """ - + """ + beg_time = time.time() - params={'var_smoothing': var_smoothing} + params = {"var_smoothing": var_smoothing} # Perform Grid Search to find the best parameters grid_search = GridSearchCV(GaussianNB(), param_grid=params, cv=cv) grid_search.fit(X_train, y_train) @@ -82,4 +103,4 @@ def compute_nb_opt(X_train, X_test, y_train, y_test, args, verbose=False, model= # Make predictions and calculate accuracy y_predicted = best_nb.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)) \ No newline at end of file + return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose) diff --git a/qbiocode/learning/compute_pqk.py b/qbiocode/learning/compute_pqk.py index d9c4803..a822c28 100644 --- a/qbiocode/learning/compute_pqk.py +++ b/qbiocode/learning/compute_pqk.py @@ -1,38 +1,55 @@ # ====== Base class imports ====== +import os import time +import warnings + import numpy as np -import os import pandas as pd -import warnings -from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score, auc -from sklearn.model_selection import RandomizedSearchCV, GridSearchCV -from sklearn.neural_network import MLPClassifier from sklearn.ensemble import RandomForestClassifier -from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, f1_score +from sklearn.model_selection import GridSearchCV, RandomizedSearchCV +from sklearn.neural_network import MLPClassifier +from sklearn.svm import SVC + try: from xgboost import XGBClassifier + XGBOOST_AVAILABLE = True except Exception: XGBOOST_AVAILABLE = False XGBClassifier = None # type: ignore -# ====== Additional local imports ====== -from qbiocode.evaluation.model_evaluation import modeleval -import qbiocode.utils.qutils as qutils -from sklearn.model_selection import GridSearchCV +# from qiskit.primitives import Sampler +from functools import reduce # ====== Qiskit imports ====== from qiskit import QuantumCircuit - -#from qiskit.primitives import Sampler -from functools import reduce from qiskit.quantum_info import Pauli from sklearn import svm +from sklearn.model_selection import GridSearchCV + +import qbiocode.utils.qutils as qutils + +# ====== Additional local imports ====== +from qbiocode.evaluation.model_evaluation import modeleval -def compute_pqk(X_train, X_test, y_train, y_test, args, model='PQK', data_key = '', verbose=False, - encoding = 'Z', primitive = 'estimator', entanglement = 'linear', reps= 2, - classical_models=None): + +def compute_pqk( + X_train, + X_test, + y_train, + y_test, + args, + model="PQK", + data_key="", + verbose=False, + encoding="Z", + primitive="estimator", + entanglement="linear", + reps=2, + classical_models=None, +): """ This function generates quantum circuits, computes projections of the data onto these circuits, and evaluates the performance of classical machine learning models on the projected data. @@ -46,7 +63,7 @@ def compute_pqk(X_train, X_test, y_train, y_test, args, model='PQK', data_key = This function is part of the main quantum machine learning pipeline (QProfiler.py) and is intended for use in supervised learning tasks. It leverages quantum computing to enhance feature extraction and classification performance on complex datasets. The function returns the performance results, including accuracy, F1-score, AUC, runtime, as well as model parameters, and other relevant metrics. - + Args: X_train (np.ndarray): Training data features. X_test (np.ndarray): Test data features. @@ -67,21 +84,24 @@ def compute_pqk(X_train, X_test, y_train, y_test, args, model='PQK', data_key = Returns: modeleval (pd.DataFrame): A DataFrame containing evaluation metrics and model parameters for all models. """ - + # Set default classical models if not provided if classical_models is None: - classical_models = ['rf', 'mlp', 'svc', 'lr', 'xgb'] + classical_models = ["rf", "mlp", "svc", "lr", "xgb"] beg_time = time.time() feat_dimension = X_train.shape[1] - if not os.path.exists( 'pqk_projections'): - os.makedirs('pqk_projections') + if not os.path.exists("pqk_projections"): + os.makedirs("pqk_projections") - file_projection_train = os.path.join( 'pqk_projections', 'pqk_projection_' + data_key + '_train.npy') - file_projection_test = os.path.join( 'pqk_projections', 'pqk_projection_' + data_key + '_test.npy') + file_projection_train = os.path.join( + "pqk_projections", "pqk_projection_" + data_key + "_train.npy" + ) + file_projection_test = os.path.join( + "pqk_projections", "pqk_projection_" + data_key + "_test.npy" + ) - # This function ensures that all multiplicative factors of data features inside single qubit gates are 1.0 def data_map_func(x: np.ndarray) -> float: """ @@ -94,73 +114,92 @@ def data_map_func(x: np.ndarray) -> float: float: the mapped value """ coeff = x[0] / 2 if len(x) == 1 else reduce(lambda m, n: (m * n) / 2, x) - return coeff - - # choose a method for mapping your features onto the circuit - feature_map, _ = qutils.get_feature_map(feature_map=encoding, - feat_dimension=X_train.shape[1], - reps = reps, - entanglement=entanglement, - data_map_func = data_map_func) + return float(coeff) + + # choose a method for mapping your features onto the circuit + feature_map, _ = qutils.get_feature_map( + feature_map=encoding, + feat_dimension=X_train.shape[1], + reps=reps, + entanglement=entanglement, + data_map_func=data_map_func, + ) # Build quantum circuit circuit = QuantumCircuit(feature_map.num_qubits) circuit.compose(feature_map, inplace=True) num_qubits = circuit.num_qubits - if (not os.path.exists( file_projection_train ) ) | (not os.path.exists( file_projection_test ) ): + if (not os.path.exists(file_projection_train)) | (not os.path.exists(file_projection_test)): # Generate the backend, session and primitive - backend, session, prim = qutils.get_backend_session(args, - 'estimator', - num_qubits=num_qubits) + backend, session, prim = qutils.get_backend_session( + args, "estimator", num_qubits=num_qubits + ) # Transpile - if args['backend'] != 'simulator': - circuit = qutils.transpile_circuit( circuit, opt_level=3, backend = backend, - PT = True, initial_layout = None) - + if args["backend"] != "simulator": + circuit = qutils.transpile_circuit( + circuit, opt_level=3, backend=backend, PT=True, initial_layout=None + ) for f_tr in [file_projection_train, file_projection_test]: - if not os.path.exists( f_tr ): + if not os.path.exists(f_tr): projections = [] - if 'train' in f_tr: + if "train" in f_tr: dat = X_train.copy() else: dat = X_test.copy() - + # Identity operator on all qubits - id = 'I' * feat_dimension + id = "I" * feat_dimension # We group all commuting observables # These groups are the Pauli X, Y and Z operators on individual qubits # Apply the circuit layout to the observable if mapped to device - if args['backend'] != 'simulator': - observables_x =[] - observables_y =[] - observables_z =[] + if args["backend"] != "simulator": + observables_x = [] + observables_y = [] + observables_z = [] for i in range(feat_dimension): - observables_x.append( Pauli(id[:i] + 'X' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) ) - observables_y.append( Pauli(id[:i] + 'Y' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) ) - observables_z.append( Pauli(id[:i] + 'Z' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) ) + observables_x.append( + Pauli(id[:i] + "X" + id[(i + 1) :]).apply_layout( + circuit.layout, num_qubits=backend.num_qubits + ) + ) + observables_y.append( + Pauli(id[:i] + "Y" + id[(i + 1) :]).apply_layout( + circuit.layout, num_qubits=backend.num_qubits + ) + ) + observables_z.append( + Pauli(id[:i] + "Z" + id[(i + 1) :]).apply_layout( + circuit.layout, num_qubits=backend.num_qubits + ) + ) else: - observables_x = [Pauli(id[:i] + 'X' + id[(i + 1):]) for i in range(feat_dimension)] - observables_y = [Pauli(id[:i] + 'Y' + id[(i + 1):]) for i in range(feat_dimension)] - observables_z = [Pauli(id[:i] + 'Z' + id[(i + 1):]) for i in range(feat_dimension)] - - + observables_x = [ + Pauli(id[:i] + "X" + id[(i + 1) :]) for i in range(feat_dimension) + ] + observables_y = [ + Pauli(id[:i] + "Y" + id[(i + 1) :]) for i in range(feat_dimension) + ] + observables_z = [ + Pauli(id[:i] + "Z" + id[(i + 1) :]) for i in range(feat_dimension) + ] + # projections[i][j][k] will be the expectation value of the j-th Pauli operator (0: X, 1: Y, 2: Z) # of datapoint i on qubit k projections = [] for i in range(len(dat)): if i % 100 == 0: - print('at datapoint {}'.format(i)) + print("at datapoint {}".format(i)) - # Get training sample + # Get training sample parameters = dat[i] - # We define the primitive unified blocs (PUBs) consisting of the embedding circuit, + # We define the primitive unified blocs (PUBs) consisting of the embedding circuit, # set of observables and the circuit parameters pub_x = (circuit, observables_x, parameters) pub_y = (circuit, observables_y, parameters) @@ -172,20 +211,20 @@ def data_map_func(x: np.ndarray) -> float: job_result_z = job.result()[2].data.evs # Record , and on all qubits for the current datapoint - projections.append([job_result_x, job_result_y, job_result_z]) - np.save( f_tr, projections ) + projections.append([job_result_x, job_result_y, job_result_z]) + np.save(f_tr, projections) if not isinstance(session, type(None)): session.close() # Load computed projections - projections_train = np.load( file_projection_train ) + projections_train = np.load(file_projection_train) projections_train = np.array(projections_train).reshape(len(projections_train), -1) - projections_test = np.load( file_projection_test ) + projections_test = np.load(file_projection_test) projections_test = np.array(projections_test).reshape(len(projections_test), -1) - + # Check if XGBoost is requested but not available - if 'xgb' in classical_models and not XGBOOST_AVAILABLE: + if "xgb" in classical_models and not XGBOOST_AVAILABLE: warnings.warn( "XGBoost is not properly installed or configured and will be skipped.\n" "On macOS, you may need to install OpenMP:\n" @@ -194,50 +233,58 @@ def data_map_func(x: np.ndarray) -> float: " pip install --force-reinstall xgboost\n" "See installation documentation for more details.\n" f"Continuing with other models: {[m for m in classical_models if m != 'xgb']}", - UserWarning + UserWarning, ) # Remove xgb from the list - classical_models = [m for m in classical_models if m != 'xgb'] - + classical_models = [m for m in classical_models if m != "xgb"] + # If no models remain after filtering, raise an error if not classical_models: - raise ValueError("No valid classical models specified. Please provide at least one model from: 'rf', 'mlp', 'svc', 'lr', 'xgb'") - + raise ValueError( + "No valid classical models specified. Please provide at least one model from: 'rf', 'mlp', 'svc', 'lr', 'xgb'" + ) + model_res = [] for method in classical_models: - if method == 'rf': - model = create_rf_model(args['seed']) - elif method == 'svc': - model = create_svc_model(args['seed']) - elif method == 'mlp': - model = create_mlp_model(args['seed']) - elif method == 'lr': - model = create_lr_model(args['seed']) - elif method == 'xgb': - model = create_xgb_model(args['seed']) + if method == "rf": + model = create_rf_model(args["seed"]) + elif method == "svc": + model = create_svc_model(args["seed"]) + elif method == "mlp": + model = create_mlp_model(args["seed"]) + elif method == "lr": + model = create_lr_model(args["seed"]) + elif method == "xgb": + model = create_xgb_model(args["seed"]) else: - warnings.warn(f"Unknown model type '{method}' skipped. Valid options: 'rf', 'mlp', 'svc', 'lr', 'xgb'", UserWarning) + warnings.warn( + f"Unknown model type '{method}' skipped. Valid options: 'rf', 'mlp', 'svc', 'lr', 'xgb'", + UserWarning, + ) continue - - method_pqk = 'pqk_' + method + + method_pqk = "pqk_" + method print(method_pqk) model.fit(projections_train, y_train) y_predicted = model.predict(projections_test) hyperparameters = { - 'feature_map': feature_map.__class__.__name__, - 'feature_map_reps': reps, - 'entanglement' : entanglement, - 'best_params': model.best_params_, - # Add other hyperparameters as needed - } + "feature_map": feature_map.__class__.__name__, + "feature_map_reps": reps, + "entanglement": entanglement, + "best_params": model.best_params_, + # Add other hyperparameters as needed + } model_params = hyperparameters - model_res.append(modeleval(y_test, y_predicted, beg_time, model_params, args, model=method_pqk, verbose=verbose)) + model_res.append( + modeleval( + y_test, y_predicted, beg_time, model_params, args, model=method_pqk, verbose=verbose + ) + ) model_res = pd.concat(model_res) - return(model_res) - + return model_res def create_xgb_model(seed): @@ -251,45 +298,50 @@ def create_xgb_model(seed): " pip install --force-reinstall xgboost\n\n" "See installation documentation for more details." ) - xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss') # type: ignore + xgb = XGBClassifier(objective="binary:logistic", eval_metric="logloss") # type: ignore xgb_param_distributions = { - 'n_estimators': [100, 200, 300], - 'learning_rate': [0.01, 0.1, 0.2], - 'max_depth': [3, 5, 7], - 'subsample': [0.7, 0.8, 1.0], - 'colsample_bytree': [0.7, 0.8, 1.0], - 'min_child_weight': [1, 3, 5] + "n_estimators": [100, 200, 300], + "learning_rate": [0.01, 0.1, 0.2], + "max_depth": [3, 5, 7], + "subsample": [0.7, 0.8, 1.0], + "colsample_bytree": [0.7, 0.8, 1.0], + "min_child_weight": [1, 3, 5], } # Initialize RandomizedSearchCV - xgb_model = RandomizedSearchCV(estimator=xgb, - param_distributions=xgb_param_distributions, - n_iter=40, - cv=5, - random_state=seed, - n_jobs=-1) - + xgb_model = RandomizedSearchCV( + estimator=xgb, + param_distributions=xgb_param_distributions, + n_iter=40, + cv=5, + random_state=seed, + n_jobs=-1, + ) + return xgb_model + def create_lr_model(seed): # Initialize the Logistic Regression Classifier lr = LogisticRegression(random_state=seed, max_iter=1000) lr_param_distributions = { - 'C': [0.001, 0.01, 0.1, 1, 10, 100], - 'penalty': ['l1', 'l2'], - 'solver': ['liblinear', 'saga'] + "C": [0.001, 0.01, 0.1, 1, 10, 100], + "penalty": ["l1", "l2"], + "solver": ["liblinear", "saga"], } # Initialize RandomizedSearchCV - lr_model = RandomizedSearchCV(estimator=lr, - param_distributions=lr_param_distributions, - n_iter=40, - cv=5, - random_state=seed, - n_jobs=-1) - + lr_model = RandomizedSearchCV( + estimator=lr, + param_distributions=lr_param_distributions, + n_iter=40, + cv=5, + random_state=seed, + n_jobs=-1, + ) + return lr_model @@ -298,58 +350,68 @@ def create_rf_model(seed): rf = RandomForestClassifier(random_state=seed) rf_param_distributions = { - 'n_estimators': np.arange(100, 1000, 100), - 'max_depth': np.arange(5, 20), - 'min_samples_split': np.arange(2, 10), - 'min_samples_leaf': np.arange(1, 5), - 'bootstrap': [True, False] + "n_estimators": np.arange(100, 1000, 100), + "max_depth": np.arange(5, 20), + "min_samples_split": np.arange(2, 10), + "min_samples_leaf": np.arange(1, 5), + "bootstrap": [True, False], } # Initialize RandomizedSearchCV - rf_model = RandomizedSearchCV(estimator=rf, - param_distributions=rf_param_distributions, - n_iter=40, - cv=5, - random_state=seed, - n_jobs=-1) - + rf_model = RandomizedSearchCV( + estimator=rf, + param_distributions=rf_param_distributions, + n_iter=40, + cv=5, + random_state=seed, + n_jobs=-1, + ) + return rf_model + def create_mlp_model(seed): - mlp_param_distributions = {"hidden_layer_sizes": [(128,64,32,10), (64,32,10), (128,64,32)], - "activation": ["identity", "logistic", "tanh", "relu"], - "solver": ["lbfgs", "sgd", "adam"], - "alpha": [0.00005,0.0005]} + mlp_param_distributions = { + "hidden_layer_sizes": [(128, 64, 32, 10), (64, 32, 10), (128, 64, 32)], + "activation": ["identity", "logistic", "tanh", "relu"], + "solver": ["lbfgs", "sgd", "adam"], + "alpha": [0.00005, 0.0005], + } # Initialize the MLP Classifier mlp = MLPClassifier(random_state=seed) # Initialize RandomizedSearchCV - mlp_model = RandomizedSearchCV(estimator=mlp, - param_distributions=mlp_param_distributions, - n_iter=40, - cv=5, - random_state=seed, - n_jobs=-1) + mlp_model = RandomizedSearchCV( + estimator=mlp, + param_distributions=mlp_param_distributions, + n_iter=40, + cv=5, + random_state=seed, + n_jobs=-1, + ) return mlp_model + def create_svc_model(seed): - svc_param_distributions={ - 'C': [0.1, 1, 10, 100], - 'gamma': [0.001, 0.01, 0.1, 1], - 'kernel': ['linear', 'rbf', 'poly','sigmoid'] - } + svc_param_distributions = { + "C": [0.1, 1, 10, 100], + "gamma": [0.001, 0.01, 0.1, 1], + "kernel": ["linear", "rbf", "poly", "sigmoid"], + } # Initialize the SVC svc = SVC(random_state=seed) # Initialize RandomizedSearchCV - svc_model = RandomizedSearchCV(estimator=svc, - param_distributions=svc_param_distributions, - n_iter=40, - cv=5, - random_state=seed, - n_jobs=-1) - - return svc_model \ No newline at end of file + svc_model = RandomizedSearchCV( + estimator=svc, + param_distributions=svc_param_distributions, + n_iter=40, + cv=5, + random_state=seed, + n_jobs=-1, + ) + + return svc_model diff --git a/qbiocode/learning/compute_qnn.py b/qbiocode/learning/compute_qnn.py index 4bed244..9257ed1 100644 --- a/qbiocode/learning/compute_qnn.py +++ b/qbiocode/learning/compute_qnn.py @@ -2,23 +2,38 @@ import time from typing import Literal -# ====== Additional local imports ====== -from qbiocode.evaluation.model_evaluation import modeleval -import qbiocode.utils.qutils as qutils +# from qiskit.primitives import Sampler +from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager +from qiskit_algorithms.utils import algorithm_globals # ====== Qiskit imports ====== from qiskit_machine_learning.algorithms.classifiers import NeuralNetworkClassifier -from qiskit_machine_learning.neural_networks import SamplerQNN, EstimatorQNN -from qiskit_machine_learning.circuit.library import qnn_circuit as QNNCircuit +from qiskit_machine_learning.circuit.library import qnn_circuit as QNNCircuit +from qiskit_machine_learning.neural_networks import EstimatorQNN, SamplerQNN -from qiskit_algorithms.utils import algorithm_globals -#from qiskit.primitives import Sampler -from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager +import qbiocode.utils.qutils as qutils + +# ====== Additional local imports ====== +from qbiocode.evaluation.model_evaluation import modeleval -def compute_qnn(X_train, X_test, y_train, y_test, args, model='QNN', data_key = '', - primitive: Literal['estimator', 'sampler'] = 'sampler', verbose=False, - local_optimizer: Literal['COBYLA', 'L_BFGS_B', 'GradientDescent']='COBYLA', - maxiter=100, encoding = 'Z', entanglement = 'linear', reps= 2, ansatz_type = 'amp'): + +def compute_qnn( + X_train, + X_test, + y_train, + y_test, + args, + model="QNN", + data_key="", + primitive: Literal["estimator", "sampler"] = "sampler", + verbose=False, + local_optimizer: Literal["COBYLA", "L_BFGS_B", "GradientDescent"] = "COBYLA", + maxiter=100, + encoding="Z", + entanglement="linear", + reps=2, + ansatz_type="amp", +): """ This function computes a Quantum Neural Network (QNN) model on the provided training data and evaluates it on the test data. It constructs a QNN circuit with a specified feature map and ansatz, optimizes it using a chosen optimizer, and fits the model to the training data. @@ -41,31 +56,33 @@ def compute_qnn(X_train, X_test, y_train, y_test, args, model='QNN', data_key = entanglement (str, optional): Entanglement strategy for the circuit. Defaults to 'linear'. reps (int, optional): Number of repetitions for the feature map and ansatz. Defaults to 2. ansatz_type (str, optional): Type of ansatz to use. Defaults to 'amp'. - + Returns: modeleval (dict): A dictionary containing the evaluation results, including accuracy, runtime, model parameters, and other relevant metrics. - """ + """ beg_time = time.time() - - - # choose a method for mapping your features onto the circuit - feature_map, _ = qutils.get_feature_map(feature_map=encoding, - feat_dimension=X_train.shape[1], - reps = reps, - entanglement=entanglement) + + # choose a method for mapping your features onto the circuit + feature_map, _ = qutils.get_feature_map( + feature_map=encoding, feat_dimension=X_train.shape[1], reps=reps, entanglement=entanglement + ) # get ansatz - ansatz= qutils.get_ansatz( ansatz_type=ansatz_type, feat_dimension = feature_map.num_qubits, reps=reps, entanglement=entanglement) + ansatz = qutils.get_ansatz( + ansatz_type=ansatz_type, + feat_dimension=feature_map.num_qubits, + reps=reps, + entanglement=entanglement, + ) # Generate the backend, session and primitive - backend, session, prim = qutils.get_backend_session(args, - primitive, - num_qubits=feature_map.num_qubits) + backend, session, prim = qutils.get_backend_session( + args, primitive, num_qubits=feature_map.num_qubits + ) - # Get Optimizer - optimizer = qutils.get_optimizer( local_optimizer, max_iter=maxiter) + # Get Optimizer + optimizer = qutils.get_optimizer(local_optimizer, max_iter=maxiter) - # qc, input_params, weight_params = QNNCircuit(num_qubits=X_train.shape[1], feature_map=feature_map, ansatz=ansatz) qc, _, _ = QNNCircuit(num_qubits=X_train.shape[1], feature_map=feature_map, ansatz=ansatz) @@ -74,55 +91,75 @@ def compute_qnn(X_train, X_test, y_train, y_test, args, model='QNN', data_key = print(f"The number of parameters in your circuit is: {feature_map.num_parameters}") print(f"The number of ansatz parameters in your circuit is: {ansatz.num_parameters}") - if primitive == 'estimator': - if args['backend'] == 'simulator': - qnn = EstimatorQNN(circuit=qc, - input_params=feature_map.parameters, - weight_params=ansatz.parameters) + neural_network: EstimatorQNN | SamplerQNN + + if primitive == "estimator": + if args["backend"] == "simulator": + neural_network = EstimatorQNN( + circuit=qc, input_params=feature_map.parameters, weight_params=ansatz.parameters + ) else: pm = generate_preset_pass_manager(backend=backend, optimization_level=3) - qnn = EstimatorQNN(circuit=qc, - estimator=prim, - pass_manager=pm, - input_params=feature_map.parameters, - weight_params=ansatz.parameters) + neural_network = EstimatorQNN( + circuit=qc, + estimator=prim, + pass_manager=pm, + input_params=feature_map.parameters, + weight_params=ansatz.parameters, + ) # QNN maps inputs to [-1, +1] - qnn.forward(X_train[0, :], algorithm_globals.random.random(qnn.num_weights)) + neural_network.forward( + X_train[0, :], algorithm_globals.random.random(neural_network.num_weights) + ) else: # sampler=Sampler(backend=backend) # parity maps bitstrings to 0 or 1 def parity(x): return "{:b}".format(x).count("1") % 2 - output_shape = 2 # corresponds to the number of classes, possible outcomes of the (parity) mapping + + output_shape = ( + 2 # corresponds to the number of classes, possible outcomes of the (parity) mapping + ) # construct QNN - if 'simulator' in args['backend']: - qnn = SamplerQNN(circuit=qc, interpret=parity, output_shape=output_shape, - input_params=feature_map.parameters, - weight_params=ansatz.parameters) + if "simulator" in args["backend"]: + neural_network = SamplerQNN( + circuit=qc, + interpret=parity, + output_shape=output_shape, + input_params=feature_map.parameters, + weight_params=ansatz.parameters, + ) else: - pm = generate_preset_pass_manager(backend=backend, optimization_level=3) - qnn = SamplerQNN(circuit=qc, sampler=prim, - interpret=parity, output_shape=output_shape, - pass_manager=pm, input_params=feature_map.parameters, - weight_params=ansatz.parameters) - + pm = generate_preset_pass_manager(backend=backend, optimization_level=3) + neural_network = SamplerQNN( + circuit=qc, + sampler=prim, + interpret=parity, + output_shape=output_shape, + pass_manager=pm, + input_params=feature_map.parameters, + weight_params=ansatz.parameters, + ) + # construct classifier - qnn = NeuralNetworkClassifier(neural_network=qnn, optimizer=optimizer) - - # fit classifier to data + qnn = NeuralNetworkClassifier(neural_network=neural_network, optimizer=optimizer) + + # fit classifier to data model_fit = qnn.fit(X_train, y_train) hyperparameters = { - 'feature_map': feature_map.__class__.__name__, - 'ansatz': ansatz.__class__.__name__, - 'optimizer': optimizer.__class__.__name__, - 'optimizer_params': optimizer.settings, - # Add other hyperparameters as needed - } + "feature_map": feature_map.__class__.__name__, + "ansatz": ansatz.__class__.__name__, + "optimizer": optimizer.__class__.__name__, + "optimizer_params": optimizer.settings, + # Add other hyperparameters as needed + } model_params = hyperparameters y_predicted = qnn.predict(X_test) - + if not isinstance(session, type(None)): session.close() - - return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose)) \ No newline at end of file + + return modeleval( + y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose + ) diff --git a/qbiocode/learning/compute_qsvc.py b/qbiocode/learning/compute_qsvc.py index a3d05f7..7421afe 100644 --- a/qbiocode/learning/compute_qsvc.py +++ b/qbiocode/learning/compute_qsvc.py @@ -1,37 +1,55 @@ -import time -import numpy as np +import time from typing import Literal +import numpy as np +from qiskit.circuit.library import PauliFeatureMap, ZFeatureMap, ZZFeatureMap +from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager +from qiskit_aer import AerSimulator +from qiskit_ibm_runtime import QiskitRuntimeService +from qiskit_ibm_runtime import SamplerV2 as Sampler +from qiskit_machine_learning.algorithms import QSVC, PegasosQSVC +from qiskit_machine_learning.kernels import FidelityQuantumKernel + +# from qiskit.primitives import Sampler +from qiskit_machine_learning.state_fidelities import ComputeUncompute +from sklearn.model_selection import GridSearchCV + +import qbiocode.utils.qutils as qutils + # ====== Additional local imports ====== from qbiocode.evaluation.model_evaluation import modeleval -import qbiocode.utils.qutils as qutils # ====== Scikit-learn imports ====== -from sklearn.model_selection import GridSearchCV # ====== Qiskit imports ====== -from qiskit.circuit.library import ZZFeatureMap -from qiskit.circuit.library import ZZFeatureMap, ZFeatureMap, PauliFeatureMap -from qiskit_aer import AerSimulator -#from qiskit.primitives import Sampler -from qiskit_machine_learning.state_fidelities import ComputeUncompute -from qiskit_machine_learning.kernels import FidelityQuantumKernel -from qiskit_machine_learning.algorithms import QSVC, PegasosQSVC -from qiskit_ibm_runtime import QiskitRuntimeService, SamplerV2 as Sampler -from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager -def compute_qsvc(X_train, X_test, y_train, y_test, args, model='QSVC', data_key = '', - C=1, gamma='scale', pegasos=False, encoding: Literal['ZZ', 'Z', 'P']="ZZ", - entanglement='linear', primitive = 'sampler', reps = 2, verbose=False, local_optimizer = ''): +def compute_qsvc( + X_train, + X_test, + y_train, + y_test, + args, + model="QSVC", + data_key="", + C=1, + gamma="scale", + pegasos=False, + encoding: Literal["ZZ", "Z", "P"] = "ZZ", + entanglement="linear", + primitive="sampler", + reps=2, + verbose=False, + local_optimizer="", +): """ This function computes a quantum support vector classifier (QSVC) using the Qiskit Machine Learning library. It takes training and testing datasets, along with various parameters to configure the QSVC model. It initializes the quantum feature map, sets up the backend and session, and fits the QSVC model to the training data. It then predicts the labels for the test data and evaluates the model's performance. The function returns the performance results, including accuracy, F1-score, AUC, runtime, as well as model parameters, and other relevant metrics. - + Args: X_train (np.ndarray): Training feature set. X_test (np.ndarray): Testing feature set. @@ -53,48 +71,50 @@ def compute_qsvc(X_train, X_test, y_train, y_test, args, model='QSVC', data_key modeleval (dict): A dictionary containing the evaluation results, including accuracy, runtime, model parameters, and other relevant metrics. """ beg_time = time.time() - - - # choose a method for mapping your features onto the circuit - feature_map, _ = qutils.get_feature_map(feature_map=encoding, - feat_dimension=X_train.shape[1], - reps = reps, - entanglement=entanglement) + # choose a method for mapping your features onto the circuit + feature_map, _ = qutils.get_feature_map( + feature_map=encoding, feat_dimension=X_train.shape[1], reps=reps, entanglement=entanglement + ) # Generate the backend, session and primitive - backend, session, prim = qutils.get_backend_session(args, - primitive, - num_qubits=feature_map.num_qubits) - + backend, session, prim = qutils.get_backend_session( + args, primitive, num_qubits=feature_map.num_qubits + ) + print(f"Currently running a quantum support vector classifier (QSVC) on this dataset.") print(f"The number of qubits in your circuit is: {feature_map.num_qubits}") print(f"The number of parameters in your circuit is: {feature_map.num_parameters}") - - if 'simulator' == args['backend']: + + if "simulator" == args["backend"]: fidelity = ComputeUncompute(sampler=prim) - else: + else: # Need to instatiate a basic pass manager to store the chosen hardware backend - pm = generate_preset_pass_manager(backend=backend, optimization_level=3) - fidelity = ComputeUncompute(sampler=prim, pass_manager=pm) #, num_virtual_qubits = feature_map.num_qubits ) - + pm = generate_preset_pass_manager(backend=backend, optimization_level=3) + fidelity = ComputeUncompute( + sampler=prim, pass_manager=pm + ) # , num_virtual_qubits = feature_map.num_qubits ) + Qkernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=feature_map) if pegasos == True: - qsvc = PegasosQSVC(C=C, gamma=gamma, quantum_kernel=Qkernel) + qsvc = PegasosQSVC(C=C, quantum_kernel=Qkernel) else: qsvc = QSVC(C=C, gamma=gamma, quantum_kernel=Qkernel) - + model_fit = qsvc.fit(X_train, y_train) # model_params = model_fit.get_params() - hyperparameters = {'feature_map': feature_map.__class__.__name__, - 'quantum_kernel': Qkernel.__class__.__name__, - 'C': C, - 'gamma': gamma, - } + hyperparameters = { + "feature_map": feature_map.__class__.__name__, + "quantum_kernel": Qkernel.__class__.__name__, + "C": C, + "gamma": gamma, + } model_params = hyperparameters - y_predicted = qsvc.predict(X_test) + y_predicted = qsvc.predict(X_test) if not isinstance(session, type(None)): session.close() - return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose)) + return modeleval( + y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose + ) diff --git a/qbiocode/learning/compute_rf.py b/qbiocode/learning/compute_rf.py index 8abc0e1..24e1cbd 100644 --- a/qbiocode/learning/compute_rf.py +++ b/qbiocode/learning/compute_rf.py @@ -1,10 +1,8 @@ # ====== Base class imports ====== import time -import numpy as np - -# ====== Scikit-learn imports ====== +import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier @@ -12,19 +10,46 @@ # ====== Additional local imports ====== from qbiocode.evaluation.model_evaluation import modeleval +# ====== Scikit-learn imports ====== + + # ====== Begin functions ====== -def compute_rf(X_train, X_test, y_train, y_test, args, verbose=False, model='Random Forest', data_key = '', - n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, - min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, - bootstrap=True, oob_score=False, n_jobs=None, random_state=None, warm_start=False, - class_weight=None, ccp_alpha=0.0, max_samples=None, monotonic_cst=None): - - """ + +def compute_rf( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + model="Random Forest", + data_key="", + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + monotonic_cst=None, +): + """ This function generates a model using a Random Forest (RF) Classifier method as implemented in - `scikit-learn `_. + `scikit-learn `__. It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. - The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model + The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model. This function is designed to be used in a supervised learning context, where the goal is to classify data points. @@ -56,34 +81,64 @@ def compute_rf(X_train, X_test, y_train, y_test, args, verbose=False, model='Ran Returns: modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation. - """ - + """ + beg_time = time.time() - rf = OneVsOneClassifier(RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, - min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, - min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, - max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, - bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, - warm_start=warm_start, class_weight=class_weight, - ccp_alpha=ccp_alpha, max_samples=max_samples, monotonic_cst=monotonic_cst)) + rf = OneVsOneClassifier( + RandomForestClassifier( + n_estimators=n_estimators, + criterion=criterion, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + max_leaf_nodes=max_leaf_nodes, + min_impurity_decrease=min_impurity_decrease, + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + warm_start=warm_start, + class_weight=class_weight, + ccp_alpha=ccp_alpha, + max_samples=max_samples, + monotonic_cst=monotonic_cst, + ) + ) # Fit the training datset model_fit = rf.fit(X_train, y_train) model_params = model_fit.get_params() # Validate the model in test dataset and calculate accuracy - y_predicted = rf.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose)) - -def compute_rf_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, model='Random Forest', - bootstrap= [], max_depth= [], max_features= [], - min_samples_leaf= [], min_samples_split= [], n_estimators= []): - - """ + y_predicted = rf.predict(X_test) + return modeleval( + y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose + ) + + +def compute_rf_opt( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + cv=5, + model="Random Forest", + bootstrap=[], + max_depth=[], + max_features=[], + min_samples_leaf=[], + min_samples_split=[], + n_estimators=[], +): + """ This function also generates a model using a Random Forest (RF) Classifier method as implemented in - `scikit-learn `_. + `scikit-learn `__. The difference here is that this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar datasets, without having to run the grid search. - The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model + The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search. This function is designed to be used in a supervised learning context, where the goal is to classify data points. @@ -104,19 +159,20 @@ def compute_rf_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, n_estimators (list): List of number of estimators options for grid search. Returns: - modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation. + modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation. + + """ - """ - beg_time = time.time() - params={'n_estimators': n_estimators, - 'max_features': max_features, - 'max_depth': max_depth, - 'min_samples_split': min_samples_split, - 'min_samples_leaf': min_samples_leaf, - 'bootstrap': bootstrap - } - + params = { + "n_estimators": n_estimators, + "max_features": max_features, + "max_depth": max_depth, + "min_samples_split": min_samples_split, + "min_samples_leaf": min_samples_leaf, + "bootstrap": bootstrap, + } + # Perform Grid Search to find the best parameters grid_search = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=cv) grid_search.fit(X_train, y_train) @@ -128,4 +184,4 @@ def compute_rf_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, # Make predictions and calculate accuracy y_predicted = best_rf.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)) \ No newline at end of file + return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose) diff --git a/qbiocode/learning/compute_svc.py b/qbiocode/learning/compute_svc.py index 4dcc8ed..fd47c7b 100644 --- a/qbiocode/learning/compute_svc.py +++ b/qbiocode/learning/compute_svc.py @@ -2,23 +2,44 @@ import time -# ====== Scikit-learn imports ====== - -from sklearn.svm import SVC from sklearn.model_selection import GridSearchCV from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier +from sklearn.svm import SVC # ====== Additional local imports ====== from qbiocode.evaluation.model_evaluation import modeleval - -def compute_svc(X_train, X_test, y_train, y_test, args, model='SVC', data_key = '', C=1.0, kernel='rbf', - degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, - class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None): - - """ This function generates a model using a Support Vector Classifier (SVC) method as implemented in - `scikit-learn `_. + +# ====== Scikit-learn imports ====== + + +def compute_svc( + X_train, + X_test, + y_train, + y_test, + args, + model="SVC", + data_key="", + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=0.001, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, +): + """This function generates a model using a Support Vector Classifier (SVC) method as implemented in + `scikit-learn `__. It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. - The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset. + The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model. This function is designed to be used in a supervised learning context, where the goal is to classify data points. @@ -47,29 +68,56 @@ def compute_svc(X_train, X_test, y_train, y_test, args, model='SVC', data_key = random_state (int or None): Controls the randomness of the estimator, default is None. Returns: modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken to train and validate the model. - """ - + """ + beg_time = time.time() - svc = OneVsOneClassifier(SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, - probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight, - max_iter=max_iter, decision_function_shape=decision_function_shape, - break_ties=break_ties, random_state=random_state)) + svc = OneVsOneClassifier( + SVC( + C=C, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + shrinking=shrinking, + probability=probability, + tol=tol, + cache_size=cache_size, + class_weight=class_weight, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) + ) # Fit the training datset model_fit = svc.fit(X_train, y_train) model_params = model_fit.get_params() # Validate the model in test dataset and calculate accuracy - y_predicted = svc.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose)) + y_predicted = svc.predict(X_test) + return modeleval( + y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose + ) + -def compute_svc_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, model='SVC', - C=[], gamma=[], kernel=[]): - - """ This function generates a model using a Support Vector Classifier (SVC) method as implemented in - `scikit-learn `_. +def compute_svc_opt( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + cv=5, + model="SVC", + C=[], + gamma=[], + kernel=[], +): + """This function generates a model using a Support Vector Classifier (SVC) method as implemented in + `scikit-learn `__. It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. The combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar datasets, without having to run the grid search. - The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset. + The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search. This function is designed to be used in a supervised learning context, where the goal is to classify data points. @@ -87,13 +135,10 @@ def compute_svc_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, kernel (list or str): Specifies the kernel type(s) to be used in the algorithm, default is an empty list. Returns: modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search. - """ + """ beg_time = time.time() - params={'C': C, - 'gamma': gamma, - 'kernel': kernel - } + params = {"C": C, "gamma": gamma, "kernel": kernel} # Perform Grid Search to find the best parameters grid_search = GridSearchCV(SVC(), param_grid=params, cv=cv) grid_search.fit(X_train, y_train) @@ -105,4 +150,4 @@ def compute_svc_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, # Make predictions and calculate accuracy y_predicted = best_svc.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)) \ No newline at end of file + return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose) diff --git a/qbiocode/learning/compute_vqc.py b/qbiocode/learning/compute_vqc.py index 2719515..da21f82 100644 --- a/qbiocode/learning/compute_vqc.py +++ b/qbiocode/learning/compute_vqc.py @@ -2,18 +2,35 @@ import time from typing import Literal -# ====== Additional local imports ====== -from qbiocode.evaluation.model_evaluation import modeleval -import qbiocode.utils.qutils as qutils +# from qiskit.primitives import Sampler +from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager # ====== Qiskit imports ====== from qiskit_machine_learning.algorithms.classifiers import VQC -#from qiskit.primitives import Sampler -from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager -def compute_vqc(X_train, X_test, y_train, y_test, args, verbose=False, model='VQC', data_key = '', - local_optimizer: Literal['COBYLA', 'L_BFGS_B', 'GradientDescent']="COBYLA", maxiter=100, - encoding = 'Z', entanglement = 'linear', reps= 2,primitive = 'sampler', ansatz_type='amp'): +import qbiocode.utils.qutils as qutils + +# ====== Additional local imports ====== +from qbiocode.evaluation.model_evaluation import modeleval + + +def compute_vqc( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + model="VQC", + data_key="", + local_optimizer: Literal["COBYLA", "L_BFGS_B", "GradientDescent"] = "COBYLA", + maxiter=100, + encoding="Z", + entanglement="linear", + reps=2, + primitive="sampler", + ansatz_type="amp", +): """ This function computes a Variational Quantum Classifier (VQC) using the Qiskit Machine Learning library. It takes training and testing datasets, along with various parameters to configure the VQC model. @@ -41,48 +58,59 @@ def compute_vqc(X_train, X_test, y_train, y_test, args, verbose=False, model='VQ dict: Evaluation results including accuracy, time taken, and model parameters. """ beg_time = time.time() - # choose a method for mapping your features onto the circuit - feature_map, _ = qutils.get_feature_map(feature_map=encoding, - feat_dimension=X_train.shape[1], - reps = reps, - entanglement=entanglement) + # choose a method for mapping your features onto the circuit + feature_map, _ = qutils.get_feature_map( + feature_map=encoding, feat_dimension=X_train.shape[1], reps=reps, entanglement=entanglement + ) # get ansatz - ansatz= qutils.get_ansatz( ansatz_type=ansatz_type, feat_dimension = feature_map.num_qubits, reps=reps, entanglement=entanglement) - + ansatz = qutils.get_ansatz( + ansatz_type=ansatz_type, + feat_dimension=feature_map.num_qubits, + reps=reps, + entanglement=entanglement, + ) # Generate the backend, session and primitive - backend, session, prim = qutils.get_backend_session(args, - primitive, - num_qubits=feature_map.num_qubits) - - # Get Optimizer - optimizer = qutils.get_optimizer( local_optimizer, max_iter=maxiter) - - # instantiate the primitive - if 'simulator' == args['backend']: - vqc= VQC(sampler=prim, feature_map=feature_map, ansatz=ansatz, optimizer=optimizer) + backend, session, prim = qutils.get_backend_session( + args, primitive, num_qubits=feature_map.num_qubits + ) + + # Get Optimizer + optimizer = qutils.get_optimizer(local_optimizer, max_iter=maxiter) + + # instantiate the primitive + if "simulator" == args["backend"]: + vqc = VQC(sampler=prim, feature_map=feature_map, ansatz=ansatz, optimizer=optimizer) else: pm = generate_preset_pass_manager(backend=backend, optimization_level=3) - vqc= VQC(sampler=prim, feature_map=feature_map, ansatz=ansatz, optimizer=optimizer, pass_manager=pm) + vqc = VQC( + sampler=prim, + feature_map=feature_map, + ansatz=ansatz, + optimizer=optimizer, + pass_manager=pm, + ) print(f"Currently running a variational quantum classifer (VQC) on this dataset.") print(f"The number of qubits in your circuit is: {feature_map.num_qubits}") print(f"The number of parameters in your circuit is: {feature_map.num_parameters}") - + # fit classifier to data model_fit = vqc.fit(X_train, y_train) hyperparameters = { - 'feature_map': feature_map.__class__.__name__, - 'ansatz': ansatz.__class__.__name__, - 'optimizer': optimizer.__class__.__name__, - 'optimizer_params': optimizer.settings, - # Add other hyperparameters as needed - } + "feature_map": feature_map.__class__.__name__, + "ansatz": ansatz.__class__.__name__, + "optimizer": optimizer.__class__.__name__, + "optimizer_params": optimizer.settings, + # Add other hyperparameters as needed + } model_params = hyperparameters y_predicted = vqc.predict(X_test) if not isinstance(session, type(None)): session.close() - return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose)) \ No newline at end of file + return modeleval( + y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose + ) diff --git a/qbiocode/learning/compute_xgb.py b/qbiocode/learning/compute_xgb.py index 2ae0227..ef446eb 100644 --- a/qbiocode/learning/compute_xgb.py +++ b/qbiocode/learning/compute_xgb.py @@ -1,12 +1,14 @@ # ====== Base class imports ====== import time + import numpy as np # ====== Scikit-learn imports ====== try: from xgboost import XGBClassifier + XGBOOST_AVAILABLE = True _XGBOOST_ERROR = None except Exception as e: @@ -23,10 +25,25 @@ # ====== Begin functions ====== -def compute_xgb(X_train, X_test, y_train, y_test, args, verbose=False, model='xgb', data_key = '', - n_estimators=100, *, criterion='gini', max_depth=None, subsample=0.5, learning_rate=0.5, - colsample_bytree=1, min_child_weight=1): - + +def compute_xgb( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + model="xgb", + data_key="", + n_estimators=100, + *, + criterion="gini", + max_depth=None, + subsample=0.5, + learning_rate=0.5, + colsample_bytree=1, + min_child_weight=1, +): """ This function generates a model using an Extreme Gradient Boositing (xgb) Classifier method as implemented in xgboost. It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. @@ -51,12 +68,12 @@ def compute_xgb(X_train, X_test, y_train, y_test, args, verbose=False, model='xg min_child_weight (int) : Minimum sum of instance weight (hessian) needed in a child. Default is 1 Returns: modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation. - + Raises: ImportError: If XGBoost is not properly installed or configured. """ - + if not XGBOOST_AVAILABLE: error_msg = ( "XGBoost is not properly installed or configured.\n" @@ -68,28 +85,53 @@ def compute_xgb(X_train, X_test, y_train, y_test, args, verbose=False, model='xg "See installation documentation for more details." ) raise ImportError(error_msg) - + beg_time = time.time() - xgb = OneVsOneClassifier(XGBClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, # type: ignore - subsample=subsample, learning_rate=learning_rate, colsample_bytree=colsample_bytree, - min_child_weight=min_child_weight)) + xgb = OneVsOneClassifier( + XGBClassifier( + n_estimators=n_estimators, + criterion=criterion, + max_depth=max_depth, # type: ignore + subsample=subsample, + learning_rate=learning_rate, + colsample_bytree=colsample_bytree, + min_child_weight=min_child_weight, + ) + ) # Fit the training datset model_fit = xgb.fit(X_train, y_train) model_params = model_fit.get_params() # Validate the model in test dataset and calculate accuracy - y_predicted = xgb.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose)) - -def compute_xgb_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, model='xgb', - bootstrap= [], max_depth= [], max_features= [],learning_rate=[],subsample = [], colsample_bytree = [] - , n_estimators= [], min_child_weight = []): - - """ + y_predicted = xgb.predict(X_test) + return modeleval( + y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose + ) + + +def compute_xgb_opt( + X_train, + X_test, + y_train, + y_test, + args, + verbose=False, + cv=5, + model="xgb", + bootstrap=[], + max_depth=[], + max_features=[], + learning_rate=[], + subsample=[], + colsample_bytree=[], + n_estimators=[], + min_child_weight=[], +): + """ This function generates a model using an Extreme Gradient Boositing (xgb) Classifier method as implemented in xgboost. The difference here is that this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar datasets, without having to run the grid search. - The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model + The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search. This function is designed to be used in a supervised learning context, where the goal is to classify data points. @@ -111,12 +153,12 @@ def compute_xgb_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, min_child_weight (list): List of minimum sum of instance weight (hessian) needed in a childoptions for grid search. Returns: - modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation. + modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation. Raises: ImportError: If XGBoost is not properly installed or configured. """ - + if not XGBOOST_AVAILABLE: error_msg = ( "XGBoost is not properly installed or configured.\n" @@ -128,17 +170,18 @@ def compute_xgb_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, "See installation documentation for more details." ) raise ImportError(error_msg) - + beg_time = time.time() - params={'n_estimators': n_estimators, - 'max_depth': max_depth, - 'learning_rate' : learning_rate, - 'subsample' : subsample, - 'colsample_bytree' : colsample_bytree, - 'min_child_weight' : min_child_weight, - 'bootstrap': bootstrap - } - + params = { + "n_estimators": n_estimators, + "max_depth": max_depth, + "learning_rate": learning_rate, + "subsample": subsample, + "colsample_bytree": colsample_bytree, + "min_child_weight": min_child_weight, + "bootstrap": bootstrap, + } + # Perform Grid Search to find the best parameters grid_search = GridSearchCV(XGBClassifier(), param_grid=params, cv=cv) # type: ignore grid_search.fit(X_train, y_train) @@ -150,4 +193,4 @@ def compute_xgb_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, # Make predictions and calculate accuracy y_predicted = best_xgb.predict(X_test) - return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)) \ No newline at end of file + return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose) diff --git a/qbiocode/utils/__init__.py b/qbiocode/utils/__init__.py index 40c0fe5..cdd23d9 100644 --- a/qbiocode/utils/__init__.py +++ b/qbiocode/utils/__init__.py @@ -6,7 +6,7 @@ model management, IBM Quantum account handling, and result analysis. Available Functions ------------------- +------------------- - scaler_fn: Data scaling and normalization - feature_encoding: Encode features for quantum circuits - qml_winner: Identify best performing quantum model @@ -34,52 +34,46 @@ >>> X_encoded = feature_encoding(X, feature_encoding='OneHotEncoder') """ -from .helper_fn import scaler_fn, feature_encoding -from .qc_winner_finder import qml_winner +from .combine_evals_results import combine_results, track_progress from .dataset_checkpoint import checkpoint_restart -from .combine_evals_results import track_progress, combine_results from .find_duplicates import find_duplicate_files from .find_string import find_string_in_files from .generate_qml_configs import generate_qml_experiment_configs +from .helper_fn import feature_encoding, scaler_fn from .ibm_account import get_creds, instantiate_runtime_service +from .qc_winner_finder import qml_winner from .qutils import ( + get_ansatz, get_backend_session, - get_sampler, get_estimator, - get_ansatz, get_feature_map, get_optimizer, + get_sampler, ) __all__ = [ # Data preprocessing - 'scaler_fn', - 'feature_encoding', - + "scaler_fn", + "feature_encoding", # Model management - 'qml_winner', - 'checkpoint_restart', - + "qml_winner", + "checkpoint_restart", # Results management - 'track_progress', - 'combine_results', - + "track_progress", + "combine_results", # Configuration generation - 'generate_qml_experiment_configs', - + "generate_qml_experiment_configs", # File utilities - 'find_duplicate_files', - 'find_string_in_files', - + "find_duplicate_files", + "find_string_in_files", # IBM Quantum utilities - 'get_creds', - 'instantiate_runtime_service', - + "get_creds", + "instantiate_runtime_service", # Quantum utilities - 'get_backend_session', - 'get_sampler', - 'get_estimator', - 'get_ansatz', - 'get_feature_map', - 'get_optimizer', + "get_backend_session", + "get_sampler", + "get_estimator", + "get_ansatz", + "get_feature_map", + "get_optimizer", ] diff --git a/qbiocode/utils/combine_evals_results.py b/qbiocode/utils/combine_evals_results.py index 16ed1ac..7e09e9b 100644 --- a/qbiocode/utils/combine_evals_results.py +++ b/qbiocode/utils/combine_evals_results.py @@ -8,25 +8,26 @@ """ import os +from typing import List, Optional, Tuple + import pandas as pd -from typing import List, Tuple, Optional def track_progress( input_dataset_dir: str, current_results_dir: str, - completion_marker: str = 'RawDataEvaluation.csv', + completion_marker: str = "RawDataEvaluation.csv", prefix_length: int = 8, - input_extension: str = 'csv', - verbose: bool = True + input_extension: str = "csv", + verbose: bool = True, ) -> Tuple[List[str], int, int]: """ Track progress of a computational job by checking for completed datasets. - + This function scans the results directory for completed datasets (identified by the presence of a specific marker file) and compares against the total number of input datasets to determine how many remain to be processed. - + Parameters ---------- input_dataset_dir : str @@ -38,13 +39,13 @@ def track_progress( Default is 'RawDataEvaluation.csv'. prefix_length : int, optional Number of characters to skip from the beginning of directory names - when extracting dataset identifiers. Default is 8 (e.g., skips 'dataset_' + when extracting dataset identifiers. Default is 8 (e.g., skips ``dataset_`` prefix). input_extension : str, optional File extension of input datasets (without dot). Default is 'csv'. verbose : bool, optional If True, prints progress information. Default is True. - + Returns ------- completed_datasets : List[str] @@ -53,7 +54,7 @@ def track_progress( Number of completed datasets. num_remaining : int Number of datasets remaining to be processed. - + Examples -------- >>> from qbiocode.utils import track_progress @@ -64,7 +65,7 @@ def track_progress( The completed datasets are: ['dataset1', 'dataset2'] You have finished running program on 2 out of a total of 10 input datasets. You have 8 input datasets left before program finishes. - + >>> # Custom completion marker >>> completed, done, remaining = track_progress( ... input_dataset_dir='data/inputs', @@ -74,7 +75,7 @@ def track_progress( ... ) """ completed_files = [] - + # Scan results directory for completed datasets for entry in os.scandir(current_results_dir): if entry.is_dir(): @@ -83,42 +84,44 @@ def track_progress( # Extract dataset identifier by skipping prefix dataset_id = entry.name[prefix_length:] if prefix_length > 0 else entry.name completed_files.append(dataset_id) - + # Count total input datasets num_input_datasets = [] for file in os.listdir(input_dataset_dir): if file.endswith(input_extension): num_input_datasets.append(file) - + num_completed = len(completed_files) num_total = len(num_input_datasets) num_remaining = num_total - num_completed - + if verbose: - print(f'The completed datasets are: {completed_files}') - print(f'You have finished running program on {num_completed} out of a total of {num_total} input datasets.') - print(f'You have {num_remaining} input datasets left before program finishes.') - + print(f"The completed datasets are: {completed_files}") + print( + f"You have finished running program on {num_completed} out of a total of {num_total} input datasets." + ) + print(f"You have {num_remaining} input datasets left before program finishes.") + return completed_files, num_completed, num_remaining def combine_results( prev_results_dir: str, recent_results_dir: str, - eval_file_prefix: str = 'Raw', - results_file_prefix: str = 'Model', - output_eval_file: str = 'RawDataEvaluation_Combined.csv', - output_results_file: str = 'ModelResults_Combined.csv', + eval_file_prefix: str = "Raw", + results_file_prefix: str = "Model", + output_eval_file: str = "RawDataEvaluation_Combined.csv", + output_results_file: str = "ModelResults_Combined.csv", save_intermediate: bool = True, - verbose: bool = True + verbose: bool = True, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Combine results from interrupted and resumed computational jobs. - + This function merges CSV files from a previous (interrupted) job run with files from a recent (resumed) job run. It's useful when a long-running computational job needs to be restarted and you want to combine all results. - + Parameters ---------- prev_results_dir : str @@ -132,7 +135,7 @@ def combine_results( results_file_prefix : str, optional Prefix of model results files to combine. Default is 'Model'. output_eval_file : str, optional - Name of the combined evaluation output file. + Name of the combined evaluation output file. Default is 'RawDataEvaluation_Combined.csv'. output_results_file : str, optional Name of the combined results output file. @@ -142,14 +145,14 @@ def combine_results( Default is True. verbose : bool, optional If True, prints shape information during processing. Default is True. - + Returns ------- combined_eval_df : pd.DataFrame Combined dataframe of all evaluation/assessment data. combined_results_df : pd.DataFrame Combined dataframe of all model results. - + Examples -------- >>> from qbiocode.utils import combine_results @@ -159,7 +162,7 @@ def combine_results( ... ) >>> print(f"Combined {len(eval_df)} evaluation records") >>> print(f"Combined {len(results_df)} result records") - + >>> # Custom file prefixes and output names >>> eval_df, results_df = combine_results( ... prev_results_dir='results/old', @@ -169,7 +172,7 @@ def combine_results( ... output_eval_file='AllEvaluations.csv', ... output_results_file='AllResults.csv' ... ) - + Notes ----- The function expects: @@ -182,7 +185,7 @@ def combine_results( previous_combined_eval_df = [] results_dfs = [] previous_combined_result_df = [] - + # Collect all individual CSV files from previous run subdirectories for entry in os.scandir(prev_results_dir): if entry.is_dir(): @@ -193,30 +196,32 @@ def combine_results( if file.startswith(results_file_prefix): results_csv_files = os.path.join(entry, file) results_dfs.append(results_csv_files) - + # Read and collect all previous evaluation dataframes for evalfile in eval_dfs: df1 = pd.read_csv(evalfile) previous_combined_eval_df.append(df1) - + # Read and collect all previous results dataframes for resultsfile in results_dfs: df2 = pd.read_csv(resultsfile) previous_combined_result_df.append(df2) - + # Concatenate all previous dataframes concat_previous_eval_df = pd.concat(previous_combined_eval_df, ignore_index=True) concat_previous_result_df = pd.concat(previous_combined_result_df, ignore_index=True) - + # Optionally save intermediate combined files if save_intermediate: - concat_previous_eval_df.to_csv(f'{eval_file_prefix}DataEvaluation_previous.csv', index=False) - concat_previous_result_df.to_csv(f'{results_file_prefix}Results_previous.csv', index=False) - + concat_previous_eval_df.to_csv( + f"{eval_file_prefix}DataEvaluation_previous.csv", index=False + ) + concat_previous_result_df.to_csv(f"{results_file_prefix}Results_previous.csv", index=False) + # Read recent (resumed run) dataframes recent_eval_df = None recent_results_df = None - + for file in os.listdir(recent_results_dir): if file.startswith(eval_file_prefix): recent_eval_csv_file = os.path.join(recent_results_dir, file) @@ -226,7 +231,7 @@ def combine_results( recent_results_csv_file = os.path.join(recent_results_dir, file) recent_results_df = pd.read_csv(recent_results_csv_file, index_col=0) recent_results_df.reset_index(drop=True, inplace=True) - + # Verify that recent dataframes were found if recent_eval_df is None: raise FileNotFoundError( @@ -236,39 +241,41 @@ def combine_results( raise FileNotFoundError( f"No results file starting with '{results_file_prefix}' found in {recent_results_dir}" ) - + if verbose: print(f"Recent evaluation dataframe shape: {recent_eval_df.shape}") print(f"Previous evaluation dataframe shape: {concat_previous_eval_df.shape}") print(f"Recent results dataframe shape: {recent_results_df.shape}") print(f"Previous results dataframe shape: {concat_previous_result_df.shape}") - + # Combine previous and recent dataframes new_combined_eval_df = pd.concat([concat_previous_eval_df, recent_eval_df], ignore_index=True) - new_combined_result_df = pd.concat([concat_previous_result_df, recent_results_df], ignore_index=True) - + new_combined_result_df = pd.concat( + [concat_previous_result_df, recent_results_df], ignore_index=True + ) + # Save final combined dataframes new_combined_eval_df.to_csv(output_eval_file, index=False) new_combined_result_df.to_csv(output_results_file, index=False) - + if verbose: print(f"\nCombined evaluation dataframe shape: {new_combined_eval_df.shape}") print(f"Combined results dataframe shape: {new_combined_result_df.shape}") print(f"\nSaved combined files:") print(f" - {output_eval_file}") print(f" - {output_results_file}") - + return new_combined_eval_df, new_combined_result_df # Example usage (commented out to prevent execution at import time): -# +# # # Track progress of current job # completed, done, remaining = track_progress( # input_dataset_dir='data/inputs', # current_results_dir='results/current_run' # ) -# +# # # Combine results from interrupted and resumed runs # eval_df, results_df = combine_results( # prev_results_dir='results/run1_interrupted', diff --git a/qbiocode/utils/dataset_checkpoint.py b/qbiocode/utils/dataset_checkpoint.py index c034a49..14a0fde 100644 --- a/qbiocode/utils/dataset_checkpoint.py +++ b/qbiocode/utils/dataset_checkpoint.py @@ -11,22 +11,22 @@ def checkpoint_restart( previous_results_dir: str, - completion_marker: str = 'RawDataEvaluation.csv', + completion_marker: str = "RawDataEvaluation.csv", prefix_length: int = 8, - verbose: bool = False + verbose: bool = False, ) -> List[str]: """ Identify completed datasets from a previous run to enable checkpoint restart. - + This function scans a results directory to find which datasets were fully processed in a previous run by checking for the presence of a completion marker file. This allows you to resume interrupted batch processing jobs without reprocessing completed datasets. - + The function assumes that each dataset has its own subdirectory in the results directory, and that a specific file (completion marker) is created when processing completes successfully. - + Parameters ---------- previous_results_dir : str @@ -37,43 +37,43 @@ def checkpoint_restart( Default is 'RawDataEvaluation.csv' (used by QProfiler). prefix_length : int, optional Number of characters to strip from the beginning of directory names to get - the dataset name. Default is 8 (strips 'dataset_' prefix used by QProfiler). + the dataset name. Default is 8 (strips ``dataset_`` prefix used by QProfiler). Set to 0 to use the full directory name. verbose : bool, optional If True, print the list of completed datasets and count. Default is False. - + Returns ------- List[str] List of dataset names that were fully processed in the previous run. These can be excluded when restarting the batch job. - + Examples -------- Basic usage with QProfiler default settings: - + >>> completed = checkpoint_restart('/path/to/previous_results') >>> print(f"Found {len(completed)} completed datasets") - + Resume processing only incomplete datasets: - + >>> import os >>> all_datasets = [f for f in os.listdir('/path/to/data') if f.endswith('.csv')] >>> completed = checkpoint_restart('/path/to/previous_results') >>> remaining = [d for d in all_datasets if d not in completed] >>> print(f"Need to process {len(remaining)} more datasets") - + Custom completion marker and no prefix stripping: - + >>> completed = checkpoint_restart( ... '/path/to/results', ... completion_marker='ModelResults.csv', ... prefix_length=0, ... verbose=True ... ) - + Integration with QProfiler batch processing: - + >>> from qbiocode.utils.dataset_checkpoint import checkpoint_restart >>> >>> # Get list of completed datasets from previous run @@ -91,34 +91,30 @@ def checkpoint_restart( >>> >>> # Run QProfiler only on remaining datasets >>> # (use datasets_to_process in your batch processing loop) - + Notes ----- - The function only checks for the presence of the completion marker file, not its contents or validity - When restarting, you may need to manually combine results from the previous and current runs - - Directory names are expected to have a consistent prefix (e.g., 'dataset_') + - Directory names are expected to have a consistent prefix (e.g., ``dataset_``) that can be stripped using the prefix_length parameter - Non-directory entries in previous_results_dir are ignored - + See Also -------- qbiocode.evaluation.model_run : Main QProfiler batch processing function """ completed_files = [] - + # Validate input directory if not os.path.exists(previous_results_dir): - raise FileNotFoundError( - f"Previous results directory not found: {previous_results_dir}" - ) - + raise FileNotFoundError(f"Previous results directory not found: {previous_results_dir}") + if not os.path.isdir(previous_results_dir): - raise NotADirectoryError( - f"Path is not a directory: {previous_results_dir}" - ) - + raise NotADirectoryError(f"Path is not a directory: {previous_results_dir}") + # Scan for completed datasets for entry in os.scandir(previous_results_dir): if entry.is_dir(): @@ -131,10 +127,10 @@ def checkpoint_restart( else: dataset_name = entry.name completed_files.append(dataset_name) - + if verbose: print(f"Found {len(completed_files)} completed datasets:") for dataset in sorted(completed_files): print(f" - {dataset}") - + return completed_files diff --git a/qbiocode/utils/find_duplicates.py b/qbiocode/utils/find_duplicates.py index 31efe30..9e695d1 100644 --- a/qbiocode/utils/find_duplicates.py +++ b/qbiocode/utils/find_duplicates.py @@ -5,9 +5,9 @@ useful for cleaning up redundant configuration files or identifying duplicate datasets. """ -import os import itertools -from typing import List, Tuple, Optional +import os +from typing import List, Optional, Tuple def find_duplicate_files( @@ -15,22 +15,22 @@ def find_duplicate_files( file_pattern: Optional[str] = None, ignore_empty_lines: bool = True, case_sensitive: bool = True, - verbose: bool = False + verbose: bool = False, ) -> List[Tuple[str, str]]: """ Find files with identical content in a directory. - + Scans the specified directory for files and compares their content line by line. Identifies files that have identical content, even if they have different names. Optionally filters files by pattern and provides various comparison options. - + This is particularly useful for: - + - Finding duplicate configuration files (e.g., YAML, JSON) - Identifying redundant experiment configurations - Cleaning up duplicate datasets before batch processing - Validating file uniqueness in automated workflows - + Parameters ---------- directory : str @@ -44,13 +44,13 @@ def find_duplicate_files( If True, comparison is case-sensitive. Default is True. verbose : bool, optional If True, print progress information during comparison. Default is False. - + Returns ------- List[Tuple[str, str]] List of tuples, where each tuple contains paths of two duplicate files. Returns empty list if no duplicates are found. - + Raises ------ FileNotFoundError @@ -59,17 +59,17 @@ def find_duplicate_files( If the specified path is not a directory. PermissionError If files cannot be read due to permission issues. - + Examples -------- Find all duplicate files in a directory: - + >>> duplicates = find_duplicate_files("configs/") >>> if duplicates: ... print(f"Found {len(duplicates)} duplicate pairs") - + Find duplicate YAML configuration files: - + >>> duplicates = find_duplicate_files( ... "configs/qml_gridsearch/", ... file_pattern='.yaml', @@ -77,17 +77,17 @@ def find_duplicate_files( ... ) >>> for file1, file2 in duplicates: ... print(f"Duplicate: {file1} == {file2}") - + Case-insensitive comparison: - + >>> duplicates = find_duplicate_files( ... "data/", ... file_pattern='.txt', ... case_sensitive=False ... ) - + Integration with QProfiler workflow: - + >>> # Check for duplicate configs before batch processing >>> config_dir = "configs/experiments/" >>> duplicates = find_duplicate_files(config_dir, file_pattern='.yaml') @@ -97,7 +97,7 @@ def find_duplicate_files( ... for f1, f2 in duplicates: ... print(f" {os.path.basename(f1)} == {os.path.basename(f2)}") ... # Optionally remove duplicates or warn user - + Notes ----- - Files are compared line by line after sorting (order-independent) @@ -105,7 +105,7 @@ def find_duplicate_files( - Large files may consume significant memory during comparison - Symbolic links are followed and treated as regular files - Hidden files (starting with '.') are included in comparison - + See Also -------- find_string_in_files : Search for specific strings across multiple files @@ -114,10 +114,10 @@ def find_duplicate_files( # Validate input directory if not os.path.exists(directory): raise FileNotFoundError(f"Directory not found: {directory}") - + if not os.path.isdir(directory): raise NotADirectoryError(f"Path is not a directory: {directory}") - + # Collect files to compare files = [] for entry in os.scandir(directory): @@ -125,53 +125,55 @@ def find_duplicate_files( # Apply file pattern filter if specified if file_pattern is None or entry.name.endswith(file_pattern): files.append(entry.path) - + if verbose: print(f"Comparing {len(files)} files in {directory}") if file_pattern: print(f"Filtering by pattern: {file_pattern}") - + # Find duplicates by comparing all pairs duplicates = [] total_comparisons = len(list(itertools.combinations(files, 2))) - + for idx, (file1, file2) in enumerate(itertools.combinations(files, 2)): if verbose and idx % 100 == 0: print(f"Progress: {idx}/{total_comparisons} comparisons") - + try: # Read and process file contents - with open(file1, 'r', encoding='utf-8') as f1: + with open(file1, "r", encoding="utf-8") as f1: content1 = f1.readlines() - with open(file2, 'r', encoding='utf-8') as f2: + with open(file2, "r", encoding="utf-8") as f2: content2 = f2.readlines() - + # Filter empty lines if requested if ignore_empty_lines: content1 = [line for line in content1 if line.strip()] content2 = [line for line in content2 if line.strip()] - + # Apply case sensitivity if not case_sensitive: content1 = [line.lower() for line in content1] content2 = [line.lower() for line in content2] - + # Sort for order-independent comparison content1_sorted = sorted(content1) content2_sorted = sorted(content2) - + # Compare contents if content1_sorted == content2_sorted: duplicates.append((file1, file2)) if verbose: - print(f" Duplicate found: {os.path.basename(file1)} == {os.path.basename(file2)}") - + print( + f" Duplicate found: {os.path.basename(file1)} == {os.path.basename(file2)}" + ) + except (UnicodeDecodeError, PermissionError) as e: if verbose: print(f" Warning: Could not read {file1} or {file2}: {e}") continue - + if verbose: print(f"\nFound {len(duplicates)} duplicate file pairs") - - return duplicates \ No newline at end of file + + return duplicates diff --git a/qbiocode/utils/find_string.py b/qbiocode/utils/find_string.py index 2ef4ad3..08568fb 100644 --- a/qbiocode/utils/find_string.py +++ b/qbiocode/utils/find_string.py @@ -7,7 +7,7 @@ """ import os -from typing import List, Dict, Optional, Tuple +from typing import Dict, List, Optional, Tuple def find_string_in_files( @@ -16,16 +16,16 @@ def find_string_in_files( file_pattern: Optional[str] = None, case_sensitive: bool = True, return_lines: bool = False, - verbose: bool = True + verbose: bool = True, ) -> Dict[str, List[Tuple[int, str]]]: """ Search for a specific string in all files within a directory. - + Scans files in the specified directory and identifies which files contain the search string. Optionally returns the matching lines with line numbers. Useful for auditing configurations, finding specific parameters, or validating settings across multiple files. - + Parameters ---------- directory : str @@ -41,33 +41,33 @@ def find_string_in_files( If True, return matching lines with line numbers. Default is False. verbose : bool, optional If True, print progress and results. Default is True. - + Returns ------- Dict[str, List[Tuple[int, str]]] Dictionary mapping file paths to list of (line_number, line_content) tuples for files containing the search string. If return_lines is False, the list contains empty tuples. - + Raises ------ FileNotFoundError If the specified directory does not exist. NotADirectoryError If the specified path is not a directory. - + Examples -------- Basic search for a string: - + >>> results = find_string_in_files( ... 'configs/', ... 'embeddings: none' ... ) >>> print(f"Found in {len(results)} files") - + Search with line numbers returned: - + >>> results = find_string_in_files( ... 'configs/qml_gridsearch/', ... 'n_qubits: 4', @@ -78,18 +78,18 @@ def find_string_in_files( ... print(f"{filepath}:") ... for line_num, line_content in matches: ... print(f" Line {line_num}: {line_content.strip()}") - + Case-insensitive search: - + >>> results = find_string_in_files( ... 'logs/', ... 'error', ... file_pattern='.log', ... case_sensitive=False ... ) - + Integration with QProfiler workflow: - + >>> # Find all configs using a specific embedding >>> config_dir = "configs/experiments/" >>> results = find_string_in_files( @@ -103,14 +103,14 @@ def find_string_in_files( ... print(f"Found {len(results)} configs using PCA embedding") ... for config_file in results.keys(): ... print(f" - {os.path.basename(config_file)}") - + Notes ----- - Only text files are supported; binary files will be skipped - Large files may consume significant memory if return_lines=True - Symbolic links are followed and treated as regular files - Hidden files (starting with '.') are included in search - + See Also -------- find_duplicate_files : Find files with identical content @@ -119,44 +119,44 @@ def find_string_in_files( # Validate input directory if not os.path.exists(directory): raise FileNotFoundError(f"Directory not found: {directory}") - + if not os.path.isdir(directory): raise NotADirectoryError(f"Path is not a directory: {directory}") - + # Prepare search string for case-insensitive search search_str = search_string if case_sensitive else search_string.lower() - + # Results dictionary results = {} total_files = 0 files_with_match = 0 - + # Scan directory for entry in os.scandir(directory): if entry.is_file(): # Apply file pattern filter if specified if file_pattern is not None and not entry.name.endswith(file_pattern): continue - + total_files += 1 - + try: - with open(entry.path, 'r', encoding='utf-8') as f: + with open(entry.path, "r", encoding="utf-8") as f: matches = [] for line_num, line in enumerate(f, start=1): # Apply case sensitivity line_to_search = line if case_sensitive else line.lower() - + if search_str in line_to_search: if return_lines: matches.append((line_num, line)) else: - matches.append((0, '')) # Placeholder - + matches.append((0, "")) # Placeholder + if matches: results[entry.path] = matches files_with_match += 1 - + if verbose: if return_lines: print(f"\n{entry.path} contains '{search_string}':") @@ -164,12 +164,12 @@ def find_string_in_files( print(f" Line {line_num}: {line_content.rstrip()}") else: print(f"{entry.path} contains '{search_string}'") - + except (UnicodeDecodeError, PermissionError) as e: if verbose: print(f"Warning: Could not read {entry.path}: {e}") continue - + # Print summary if verbose: print(f"\n{'='*60}") @@ -179,5 +179,5 @@ def find_string_in_files( if file_pattern: print(f" File pattern filter: {file_pattern}") print(f"{'='*60}") - + return results diff --git a/qbiocode/utils/generate_qml_configs.py b/qbiocode/utils/generate_qml_configs.py index 203d253..b901ce6 100644 --- a/qbiocode/utils/generate_qml_configs.py +++ b/qbiocode/utils/generate_qml_configs.py @@ -5,13 +5,14 @@ for systematic hyperparameter tuning of quantum machine learning models. """ +import itertools import os import re -import itertools -from typing import List, Dict, Any, Optional, Tuple -import yaml -import pandas as pd +from typing import Any, Dict, List, Optional, Tuple, cast + import numpy as np +import pandas as pd +import yaml def generate_qml_experiment_configs( @@ -30,15 +31,15 @@ def generate_qml_experiment_configs( embeddings: Optional[List[str]] = None, data_sample_fraction: float = 1.0, used_files_path: Optional[str] = None, - random_seed: Optional[int] = None + random_seed: Optional[int] = None, ) -> Tuple[int, str]: """ Generate YAML configuration files for quantum ML hyperparameter grid search. - + This function creates multiple configuration files by combining different hyperparameter values for quantum machine learning models (QNN, VQC, QSVC). Each configuration file can be used with QProfiler to run systematic experiments. - + Parameters ---------- template_config_path : str @@ -73,16 +74,16 @@ def generate_qml_experiment_configs( Path to CSV file tracking previously used data files. random_seed : int, optional Random seed for reproducible file sampling. - + Returns ------- Tuple[int, str] Number of configuration files generated and path to used files CSV. - + Examples -------- >>> from qbiocode.utils import generate_qml_experiment_configs - >>> + >>> >>> # Generate configs for quantum model grid search >>> num_configs, used_files = generate_qml_experiment_configs( ... template_config_path='configs/config.yaml', @@ -94,7 +95,7 @@ def generate_qml_experiment_configs( ... data_sample_fraction=0.1 # Use 10% of files for testing ... ) >>> print(f"Generated {num_configs} configuration files") - + Notes ----- - Quantum models (QNN, VQC, QSVC) don't support automated grid search @@ -104,24 +105,24 @@ def generate_qml_experiment_configs( * QSVC uses only 'amp' ansatz and 'COBYLA' optimizer * QNN/VQC don't use the C parameter - Embedding is set to 'none' when n_components >= original feature count - + See Also -------- qbiocode.apps.qprofiler : Main profiling application """ # Set default hyperparameter values if qmethods is None: - qmethods = ['qnn', 'vqc', 'qsvc'] + qmethods = ["qnn", "vqc", "qsvc"] if reps is None: reps = [1, 2] if optimizers is None: - optimizers = ['COBYLA', 'SPSA'] + optimizers = ["COBYLA", "SPSA"] if entanglements is None: - entanglements = ['linear', 'full'] + entanglements = ["linear", "full"] if feature_maps is None: - feature_maps = ['Z', 'ZZ'] + feature_maps = ["Z", "ZZ"] if ansatz_types is None: - ansatz_types = ['amp', 'esu2'] + ansatz_types = ["amp", "esu2"] if n_components is None: n_components = [5, 10] if Cs is None: @@ -129,132 +130,153 @@ def generate_qml_experiment_configs( if max_iters is None: max_iters = [100, 500] if embeddings is None: - embeddings = ['none', 'pca', 'lle', 'isomap', 'spectral', 'umap', 'nmf'] - + embeddings = ["none", "pca", "lle", "isomap", "spectral", "umap", "nmf"] + # Set random seed if provided if random_seed is not None: np.random.seed(random_seed) - + # Create output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) - + # Set up used files tracking if used_files_path is None: - used_files_path = os.path.join(output_dir, 'used_data_files.csv') - + used_files_path = os.path.join(output_dir, "used_data_files.csv") + # Generate all hyperparameter combinations - param_grid = [qmethods, reps, optimizers, entanglements, feature_maps, - ansatz_types, n_components, Cs, max_iters, embeddings] - + param_grid = [ + qmethods, + reps, + optimizers, + entanglements, + feature_maps, + ansatz_types, + n_components, + Cs, + max_iters, + embeddings, + ] + param_combinations = pd.DataFrame( - list(itertools.product(*param_grid)), - columns=['method', 'reps', 'local_optimizer', 'entanglement', - 'feature_map', 'ansatz_type', 'n_components', 'C', - 'max_iter', 'embedding'] + list(itertools.product(*(cast(List[Any], values) for values in param_grid))), + columns=[ + "method", + "reps", + "local_optimizer", + "entanglement", + "feature_map", + "ansatz_type", + "n_components", + "C", + "max_iter", + "embedding", + ], ) - + # Apply model-specific constraints - param_combinations.loc[param_combinations['method'].isin(['qnn', 'vqc']), 'C'] = 1 - param_combinations.loc[param_combinations['method'].isin(['qsvc']), 'ansatz_type'] = 'amp' - param_combinations.loc[param_combinations['method'].isin(['qsvc']), 'max_iter'] = 100 - param_combinations.loc[param_combinations['method'].isin(['qsvc']), 'local_optimizer'] = 'COBYLA' - + param_combinations.loc[param_combinations["method"].isin(["qnn", "vqc"]), "C"] = 1 + param_combinations.loc[param_combinations["method"].isin(["qsvc"]), "ansatz_type"] = "amp" + param_combinations.loc[param_combinations["method"].isin(["qsvc"]), "max_iter"] = 100 + param_combinations.loc[param_combinations["method"].isin(["qsvc"]), "local_optimizer"] = ( + "COBYLA" + ) + # Remove duplicates and apply filtering rules param_combinations = param_combinations.drop_duplicates() param_combinations = param_combinations[ - ~((param_combinations['n_components'] >= 10) & (param_combinations['max_iter'] < 500)) + ~((param_combinations["n_components"] >= 10) & (param_combinations["max_iter"] < 500)) ] param_combinations = param_combinations[ - ~((param_combinations['reps'] > 1) & (param_combinations['n_components'] <= 10)) + ~((param_combinations["reps"] > 1) & (param_combinations["n_components"] <= 10)) ] - + # Load template configuration - with open(template_config_path, 'r') as f: + with open(template_config_path, "r") as f: cfg_template = yaml.safe_load(f) - + # Load or initialize used files list if os.path.exists(used_files_path): used_files = pd.read_csv(used_files_path).iloc[:, 0].tolist() else: used_files = [] - + # Generate configuration files config_idx = 1 - + for data_dir in data_dirs: # Get all CSV files in directory - csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')] + csv_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")] csv_files.sort() - + # Remove previously used files csv_files = list(set(csv_files) - set(used_files)) - + # Sample files if requested if data_sample_fraction < 1.0: n_files = max(1, int(len(csv_files) * data_sample_fraction)) csv_files = list(np.random.choice(csv_files, n_files, replace=False)) - + # Update used files list used_files.extend(csv_files) - + # Filter parameter combinations based on data type param_subset = param_combinations.copy() - if ('moons' in data_dir) or ('circles' in data_dir): - param_subset = param_subset[param_subset['embedding'] == 'none'] + if ("moons" in data_dir) or ("circles" in data_dir): + param_subset = param_subset[param_subset["embedding"] == "none"] else: - param_subset = param_subset[param_subset['embedding'] != 'none'] - + param_subset = param_subset[param_subset["embedding"] != "none"] + # Generate config for each combination and file for _, params in param_subset.iterrows(): for csv_file in csv_files: - config_path = os.path.join(output_dir, f'exp_{config_idx}.yaml') + config_path = os.path.join(output_dir, f"exp_{config_idx}.yaml") key = f"{params['method']}_{csv_file.replace('.csv', '')}" - + # Create config from template config = cfg_template.copy() - config['yaml'] = config_path - config['model'] = [params['method']] - config['file_dataset'] = csv_file - config['folder_path'] = data_dir.replace('data/', '') - config['hydra'] = config.get('hydra', {}) - config['hydra']['run'] = config['hydra'].get('run', {}) - config['hydra']['run']['dir'] = os.path.join('results', f'qmlgridsearch_{key}') - + config["yaml"] = config_path + config["model"] = [params["method"]] + config["file_dataset"] = csv_file + config["folder_path"] = data_dir.replace("data/", "") + config["hydra"] = config.get("hydra", {}) + config["hydra"]["run"] = config["hydra"].get("run", {}) + config["hydra"]["run"]["dir"] = os.path.join("results", f"qmlgridsearch_{key}") + # Check if embedding should be 'none' based on feature count df = pd.read_csv(os.path.join(data_dir, csv_file)) orig_features = df.shape[1] - 1 # Subtract label column - - if params['n_components'] >= orig_features: - config['embeddings'] = ['none'] + + if params["n_components"] >= orig_features: + config["embeddings"] = ["none"] else: - config['embeddings'] = [params['embedding']] - - config['n_components'] = params['n_components'] - + config["embeddings"] = [params["embedding"]] + + config["n_components"] = params["n_components"] + # Set method-specific parameters method_args_key = f"{params['method']}_args" if method_args_key not in config: config[method_args_key] = {} - - config[method_args_key]['reps'] = int(params['reps']) - config[method_args_key]['entanglement'] = params['entanglement'] - config[method_args_key]['encoding'] = params['feature_map'] - - if params['method'] != 'qsvc': - config[method_args_key]['ansatz_type'] = params['ansatz_type'] - config[method_args_key]['maxiter'] = int(params['max_iter']) + + config[method_args_key]["reps"] = int(params["reps"]) + config[method_args_key]["entanglement"] = params["entanglement"] + config[method_args_key]["encoding"] = params["feature_map"] + + if params["method"] != "qsvc": + config[method_args_key]["ansatz_type"] = params["ansatz_type"] + config[method_args_key]["maxiter"] = int(params["max_iter"]) else: - config[method_args_key]['C'] = float(params['C']) - config[method_args_key]['local_optimizer'] = params['local_optimizer'] - + config[method_args_key]["C"] = float(params["C"]) + config[method_args_key]["local_optimizer"] = params["local_optimizer"] + # Write configuration file - with open(config_path, 'w') as f: + with open(config_path, "w") as f: yaml.dump(config, f, default_flow_style=False) - + config_idx += 1 - + # Save used files list - pd.Series(used_files).to_csv(used_files_path, index=False, header=['filename']) - + pd.Series(used_files).to_csv(used_files_path, index=False, header=["filename"]) + num_configs = config_idx - 1 return num_configs, used_files_path diff --git a/qbiocode/utils/helper_fn.py b/qbiocode/utils/helper_fn.py index 8d25dc7..7eeb932 100644 --- a/qbiocode/utils/helper_fn.py +++ b/qbiocode/utils/helper_fn.py @@ -11,47 +11,46 @@ import time from typing import Literal -# ====== Scikit-learn imports ====== +from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler -from sklearn.preprocessing import StandardScaler, MinMaxScaler -from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder +# ====== Scikit-learn imports ====== -def scaler_fn(X, scaling: Literal['None', 'StandardScaler', 'MinMaxScaler'] = "None"): +def scaler_fn(X, scaling: Literal["None", "StandardScaler", "MinMaxScaler"] = "None"): """ Apply scaling transformation to input data. - + Scales the input data using one of three methods: no scaling, standard scaling (z-score normalization), or min-max scaling to [0, 1] range. - + Parameters ---------- X : array-like of shape (n_samples, n_features) Input data to be scaled. scaling : {'None', 'StandardScaler', 'MinMaxScaler'}, default='None' Scaling method to apply: - + - 'None': No scaling, returns original data - 'StandardScaler': Standardize features by removing mean and scaling to unit variance - 'MinMaxScaler': Scale features to [0, 1] range - + Returns ------- X_scaled : array-like of shape (n_samples, n_features) Scaled data. If scaling='None', returns original data unchanged. - + Notes ----- StandardScaler transforms data to have mean=0 and variance=1: - + .. math:: z = \\frac{x - \\mu}{\\sigma} - + MinMaxScaler transforms data to [0, 1] range: - + .. math:: x_{scaled} = \\frac{x - x_{min}}{x_{max} - x_{min}} - + Examples -------- >>> import numpy as np @@ -59,16 +58,16 @@ def scaler_fn(X, scaling: Literal['None', 'StandardScaler', 'MinMaxScaler'] = "N >>> X = np.array([[1, 2], [3, 4], [5, 6]]) >>> X_scaled = scaler_fn(X, scaling='StandardScaler') >>> X_minmax = scaler_fn(X, scaling='MinMaxScaler') - + See Also -------- sklearn.preprocessing.StandardScaler : Standardize features sklearn.preprocessing.MinMaxScaler : Scale features to a range """ - if scaling == 'MinMaxScaler': + if scaling == "MinMaxScaler": scaler = MinMaxScaler() return scaler.fit_transform(X) - elif scaling == 'StandardScaler': + elif scaling == "StandardScaler": scaler = StandardScaler() return scaler.fit_transform(X) else: # scaling == 'None' @@ -78,15 +77,15 @@ def scaler_fn(X, scaling: Literal['None', 'StandardScaler', 'MinMaxScaler'] = "N def feature_encoding( feature1, sparse_output=False, - feature_encoding: Literal['None', 'OneHotEncoder', 'OrdinalEncoder'] = "None" + feature_encoding: Literal["None", "OneHotEncoder", "OrdinalEncoder"] = "None", ): """ Encode categorical features using various encoding strategies. - + Transforms categorical features into numerical representations suitable for machine learning algorithms. Supports one-hot encoding, ordinal encoding, or no encoding. - + Parameters ---------- feature1 : array-like of shape (n_samples,) @@ -96,29 +95,29 @@ def feature_encoding( If False, returns a dense array. Ignored for other encoding methods. feature_encoding : {'None', 'OneHotEncoder', 'OrdinalEncoder'}, default='None' Encoding method to apply: - + - 'None': No encoding, returns original feature - 'OneHotEncoder': Create binary columns for each category - 'OrdinalEncoder': Map categories to integer values - + Returns ------- feature1_encoded : array-like Encoded feature. Shape depends on encoding method: - + - 'None': shape (n_samples, 1) - 'OrdinalEncoder': shape (n_samples, 1) - 'OneHotEncoder': shape (n_samples, n_categories) - + Notes ----- One-hot encoding creates a binary column for each unique category, useful when categories have no ordinal relationship. Ordinal encoding assigns integer values, suitable when categories have a natural order. - + The function automatically reshapes the input to (-1, 1) format required by scikit-learn encoders. - + Examples -------- >>> import numpy as np @@ -128,16 +127,16 @@ def feature_encoding( >>> encoded_onehot = feature_encoding(categories, feature_encoding='OneHotEncoder') >>> # Ordinal encoding >>> encoded_ordinal = feature_encoding(categories, feature_encoding='OrdinalEncoder') - + See Also -------- sklearn.preprocessing.OneHotEncoder : Encode categorical features as one-hot sklearn.preprocessing.OrdinalEncoder : Encode categorical features as integers """ - if feature_encoding == 'OrdinalEncoder': + if feature_encoding == "OrdinalEncoder": encoder = OrdinalEncoder() return encoder.fit_transform(feature1.reshape(-1, 1)) - elif feature_encoding == 'OneHotEncoder': + elif feature_encoding == "OneHotEncoder": encoder = OneHotEncoder(sparse_output=sparse_output) return encoder.fit_transform(feature1.reshape(-1, 1)) else: # feature_encoding == 'None' @@ -147,11 +146,11 @@ def feature_encoding( def print_results(model, accuracy, f1, compile_time, params): """ Print formatted machine learning model evaluation results. - + Displays model performance metrics and parameters in a consistent, readable format. Useful for comparing multiple models during experimentation and benchmarking. - + Parameters ---------- model : str @@ -164,17 +163,17 @@ def print_results(model, accuracy, f1, compile_time, params): Time taken to train/compile the model, in seconds. params : dict Dictionary of model hyperparameters and configuration settings. - + Returns ------- None Prints results to stdout. - + Notes ----- The function formats floating-point numbers to 4 decimal places for consistency. All metrics are printed with descriptive labels. - + Examples -------- >>> from qbiocode.utils import print_results @@ -184,7 +183,7 @@ def print_results(model, accuracy, f1, compile_time, params): RandomForest Model F1 score: 0.9156 Time taken for RandomForest Model (secs): 2.3450 RandomForest Model Params: {'n_estimators': 100, 'max_depth': 10} - + See Also -------- sklearn.metrics.accuracy_score : Compute accuracy @@ -195,4 +194,5 @@ def print_results(model, accuracy, f1, compile_time, params): print(f"Time taken for {model} Model (secs): {compile_time:.4f}") print(f"{model} Model Params: ", params) + # Made with Bob diff --git a/qbiocode/utils/ibm_account.py b/qbiocode/utils/ibm_account.py index a1bd109..9464c79 100644 --- a/qbiocode/utils/ibm_account.py +++ b/qbiocode/utils/ibm_account.py @@ -1,8 +1,11 @@ # This will be a simple function to extract information from a user's qiskit-json file -import json, os +import json +import os + from qiskit_ibm_runtime import QiskitRuntimeService + def get_creds(args): """This function determines the user's IBM Quantum channel, instance, and token, using values provided within the config.yaml file or as defined within the user's qiskit configuration from provided qiskit_json_path @@ -19,34 +22,42 @@ def get_creds(args): Returns: rval (dict): A dictionary containing the IBM Quantum credentials, including 'channel', 'instance', 'token', and 'url'. """ - cred_source_dict = {'channel':'ibm_channel', 'instance':'ibm_instance', 'token':'ibm_token', 'url':'ibm_url'} + cred_source_dict = { + "channel": "ibm_channel", + "instance": "ibm_instance", + "token": "ibm_token", + "url": "ibm_url", + } rval = {} for ibm_name, yaml_name in cred_source_dict.items(): value = args.get(yaml_name, None) if value: rval[ibm_name] = value - qiskit_json_path = args.get('qiskit_json_path', None) + qiskit_json_path = args.get("qiskit_json_path", None) if qiskit_json_path: qiskit_json_path = os.path.expanduser(qiskit_json_path) if os.path.exists(qiskit_json_path): # load the qiskit json file - with open(qiskit_json_path, 'r') as jfile: + with open(qiskit_json_path, "r") as jfile: creds = json.load(jfile) # Access keys and values - # The items we want are actually in a nested dictionary, so we have to loop through the outer dictionary first, then the - # nested one. This nested dictionary (outer_value) is actually the value for the key in the parent dictionary. + # The items we want are actually in a nested dictionary, so we have to loop through the outer dictionary first, then the + # nested one. This nested dictionary (outer_value) is actually the value for the key in the parent dictionary. for outer_key, outer_value in creds.items(): - if 'name' in rval.keys() and outer_key == rval['name']: + if "name" in rval.keys() and outer_key == rval["name"]: for ibm_name in cred_source_dict.keys(): if ibm_name not in rval: value = outer_value.get(ibm_name, None) if value: rval[ibm_name] = value else: - print('IBM credentials not found! Please verify that the path to your qiskit-ibm.json file is correct.') + print( + "IBM credentials not found! Please verify that the path to your qiskit-ibm.json file is correct." + ) return rval + def instantiate_runtime_service(args): """This function provides a quick way to instantiate QiskitRuntimeService in one place. A basic call to this function can then be done in anywhere else. It uses the get_creds function to retrieve the necessary credentials from the qiskit-ibm.json file, with the file path specified in the config.yaml file. @@ -55,7 +66,7 @@ def instantiate_runtime_service(args): Args: args (dict): This passes the arguments from the config.yaml file. In this particular case, it is importing the path to the qiskit-ibm.json file (qiskit_json_path) and the credentials defined in this json file (ibm_channel, ibm_instance, ibm_token, ibm_url). - + Returns: QiskitRuntimeService: An instance of the QiskitRuntimeService class, initialized with the credentials from the qiskit-ibm.json file or the provided arguments. """ diff --git a/qbiocode/utils/qc_winner_finder.py b/qbiocode/utils/qc_winner_finder.py index 12c15df..ae3d040 100644 --- a/qbiocode/utils/qc_winner_finder.py +++ b/qbiocode/utils/qc_winner_finder.py @@ -1,94 +1,114 @@ ## function to find datasets where QML methods did better than classical -import pandas as pd -import numpy as np +import os + import matplotlib.pyplot as plt +import numpy as np import pandas as pd -import os + def qml_winner(results_df, rawevals_df, output_dir, tag): """This function finds data sets where QML was beneficial (higher F1 scores than CML) and create new .csv files - with the relevant evaluation and performance for these specific datasets, for further analysis. + with the relevant evaluation and performance for these specific datasets, for further analysis. It also computes the best results per method across all splits and the best results per dataset. It returns two DataFrames: one with the datasets where QML methods outperformed CML methods, and another with the evaluation scores for the best QML method for each of these datasets. It also saves these DataFrames as .csv files in the specified output directory. - + Args: results_df (pandas.DataFrame): Dataset in pandas corresponding to 'ModelResults.csv' rawevals_df (pandas.DataFrame): Dataset in pandas corresponding to 'RawDataEvaluation.csv' - Returns: + Returns: qml_winners (pandas.DataFrame): contais the input datasets for which at least one QML method performed better than CML. DataFrame contains the scores of all - the methods. - winner_eval_score (pandas.DataFrame): contains the input datasets, their evaluation, and scores for the + the methods. + winner_eval_score (pandas.DataFrame): contains the input datasets, their evaluation, and scores for the specific qml method that yielded the best score. """ - + # pass in the ML results df = results_df.copy() # pull in the raw evaluations rawevals = rawevals_df.copy() - #first, compute mean across all splits - if 'Model_Parameters' in df.columns: - df_across_split= df.groupby(['Dataset', 'embeddings', 'model', 'Model_Parameters'])['f1_score'].mean().reset_index() - else: + # first, compute mean across all splits + if "Model_Parameters" in df.columns: + df_across_split = ( + df.groupby(["Dataset", "embeddings", "model", "Model_Parameters"])["f1_score"] + .mean() + .reset_index() + ) + else: # if 'Model_Parameters' is not present, this means you ran a grid search and this column will be named 'BestParams_GridSearch' instead - df_across_split= df.groupby(['Dataset', 'embeddings', 'model', 'BestParams_GridSearch'])['f1_score'].mean().reset_index() - #now, extract the best results per method across embedding and iteration - df_best = df_across_split.groupby(['Dataset', 'model'])['f1_score'].max().reset_index() - #df_best = df_across_split.groupby(['Dataset', 'model', 'Model_Parameters'])['f1_score'].max().reset_index() - df_best.to_csv(( os.path.join( output_dir, tag +'_best_across_split.csv')), index=False) + df_across_split = ( + df.groupby(["Dataset", "embeddings", "model", "BestParams_GridSearch"])["f1_score"] + .mean() + .reset_index() + ) + # now, extract the best results per method across embedding and iteration + df_best = df_across_split.groupby(["Dataset", "model"])["f1_score"].max().reset_index() + # df_best = df_across_split.groupby(['Dataset', 'model', 'Model_Parameters'])['f1_score'].max().reset_index() + df_best.to_csv((os.path.join(output_dir, tag + "_best_across_split.csv")), index=False) # get summary accross all datasets - df_best_model_mean = df_best.groupby('model')['f1_score'].mean() - df_best_model_median = df_best.groupby('model')['f1_score'].median() - df_best_model_max = df_best.groupby('model')['f1_score'].max() - df_best_model_std = df_best.groupby('model')['f1_score'].std() - df_best_permodel_summary = pd.concat([df_best_model_mean, df_best_model_median, df_best_model_max, df_best_model_std], axis=1) - df_best_permodel_summary.columns = ['Mean_F1_Score', 'Median_F1_Score', 'Max_F1_Score', 'StandardDev_F1_Score'] - df_best_permodel_summary.to_csv(( os.path.join( output_dir, tag +'_best_permodel_summary.csv'))) + df_best_model_mean = df_best.groupby("model")["f1_score"].mean() + df_best_model_median = df_best.groupby("model")["f1_score"].median() + df_best_model_max = df_best.groupby("model")["f1_score"].max() + df_best_model_std = df_best.groupby("model")["f1_score"].std() + df_best_permodel_summary = pd.concat( + [df_best_model_mean, df_best_model_median, df_best_model_max, df_best_model_std], axis=1 + ) + df_best_permodel_summary.columns = [ + "Mean_F1_Score", + "Median_F1_Score", + "Max_F1_Score", + "StandardDev_F1_Score", + ] + df_best_permodel_summary.to_csv((os.path.join(output_dir, tag + "_best_permodel_summary.csv"))) # print(df_best_permodel_summary) - + # extract the best results per dataset - best_per_dataset = df_best.loc[df_best.groupby('Dataset')['f1_score'].idxmax()] + best_per_dataset = df_best.loc[df_best.groupby("Dataset")["f1_score"].idxmax()] # best_per_dataset = df_across_split.loc[df_across_split.groupby('Dataset')['f1_score'].idxmax()] # create list of qml methods - qml_list = ['QSVC', 'QNN', 'VQC', 'PQK'] + qml_list = ["QSVC", "QNN", "VQC", "PQK"] # qml_winner = df_best[df_best['Dataset'].isin(best_per_dataset[best_per_dataset['model'].isin(qml_list)]['Dataset'])] - qml_winner = df_across_split[df_across_split['Dataset'].isin(best_per_dataset[best_per_dataset['model'].isin(qml_list)]['Dataset'])] + qml_winner = df_across_split[ + df_across_split["Dataset"].isin( + best_per_dataset[best_per_dataset["model"].isin(qml_list)]["Dataset"] + ) + ] if not qml_winner.empty: - bestmethod = qml_winner.groupby('Dataset')['f1_score'].idxmax() + bestmethod = qml_winner.groupby("Dataset")["f1_score"].idxmax() qc_method_and_score = qml_winner.loc[bestmethod] - qml_winner.to_csv(( os.path.join( output_dir, tag +'_qml_winners.csv')), index=False) - dataset = list(qml_winner['Dataset'].unique()) - + qml_winner.to_csv((os.path.join(output_dir, tag + "_qml_winners.csv")), index=False) + dataset = list(qml_winner["Dataset"].unique()) + ####### # now let's find the raw data evaluations for the qml winner data sets # this wil produce another csv file that contains scores, evaluation, and qml method # for these "qml winners". winner_evals = [] for file in dataset: - eval = rawevals.loc[rawevals['Dataset'] == file] + eval = rawevals.loc[rawevals["Dataset"] == file] # print(eval) winner_evals.append(eval) winner_evals_df = pd.concat(winner_evals) - winner_evals_df.to_csv(( os.path.join( output_dir, tag +'_winner_evals.csv')), index=False) + winner_evals_df.to_csv((os.path.join(output_dir, tag + "_winner_evals.csv")), index=False) winner_scores_df = qc_method_and_score.iloc[:, -3:] - winner_scores_df.to_csv(( os.path.join( output_dir, tag +'_winner_score.csv')), index=False) + winner_scores_df.to_csv((os.path.join(output_dir, tag + "_winner_score.csv")), index=False) print(winner_scores_df) winner_eval_score = pd.concat([winner_evals_df, winner_scores_df], axis=1) - winner_eval_score.to_csv(( os.path.join( output_dir, tag +'_winner_eval_score.csv')), index=False) # contains dataset, evaluation, qml method, and average f1 score + winner_eval_score.to_csv( + (os.path.join(output_dir, tag + "_winner_eval_score.csv")), index=False + ) # contains dataset, evaluation, qml method, and average f1 score ####### - + # optional print statements - print('*** The number of qml winners is', len(dataset)) - print('*** The qml winners are:', dataset) - + print("*** The number of qml winners is", len(dataset)) + print("*** The qml winners are:", dataset) + return qml_winner, winner_eval_score, df_best - - else: - print('*** QML methods were outperformed by CML methods in all datasets ***') - - return + else: + print("*** QML methods were outperformed by CML methods in all datasets ***") + return diff --git a/qbiocode/utils/qutils.py b/qbiocode/utils/qutils.py index cdc333f..b36f760 100644 --- a/qbiocode/utils/qutils.py +++ b/qbiocode/utils/qutils.py @@ -6,15 +6,20 @@ import numpy as np import pandas as pd from qiskit.circuit.equivalence_library import SessionEquivalenceLibrary as sel -from qiskit.circuit.library import (EfficientSU2, PauliFeatureMap, - RealAmplitudes, TwoLocal, XGate, YGate, - ZFeatureMap, ZZFeatureMap) -from qiskit.primitives import StatevectorEstimator -from qiskit.primitives import StatevectorSampler +from qiskit.circuit.library import ( + EfficientSU2, + PauliFeatureMap, + RealAmplitudes, + TwoLocal, + XGate, + YGate, + ZFeatureMap, + ZZFeatureMap, +) +from qiskit.primitives import StatevectorEstimator, StatevectorSampler from qiskit.quantum_info import SparsePauliOp from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager -from qiskit_algorithms.optimizers import (COBYLA, L_BFGS_B, NFT, SPSA, - GradientDescent, spsa) +from qiskit_algorithms.optimizers import COBYLA, L_BFGS_B, NFT, SPSA, GradientDescent, spsa from qiskit_ibm_runtime import EstimatorOptions from qiskit_ibm_runtime import EstimatorV2 as Estimator from qiskit_ibm_runtime import SamplerOptions @@ -25,7 +30,7 @@ from qbiocode.utils.ibm_account import instantiate_runtime_service -def get_backend_session( args: dict, primitive : str, num_qubits : int ): +def get_backend_session(args: dict, primitive: str, num_qubits: int): """ This function to get the backend and session for the specified primitive. @@ -42,35 +47,33 @@ def get_backend_session( args: dict, primitive : str, num_qubits : int ): backend = None session = None prim = None - - if args['backend'] == 'simulator': - if primitive == 'estimator': + if args["backend"] == "simulator": + + if primitive == "estimator": # Estimator primitive - prim = StatevectorEstimator(seed=args['seed']) + prim = StatevectorEstimator(seed=args["seed"]) else: - prim = StatevectorSampler(seed = args['seed'], default_shots=args['shots']) - elif 'ibm' in args['backend']: + prim = StatevectorSampler(seed=args["seed"], default_shots=args["shots"]) + elif "ibm" in args["backend"]: service = instantiate_runtime_service(args) - if args['backend'] == 'ibm_least': - backend = service.least_busy(simulator=False, operational=True, min_num_qubits=num_qubits) + if args["backend"] == "ibm_least": + backend = service.least_busy( + simulator=False, operational=True, min_num_qubits=num_qubits + ) else: - backend = service.backend(name=args['backend']) + backend = service.backend(name=args["backend"]) session = Session(backend=backend) - - if primitive == 'sampler': - prim = get_sampler(mode = session, - shots = args['shots']) + + if primitive == "sampler": + prim = get_sampler(mode=session, shots=args["shots"]) else: - prim = get_estimator(mode = session, - shots = args['shots'], - resil_level=args['resil_level']) + prim = get_estimator(mode=session, shots=args["shots"], resil_level=args["resil_level"]) return backend, session, prim -def transpile_circuit( circuit, opt_level, backend, initial_layout, PT = False, - dd_sequence = 'XpXm'): +def transpile_circuit(circuit, opt_level, backend, initial_layout, PT=False, dd_sequence="XpXm"): """ This function transpiles the given quantum circuit based on the optimization level and backend. @@ -85,7 +88,7 @@ def transpile_circuit( circuit, opt_level, backend, initial_layout, PT = False, Returns: t_qc (QuantumCircuit): The transpiled quantum circuit. """ - if str(opt_level) == 'AI': + if str(opt_level) == "AI": pm = TranspilerService( backend_name=backend, ai="true", @@ -100,23 +103,24 @@ def transpile_circuit( circuit, opt_level, backend, initial_layout, PT = False, ) t_qc = pm.run(circuit) - return( t_qc) + return t_qc + + def get_observable(circuit, backend): observable = SparsePauliOp.from_list([("Z" * circuit.num_qubits, 1)]) # observable = SparsePauliOp.from_list([("Z" + "I" * (int(circuit.num_qubits) - 1), 0.5)]) - if 'ibm' in backend.name: - observable = observable.apply_layout(circuit.layout)#, num_qubits=backend.num_qubits) + if "ibm" in backend.name: + observable = observable.apply_layout(circuit.layout) # , num_qubits=backend.num_qubits) return observable def get_sampler( - mode = None, - shots = 1024, - dd = True, - dd_seq = 'XpXm', - PT = True, - ): - + mode=None, + shots=1024, + dd=True, + dd_seq="XpXm", + PT=True, +): """ This function creates a Sampler instance with specified options. @@ -130,7 +134,7 @@ def get_sampler( Returns: Sampler: An instance of the Sampler with the specified options. """ - + sampler_options = SamplerOptions() ## ERROR SUPPRESSION TESTING ### @@ -138,30 +142,28 @@ def get_sampler( if dd: sampler_options.dynamical_decoupling.enable = dd sampler_options.dynamical_decoupling.sequence_type = dd_seq - sampler_options.dynamical_decoupling.extra_slack_distribution = 'middle' - sampler_options.dynamical_decoupling.scheduling_method = 'alap' + sampler_options.dynamical_decoupling.extra_slack_distribution = "middle" + sampler_options.dynamical_decoupling.scheduling_method = "alap" if PT: sampler_options.twirling.enable_gates = True sampler_options.twirling.enable_measure = False - sampler_options.twirling.num_randomizations = 'auto' - sampler_options.twirling.shots_per_randomization = 'auto' - sampler_options.twirling.strategy = ( - "active-accum" ### TRY VARYING THIS ### - ) - + sampler_options.twirling.num_randomizations = "auto" + sampler_options.twirling.shots_per_randomization = "auto" + sampler_options.twirling.strategy = "active-accum" ### TRY VARYING THIS ### sampler = Sampler(mode=mode, options=sampler_options) - + return sampler + def get_estimator( - mode = None, - shots = 1024, - resil_level = 2, - dd = True, - dd_seq = 'XpXm', - PT = True, - ): + mode=None, + shots=1024, + resil_level=2, + dd=True, + dd_seq="XpXm", + PT=True, +): """ This function creates an Estimator instance with specified options. @@ -175,11 +177,11 @@ def get_estimator( Returns: Estimator: An instance of the Estimator with the specified options. """ - + experimental_opts = {} # experimental_opts["execution_path"] = "gen3-turbo" - estimator_options = EstimatorOptions(experimental = experimental_opts) + estimator_options = EstimatorOptions(experimental=experimental_opts) ## ERROR SUPPRESSION TESTING ### estimator_options.default_shots = shots @@ -187,24 +189,23 @@ def get_estimator( if dd: estimator_options.dynamical_decoupling.enable = dd estimator_options.dynamical_decoupling.sequence_type = dd_seq - estimator_options.dynamical_decoupling.extra_slack_distribution = 'middle' - estimator_options.dynamical_decoupling.scheduling_method = 'alap' + estimator_options.dynamical_decoupling.extra_slack_distribution = "middle" + estimator_options.dynamical_decoupling.scheduling_method = "alap" if PT: estimator_options.twirling.enable_gates = True estimator_options.twirling.enable_measure = False - estimator_options.twirling.num_randomizations = 'auto' - estimator_options.twirling.shots_per_randomization = 'auto' - estimator_options.twirling.strategy = ( - "active-accum" ### TRY VARYING THIS ### - ) - + estimator_options.twirling.num_randomizations = "auto" + estimator_options.twirling.shots_per_randomization = "auto" + estimator_options.twirling.strategy = "active-accum" ### TRY VARYING THIS ### + estimator = Estimator(mode=mode, options=estimator_options) return estimator -def get_ansatz( ansatz_type, feat_dimension, reps = 1, entanglement = 'linear'): + +def get_ansatz(ansatz_type, feat_dimension, reps=1, entanglement="linear"): """ This function returns an ansatz based on the specified type and parameters. - It supports 'esu2', 'amp', and 'twolocal' ansatz types, constructing it using the specified feature dimension, + It supports 'esu2', 'amp', and 'twolocal' ansatz types, constructing it using the specified feature dimension, number of repetitions, and entanglement type. Args: @@ -215,16 +216,16 @@ def get_ansatz( ansatz_type, feat_dimension, reps = 1, entanglement = 'linear'): Returns: ansatz: An instance of the specified ansatz type. """ - if(ansatz_type=='esu2'): - ansatz = EfficientSU2(feat_dimension, ['ry', 'rz'], entanglement, reps=reps) - elif ansatz_type == 'amp': - ansatz = RealAmplitudes(num_qubits=feat_dimension, reps=reps) - elif ansatz_type == 'twolocal': - ansatz = TwoLocal(feat_dimension, ['ry', 'rz'], 'cz', entanglement, reps=reps) + if ansatz_type == "esu2": + ansatz = EfficientSU2(feat_dimension, ["ry", "rz"], entanglement, reps=reps) + elif ansatz_type == "amp": + ansatz = RealAmplitudes(num_qubits=feat_dimension, reps=reps) + elif ansatz_type == "twolocal": + ansatz = TwoLocal(feat_dimension, ["ry", "rz"], "cz", entanglement, reps=reps) return ansatz -def get_feature_map( feature_map, feat_dimension, reps = 1, entanglement = 'linear', data_map_func = None ): +def get_feature_map(feature_map, feat_dimension, reps=1, entanglement="linear", data_map_func=None): """ This function returns a feature map based on the specified type and parameters. It supports 'Z', 'ZZ', and 'P' feature maps, constructing it using the specified feature dimension, @@ -240,29 +241,35 @@ def get_feature_map( feature_map, feat_dimension, reps = 1, entanglement = 'line feat_dimension (int): The number of qubits in the feature map. """ # Get Feature Map - if feature_map == 'Z': - feature_map = ZFeatureMap(feat_dimension,reps=reps, parameter_prefix='a', data_map_func = data_map_func) - elif feature_map == 'ZZ': - feature_map = ZZFeatureMap(feature_dimension=feat_dimension, - reps=reps, - entanglement=entanglement, - parameter_prefix='a', - data_map_func = data_map_func) - elif feature_map == 'P': - feature_map = PauliFeatureMap(feature_dimension=feat_dimension, - reps=reps, - entanglement=entanglement, - data_map_func = data_map_func) + if feature_map == "Z": + feature_map = ZFeatureMap( + feat_dimension, reps=reps, parameter_prefix="a", data_map_func=data_map_func + ) + elif feature_map == "ZZ": + feature_map = ZZFeatureMap( + feature_dimension=feat_dimension, + reps=reps, + entanglement=entanglement, + parameter_prefix="a", + data_map_func=data_map_func, + ) + elif feature_map == "P": + feature_map = PauliFeatureMap( + feature_dimension=feat_dimension, + reps=reps, + entanglement=entanglement, + data_map_func=data_map_func, + ) # print("The number of qubits is:", feature_map.num_qubits) # print("The number of parameters is:", feature_map.num_parameters) - - return feature_map, feat_dimension + return feature_map, feat_dimension -def get_optimizer( type = 'COBYLA', max_iter = 100, learning_rate_a = None, - perturbation_gamma = None, prior_iter = 0 ): +def get_optimizer( + type="COBYLA", max_iter=100, learning_rate_a=None, perturbation_gamma=None, prior_iter=0 +): """ This function returns an optimizer based on the specified type and parameters. It supports 'SPSA', 'COBYLA', 'GradientDescent', and 'L_BFGS_B' optimizer types, @@ -278,31 +285,33 @@ def get_optimizer( type = 'COBYLA', max_iter = 100, learning_rate_a = None, Returns: optimizer: An instance of the specified optimizer type. """ - if type == 'SPSA': + if type == "SPSA": if (learning_rate_a != None) & (perturbation_gamma != None): # set up the power series def learning_rate(): return spsa.powerseries(learning_rate_a, 0.602, 0) + gen = learning_rate() learning_rates = np.array([next(gen) for _ in range(max_iter + prior_iter)]) - learning_rates = learning_rates[prior_iter:(max_iter + prior_iter)] + learning_rates = learning_rates[prior_iter : (max_iter + prior_iter)] def perturbation(): return spsa.powerseries(0.2, perturbation_gamma) + gen = perturbation() perturbations = np.array([next(gen) for _ in range(max_iter + prior_iter)]) - perturbations = perturbations[prior_iter:(max_iter + prior_iter)] + perturbations = perturbations[prior_iter : (max_iter + prior_iter)] - optimizer=SPSA(maxiter=max_iter, - learning_rate= learning_rates, - perturbation= perturbations) + optimizer = SPSA( + maxiter=max_iter, learning_rate=learning_rates, perturbation=perturbations + ) else: - optimizer=SPSA(maxiter=max_iter) - elif type == 'COBYLA': - optimizer=COBYLA(maxiter=max_iter) - elif type == 'GradientDescent': - optimizer=GradientDescent(maxiter=max_iter) - elif type == 'L_BFGS_B': + optimizer = SPSA(maxiter=max_iter) + elif type == "COBYLA": + optimizer = COBYLA(maxiter=max_iter) + elif type == "GradientDescent": + optimizer = GradientDescent(maxiter=max_iter) + elif type == "L_BFGS_B": optimizer == L_BFGS_B(maxiter=max_iter) - + return optimizer diff --git a/qbiocode/visualization/__init__.py b/qbiocode/visualization/__init__.py index a833ded..741526f 100644 --- a/qbiocode/visualization/__init__.py +++ b/qbiocode/visualization/__init__.py @@ -7,7 +7,7 @@ comparisons between classical and quantum models. Available Functions ------------------- +------------------- - compute_results_correlation: Compute Spearman correlation between metrics - plot_results_correlation: Create correlation plots and visualizations @@ -21,6 +21,6 @@ from .visualize_correlation import compute_results_correlation, plot_results_correlation __all__ = [ - 'compute_results_correlation', - 'plot_results_correlation', + "compute_results_correlation", + "plot_results_correlation", ] diff --git a/qbiocode/visualization/visualize_correlation.py b/qbiocode/visualization/visualize_correlation.py index cd1bd4e..581865c 100644 --- a/qbiocode/visualization/visualize_correlation.py +++ b/qbiocode/visualization/visualize_correlation.py @@ -1,46 +1,45 @@ +import re +import matplotlib.colors as mcolors +import matplotlib.pyplot as plt +import numpy as np import pandas as pd +import seaborn as sns from scipy.stats import spearmanr from sklearn.metrics import r2_score -import re -import seaborn as sns -import matplotlib.pyplot as plt -import matplotlib.colors as mcolors -import numpy as np from sklearn.preprocessing import MinMaxScaler # Set publication-quality defaults for scientific journals -plt.rcParams['font.family'] = 'sans-serif' -plt.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans', 'Helvetica', 'Liberation Sans'] -plt.rcParams['font.size'] = 11 -plt.rcParams['axes.labelsize'] = 12 -plt.rcParams['axes.titlesize'] = 13 -plt.rcParams['xtick.labelsize'] = 10 -plt.rcParams['ytick.labelsize'] = 10 -plt.rcParams['legend.fontsize'] = 10 -plt.rcParams['figure.titlesize'] = 13 -plt.rcParams['axes.linewidth'] = 1.2 -plt.rcParams['xtick.major.width'] = 1.2 -plt.rcParams['ytick.major.width'] = 1.2 -plt.rcParams['xtick.minor.width'] = 0.8 -plt.rcParams['ytick.minor.width'] = 0.8 -plt.rcParams['xtick.major.size'] = 5 -plt.rcParams['ytick.major.size'] = 5 -plt.rcParams['xtick.minor.size'] = 3 -plt.rcParams['ytick.minor.size'] = 3 -plt.rcParams['savefig.dpi'] = 600 -plt.rcParams['savefig.bbox'] = 'tight' -plt.rcParams['savefig.pad_inches'] = 0.05 -plt.rcParams['axes.spines.top'] = False -plt.rcParams['axes.spines.right'] = False -plt.rcParams['axes.grid'] = False -plt.rcParams['grid.alpha'] = 0.3 -plt.rcParams['grid.linestyle'] = '--' -plt.rcParams['grid.linewidth'] = 0.5 - - -def compute_results_correlation( results_df, correlation = 'spearman', thresh = 0.7 ): - +plt.rcParams["font.family"] = "sans-serif" +plt.rcParams["font.sans-serif"] = ["Arial", "DejaVu Sans", "Helvetica", "Liberation Sans"] +plt.rcParams["font.size"] = 11 +plt.rcParams["axes.labelsize"] = 12 +plt.rcParams["axes.titlesize"] = 13 +plt.rcParams["xtick.labelsize"] = 10 +plt.rcParams["ytick.labelsize"] = 10 +plt.rcParams["legend.fontsize"] = 10 +plt.rcParams["figure.titlesize"] = 13 +plt.rcParams["axes.linewidth"] = 1.2 +plt.rcParams["xtick.major.width"] = 1.2 +plt.rcParams["ytick.major.width"] = 1.2 +plt.rcParams["xtick.minor.width"] = 0.8 +plt.rcParams["ytick.minor.width"] = 0.8 +plt.rcParams["xtick.major.size"] = 5 +plt.rcParams["ytick.major.size"] = 5 +plt.rcParams["xtick.minor.size"] = 3 +plt.rcParams["ytick.minor.size"] = 3 +plt.rcParams["savefig.dpi"] = 600 +plt.rcParams["savefig.bbox"] = "tight" +plt.rcParams["savefig.pad_inches"] = 0.05 +plt.rcParams["axes.spines.top"] = False +plt.rcParams["axes.spines.right"] = False +plt.rcParams["axes.grid"] = False +plt.rcParams["grid.alpha"] = 0.3 +plt.rcParams["grid.linestyle"] = "--" +plt.rcParams["grid.linewidth"] = 0.5 + + +def compute_results_correlation(results_df, correlation="spearman", thresh=0.7): """This function takes in as input a Pandas Dataframe containing the results and data evaluations for a given dataset. It then produces a spearman correlation between the data evaluation characteristics (features) and instances where an F1 score was observed above a certain threshold (thresh). @@ -57,7 +56,7 @@ def compute_results_correlation( results_df, correlation = 'spearman', thresh = results_df (pd.DataFrame): A DataFrame containing the results and data evaluations. correlation (str): The type of correlation to compute, default is 'spearman'. thresh (float): The threshold for F1 score to consider, default is 0.7. - + Returns: results_df (pd.DataFrame): The input DataFrame with additional columns for datatype and model_embed_datatype. correlations_df (pd.DataFrame): A DataFrame containing the computed correlations between metrics and features. @@ -65,37 +64,89 @@ def compute_results_correlation( results_df, correlation = 'spearman', thresh = """ # Refining datasrame - results_df['datatype'] = [ re.sub( '\.csv', '', re.sub( '-.*', '', x ) ) for x in results_df['Dataset'] ] - results_df[ 'model_embed_datatype'] = [ '_'.join( [str(row.model), str(row.embeddings), str(row.datatype)] ) for idx, row in results_df.iterrows() ] + results_df["datatype"] = [ + re.sub(r"\.csv", "", re.sub(r"-.*", "", x)) for x in results_df["Dataset"] + ] + results_df["model_embed_datatype"] = [ + "_".join([str(row.model), str(row.embeddings), str(row.datatype)]) + for idx, row in results_df.iterrows() + ] correlations = [] - features = ['Feature_Samples_ratio', 'Intrinsic_Dimension', 'Condition number', - 'Fisher Discriminant Ratio', 'Total Correlations', 'Mutual information', - '# Non-zero entries', '# Low variance features', 'Variation', 'std_var', - 'Coefficient of Variation %', 'std_co_of_v', 'Skewness', 'std_skew', - 'Kurtosis', 'std_kurt', 'Mean Log Kernel Density', - 'Isomap Reconstruction Error', 'Fractal dimension', 'Entropy', - 'std_entropy'] - metrics = ['accuracy', 'f1_score', 'time', 'auc'] - - keys = list(set(results_df['model_embed_datatype'])) + features = [ + "Feature_Samples_ratio", + "Intrinsic_Dimension", + "Condition number", + "Fisher Discriminant Ratio", + "Total Correlations", + "Mutual information", + "# Non-zero entries", + "# Low variance features", + "Variation", + "std_var", + "Coefficient of Variation %", + "std_co_of_v", + "Skewness", + "std_skew", + "Kurtosis", + "std_kurt", + "Mean Log Kernel Density", + "Isomap Reconstruction Error", + "Fractal dimension", + "Entropy", + "std_entropy", + ] + metrics = ["accuracy", "f1_score", "time", "auc"] + + keys = list(set(results_df["model_embed_datatype"])) for m in keys: - dat_temp_m = results_df[results_df['model_embed_datatype'] == m] + dat_temp_m = results_df[results_df["model_embed_datatype"] == m] if len(dat_temp_m) > 0: for s in metrics: for f in features: if f in dat_temp_m.columns: - if correlation == 'spearman': - correlations.append( [m, s, f, np.median(dat_temp_m[s]), sum(dat_temp_m[s]>thresh)/len(dat_temp_m[s]), spearmanr( dat_temp_m[s], dat_temp_m[f] )[0] ] ) - - correlations_df = pd.DataFrame(correlations, columns = ['model_embed_datatype', 'metric', 'feature', 'median_metric', 'frac_gt_thresh', 'correlation'] ) + if correlation == "spearman": + correlations.append( + [ + m, + s, + f, + np.median(dat_temp_m[s]), + sum(dat_temp_m[s] > thresh) / len(dat_temp_m[s]), + spearmanr(dat_temp_m[s], dat_temp_m[f])[0], + ] + ) + + correlations_df = pd.DataFrame( + correlations, + columns=[ + "model_embed_datatype", + "metric", + "feature", + "median_metric", + "frac_gt_thresh", + "correlation", + ], + ) return results_df, correlations_df -def plot_results_correlation( correlations_df, metric = 'f1_score', title = '', correlation_type = 'Spearman ρ', figsize=(6.5,10), save_file_path = '', size = 'median_metric', - xticks = True, key = 'model_embed_datatype', legend_offset = 1.0, show_plots = True, - colorbar_label = 'Correlation coefficient', size_label = 'Median metric value'): - + +def plot_results_correlation( + correlations_df, + metric="f1_score", + title="", + correlation_type="Spearman ρ", + figsize=(6.5, 10), + save_file_path="", + size="median_metric", + xticks=True, + key="model_embed_datatype", + legend_offset=1.0, + show_plots=True, + colorbar_label="Correlation coefficient", + size_label="Median metric value", +): """This function plots publication-quality correlation dot plots using the previously generated correlations_df dataframe. The larger the circle, the higher the metric value for that particular data set. The circle colors correspond to the correlations between the data characteristics (evaluations) and the metric. Red corresponds to a positive @@ -114,41 +165,71 @@ def plot_results_correlation( correlations_df, metric = 'f1_score', title = '', show_plots (bool): Whether to display plots, default is True. colorbar_label (str): Label for the colorbar, default is 'Correlation coefficient'. size_label (str): Label for the size legend, default is 'Median metric value'. - + Returns: None: Displays the plot and saves it to the specified file path if provided. """ # Use enhanced professional diverging colormap from matplotlib.colors import LinearSegmentedColormap - colors_custom = ['#053061', '#2166ac', '#4393c3', '#92c5de', '#d1e5f0', - '#f7f7f7', '#fddbc7', '#f4a582', '#d6604d', '#b2182b', '#67001f'] - cmap_custom = LinearSegmentedColormap.from_list('custom_diverging', colors_custom, N=256) + + colors_custom = [ + "#053061", + "#2166ac", + "#4393c3", + "#92c5de", + "#d1e5f0", + "#f7f7f7", + "#fddbc7", + "#f4a582", + "#d6604d", + "#b2182b", + "#67001f", + ] + cmap_custom = LinearSegmentedColormap.from_list("custom_diverging", colors_custom, N=256) norm = mcolors.TwoSlopeNorm(vmin=-1.0, vcenter=0.0, vmax=1.0) # Sample data - data = correlations_df[correlations_df['metric'] == metric].copy() - data['feature'] = [ re.sub( 'std', 'Std. dev. of', - re.sub( 'co of v', 'coefficient of variation', - re.sub( 'kurt$' ,'kurtosis', - re.sub( 'skew$', 'skewness', - re.sub( 'var$', 'variation', - re.sub( '%', '', - re.sub( '_', ' ', x ) ) ) ) ) ) ) for x in data['feature']] - - if key == 'model_datatype': - data['datatype'] = [ '_'.join( x.split('_')[1:] ) for x in data[key]] - key_column = 'Model / Dataset' + data = correlations_df[correlations_df["metric"] == metric].copy() + data["feature"] = [ + re.sub( + "std", + "Std. dev. of", + re.sub( + "co of v", + "coefficient of variation", + re.sub( + "kurt$", + "kurtosis", + re.sub( + "skew$", + "skewness", + re.sub("var$", "variation", re.sub("%", "", re.sub("_", " ", x))), + ), + ), + ), + ) + for x in data["feature"] + ] + + if key == "model_datatype": + data["datatype"] = ["_".join(x.split("_")[1:]) for x in data[key]] + key_column = "Model / Dataset" else: - data['datatype'] = [ '_'.join( x.split('_')[2:] ) for x in data[key]] - key_column = 'Model / Embedding / Dataset' - - data = data.sort_values( ['feature','datatype'], ascending = False ) - data['model'] = [ re.sub( '_.*', '', x ) for x in data[key]] - data['model'] = [x.upper() for x in data['model']] - data = pd.concat( [data[ ~data['model'].isin( ['QSVC', 'QNN', 'VQC', 'PQK']) ], data[ data['model'].isin( ['QSVC', 'QNN', 'VQC', 'PQK']) ] ] ) - fm = dict(zip( list(set(data['feature'])), range(len(set(data['feature']))) ) ) - data['feature_map'] = [ fm[x] for x in data['feature']] + data["datatype"] = ["_".join(x.split("_")[2:]) for x in data[key]] + key_column = "Model / Embedding / Dataset" + + data = data.sort_values(["feature", "datatype"], ascending=False) + data["model"] = [re.sub("_.*", "", x) for x in data[key]] + data["model"] = [x.upper() for x in data["model"]] + data = pd.concat( + [ + data[~data["model"].isin(["QSVC", "QNN", "VQC", "PQK"])], + data[data["model"].isin(["QSVC", "QNN", "VQC", "PQK"])], + ] + ) + fm = dict(zip(list(set(data["feature"])), range(len(set(data["feature"]))))) + data["feature_map"] = [fm[x] for x in data["feature"]] # Fill NaN values before scaling to avoid errors data = data.fillna(0) @@ -156,225 +237,310 @@ def plot_results_correlation( correlations_df, metric = 'f1_score', title = '', # Scale dot size based on actual data range for meaningful representation # Reduced sizes to minimize overlap epsilon = 25 - + # Get actual min/max from the data to scale appropriately min_val = data[size].min() max_val = data[size].max() - + # Normalize to 0-1 based on actual data range, then scale to pixel sizes if max_val > min_val: normalized_values = (data[size] - min_val) / (max_val - min_val) else: normalized_values = np.ones_like(data[size]) * 0.5 - + # Size formula: normalized value in [0,1] → size in [epsilon, 150+epsilon] (reduced from 200) - data['norm_size'] = (normalized_values * 150 + epsilon).astype(float) + data["norm_size"] = (normalized_values * 150 + epsilon).astype(float) + + data[key] = [re.sub("_", " / ", x) for x in data[key]] - data[key] = [ re.sub( '_', ' / ', x ) for x in data[key]] - # Create figure with very compact design - fig, ax = plt.subplots(figsize=figsize, facecolor='white', dpi=100) - ax.set_facecolor('white') - + fig, ax = plt.subplots(figsize=figsize, facecolor="white", dpi=100) + ax.set_facecolor("white") + # Create scatter plot with enhanced professional styling - scatter = ax.scatter(data[key], data['feature'], s=data['norm_size'], - c=data['correlation'], cmap=cmap_custom, norm=norm, - alpha=0.92, edgecolors='#34495E', linewidths=1.2, - zorder=3) - + scatter = ax.scatter( + data[key], + data["feature"], + s=data["norm_size"], + c=data["correlation"], + cmap=cmap_custom, + norm=norm, + alpha=0.92, + edgecolors="#34495E", + linewidths=1.2, + zorder=3, + ) + # Add colorbar with enhanced professional styling cbar = plt.colorbar(scatter, ax=ax, pad=0.018, aspect=28, shrink=0.88) - cbar.set_label(colorbar_label, rotation=270, labelpad=22, fontsize=11, fontweight='bold') + cbar.set_label(colorbar_label, rotation=270, labelpad=22, fontsize=11, fontweight="bold") cbar.ax.tick_params(labelsize=10, width=1.3, length=5, pad=4) for spine in cbar.ax.spines.values(): spine.set_linewidth(1.3) - spine.set_edgecolor('#34495E') - + spine.set_edgecolor("#34495E") + # Set labels with clean formatting - ax.set_xlabel(key_column, fontweight='bold', fontsize=13, labelpad=10) - ax.set_ylabel('Data Feature', fontweight='bold', fontsize=13, labelpad=10) - + ax.set_xlabel(key_column, fontweight="bold", fontsize=13, labelpad=10) + ax.set_ylabel("Data Feature", fontweight="bold", fontsize=13, labelpad=10) + # Add title if provided if title: - ax.set_title(title, fontweight='bold', pad=20, fontsize=14) - + ax.set_title(title, fontweight="bold", pad=20, fontsize=14) + # Rotate x-axis labels for better readability - plt.setp(ax.xaxis.get_majorticklabels(), rotation=90, ha='right', va='top', fontsize=10) + plt.setp(ax.xaxis.get_majorticklabels(), rotation=90, ha="right", va="top", fontsize=10) plt.setp(ax.yaxis.get_majorticklabels(), fontsize=10) - + # Add professional grid for better readability - ax.grid(True, alpha=0.18, linestyle='--', linewidth=0.8, color='#95A5A6', zorder=0) + ax.grid(True, alpha=0.18, linestyle="--", linewidth=0.8, color="#95A5A6", zorder=0) ax.set_axisbelow(True) - + # Proper margins to prevent cropping while keeping columns close ax.margins(x=0.025, y=0.035) - + # Clean tick parameters - ax.tick_params(axis='both', which='major', labelsize=11, width=1.2, length=5) - + ax.tick_params(axis="both", which="major", labelsize=11, width=1.2, length=5) + # Remove top and right spines for cleaner look sns.despine(ax=ax) - + # Create size legend with 4 dots showing ACTUAL median metric values from data - handles_size, labels_size = scatter.legend_elements(prop="sizes", alpha=0.75, num=4, - markeredgecolor='#34495E', markeredgewidth=1.2) - + handles_size, labels_size = scatter.legend_elements( + prop="sizes", alpha=0.75, num=4, markeredgecolor="#34495E", markeredgewidth=1.2 + ) + # Use REAL median metric values from the data smin = np.min(data[size]) smax = np.max(data[size]) - labels_size = [f'{x:.2f}' for x in np.linspace(smin, smax, 4)] - + labels_size = [f"{x:.2f}" for x in np.linspace(smin, smax, 4)] + # Position legend on the right side, well below the colorbar with proper spacing - legend = ax.legend(handles_size, labels_size, title=size_label, - loc='upper left', bbox_to_anchor=(1.15, -0.05), - ncol=1, frameon=True, fancybox=False, - title_fontsize=9, fontsize=8, - edgecolor='#34495E', framealpha=0.98, - labelspacing=0.8, handletextpad=0.5) + legend = ax.legend( + handles_size, + labels_size, + title=size_label, + loc="upper left", + bbox_to_anchor=(1.15, -0.05), + ncol=1, + frameon=True, + fancybox=False, + title_fontsize=9, + fontsize=8, + edgecolor="#34495E", + framealpha=0.98, + labelspacing=0.8, + handletextpad=0.5, + ) legend.get_frame().set_linewidth(1.2) - legend.get_frame().set_facecolor('white') - legend.get_title().set_fontweight('bold') - + legend.get_frame().set_facecolor("white") + legend.get_title().set_fontweight("bold") + # Adjust layout with reduced horizontal spacing between subplots plt.tight_layout(pad=0.8, w_pad=1.8) - - if save_file_path != '': - plt.savefig(save_file_path, dpi=600, bbox_inches='tight', facecolor='white', - edgecolor='none', format='pdf' if save_file_path.endswith('.pdf') else None) + + if save_file_path != "": + plt.savefig( + save_file_path, + dpi=600, + bbox_inches="tight", + facecolor="white", + edgecolor="none", + format="pdf" if save_file_path.endswith(".pdf") else None, + ) print(f"Scatter plot saved to: {save_file_path}") - + if show_plots: plt.show() plt.close() + model_qml = ["QNN", "PQK", "VQC", "QSVC"] - model_qml = ['QNN', 'PQK', 'VQC' ,'QSVC'] - data[key_column] = data[key] - data['Data feature'] = data['feature'] - to_plot = data.pivot_table(columns = key_column, index = 'Data feature', values = 'correlation') - + data["Data feature"] = data["feature"] + to_plot = data.pivot_table(columns=key_column, index="Data feature", values="correlation") + # Define professional color scheme for model types - ccolors = ['#7B68EE' if re.sub(' .*', '', x) in model_qml else '#FF8C00' for x in to_plot.columns] + ccolors = [ + "#7B68EE" if re.sub(" .*", "", x) in model_qml else "#FF8C00" for x in to_plot.columns + ] # Create custom diverging colormap from matplotlib.colors import LinearSegmentedColormap - colors_heatmap = ['#2166ac', '#4393c3', '#92c5de', '#d1e5f0', '#f7f7f7', - '#fddbc7', '#f4a582', '#d6604d', '#b2182b'] - cmap_heatmap = LinearSegmentedColormap.from_list('custom_heatmap', colors_heatmap, N=256) + + colors_heatmap = [ + "#2166ac", + "#4393c3", + "#92c5de", + "#d1e5f0", + "#f7f7f7", + "#fddbc7", + "#f4a582", + "#d6604d", + "#b2182b", + ] + cmap_heatmap = LinearSegmentedColormap.from_list("custom_heatmap", colors_heatmap, N=256) # Create professional heatmap with better proportions heatmap_height = figsize[1] * 0.95 # Much taller to reduce space above colorbar heatmap_width = min(figsize[0] * 0.9, 10) # Narrower columns - - g = sns.clustermap(to_plot.fillna(0), - figsize=(heatmap_width, heatmap_height), - col_colors=ccolors, - cmap=cmap_heatmap, - method='average', - metric='euclidean', - center=0, - xticklabels=xticks, - yticklabels=True, - cbar_kws={'label': colorbar_label, 'orientation': 'horizontal'}, - linewidths=1.0, - linecolor='white', - vmin=-1, vmax=1, - dendrogram_ratio=0.05, - cbar_pos=(0.55, 0.01, 0.4, 0.015)) - + + g = sns.clustermap( + to_plot.fillna(0), + figsize=(heatmap_width, heatmap_height), + col_colors=ccolors, + cmap=cmap_heatmap, + method="average", + metric="euclidean", + center=0, + xticklabels=xticks, + yticklabels=True, + cbar_kws={"label": colorbar_label, "orientation": "horizontal"}, + linewidths=1.0, + linecolor="white", + vmin=-1, + vmax=1, + dendrogram_ratio=0.05, + cbar_pos=(0.55, 0.01, 0.4, 0.015), + ) + # Hide dendrograms for cleaner appearance g.ax_row_dendrogram.set_visible(False) g.ax_col_dendrogram.set_visible(False) - + # Improve axis labels with better styling - g.ax_heatmap.set_xlabel(key_column, fontweight='bold', fontsize=11, labelpad=12, color='#2C3E50') - g.ax_heatmap.set_ylabel('Data Feature', fontweight='bold', fontsize=11, labelpad=12, color='#2C3E50') - + g.ax_heatmap.set_xlabel( + key_column, fontweight="bold", fontsize=11, labelpad=12, color="#2C3E50" + ) + g.ax_heatmap.set_ylabel( + "Data Feature", fontweight="bold", fontsize=11, labelpad=12, color="#2C3E50" + ) + # Rotate x-labels 45 degrees for readability - plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, ha='right', fontsize=9, color='#2C3E50') - plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, fontsize=9, color='#2C3E50') - + plt.setp( + g.ax_heatmap.xaxis.get_majorticklabels(), + rotation=45, + ha="right", + fontsize=9, + color="#2C3E50", + ) + plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, fontsize=9, color="#2C3E50") + # Improve tick parameters with better styling - g.ax_heatmap.tick_params(axis='both', which='major', width=1.2, length=5, pad=4, colors='#2C3E50') - + g.ax_heatmap.tick_params( + axis="both", which="major", width=1.2, length=5, pad=4, colors="#2C3E50" + ) + # Style heatmap spines for spine in g.ax_heatmap.spines.values(): spine.set_linewidth(1.5) - spine.set_edgecolor('#34495E') - + spine.set_edgecolor("#34495E") + # Enhance horizontal colorbar styling at bottom if g.cax is not None: - g.cax.set_xlabel(colorbar_label, fontsize=10, fontweight='bold', labelpad=10, color='#2C3E50') - g.cax.tick_params(labelsize=9, width=1.2, length=4, colors='#2C3E50') + g.cax.set_xlabel( + colorbar_label, fontsize=10, fontweight="bold", labelpad=10, color="#2C3E50" + ) + g.cax.tick_params(labelsize=9, width=1.2, length=4, colors="#2C3E50") for spine in g.cax.spines.values(): spine.set_linewidth(1.2) - spine.set_edgecolor('#34495E') + spine.set_edgecolor("#34495E") - if save_file_path != '': - heatmap_path = re.sub('.pdf', '_heatmap.pdf', save_file_path) - plt.savefig(heatmap_path, dpi=600, bbox_inches='tight', facecolor='white', - edgecolor='none', format='pdf' if heatmap_path.endswith('.pdf') else None) + if save_file_path != "": + heatmap_path = re.sub(".pdf", "_heatmap.pdf", save_file_path) + plt.savefig( + heatmap_path, + dpi=600, + bbox_inches="tight", + facecolor="white", + edgecolor="none", + format="pdf" if heatmap_path.endswith(".pdf") else None, + ) print(f"Clustered heatmap saved to: {heatmap_path}") - + if show_plots: plt.show() plt.close() # Create non-clustered heatmap with quantum models first - qml_col = [x for x in to_plot.columns if re.sub(' .*', '', x) in model_qml] - cml_col = [x for x in to_plot.columns if re.sub(' .*', '', x) not in model_qml] + qml_col = [x for x in to_plot.columns if re.sub(" .*", "", x) in model_qml] + cml_col = [x for x in to_plot.columns if re.sub(" .*", "", x) not in model_qml] to_plot_ordered = to_plot.loc[:, qml_col + cml_col] - ccolors_ordered = ['#7B68EE' if re.sub(' .*', '', x) in model_qml else '#FF8C00' for x in to_plot_ordered.columns] - - g2 = sns.clustermap(to_plot_ordered.fillna(0), - figsize=(heatmap_width, heatmap_height), - col_colors=ccolors_ordered, - col_cluster=False, - row_cluster=True, - cmap=cmap_heatmap, - center=0, - xticklabels=xticks, - yticklabels=True, - cbar_kws={'label': colorbar_label, 'orientation': 'horizontal'}, - linewidths=1.0, - linecolor='white', - vmin=-1, vmax=1, - dendrogram_ratio=0.05, - cbar_pos=(0.55, 0.01, 0.4, 0.015), - method='average', - metric='euclidean') - + ccolors_ordered = [ + "#7B68EE" if re.sub(" .*", "", x) in model_qml else "#FF8C00" + for x in to_plot_ordered.columns + ] + + g2 = sns.clustermap( + to_plot_ordered.fillna(0), + figsize=(heatmap_width, heatmap_height), + col_colors=ccolors_ordered, + col_cluster=False, + row_cluster=True, + cmap=cmap_heatmap, + center=0, + xticklabels=xticks, + yticklabels=True, + cbar_kws={"label": colorbar_label, "orientation": "horizontal"}, + linewidths=1.0, + linecolor="white", + vmin=-1, + vmax=1, + dendrogram_ratio=0.05, + cbar_pos=(0.55, 0.01, 0.4, 0.015), + method="average", + metric="euclidean", + ) + # Improve axis labels with better styling - g2.ax_heatmap.set_xlabel(key_column, fontweight='bold', fontsize=11, labelpad=12, color='#2C3E50') - g2.ax_heatmap.set_ylabel('Data Feature', fontweight='bold', fontsize=11, labelpad=12, color='#2C3E50') - + g2.ax_heatmap.set_xlabel( + key_column, fontweight="bold", fontsize=11, labelpad=12, color="#2C3E50" + ) + g2.ax_heatmap.set_ylabel( + "Data Feature", fontweight="bold", fontsize=11, labelpad=12, color="#2C3E50" + ) + # Rotate x-labels 45 degrees for readability - plt.setp(g2.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, ha='right', fontsize=9, color='#2C3E50') - plt.setp(g2.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, fontsize=9, color='#2C3E50') - + plt.setp( + g2.ax_heatmap.xaxis.get_majorticklabels(), + rotation=45, + ha="right", + fontsize=9, + color="#2C3E50", + ) + plt.setp(g2.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, fontsize=9, color="#2C3E50") + # Improve tick parameters with better styling - g2.ax_heatmap.tick_params(axis='both', which='major', width=1.2, length=5, pad=4, colors='#2C3E50') - + g2.ax_heatmap.tick_params( + axis="both", which="major", width=1.2, length=5, pad=4, colors="#2C3E50" + ) + # Style heatmap spines for spine in g2.ax_heatmap.spines.values(): spine.set_linewidth(1.5) - spine.set_edgecolor('#34495E') - + spine.set_edgecolor("#34495E") + # Enhance horizontal colorbar styling at bottom if g2.cax is not None: - g2.cax.set_xlabel(colorbar_label, fontsize=10, fontweight='bold', labelpad=10, color='#2C3E50') - g2.cax.tick_params(labelsize=9, width=1.2, length=4, colors='#2C3E50') + g2.cax.set_xlabel( + colorbar_label, fontsize=10, fontweight="bold", labelpad=10, color="#2C3E50" + ) + g2.cax.tick_params(labelsize=9, width=1.2, length=4, colors="#2C3E50") for spine in g2.cax.spines.values(): spine.set_linewidth(1.2) - spine.set_edgecolor('#34495E') + spine.set_edgecolor("#34495E") - if save_file_path != '': - noncluster_path = re.sub('.pdf', '_noncluster_heatmap.pdf', save_file_path) - plt.savefig(noncluster_path, dpi=600, bbox_inches='tight', facecolor='white', - edgecolor='none', format='pdf' if noncluster_path.endswith('.pdf') else None) + if save_file_path != "": + noncluster_path = re.sub(".pdf", "_noncluster_heatmap.pdf", save_file_path) + plt.savefig( + noncluster_path, + dpi=600, + bbox_inches="tight", + facecolor="white", + edgecolor="none", + format="pdf" if noncluster_path.endswith(".pdf") else None, + ) print(f"Non-clustered heatmap saved to: {noncluster_path}") - + if show_plots: plt.show() - plt.close() \ No newline at end of file + plt.close() diff --git a/setup.py b/setup.py index d1588f0..d6d7a54 100644 --- a/setup.py +++ b/setup.py @@ -107,6 +107,7 @@ def read_requirements(): 'black>=23.0', 'flake8>=6.0', 'mypy>=1.0', + 'types-PyYAML', ], 'all': docs_require + [ 'hydra-core', @@ -116,6 +117,7 @@ def read_requirements(): 'black>=23.0', 'flake8>=6.0', 'mypy>=1.0', + 'types-PyYAML', ], }, diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..e1ee87a --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,27 @@ +from importlib.util import module_from_spec, spec_from_file_location +from pathlib import Path +import sys +import types + + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +def load_module(module_name: str, relative_path: str): + spec = spec_from_file_location(module_name, REPO_ROOT / relative_path) + if spec is None or spec.loader is None: + raise ImportError(f"Could not load module {module_name} from {relative_path}") + + module = module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +def ensure_package(package_name: str, relative_path: str): + package = sys.modules.get(package_name) + if package is None: + package = types.ModuleType(package_name) + package.__path__ = [str(REPO_ROOT / relative_path)] + sys.modules[package_name] = package + return package diff --git a/tests/test_data_generation.py b/tests/test_data_generation.py new file mode 100644 index 0000000..18d9b3c --- /dev/null +++ b/tests/test_data_generation.py @@ -0,0 +1,92 @@ +import json + +import pandas as pd + +from conftest import load_module + + +make_circles = load_module( + "tests._make_circles", + "qbiocode/data_generation/make_circles.py", +) +make_class = load_module( + "tests._make_class", + "qbiocode/data_generation/make_class.py", +) +make_spheres = load_module( + "tests._make_spheres", + "qbiocode/data_generation/make_spheres.py", +) + + +def test_generate_circles_datasets_writes_expected_files(tmp_path): + make_circles.generate_circles_datasets( + n_samples=[12], + noise=[0.15], + save_path=str(tmp_path), + random_state=7, + ) + + dataset_path = tmp_path / "circles_data-1.csv" + config_path = tmp_path / "dataset_config.json" + + assert dataset_path.exists() + assert config_path.exists() + + dataset = pd.read_csv(dataset_path) + with config_path.open(encoding="utf-8") as handle: + config = json.load(handle) + + assert list(dataset.columns) == ["0", "1", "class"] + assert len(dataset) == 12 + assert dataset["class"].isin([0, 1]).all() + assert list(config.values()) == [{"n_samples": 12, "noise": 0.15}] + + +def test_generate_classification_datasets_only_writes_valid_configurations(tmp_path): + make_class.generate_classification_datasets( + n_samples=[10], + n_features=[4, 3], + n_informative=[2], + n_redundant=[2], + n_classes=[2], + n_clusters_per_class=[1], + weights=[[0.5, 0.5]], + save_path=str(tmp_path), + random_state=11, + ) + + csv_files = sorted(tmp_path.glob("class_data-*.csv")) + config_path = tmp_path / "dataset_config.json" + + assert [path.name for path in csv_files] == ["class_data-1.csv"] + + with config_path.open(encoding="utf-8") as handle: + config = json.load(handle) + + assert list(config.values()) == [ + { + "n_samples": 10, + "n_features": 4, + "n_informative": 2, + "n_redundant": 2, + "n_classes": 2, + "n_clusters_per_class": 1, + "weights": [0.5, 0.5], + } + ] + + +def test_generate_points_in_nd_sphere_respects_radius_threshold(): + points = make_spheres.generate_points_in_nd_sphere( + n_s=25, + dim=4, + radius=3, + thresh=0.6, + ) + + norms = (points ** 2).sum(axis=1) ** 0.5 + + assert points.shape == (25, 4) + assert (norms <= 3).all() + assert (norms >= 1.8).all() diff --git a/tests/test_file_utilities.py b/tests/test_file_utilities.py new file mode 100644 index 0000000..d3c34a1 --- /dev/null +++ b/tests/test_file_utilities.py @@ -0,0 +1,82 @@ +from pathlib import Path + +import pytest + +from conftest import load_module + + +find_duplicates = load_module( + "tests._find_duplicates", + "qbiocode/utils/find_duplicates.py", +) +find_string = load_module( + "tests._find_string", + "qbiocode/utils/find_string.py", +) + + +def write_text(path: Path, content: str) -> None: + path.write_text(content, encoding="utf-8") + + +def normalize_pairs(pairs): + return {tuple(sorted(pair)) for pair in pairs} + + +def test_find_duplicate_files_detects_matches_ignoring_empty_lines(tmp_path): + write_text(tmp_path / "one.txt", "alpha\n\nbeta\n") + write_text(tmp_path / "two.txt", "beta\nalpha\n") + write_text(tmp_path / "three.txt", "alpha\ngamma\n") + + duplicates = find_duplicates.find_duplicate_files(str(tmp_path)) + + assert normalize_pairs(duplicates) == { + tuple(sorted((str(tmp_path / "one.txt"), str(tmp_path / "two.txt")))) + } + + +def test_find_duplicate_files_honors_case_sensitivity_setting(tmp_path): + write_text(tmp_path / "upper.txt", "Alpha\n") + write_text(tmp_path / "lower.txt", "alpha\n") + + duplicates = find_duplicates.find_duplicate_files( + str(tmp_path), + case_sensitive=False, + ) + + assert normalize_pairs(duplicates) == { + tuple(sorted((str(tmp_path / "upper.txt"), str(tmp_path / "lower.txt")))) + } + + +def test_find_duplicate_files_raises_for_missing_directory(tmp_path): + missing_dir = tmp_path / "missing" + + with pytest.raises(FileNotFoundError): + find_duplicates.find_duplicate_files(str(missing_dir)) + + +def test_find_string_in_files_returns_matching_lines_and_filters_by_pattern(tmp_path): + write_text(tmp_path / "config.yaml", "mode: fast\nEmbedding: PCA\n") + write_text(tmp_path / "notes.txt", "embedding: pca\n") + + results = find_string.find_string_in_files( + str(tmp_path), + "embedding: pca", + file_pattern=".yaml", + case_sensitive=False, + return_lines=True, + verbose=False, + ) + + assert results == { + str(tmp_path / "config.yaml"): [(2, "Embedding: PCA\n")], + } + + +def test_find_string_in_files_raises_for_non_directory(tmp_path): + file_path = tmp_path / "data.txt" + write_text(file_path, "content\n") + + with pytest.raises(NotADirectoryError): + find_string.find_string_in_files(str(file_path), "content") diff --git a/tests/test_generator_dispatch.py b/tests/test_generator_dispatch.py new file mode 100644 index 0000000..8f187ed --- /dev/null +++ b/tests/test_generator_dispatch.py @@ -0,0 +1,114 @@ +import pytest + +from conftest import ensure_package, load_module + + +def load_generator_module(): + ensure_package("qbiocode", "qbiocode") + ensure_package("qbiocode.data_generation", "qbiocode/data_generation") + + for module_name in [ + "make_circles", + "make_moons", + "make_class", + "make_s_curve", + "make_spheres", + "make_spirals", + "make_swiss_roll", + ]: + load_module( + f"qbiocode.data_generation.{module_name}", + f"qbiocode/data_generation/{module_name}.py", + ) + + return load_module( + "qbiocode.data_generation.generator", + "qbiocode/data_generation/generator.py", + ) + + +@pytest.mark.parametrize( + ("dataset_type", "module_attr", "function_name", "expected_kwargs"), + [ + ( + "circles", + "circles", + "generate_circles_datasets", + {"n_samples": [9], "noise": [0.2], "save_path": "out", "random_state": 5}, + ), + ( + "classes", + "make_class", + "generate_classification_datasets", + { + "n_samples": [9], + "n_features": [6], + "n_informative": [2], + "n_redundant": [1], + "n_classes": [2], + "n_clusters_per_class": [1], + "weights": [[0.5, 0.5]], + "save_path": "out", + "random_state": 5, + }, + ), + ( + "spheres", + "spheres", + "generate_spheres_datasets", + {"n_s": [9], "dim": [6], "radius": [4], "save_path": "out", "random_state": 5}, + ), + ( + "swiss_roll", + "swiss_roll", + "generate_swiss_roll_datasets", + { + "n_samples": [9], + "noise": [0.2], + "hole": [True], + "save_path": "out", + "random_state": 5, + }, + ), + ], +) +def test_generate_data_dispatches_to_expected_backend( + monkeypatch, + dataset_type, + module_attr, + function_name, + expected_kwargs, +): + generator = load_generator_module() + captured = {} + + def fake_backend(**kwargs): + captured.update(kwargs) + + monkeypatch.setattr(getattr(generator, module_attr), function_name, fake_backend) + + generator.generate_data( + type_of_data=dataset_type, + save_path="out", + n_samples=[9], + noise=[0.2], + hole=[True], + n_classes=[2], + dim=[6], + rad=[4], + n_features=[6], + n_informative=[2], + n_redundant=[1], + n_clusters_per_class=[1], + weights=[[0.5, 0.5]], + random_state=5, + ) + + assert captured == expected_kwargs + + +def test_generate_data_rejects_unknown_dataset_type(): + generator = load_generator_module() + + with pytest.raises(ValueError, match="Invalid type_of_data"): + generator.generate_data(type_of_data="unknown", save_path="out")