diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5d5568f..34045be 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -19,10 +19,10 @@ jobs:
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
@@ -30,8 +30,8 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install -e .
- pip install pytest pytest-cov flake8
+ pip install -e ".[dev]"
+ pip install flake8
- name: Lint with flake8
run: |
@@ -43,13 +43,12 @@ jobs:
- name: Run tests
run: |
- pytest --cov=qbiocode --cov-report=xml --cov-report=term
- continue-on-error: true
+ python -m pytest --cov=qbiocode --cov-report=xml --cov-report=term
- name: Upload coverage to Codecov
- uses: codecov/codecov-action@v4
+ uses: codecov/codecov-action@v6
with:
- file: ./coverage.xml
+ files: ./coverage.xml
flags: unittests
name: codecov-${{ matrix.os }}-py${{ matrix.python-version }}
continue-on-error: true
@@ -60,18 +59,19 @@ jobs:
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
- python-version: '3.10'
+ python-version: '3.12'
cache: 'pip'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install flake8 black isort mypy
+ pip install -e ".[dev]"
+ pip install isort
- name: Check code formatting with black
run: black --check --diff qbiocode/
@@ -91,10 +91,10 @@ jobs:
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: '3.10'
cache: 'pip'
@@ -102,8 +102,12 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install -e .
- pip install sphinx sphinx-rtd-theme
+ pip install -e ".[docs]"
+
+ - name: Install pandoc
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y pandoc
- name: Build documentation
run: |
@@ -112,7 +116,7 @@ jobs:
continue-on-error: true
- name: Upload documentation artifacts
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
name: documentation
path: docs/build/html/
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 83105c1..c239af3 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,5 +1,8 @@
name: Release
+permissions:
+ contents: write
+
on:
release:
types: [published]
@@ -12,10 +15,10 @@ jobs:
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: '3.10'
@@ -39,14 +42,9 @@ jobs:
- name: Upload release assets
if: github.event_name == 'release'
- uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- with:
- upload_url: ${{ github.event.release.upload_url }}
- asset_path: ./dist/*.whl
- asset_name: qbiocode-${{ github.event.release.tag_name }}-py3-none-any.whl
- asset_content_type: application/zip
+ run: gh release upload "${{ github.event.release.tag_name }}" dist/*.whl --clobber
continue-on-error: true
create-zenodo-release:
@@ -57,7 +55,7 @@ jobs:
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Zenodo release notification
run: |
diff --git a/.gitignore b/.gitignore
index 20cdcf4..f051c60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,13 @@ dist/
build/
*.whl
+# Testing results
+.coverage
+htmlcov/
+.coverage.*
+nosetests.xml
+coverage.xml
+
# IDE
.DS_Store
.vscode/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6f67e07..436e2ff 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -56,7 +56,7 @@ This project adheres to a [Code of Conduct](CODE_OF_CONDUCT.md). By participatin
4. **Install Development Dependencies** (optional)
```bash
- pip install pytest pytest-cov black flake8 mypy
+ pip install -e ".[dev]"
```
5. **Verify Installation**
@@ -164,7 +164,7 @@ from qbiocode.learning import compute_qsvc
3. **Test Your Changes**
```bash
# Run existing tests
- pytest tests/
+ python -m pytest
# Check code style
black qbiocode/
diff --git a/README.md b/README.md
index 7cd90ab..986644c 100644
--- a/README.md
+++ b/README.md
@@ -90,6 +90,18 @@ pip install --force-reinstall xgboost
For detailed installation instructions, see the [Installation Guide](https://ibm.github.io/QBioCode/installation.html).
+### Running Tests
+
+```bash
+# Install the package with development dependencies
+pip install -e ".[dev]"
+
+# Run the test suite
+python -m pytest
+```
+
+The current test suite focuses on utility modules and data-generation helpers that do not require a full runtime setup for all optional quantum workflows.
+
### Basic Usage
```python
diff --git a/docs/source/api_overview.rst b/docs/source/api_overview.rst
index cddbb32..3d9feb3 100644
--- a/docs/source/api_overview.rst
+++ b/docs/source/api_overview.rst
@@ -45,7 +45,7 @@ Multiple models can be run via the following
Classical Models
""""""""""""""""
-QBioCode provides classical machine learning models from `scikit-learn `_ for baseline comparisons and benchmarking against quantum models.
+QBioCode provides classical machine learning models from `scikit-learn `__ for baseline comparisons and benchmarking against quantum models.
.. autosummary::
~qbiocode.learning.compute_dt.compute_dt
@@ -291,4 +291,3 @@ Generated datasets are saved with:
References
^^^^^^^^^^
-
diff --git a/docs/source/apps/sage.rst b/docs/source/apps/sage.rst
index 808e065..46eb756 100644
--- a/docs/source/apps/sage.rst
+++ b/docs/source/apps/sage.rst
@@ -154,10 +154,11 @@ This trains QSage on historical QProfiler data and generates predictions for all
Train with Random Forest only:
.. code-block:: bash
-qsage --input qprofiler_results.csv --output results/ --model-type rf
-# Or train MLP sub-sages
-qsage --input qprofiler_results.csv --output results/ --model-type mlp
+ qsage --input qprofiler_results.csv --output results/ --model-type rf
+
+ # Or train MLP sub-sages
+ qsage --input qprofiler_results.csv --output results/ --model-type mlp
Train with custom seed and test size:
@@ -166,11 +167,12 @@ Train with custom seed and test size:
qsage --input data.csv --output results/ --seed 123 --test-size 0.3
-Train both Random Forest and MLP:
+Train with a custom MLP iteration count:
.. code-block:: bash
-# Train MLP with more epochs
-qsage --input data.csv --output results/ --model-type mlp --n-iter 2000
+
+ # Train MLP with more epochs
+ qsage --input data.csv --output results/ --model-type mlp --n-iter 2000
**Output Files**
@@ -439,4 +441,4 @@ QSage can reveal which complexity features are most predictive of model performa
.. admonition:: Reference
:class: tip
- For implementation details, see ``apps/sage/sage.py`` in the QBioCode repository.
\ No newline at end of file
+ For implementation details, see ``apps/sage/sage.py`` in the QBioCode repository.
diff --git a/docs/source/citing.rst b/docs/source/citing.rst
index bf4e2ff..1490b7a 100644
--- a/docs/source/citing.rst
+++ b/docs/source/citing.rst
@@ -1,10 +1,7 @@
.. _citing:
-Citing
-===============
-
Citing qbiocode
---------------
+===============
If qbiocode is integral to a scientific publication, please cite it.
A paper describing qbiocode has been published in the :
diff --git a/docs/source/conf.py b/docs/source/conf.py
index a232e56..469cac9 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -142,7 +142,8 @@ def run_apidoc(app):
html_show_sourcelink = False
html_logo = "_static/QBioCode_logo.png"
-html_favicon = "_static/favicon.ico"
+if os.path.exists(os.path.join(os.path.dirname(__file__), "_static", "favicon.ico")):
+ html_favicon = "_static/favicon.ico"
html_theme_options = {
"icon_links": [
diff --git a/pyproject.toml b/pyproject.toml
index a50a5c4..18f6565 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -84,8 +84,10 @@ dev = [
"pytest>=7.0",
"pytest-cov>=4.0",
"black>=23.0",
+ "isort>=5.0",
"flake8>=6.0",
"mypy>=1.0",
+ "types-PyYAML",
]
all = [
"hydra-core",
@@ -104,8 +106,10 @@ all = [
"pytest>=7.0",
"pytest-cov>=4.0",
"black>=23.0",
+ "isort>=5.0",
"flake8>=6.0",
"mypy>=1.0",
+ "types-PyYAML",
]
[project.scripts]
@@ -134,6 +138,10 @@ line-length = 100
target-version = ['py310', 'py311', 'py312']
include = '\.pyi?$'
+[tool.isort]
+profile = "black"
+line_length = 100
+
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
@@ -146,4 +154,4 @@ python_version = "3.10"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = false
-ignore_missing_imports = true
\ No newline at end of file
+ignore_missing_imports = true
diff --git a/qbiocode/__init__.py b/qbiocode/__init__.py
index b1c9dac..67dff52 100644
--- a/qbiocode/__init__.py
+++ b/qbiocode/__init__.py
@@ -25,109 +25,102 @@
>>> results = compute_rf(X_train, y_train, X_test, y_test)
"""
-from .version import __version__
+# ====== Import data generation functions ======
+from .data_generation import (
+ generate_circles_datasets,
+ generate_classification_datasets,
+ generate_moons_datasets,
+ generate_s_curve_datasets,
+ generate_spheres_datasets,
+ generate_spirals_datasets,
+ generate_swiss_roll_datasets,
+)
+from .data_generation.generator import generate_data
+
+# ====== Import embedding functions ======
+from .embeddings.embed import get_embeddings, pqk
+
+# ====== Import evaluation functions ======
+from .evaluation.dataset_evaluation import evaluate
+from .evaluation.model_evaluation import modeleval
+from .evaluation.model_run import model_run
# ====== Import learning functions ======
-from .learning.compute_svc import compute_svc, compute_svc_opt
from .learning.compute_dt import compute_dt, compute_dt_opt
-from .learning.compute_nb import compute_nb, compute_nb_opt
from .learning.compute_lr import compute_lr, compute_lr_opt
+from .learning.compute_mlp import compute_mlp, compute_mlp_opt
+from .learning.compute_nb import compute_nb, compute_nb_opt
+from .learning.compute_pqk import compute_pqk
+from .learning.compute_qnn import compute_qnn
+from .learning.compute_qsvc import compute_qsvc
from .learning.compute_rf import compute_rf, compute_rf_opt
+from .learning.compute_svc import compute_svc, compute_svc_opt
+from .learning.compute_vqc import compute_vqc
+
try:
from .learning.compute_xgb import compute_xgb, compute_xgb_opt
except Exception:
# XGBoost not available (e.g., OpenMP not installed on macOS)
compute_xgb = None # type: ignore
compute_xgb_opt = None # type: ignore
-from .learning.compute_mlp import compute_mlp, compute_mlp_opt
-from .learning.compute_qnn import compute_qnn
-from .learning.compute_qsvc import compute_qsvc
-from .learning.compute_vqc import compute_vqc
-from .learning.compute_pqk import compute_pqk
-
-# ====== Import embedding functions ======
-from .embeddings.embed import get_embeddings, pqk
# ====== Import helper functions ======
-from .utils.helper_fn import scaler_fn, feature_encoding
-from .utils.qc_winner_finder import qml_winner
from .utils.dataset_checkpoint import checkpoint_restart
-
-# ====== Import evaluation functions ======
-from .evaluation.model_evaluation import modeleval
-from .evaluation.dataset_evaluation import evaluate
-from .evaluation.model_run import model_run
+from .utils.helper_fn import feature_encoding, scaler_fn
+from .utils.qc_winner_finder import qml_winner
+from .version import __version__
# ====== Import visualization functions ======
from .visualization.visualize_correlation import (
+ compute_results_correlation,
plot_results_correlation,
- compute_results_correlation
-)
-
-# ====== Import data generation functions ======
-from .data_generation.generator import generate_data
-from .data_generation import (
- generate_circles_datasets,
- generate_moons_datasets,
- generate_classification_datasets,
- generate_s_curve_datasets,
- generate_spheres_datasets,
- generate_spirals_datasets,
- generate_swiss_roll_datasets,
)
__all__ = [
# Version
- '__version__',
-
+ "__version__",
# Classical ML algorithms
- 'compute_svc',
- 'compute_svc_opt',
- 'compute_dt',
- 'compute_dt_opt',
- 'compute_nb',
- 'compute_nb_opt',
- 'compute_lr',
- 'compute_lr_opt',
- 'compute_rf',
- 'compute_rf_opt',
- 'compute_xgb',
- 'compute_xgb_opt',
- 'compute_mlp',
- 'compute_mlp_opt',
-
+ "compute_svc",
+ "compute_svc_opt",
+ "compute_dt",
+ "compute_dt_opt",
+ "compute_nb",
+ "compute_nb_opt",
+ "compute_lr",
+ "compute_lr_opt",
+ "compute_rf",
+ "compute_rf_opt",
+ "compute_xgb",
+ "compute_xgb_opt",
+ "compute_mlp",
+ "compute_mlp_opt",
# Quantum ML algorithms
- 'compute_qnn',
- 'compute_qsvc',
- 'compute_vqc',
- 'compute_pqk',
-
+ "compute_qnn",
+ "compute_qsvc",
+ "compute_vqc",
+ "compute_pqk",
# Embeddings
- 'get_embeddings',
- 'pqk',
-
+ "get_embeddings",
+ "pqk",
# Utilities
- 'scaler_fn',
- 'feature_encoding',
- 'qml_winner',
- 'checkpoint_restart',
-
+ "scaler_fn",
+ "feature_encoding",
+ "qml_winner",
+ "checkpoint_restart",
# Evaluation
- 'modeleval',
- 'evaluate',
- 'model_run',
-
+ "modeleval",
+ "evaluate",
+ "model_run",
# Visualization
- 'plot_results_correlation',
- 'compute_results_correlation',
-
+ "plot_results_correlation",
+ "compute_results_correlation",
# Data generation
- 'generate_data',
- 'generate_circles_datasets',
- 'generate_moons_datasets',
- 'generate_classification_datasets',
- 'generate_s_curve_datasets',
- 'generate_spheres_datasets',
- 'generate_spirals_datasets',
- 'generate_swiss_roll_datasets',
+ "generate_data",
+ "generate_circles_datasets",
+ "generate_moons_datasets",
+ "generate_classification_datasets",
+ "generate_s_curve_datasets",
+ "generate_spheres_datasets",
+ "generate_spirals_datasets",
+ "generate_swiss_roll_datasets",
]
diff --git a/qbiocode/data_generation/__init__.py b/qbiocode/data_generation/__init__.py
index 2b292d6..1afc139 100644
--- a/qbiocode/data_generation/__init__.py
+++ b/qbiocode/data_generation/__init__.py
@@ -16,19 +16,19 @@
"""
from .make_circles import generate_circles_datasets
-from .make_moons import generate_moons_datasets
from .make_class import generate_classification_datasets
+from .make_moons import generate_moons_datasets
from .make_s_curve import generate_s_curve_datasets
from .make_spheres import generate_spheres_datasets
from .make_spirals import generate_spirals_datasets
from .make_swiss_roll import generate_swiss_roll_datasets
__all__ = [
- 'generate_circles_datasets',
- 'generate_moons_datasets',
- 'generate_classification_datasets',
- 'generate_s_curve_datasets',
- 'generate_spheres_datasets',
- 'generate_spirals_datasets',
- 'generate_swiss_roll_datasets',
+ "generate_circles_datasets",
+ "generate_moons_datasets",
+ "generate_classification_datasets",
+ "generate_s_curve_datasets",
+ "generate_spheres_datasets",
+ "generate_spirals_datasets",
+ "generate_swiss_roll_datasets",
]
diff --git a/qbiocode/data_generation/generator.py b/qbiocode/data_generation/generator.py
index 36fa75b..f78f3a7 100644
--- a/qbiocode/data_generation/generator.py
+++ b/qbiocode/data_generation/generator.py
@@ -8,8 +8,8 @@
### Imports ###
import qbiocode.data_generation.make_circles as circles
-import qbiocode.data_generation.make_moons as moons
import qbiocode.data_generation.make_class as make_class
+import qbiocode.data_generation.make_moons as moons
import qbiocode.data_generation.make_s_curve as s_curve
import qbiocode.data_generation.make_spheres as spheres
import qbiocode.data_generation.make_spirals as spirals
@@ -32,28 +32,28 @@
def generate_data(
- type_of_data=None,
- save_path=None,
- n_samples=N_SAMPLES,
- noise=NOISE,
- hole=HOLE,
- n_classes=N_CLASSES,
- dim=DIM,
- rad=RAD,
- n_features=N_FEATURES,
- n_informative=N_INFORMATIVE,
- n_redundant=N_REDUNDANT,
- n_clusters_per_class=N_CLUSTERS_PER_CLASS,
- weights=WEIGHTS,
- random_state=42,
+ type_of_data=None,
+ save_path=None,
+ n_samples=N_SAMPLES,
+ noise=NOISE,
+ hole=HOLE,
+ n_classes=N_CLASSES,
+ dim=DIM,
+ rad=RAD,
+ n_features=N_FEATURES,
+ n_informative=N_INFORMATIVE,
+ n_redundant=N_REDUNDANT,
+ n_clusters_per_class=N_CLUSTERS_PER_CLASS,
+ weights=WEIGHTS,
+ random_state=42,
):
"""
Generate synthetic datasets for machine learning benchmarking.
-
+
Unified interface to generate various types of synthetic datasets with
configurable parameters. Each dataset type creates multiple configurations
by varying the specified parameters.
-
+
Parameters
----------
type_of_data : str
@@ -85,17 +85,17 @@ def generate_data(
Class weight distributions (for classes only).
random_state : int, default=42
Random seed for reproducibility.
-
+
Returns
-------
None
Saves generated datasets to the specified path.
-
+
Raises
------
ValueError
If type_of_data is not one of the supported types.
-
+
Examples
--------
>>> from qbiocode.data_generation import generate_data
@@ -104,65 +104,62 @@ def generate_data(
Dataset generation complete.
"""
- if type_of_data == 'circles':
+ if type_of_data == "circles":
# Generate circles dataset
- circles.generate_circles_datasets(n_samples=n_samples,
- noise=noise,
- save_path=save_path,
- random_state=random_state)
- elif type_of_data == 'moons':
+ circles.generate_circles_datasets(
+ n_samples=n_samples, noise=noise, save_path=save_path, random_state=random_state
+ )
+ elif type_of_data == "moons":
# Generate moons dataset
- moons.generate_moons_datasets(n_samples=n_samples,
- noise=noise,
- save_path=save_path,
- random_state=random_state)
- elif type_of_data == 'classes':
+ moons.generate_moons_datasets(
+ n_samples=n_samples, noise=noise, save_path=save_path, random_state=random_state
+ )
+ elif type_of_data == "classes":
# Generate higher-dimensional classification dataset
- make_class.generate_classification_datasets(n_samples=n_samples,
- n_features=n_features,
- n_informative=n_informative,
- n_redundant=n_redundant,
- n_classes=n_classes,
- n_clusters_per_class=n_clusters_per_class,
- weights=weights,
- save_path=save_path,
- random_state=random_state
+ make_class.generate_classification_datasets(
+ n_samples=n_samples,
+ n_features=n_features,
+ n_informative=n_informative,
+ n_redundant=n_redundant,
+ n_classes=n_classes,
+ n_clusters_per_class=n_clusters_per_class,
+ weights=weights,
+ save_path=save_path,
+ random_state=random_state,
)
- elif type_of_data == 's_curve':
+ elif type_of_data == "s_curve":
# Generate S-curve dataset
- s_curve.generate_s_curve_datasets(n_samples=n_samples,
- noise=noise,
- save_path=save_path,
- random_state=random_state
- )
- elif type_of_data == 'spheres':
+ s_curve.generate_s_curve_datasets(
+ n_samples=n_samples, noise=noise, save_path=save_path, random_state=random_state
+ )
+ elif type_of_data == "spheres":
# Generate spheres dataset
- spheres.generate_spheres_datasets(n_s=n_samples,
- dim=dim,
- radius=rad,
- save_path=save_path,
- random_state=random_state
- )
- elif type_of_data == 'spirals':
+ spheres.generate_spheres_datasets(
+ n_s=n_samples, dim=dim, radius=rad, save_path=save_path, random_state=random_state
+ )
+ elif type_of_data == "spirals":
# Generate spirals dataset
- spirals.generate_spirals_datasets(n_s=n_samples,
- n_c=n_classes,
- n_n=noise,
- n_d=dim,
- save_path=save_path,
- random_state=random_state
- )
- elif type_of_data == 'swiss_roll':
+ spirals.generate_spirals_datasets(
+ n_s=n_samples,
+ n_c=n_classes,
+ n_n=noise,
+ n_d=dim,
+ save_path=save_path,
+ random_state=random_state,
+ )
+ elif type_of_data == "swiss_roll":
# Generate Swiss roll dataset
- swiss_roll.generate_swiss_roll_datasets(n_samples=n_samples,
- noise=noise,
- hole=hole,
- save_path=save_path,
- random_state=random_state
- )
+ swiss_roll.generate_swiss_roll_datasets(
+ n_samples=n_samples,
+ noise=noise,
+ hole=hole,
+ save_path=save_path,
+ random_state=random_state,
+ )
else:
- raise ValueError("Invalid type_of_data. Choose from 'circles', 'moons', 'classes', 's_curve', 'spheres', 'spirals', or 'swiss_roll'.")
+ raise ValueError(
+ "Invalid type_of_data. Choose from 'circles', 'moons', 'classes', 's_curve', 'spheres', 'spirals', or 'swiss_roll'."
+ )
print("Dataset generation complete.")
return
-
diff --git a/qbiocode/data_generation/make_circles.py b/qbiocode/data_generation/make_circles.py
index cdcfcb3..1eacb8d 100644
--- a/qbiocode/data_generation/make_circles.py
+++ b/qbiocode/data_generation/make_circles.py
@@ -6,18 +6,19 @@
algorithms on non-linearly separable data.
"""
-from sklearn.datasets import make_circles
-import pandas as pd
-import numpy as np
import itertools
import json
import os
+import numpy as np
+import pandas as pd
+from sklearn.datasets import make_circles
# parameters to vary across the configurations
N_SAMPLES = list(range(100, 300, 20))
NOISE = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+
def generate_circles_datasets(
n_samples=N_SAMPLES,
noise=NOISE,
@@ -26,11 +27,11 @@ def generate_circles_datasets(
):
"""
Generate multiple concentric circles datasets with varying parameters.
-
+
Creates a series of 2D datasets where samples form two concentric circles,
providing a classic non-linearly separable binary classification problem.
Each configuration varies the number of samples and noise level.
-
+
Parameters
----------
n_samples : list of int, default=range(100, 300, 20)
@@ -41,19 +42,19 @@ def generate_circles_datasets(
Directory path where datasets and configuration files will be saved.
random_state : int, default=42
Random seed for reproducibility.
-
+
Returns
-------
None
Saves CSV files for each dataset configuration and a JSON file with
all configuration parameters.
-
+
Notes
-----
- Each dataset is saved as 'circles_data-{i}.csv' where i is the configuration number
- Configuration parameters are saved in 'dataset_config.json'
- The last column 'class' contains binary labels (0 or 1)
-
+
Examples
--------
>>> from qbiocode.data_generation import generate_circles_datasets
@@ -61,12 +62,12 @@ def generate_circles_datasets(
Generating circles dataset...
"""
print("Generating circles dataset...")
-
+
np.random.seed(random_state)
if save_path is None:
- save_path = 'circles_data'
-
+ save_path = "circles_data"
+
if not os.path.exists(save_path):
os.makedirs(save_path)
@@ -78,29 +79,30 @@ def generate_circles_datasets(
# populate all the configs with the corresponding argument values
for n_s, n_n in configurations:
- config = "n_samples={}, noise={}".format(
- n_s, n_n,
- )
- # print(count_configs)
-
-
+ config = "n_samples={}, noise={}".format(
+ n_s,
+ n_n,
+ )
+ # print(count_configs)
+
# iteratively run the function for each combination of arguments
- X, y = make_circles(
- n_samples=n_s,
- noise=n_n,
- random_state=random_state,
+ X, y = make_circles(
+ n_samples=n_s,
+ noise=n_n,
+ random_state=random_state,
+ )
+ # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
+ dataset = pd.DataFrame(X)
+ dataset["class"] = y
+ with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile:
+ dataset_config.update(
+ {"ld_data-{}.csv".format(count_configs): {"n_samples": n_s, "noise": n_n}}
)
- # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
- dataset = pd.DataFrame(X)
- dataset['class'] = y
- with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile:
- dataset_config.update({'ld_data-{}.csv'.format(count_configs):
- {'n_samples': n_s,
- 'noise': n_n}})
- json.dump(dataset_config, outfile, indent=4)
- new_dataset = dataset.to_csv( os.path.join( save_path, 'circles_data-{}.csv'.format(count_configs)), index=False)
- count_configs += 1
- # print(X.shape)
- # print(y.shape)
+ json.dump(dataset_config, outfile, indent=4)
+ new_dataset = dataset.to_csv(
+ os.path.join(save_path, "circles_data-{}.csv".format(count_configs)), index=False
+ )
+ count_configs += 1
+ # print(X.shape)
+ # print(y.shape)
return
-
diff --git a/qbiocode/data_generation/make_class.py b/qbiocode/data_generation/make_class.py
index ced8627..c61a05c 100644
--- a/qbiocode/data_generation/make_class.py
+++ b/qbiocode/data_generation/make_class.py
@@ -6,25 +6,26 @@
useful for testing machine learning algorithms on high-dimensional data.
"""
-from sklearn.datasets import make_classification
-import pandas as pd
-import numpy as np
-import json
import itertools
+import json
import os
+import numpy as np
+import pandas as pd
+from sklearn.datasets import make_classification
dataset_config = {}
# parameters to vary across the configurations
N_SAMPLES = list(range(100, 300, 50))
-N_FEATURES = list(range(10,60,10))
-N_INFORMATIVE = list(range(2,8,4))
-N_REDUNDANT = list(range(2,8,4))
+N_FEATURES = list(range(10, 60, 10))
+N_INFORMATIVE = list(range(2, 8, 4))
+N_REDUNDANT = list(range(2, 8, 4))
N_CLASSES = list(range(2, 4, 6))
N_CLUSTERS_PER_CLASS = list(range(1, 2, 3))
WEIGHTS = [[0.3, 0.7], [0.4, 0.6], [0.5, 0.5]]
+
def generate_classification_datasets(
n_samples,
n_features,
@@ -38,11 +39,11 @@ def generate_classification_datasets(
):
"""
Generate multiple high-dimensional classification datasets with varying parameters.
-
+
Creates a series of synthetic datasets for multi-class classification problems
with configurable feature characteristics including informative features,
redundant features, and class distributions.
-
+
Parameters
----------
n_samples : list of int
@@ -63,20 +64,20 @@ def generate_classification_datasets(
Directory path where datasets and configuration files will be saved.
random_state : int, default=42
Random seed for reproducibility.
-
+
Returns
-------
None
Saves CSV files for each dataset configuration and a JSON file with
all configuration parameters.
-
+
Notes
-----
- Each dataset is saved as 'class_data-{i}.csv' where i is the configuration number
- Configuration parameters are saved in 'dataset_config.json'
- The last column 'class' contains class labels
- Only valid configurations where (n_informative + n_redundant) <= n_features are generated
-
+
Examples
--------
>>> from qbiocode.data_generation import generate_classification_datasets
@@ -88,55 +89,72 @@ def generate_classification_datasets(
Generating classes dataset...
"""
print("Generating classes dataset...")
-
+
np.random.seed(random_state)
if save_path is None:
- save_path = 'class_data'
-
+ save_path = "class_data"
+
if not os.path.exists(save_path):
os.makedirs(save_path)
# enumerate all possible combinations of parameters based on ranges above
- configurations = list(itertools.product(*[n_samples, n_features, n_informative, n_redundant, n_classes, n_clusters_per_class, weights]))
+ configurations = list(
+ itertools.product(
+ *[
+ n_samples,
+ n_features,
+ n_informative,
+ n_redundant,
+ n_classes,
+ n_clusters_per_class,
+ weights,
+ ]
+ )
+ )
count_configs = 1
# populate all the configs with the corresponding argument values
for n_s, n_f, n_i, n_r, n_cla, n_clu, weights in configurations:
- if (n_i + n_r) <= n_f:
- config = "n_samples={}, n_features={}, n_informative={}, n_redundant={}, n_classes={}, n_clusters_per_class={}, weights={}".format(
- n_s, n_f, n_i, n_r, n_cla, n_clu, weights
- )
- # print(count_configs)
-
-
+ if (n_i + n_r) <= n_f:
+ config = "n_samples={}, n_features={}, n_informative={}, n_redundant={}, n_classes={}, n_clusters_per_class={}, weights={}".format(
+ n_s, n_f, n_i, n_r, n_cla, n_clu, weights
+ )
+ # print(count_configs)
+
# iteratively run the function for each combination of arguments
- X, y = make_classification(
- n_samples=n_s,
- n_features=n_f,
- n_informative=n_i,
- n_redundant=n_r,
- n_classes=n_cla,
- n_clusters_per_class=n_clu,
- weights=weights,
- random_state=random_state,
+ X, y = make_classification(
+ n_samples=n_s,
+ n_features=n_f,
+ n_informative=n_i,
+ n_redundant=n_r,
+ n_classes=n_cla,
+ n_clusters_per_class=n_clu,
+ weights=weights,
+ random_state=random_state,
+ )
+ # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
+ dataset = pd.DataFrame(X)
+ dataset["class"] = y
+ with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile:
+ dataset_config.update(
+ {
+ "hd_data-{}.csv".format(count_configs): {
+ "n_samples": n_s,
+ "n_features": n_f,
+ "n_informative": n_i,
+ "n_redundant": n_r,
+ "n_classes": n_cla,
+ "n_clusters_per_class": n_clu,
+ "weights": weights,
+ }
+ }
)
- # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
- dataset = pd.DataFrame(X)
- dataset['class'] = y
- with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile:
- dataset_config.update({'hd_data-{}.csv'.format(count_configs):
- {'n_samples': n_s,
- 'n_features': n_f,
- 'n_informative': n_i,
- 'n_redundant': n_r,
- 'n_classes': n_cla,
- 'n_clusters_per_class': n_clu,
- 'weights': weights}})
- json.dump(dataset_config, outfile, indent=4)
- new_dataset = dataset.to_csv( os.path.join( save_path, 'class_data-{}.csv'.format(count_configs)), index=False)
- count_configs += 1
- # print(X.shape)
- # print(y.shape)
+ json.dump(dataset_config, outfile, indent=4)
+ new_dataset = dataset.to_csv(
+ os.path.join(save_path, "class_data-{}.csv".format(count_configs)), index=False
+ )
+ count_configs += 1
+ # print(X.shape)
+ # print(y.shape)
return
-
\ No newline at end of file
diff --git a/qbiocode/data_generation/make_moons.py b/qbiocode/data_generation/make_moons.py
index b9341a6..f392438 100644
--- a/qbiocode/data_generation/make_moons.py
+++ b/qbiocode/data_generation/make_moons.py
@@ -6,18 +6,19 @@
algorithms on non-linearly separable data with interleaving classes.
"""
-from sklearn.datasets import make_moons
-import pandas as pd
-import numpy as np
import itertools
import json
import os
+import numpy as np
+import pandas as pd
+from sklearn.datasets import make_moons
# parameters to vary across the configurations
N_SAMPLES = list(range(100, 300, 20))
NOISE = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+
def generate_moons_datasets(
n_samples=N_SAMPLES,
noise=NOISE,
@@ -26,11 +27,11 @@ def generate_moons_datasets(
):
"""
Generate multiple two-moons datasets with varying parameters.
-
+
Creates a series of 2D datasets where samples form two interleaving half-circles
(moons), providing a challenging non-linearly separable binary classification problem.
Each configuration varies the number of samples and noise level.
-
+
Parameters
----------
n_samples : list of int, default=range(100, 300, 20)
@@ -41,20 +42,20 @@ def generate_moons_datasets(
Directory path where datasets and configuration files will be saved.
random_state : int, default=42
Random seed for reproducibility.
-
+
Returns
-------
None
Saves CSV files for each dataset configuration and a JSON file with
all configuration parameters.
-
+
Notes
-----
- Each dataset is saved as 'moons_data-{i}.csv' where i is the configuration number
- Configuration parameters are saved in 'dataset_config.json'
- The last column 'class' contains binary labels (0 or 1)
- Two-moons datasets are commonly used to evaluate algorithms on interleaving patterns
-
+
Examples
--------
>>> from qbiocode.data_generation import generate_moons_datasets
@@ -62,12 +63,12 @@ def generate_moons_datasets(
Generating moons dataset...
"""
print("Generating moons dataset...")
-
+
np.random.seed(random_state)
if save_path is None:
- save_path = 'moons_data'
-
+ save_path = "moons_data"
+
if not os.path.exists(save_path):
os.makedirs(save_path)
@@ -81,28 +82,30 @@ def generate_moons_datasets(
# populate all the configs with the corresponding argument values
for n_s, n_n in configurations:
- config = "n_samples={}, noise={}".format(
- n_s, n_n,
- )
- # print(count_configs)
-
-
+ config = "n_samples={}, noise={}".format(
+ n_s,
+ n_n,
+ )
+ # print(count_configs)
+
# iteratively run the function for each combination of arguments
- X, y = make_moons(
- n_samples=n_s,
- noise=n_n,
- random_state=random_state,
+ X, y = make_moons(
+ n_samples=n_s,
+ noise=n_n,
+ random_state=random_state,
+ )
+ # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
+ dataset = pd.DataFrame(X)
+ dataset["class"] = y
+ with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile:
+ dataset_config.update(
+ {"moons_data-{}.csv".format(count_configs): {"n_samples": n_s, "noise": n_n}}
)
- # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
- dataset = pd.DataFrame(X)
- dataset['class'] = y
- with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile:
- dataset_config.update({'moons_data-{}.csv'.format(count_configs):
- {'n_samples': n_s,
- 'noise': n_n}})
- json.dump(dataset_config, outfile, indent=4)
- new_dataset = dataset.to_csv( os.path.join( save_path, 'moons_data-{}.csv'.format(count_configs)), index=False)
- count_configs += 1
- # print(X.shape)
- # print(y.shape)
+ json.dump(dataset_config, outfile, indent=4)
+ new_dataset = dataset.to_csv(
+ os.path.join(save_path, "moons_data-{}.csv".format(count_configs)), index=False
+ )
+ count_configs += 1
+ # print(X.shape)
+ # print(y.shape)
return
diff --git a/qbiocode/data_generation/make_s_curve.py b/qbiocode/data_generation/make_s_curve.py
index 9ee3f83..e1be9a5 100644
--- a/qbiocode/data_generation/make_s_curve.py
+++ b/qbiocode/data_generation/make_s_curve.py
@@ -6,18 +6,19 @@
reduction and manifold learning algorithms.
"""
-from sklearn.datasets import make_s_curve
-import pandas as pd
-import numpy as np
import itertools
import json
import os
+import numpy as np
+import pandas as pd
+from sklearn.datasets import make_s_curve
# parameters to vary across the configurations
N_SAMPLES = list(range(100, 300, 20))
NOISE = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+
def generate_s_curve_datasets(
n_samples=N_SAMPLES,
noise=NOISE,
@@ -26,11 +27,11 @@ def generate_s_curve_datasets(
):
"""
Generate multiple 3D S-curve datasets with varying parameters.
-
+
Creates a series of 3D datasets where samples lie on an S-shaped manifold,
a classic benchmark for manifold learning and dimensionality reduction algorithms.
Each configuration varies the number of samples and noise level.
-
+
Parameters
----------
n_samples : list of int, default=range(100, 300, 20)
@@ -41,20 +42,20 @@ def generate_s_curve_datasets(
Directory path where datasets and configuration files will be saved.
random_state : int, default=42
Random seed for reproducibility.
-
+
Returns
-------
None
Saves CSV files for each dataset configuration and a JSON file with
all configuration parameters.
-
+
Notes
-----
- Each dataset is saved as 's_curve_data-{i}.csv' where i is the configuration number
- Configuration parameters are saved in 'dataset_config.json'
- The last column 'class' contains the position along the manifold (continuous values)
- S-curve is a standard benchmark for testing manifold learning algorithms
-
+
Examples
--------
>>> from qbiocode.data_generation import generate_s_curve_datasets
@@ -62,12 +63,12 @@ def generate_s_curve_datasets(
Generating S Curve dataset...
"""
print("Generating S Curve dataset...")
-
+
np.random.seed(random_state)
if save_path is None:
- save_path = 's_curve_data'
-
+ save_path = "s_curve_data"
+
if not os.path.exists(save_path):
os.makedirs(save_path)
@@ -81,29 +82,30 @@ def generate_s_curve_datasets(
# populate all the configs with the corresponding argument values
for n_s, n_n in configurations:
- config = "n_samples={}, noise={}".format(
- n_s, n_n,
- )
- # print(count_configs)
-
-
+ config = "n_samples={}, noise={}".format(
+ n_s,
+ n_n,
+ )
+ # print(count_configs)
+
# iteratively run the function for each combination of arguments
- X, y = make_s_curve(
- n_samples=n_s,
- noise=n_n,
- random_state=random_state,
+ X, y = make_s_curve(
+ n_samples=n_s,
+ noise=n_n,
+ random_state=random_state,
+ )
+ # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
+ dataset = pd.DataFrame(X)
+ dataset["class"] = y
+ with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile:
+ dataset_config.update(
+ {"s_curve_data-{}.csv".format(count_configs): {"n_samples": n_s, "noise": n_n}}
)
- # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
- dataset = pd.DataFrame(X)
- dataset['class'] = y
- with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile:
- dataset_config.update({'s_curve_data-{}.csv'.format(count_configs):
- {'n_samples': n_s,
- 'noise': n_n}})
- json.dump(dataset_config, outfile, indent=4)
- new_dataset = dataset.to_csv( os.path.join( save_path, 's_curve_data-{}.csv'.format(count_configs)), index=False)
- count_configs += 1
- # print(X.shape)
- # print(y.shape)
+ json.dump(dataset_config, outfile, indent=4)
+ new_dataset = dataset.to_csv(
+ os.path.join(save_path, "s_curve_data-{}.csv".format(count_configs)), index=False
+ )
+ count_configs += 1
+ # print(X.shape)
+ # print(y.shape)
return
-
diff --git a/qbiocode/data_generation/make_spheres.py b/qbiocode/data_generation/make_spheres.py
index 16bc33a..22e2aea 100644
--- a/qbiocode/data_generation/make_spheres.py
+++ b/qbiocode/data_generation/make_spheres.py
@@ -6,19 +6,19 @@
machine learning algorithms on high-dimensional non-linearly separable data.
"""
-import numpy as np
-import matplotlib.pyplot as plt
-import pandas as pd
import itertools
import json
import os
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
-def generate_points_in_nd_sphere(n_s, dim = 3, radius=1, thresh = 0.9):
+def generate_points_in_nd_sphere(n_s, dim=3, radius=1, thresh=0.9):
"""
Generate random points within an n-dimensional spherical shell.
-
+
Parameters
----------
n_s : int
@@ -29,7 +29,7 @@ def generate_points_in_nd_sphere(n_s, dim = 3, radius=1, thresh = 0.9):
Outer radius of the spherical shell.
thresh : float, default=0.9
Inner radius threshold as fraction of outer radius (creates shell).
-
+
Returns
-------
points : ndarray of shape (n_s, dim)
@@ -40,17 +40,19 @@ def generate_points_in_nd_sphere(n_s, dim = 3, radius=1, thresh = 0.9):
while cnt < n_s:
pnts = np.random.rand(dim) * 2 * radius - radius
pnts_nrm = np.linalg.norm(pnts)
- if (pnts_nrm <= radius) & (pnts_nrm >= radius*thresh):
+ if (pnts_nrm <= radius) & (pnts_nrm >= radius * thresh):
points.append(pnts)
cnt += 1
points = np.asarray(points)
return points
+
# parameters to vary across the configurations
N_SAMPLES = list(range(100, 300, 25))
DIM = list(range(5, 15, 5))
RAD = list(range(5, 20, 5))
+
def generate_spheres_datasets(
n_s=N_SAMPLES,
dim=DIM,
@@ -60,12 +62,12 @@ def generate_spheres_datasets(
):
"""
Generate multiple concentric n-dimensional spheres datasets with varying parameters.
-
+
Creates a series of high-dimensional datasets where samples form two concentric
spherical shells, providing a challenging non-linearly separable binary classification
problem in high dimensions. Each configuration varies the number of samples,
dimensionality, and sphere radii.
-
+
Parameters
----------
n_s : list of int, default=range(100, 300, 25)
@@ -78,20 +80,20 @@ def generate_spheres_datasets(
Directory path where datasets and configuration files will be saved.
random_state : int, default=42
Random seed for reproducibility.
-
+
Returns
-------
None
Saves CSV files for each dataset configuration and a JSON file with
all configuration parameters.
-
+
Notes
-----
- Each dataset is saved as 'spheres_data-{i}.csv' where i is the configuration number
- Configuration parameters are saved in 'dataset_config.json'
- The last column 'class' contains binary labels (0 for outer, 1 for inner sphere)
- Samples are generated in spherical shells (not solid spheres) for better separation
-
+
Examples
--------
>>> from qbiocode.data_generation import generate_spheres_datasets
@@ -99,12 +101,12 @@ def generate_spheres_datasets(
Generating spheres dataset...
"""
print("Generating spheres dataset...")
-
+
np.random.seed(random_state)
if save_path is None:
- save_path = 'spheres_data'
-
+ save_path = "spheres_data"
+
if not os.path.exists(save_path):
os.makedirs(save_path)
@@ -118,38 +120,41 @@ def generate_spheres_datasets(
# populate all the configs with the corresponding argument values
for n_s, n_d, n_r in configurations:
- config = "samples={}, dimensions={}, radius={}".format(
- n_s, n_d, n_r
- )
- # print(count_configs)
- radius1 = n_r
- radius2 = radius1 * 0.5
- Xa = generate_points_in_nd_sphere(n_s, dim = n_d, radius=radius1, thresh = 0.9)
- Xb = generate_points_in_nd_sphere(n_s, dim = n_d, radius=radius2, thresh = 0.9)
- X = np.concatenate((Xa, Xb))
- y = [0]*len(Xa) + [1]*len(Xb)
-
- # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
- X_df = pd.DataFrame(X)
- y_dict = {'class':y}
- y_df = pd.DataFrame(y_dict)
- df = pd.concat([X_df, y_df], axis=1)
- with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile:
- dataset_config.update({'spheres_data-{}.csv'.format(count_configs):
+ config = "samples={}, dimensions={}, radius={}".format(n_s, n_d, n_r)
+ # print(count_configs)
+ radius1 = n_r
+ radius2 = radius1 * 0.5
+ Xa = generate_points_in_nd_sphere(n_s, dim=n_d, radius=radius1, thresh=0.9)
+ Xb = generate_points_in_nd_sphere(n_s, dim=n_d, radius=radius2, thresh=0.9)
+ X = np.concatenate((Xa, Xb))
+ y = [0] * len(Xa) + [1] * len(Xb)
+
+ # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
+ X_df = pd.DataFrame(X)
+ y_dict = {"class": y}
+ y_df = pd.DataFrame(y_dict)
+ df = pd.concat([X_df, y_df], axis=1)
+ with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile:
+ dataset_config.update(
{
- 'n_samples':n_s,
- 'dimensions': n_d,
- 'radius': n_r}})
- json.dump(dataset_config, outfile, indent=4)
- new_dataset = df.to_csv( os.path.join( save_path, 'spheres_data-{}.csv'.format(count_configs)), index=False)
- count_configs += 1
-
- # fig = plt.figure()
- # ax = fig.add_subplot(111, projection='3d')
- # # ax.scatter(X[:, 0], X[:, 1],X[:,2], c= y, cmap='viridis')
- # ax.scatter(X[:, n_d-3], X[:, n_d-2],X[:, n_d-1], c=y, cmap='viridis')
- # plt.savefig('spheres_data/spheres_data-{}.png'.format(count_configs))
- # print(X.shape)
- # print(y.shape)
- return
+ "spheres_data-{}.csv".format(count_configs): {
+ "n_samples": n_s,
+ "dimensions": n_d,
+ "radius": n_r,
+ }
+ }
+ )
+ json.dump(dataset_config, outfile, indent=4)
+ new_dataset = df.to_csv(
+ os.path.join(save_path, "spheres_data-{}.csv".format(count_configs)), index=False
+ )
+ count_configs += 1
+ # fig = plt.figure()
+ # ax = fig.add_subplot(111, projection='3d')
+ # # ax.scatter(X[:, 0], X[:, 1],X[:,2], c= y, cmap='viridis')
+ # ax.scatter(X[:, n_d-3], X[:, n_d-2],X[:, n_d-1], c=y, cmap='viridis')
+ # plt.savefig('spheres_data/spheres_data-{}.png'.format(count_configs))
+ # print(X.shape)
+ # print(y.shape)
+ return
diff --git a/qbiocode/data_generation/make_spirals.py b/qbiocode/data_generation/make_spirals.py
index 8f9c433..9857d0c 100644
--- a/qbiocode/data_generation/make_spirals.py
+++ b/qbiocode/data_generation/make_spirals.py
@@ -6,21 +6,22 @@
machine learning algorithms on complex non-linearly separable patterns.
"""
-import numpy as np
-import matplotlib.pyplot as plt
-import pandas as pd
import itertools
import json
import os
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
def make_spirals(n_samples=5000, n_classes=2, noise=0.3, dim=3):
"""
Generate an n-dimensional dataset of intertwined spirals.
-
+
Creates spiral patterns in n-dimensional space where each class forms
a distinct spiral arm. Supports dimensions 3, 6, 9, and 12.
-
+
Parameters
----------
n_samples : int, default=5000
@@ -31,7 +32,7 @@ def make_spirals(n_samples=5000, n_classes=2, noise=0.3, dim=3):
Standard deviation of Gaussian noise added to each dimension.
dim : int, default=3
Dimensionality of the output space (must be 3, 6, 9, or 12).
-
+
Returns
-------
X : ndarray of shape (n_samples, dim)
@@ -48,30 +49,36 @@ def make_spirals(n_samples=5000, n_classes=2, noise=0.3, dim=3):
x = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
y_ = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
z = t + np.random.normal(0, noise, n_samples // n_classes)
- if dim==3:
- X.append(np.column_stack([x, y_, z])) # any new dimensions need to be added to this list
-
- # to add more dimensions, apparently you would just keep adding 't' variable from above, to each new dimension,
+ if dim == 3:
+ X.append(
+ np.column_stack([x, y_, z])
+ ) # any new dimensions need to be added to this list
+
+ # to add more dimensions, apparently you would just keep adding 't' variable from above, to each new dimension,
# as seen below. The question is, how can we iteratively do this while maintaining the binary classification
- # that this for loop is creating?
+ # that this for loop is creating?
# nesting a loop iterating over the number of dimensions doesn't really work from what I'm seeing. so far
# However, manually adding repeats of the same 3Ds, does work, as seen below -- is this correct?
-
- # for j in range(dim-3): # for anything above the first 3D
- if dim==6:
+
+ # for j in range(dim-3): # for anything above the first 3D
+ if dim == 6:
new_d1 = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
new_d2 = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
new_d3 = t + np.random.normal(0, noise, n_samples // n_classes)
- X.append(np.column_stack([x, y_, z, new_d1, new_d2, new_d3])) # any new dimensions need to be added to this list
- if dim==9:
+ X.append(
+ np.column_stack([x, y_, z, new_d1, new_d2, new_d3])
+ ) # any new dimensions need to be added to this list
+ if dim == 9:
new_d1 = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
new_d2 = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
new_d3 = t + np.random.normal(0, noise, n_samples // n_classes)
new_d4 = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
new_d5 = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
new_d6 = t + np.random.normal(0, noise, n_samples // n_classes)
- X.append(np.column_stack([x, y_, z, new_d1, new_d2, new_d3, new_d4, new_d5, new_d6])) # any new dimensions need to be added to this list
- if dim==12:
+ X.append(
+ np.column_stack([x, y_, z, new_d1, new_d2, new_d3, new_d4, new_d5, new_d6])
+ ) # any new dimensions need to be added to this list
+ if dim == 12:
new_d1 = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
new_d2 = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
new_d3 = t + np.random.normal(0, noise, n_samples // n_classes)
@@ -81,7 +88,24 @@ def make_spirals(n_samples=5000, n_classes=2, noise=0.3, dim=3):
new_d7 = t * np.cos(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
new_d8 = t * np.sin(t + i * np.pi) + np.random.normal(0, noise, n_samples // n_classes)
new_d9 = t + np.random.normal(0, noise, n_samples // n_classes)
- X.append(np.column_stack([x, y_, z, new_d1, new_d2, new_d3, new_d4, new_d5, new_d6, new_d7, new_d8, new_d9]))
+ X.append(
+ np.column_stack(
+ [
+ x,
+ y_,
+ z,
+ new_d1,
+ new_d2,
+ new_d3,
+ new_d4,
+ new_d5,
+ new_d6,
+ new_d7,
+ new_d8,
+ new_d9,
+ ]
+ )
+ )
y.extend([i] * (n_samples // n_classes))
return np.vstack(X), np.array(y)
@@ -93,6 +117,7 @@ def make_spirals(n_samples=5000, n_classes=2, noise=0.3, dim=3):
NOISE = [0.3, 0.6, 0.9]
DIM = [3, 6, 9, 12]
+
def generate_spirals_datasets(
n_s=N_SAMPLES,
n_c=N_CLASSES,
@@ -103,12 +128,12 @@ def generate_spirals_datasets(
):
"""
Generate multiple n-dimensional spiral datasets with varying parameters.
-
+
Creates a series of high-dimensional datasets where samples form intertwined
spiral patterns, providing challenging non-linearly separable multi-class
classification problems. Each configuration varies the number of samples,
classes, noise level, and dimensionality.
-
+
Parameters
----------
n_s : list of int, default=range(100, 300, 50)
@@ -123,20 +148,20 @@ def generate_spirals_datasets(
Directory path where datasets and configuration files will be saved.
random_state : int, default=42
Random seed for reproducibility.
-
+
Returns
-------
None
Saves CSV files for each dataset configuration and a JSON file with
all configuration parameters.
-
+
Notes
-----
- Each dataset is saved as 'spirals_data-{i}.csv' where i is the configuration number
- Configuration parameters are saved in 'dataset_config.json'
- The last column 'class' contains class labels
- Spiral patterns become increasingly complex in higher dimensions
-
+
Examples
--------
>>> from qbiocode.data_generation import generate_spirals_datasets
@@ -144,12 +169,12 @@ def generate_spirals_datasets(
Generating spirals dataset...
"""
print("Generating spirals dataset...")
-
+
np.random.seed(random_state)
if save_path is None:
- save_path = 'spirals_data'
-
+ save_path = "spirals_data"
+
if not os.path.exists(save_path):
os.makedirs(save_path)
@@ -161,38 +186,36 @@ def generate_spirals_datasets(
dataset_config = {}
- # populate all the configs with the corresponding argument values
+ # populate all the configs with the corresponding argument values
for n_s, n_c, n_n, n_d in configurations:
- config = "samples={}, classes={}, noise={}, dimensions={}".format(
- n_s, n_c, n_n, n_d
- )
- # print(count_configs)
-
- X, y = make_spirals(
- n_samples=n_s,
- n_classes=n_c,
- noise=n_n,
- dim=n_d
+ config = "samples={}, classes={}, noise={}, dimensions={}".format(n_s, n_c, n_n, n_d)
+ # print(count_configs)
+
+ X, y = make_spirals(n_samples=n_s, n_classes=n_c, noise=n_n, dim=n_d)
+ # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
+ dataset = pd.DataFrame(X)
+ dataset["class"] = y
+ with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile:
+ dataset_config.update(
+ {
+ "spirals_data-{}.csv".format(count_configs): {
+ "n_samples": n_s,
+ "noise": n_n,
+ "dimensions": n_d,
+ }
+ }
)
- # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
- dataset = pd.DataFrame(X)
- dataset['class'] = y
- with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile:
- dataset_config.update({'spirals_data-{}.csv'.format(count_configs):
- {'n_samples': n_s,
- 'noise': n_n,
- 'dimensions': n_d
- }})
- json.dump(dataset_config, outfile, indent=4)
- new_dataset = dataset.to_csv( os.path.join( save_path, 'spirals_data-{}.csv'.format(count_configs)), index=False)
- count_configs += 1
-
- # plot the last 3 dimensions in each case
- # fig = plt.figure()
- # ax = fig.add_subplot(111, projection='3d')
- # ax.scatter(X[:, n_d-3], X[:, n_d-2],X[:, n_d-1], c=y, cmap='viridis')
- # plt.savefig('spirals_data/spirals_data-{}.png'.format(count_configs))
- #print(X.shape)
- #print(y.shape)
- return
+ json.dump(dataset_config, outfile, indent=4)
+ new_dataset = dataset.to_csv(
+ os.path.join(save_path, "spirals_data-{}.csv".format(count_configs)), index=False
+ )
+ count_configs += 1
+ # plot the last 3 dimensions in each case
+ # fig = plt.figure()
+ # ax = fig.add_subplot(111, projection='3d')
+ # ax.scatter(X[:, n_d-3], X[:, n_d-2],X[:, n_d-1], c=y, cmap='viridis')
+ # plt.savefig('spirals_data/spirals_data-{}.png'.format(count_configs))
+ # print(X.shape)
+ # print(y.shape)
+ return
diff --git a/qbiocode/data_generation/make_swiss_roll.py b/qbiocode/data_generation/make_swiss_roll.py
index 0c9b6f4..824aa21 100644
--- a/qbiocode/data_generation/make_swiss_roll.py
+++ b/qbiocode/data_generation/make_swiss_roll.py
@@ -6,19 +6,20 @@
dimensionality reduction and manifold learning algorithms.
"""
-from sklearn.datasets import make_swiss_roll
-import pandas as pd
-import numpy as np
import itertools
import json
import os
+import numpy as np
+import pandas as pd
+from sklearn.datasets import make_swiss_roll
# parameters to vary across the configurations
N_SAMPLES = list(range(100, 300, 20))
NOISE = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
HOLE = [True, False]
+
def generate_swiss_roll_datasets(
n_samples=N_SAMPLES,
noise=NOISE,
@@ -28,12 +29,12 @@ def generate_swiss_roll_datasets(
):
"""
Generate multiple 3D Swiss roll datasets with varying parameters.
-
+
Creates a series of 3D datasets where samples lie on a Swiss roll manifold,
a classic benchmark for manifold learning and dimensionality reduction algorithms.
Each configuration varies the number of samples, noise level, and whether the
roll has a hole in the center.
-
+
Parameters
----------
n_samples : list of int, default=range(100, 300, 20)
@@ -46,20 +47,20 @@ def generate_swiss_roll_datasets(
Directory path where datasets and configuration files will be saved.
random_state : int, default=42
Random seed for reproducibility.
-
+
Returns
-------
None
Saves CSV files for each dataset configuration and a JSON file with
all configuration parameters.
-
+
Notes
-----
- Each dataset is saved as 'swiss_roll_data-{i}.csv' where i is the configuration number
- Configuration parameters are saved in 'dataset_config.json'
- The last column 'class' contains the position along the manifold (continuous values)
- Swiss roll is a standard benchmark for testing manifold learning algorithms
-
+
Examples
--------
>>> from qbiocode.data_generation import generate_swiss_roll_datasets
@@ -67,12 +68,12 @@ def generate_swiss_roll_datasets(
Generating swiss roll dataset...
"""
print("Generating swiss roll dataset...")
-
+
np.random.seed(random_state)
if save_path is None:
- save_path = 'swiss_roll_data'
-
+ save_path = "swiss_roll_data"
+
if not os.path.exists(save_path):
os.makedirs(save_path)
@@ -86,31 +87,34 @@ def generate_swiss_roll_datasets(
# populate all the configs with the corresponding argument values
for n_s, n_n, n_h in configurations:
- config = "n_samples={}, noise={}, hole={}".format(
- n_s, n_n, n_h
- )
- # print(count_configs)
-
-
+ config = "n_samples={}, noise={}, hole={}".format(n_s, n_n, n_h)
+ # print(count_configs)
+
# iteratively run the function for each combination of arguments
- X, y = make_swiss_roll(
- n_samples=n_s,
- noise=n_n,
- hole=n_h,
- random_state=random_state,
+ X, y = make_swiss_roll(
+ n_samples=n_s,
+ noise=n_n,
+ hole=n_h,
+ random_state=random_state,
+ )
+ # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
+ dataset = pd.DataFrame(X)
+ dataset["class"] = y
+ with open(os.path.join(save_path, "dataset_config.json"), "w") as outfile:
+ dataset_config.update(
+ {
+ "swiss_roll_data-{}.csv".format(count_configs): {
+ "n_samples": n_s,
+ "noise": n_n,
+ "hole": n_h,
+ }
+ }
)
- # print("Configuration {}/{}: {}".format(count_configs, len(configurations), config))
- dataset = pd.DataFrame(X)
- dataset['class'] = y
- with open( os.path.join( save_path, 'dataset_config.json' ), 'w') as outfile:
- dataset_config.update({'swiss_roll_data-{}.csv'.format(count_configs):
- {'n_samples': n_s,
- 'noise': n_n,
- 'hole': n_h}})
- json.dump(dataset_config, outfile, indent=4)
- new_dataset = dataset.to_csv( os.path.join( save_path, 'swiss_roll_data-{}.csv'.format(count_configs)), index=False)
- count_configs += 1
- # print(X.shape)
- # print(y.shape)
+ json.dump(dataset_config, outfile, indent=4)
+ new_dataset = dataset.to_csv(
+ os.path.join(save_path, "swiss_roll_data-{}.csv".format(count_configs)), index=False
+ )
+ count_configs += 1
+ # print(X.shape)
+ # print(y.shape)
return
-
diff --git a/qbiocode/embeddings/__init__.py b/qbiocode/embeddings/__init__.py
index af0664a..bd6f300 100644
--- a/qbiocode/embeddings/__init__.py
+++ b/qbiocode/embeddings/__init__.py
@@ -7,12 +7,12 @@
and quantum feature maps.
Available Functions
-------------------
+-------------------
- get_embeddings: Compute embeddings using various methods (PCA, t-SNE, UMAP, etc.)
- pqk: Projected Quantum Kernel embedding
Available Classes
-----------------
+-----------------
- ConvAutoencoder: Convolutional autoencoder for dimensionality reduction
Usage
@@ -24,11 +24,11 @@
>>> X_pqk = pqk(X, n_components=4)
"""
-from .embed import get_embeddings, pqk
from .compute_autoencoder import ConvAutoencoder
+from .embed import get_embeddings, pqk
__all__ = [
- 'get_embeddings',
- 'pqk',
- 'ConvAutoencoder',
+ "get_embeddings",
+ "pqk",
+ "ConvAutoencoder",
]
diff --git a/qbiocode/embeddings/compute_autoencoder.py b/qbiocode/embeddings/compute_autoencoder.py
index 6653fcf..32b7470 100644
--- a/qbiocode/embeddings/compute_autoencoder.py
+++ b/qbiocode/embeddings/compute_autoencoder.py
@@ -2,11 +2,12 @@
import torch.nn as nn
import torch.optim as optim
+
# Define the Autoencoder Model
class ConvAutoencoder(nn.Module):
def __init__(self):
super(ConvAutoencoder, self).__init__()
-
+
# Encoder
self.encoder = nn.Sequential(
nn.Conv2d(7, 64, kernel_size=3, stride=2, padding=1), # (64, 192, 192)
@@ -17,25 +18,35 @@ def __init__(self):
nn.ReLU(),
nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1), # (512, 24, 24)
nn.ReLU(),
- nn.Conv2d(512, 7, kernel_size=3, stride=2, padding=1), # (7, 16, 16)
- nn.ReLU()
+ nn.Conv2d(512, 7, kernel_size=3, stride=2, padding=1), # (7, 16, 16)
+ nn.ReLU(),
)
-
+
# Decoder
self.decoder = nn.Sequential(
- nn.ConvTranspose2d(7, 512, kernel_size=3, stride=2, padding=1, output_padding=1), # (512, 24, 24)
- nn.ReLU(),
- nn.ConvTranspose2d(512, 256, kernel_size=3, stride=2, padding=1, output_padding=1), # (256, 48, 48)
- nn.ReLU(),
- nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, output_padding=1), # (128, 96, 96)
- nn.ReLU(),
- nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1), # (64, 192, 192)
- nn.ReLU(),
- nn.ConvTranspose2d(64, 7, kernel_size=3, stride=2, padding=1, output_padding=1), # (7, 384, 384)
- nn.Sigmoid()
+ nn.ConvTranspose2d(
+ 7, 512, kernel_size=3, stride=2, padding=1, output_padding=1
+ ), # (512, 24, 24)
+ nn.ReLU(),
+ nn.ConvTranspose2d(
+ 512, 256, kernel_size=3, stride=2, padding=1, output_padding=1
+ ), # (256, 48, 48)
+ nn.ReLU(),
+ nn.ConvTranspose2d(
+ 256, 128, kernel_size=3, stride=2, padding=1, output_padding=1
+ ), # (128, 96, 96)
+ nn.ReLU(),
+ nn.ConvTranspose2d(
+ 128, 64, kernel_size=3, stride=2, padding=1, output_padding=1
+ ), # (64, 192, 192)
+ nn.ReLU(),
+ nn.ConvTranspose2d(
+ 64, 7, kernel_size=3, stride=2, padding=1, output_padding=1
+ ), # (7, 384, 384)
+ nn.Sigmoid(),
)
-
+
def forward(self, x):
latent = self.encoder(x)
reconstructed = self.decoder(latent)
- return reconstructed
\ No newline at end of file
+ return reconstructed
diff --git a/qbiocode/embeddings/embed.py b/qbiocode/embeddings/embed.py
index a4b5391..3b6b087 100644
--- a/qbiocode/embeddings/embed.py
+++ b/qbiocode/embeddings/embed.py
@@ -1,34 +1,40 @@
-import numpy as np
import os
+from functools import reduce
+
+import numpy as np
+# ====== Qiskit imports ======
+from qiskit import QuantumCircuit
+from qiskit.quantum_info import Pauli
# ====== Embedding functions imports ======
-from sklearn.decomposition import PCA
-from sklearn.decomposition import NMF
-from sklearn.manifold import (
- Isomap,
- LocallyLinearEmbedding,
- SpectralEmbedding,
-)
+from sklearn.decomposition import NMF, PCA
+from sklearn.manifold import Isomap, LocallyLinearEmbedding, SpectralEmbedding
from umap import UMAP
-from functools import reduce
-
-# ====== Qiskit imports ======
-from qiskit import QuantumCircuit
import qbiocode.utils.qutils as qutils
-from qiskit.quantum_info import Pauli
-def pqk(X_train, X_test, args, store = False, data_key = '',
- encoding = 'Z', data_map=True, primitive = 'estimator', entanglement = 'linear', reps= 2):
+
+def pqk(
+ X_train,
+ X_test,
+ args,
+ store=False,
+ data_key="",
+ encoding="Z",
+ data_map=True,
+ primitive="estimator",
+ entanglement="linear",
+ reps=2,
+):
"""
This function generates quantum circuits, computes projections of the data onto these circuits.
It uses a feature map to encode the data into quantum states and then measures the expectation values
- of Pauli operators to obtain the features.
+ of Pauli operators to obtain the features.
This function requires a quantum backend (simulator or real quantum hardware) for execution.
It supports various configurations such as encoding methods, entanglement strategies, and repetitions
of the feature map. Optionally the results are saved to files for training and test projections.
-
+
Args:
X_train (np.ndarray): Training data features.
X_test (np.ndarray): Test data features.
@@ -47,8 +53,8 @@ def pqk(X_train, X_test, args, store = False, data_key = '',
feat_dimension = X_train.shape[1]
- if data_map:
- # This function ensures that all multiplicative factors of data features inside single qubit gates are 1.0
+ if data_map:
+ # This function ensures that all multiplicative factors of data features inside single qubit gates are 1.0
def data_map_func(x: np.ndarray) -> float:
"""
Define a function map from R^n to R.
@@ -60,70 +66,82 @@ def data_map_func(x: np.ndarray) -> float:
float: the mapped value
"""
coeff = x[0] / 2 if len(x) == 1 else reduce(lambda m, n: (m * n) / 2, x)
- return coeff
+ return float(coeff)
+
else:
data_map_func = None
-
- # choose a method for mapping your features onto the circuit
- feature_map, _ = qutils.get_feature_map(feature_map=encoding,
- feat_dimension=X_train.shape[1],
- reps = reps,
- entanglement=entanglement,
- data_map_func = data_map_func)
+
+ # choose a method for mapping your features onto the circuit
+ feature_map, _ = qutils.get_feature_map(
+ feature_map=encoding,
+ feat_dimension=X_train.shape[1],
+ reps=reps,
+ entanglement=entanglement,
+ data_map_func=data_map_func,
+ )
# Build quantum circuit
circuit = QuantumCircuit(feature_map.num_qubits)
circuit.compose(feature_map, inplace=True)
num_qubits = circuit.num_qubits
-
# Generate the backend, session and primitive
- backend, session, prim = qutils.get_backend_session(args,
- 'estimator',
- num_qubits=num_qubits)
+ backend, session, prim = qutils.get_backend_session(args, "estimator", num_qubits=num_qubits)
# Transpile
- if args['backend'] != 'simulator':
- circuit = qutils.transpile_circuit( circuit, opt_level=3, backend = backend,
- PT = True, initial_layout = None)
+ if args["backend"] != "simulator":
+ circuit = qutils.transpile_circuit(
+ circuit, opt_level=3, backend=backend, PT=True, initial_layout=None
+ )
- for f_tr in ['train', 'test']:
-
- if 'train' in f_tr:
+ for f_tr in ["train", "test"]:
+
+ if "train" in f_tr:
dat = X_train.copy()
else:
dat = X_test.copy()
-
+
# Identity operator on all qubits
- id = 'I' * feat_dimension
+ id = "I" * feat_dimension
# We group all commuting observables
# These groups are the Pauli X, Y and Z operators on individual qubits
# Apply the circuit layout to the observable if mapped to device
- if args['backend'] != 'simulator':
- observables_x =[]
- observables_y =[]
- observables_z =[]
+ if args["backend"] != "simulator":
+ observables_x = []
+ observables_y = []
+ observables_z = []
for i in range(feat_dimension):
- observables_x.append( Pauli(id[:i] + 'X' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) )
- observables_y.append( Pauli(id[:i] + 'Y' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) )
- observables_z.append( Pauli(id[:i] + 'Z' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) )
+ observables_x.append(
+ Pauli(id[:i] + "X" + id[(i + 1) :]).apply_layout(
+ circuit.layout, num_qubits=backend.num_qubits
+ )
+ )
+ observables_y.append(
+ Pauli(id[:i] + "Y" + id[(i + 1) :]).apply_layout(
+ circuit.layout, num_qubits=backend.num_qubits
+ )
+ )
+ observables_z.append(
+ Pauli(id[:i] + "Z" + id[(i + 1) :]).apply_layout(
+ circuit.layout, num_qubits=backend.num_qubits
+ )
+ )
else:
- observables_x = [Pauli(id[:i] + 'X' + id[(i + 1):]) for i in range(feat_dimension)]
- observables_y = [Pauli(id[:i] + 'Y' + id[(i + 1):]) for i in range(feat_dimension)]
- observables_z = [Pauli(id[:i] + 'Z' + id[(i + 1):]) for i in range(feat_dimension)]
-
-
+ observables_x = [Pauli(id[:i] + "X" + id[(i + 1) :]) for i in range(feat_dimension)]
+ observables_y = [Pauli(id[:i] + "Y" + id[(i + 1) :]) for i in range(feat_dimension)]
+ observables_z = [Pauli(id[:i] + "Z" + id[(i + 1) :]) for i in range(feat_dimension)]
+
# projections[i][j][k] will be the expectation value of the j-th Pauli operator (0: X, 1: Y, 2: Z)
# of datapoint i on qubit k
projections = []
for i in range(len(dat)):
-
- # Get training sample
+
+ # Get training sample
parameters = dat[i]
- # We define the primitive unified blocs (PUBs) consisting of the embedding circuit,
+ # We define the primitive unified blocs (PUBs) consisting of the embedding circuit,
# set of observables and the circuit parameters
pub_x = (circuit, observables_x, parameters)
pub_y = (circuit, observables_y, parameters)
@@ -135,28 +153,30 @@ def data_map_func(x: np.ndarray) -> float:
job_result_z = job.result()[2].data.evs
# Record , and on all qubits for the current datapoint
- projections.append([job_result_x, job_result_y, job_result_z])
-
+ projections.append([job_result_x, job_result_y, job_result_z])
+
if store:
- if not os.path.exists( 'pqk_projections'):
- os.makedirs('pqk_projections')
+ if not os.path.exists("pqk_projections"):
+ os.makedirs("pqk_projections")
- file_projection = os.path.join( 'pqk_projections', 'pqk_projection_' + data_key + '_'+f_tr+'.npy')
-
- np.save( file_projection, projections )
+ file_projection = os.path.join(
+ "pqk_projections", "pqk_projection_" + data_key + "_" + f_tr + ".npy"
+ )
- if 'train' in f_tr:
+ np.save(file_projection, projections)
+
+ if "train" in f_tr:
X_train_prj = np.array(projections.copy()).reshape(len(projections), -1)
else:
X_test_prj = np.array(projections.copy()).reshape(len(projections), -1)
-
+
if not isinstance(session, type(None)):
session.close()
return X_train_prj, X_test_prj
-def get_embeddings(embedding: str, X_train, X_test, n_neighbors=30, n_components=None, method=None):
+def get_embeddings(embedding: str, X_train, X_test, n_neighbors=30, n_components=None, method=None):
"""This function applies the specified embedding technique to the training and test datasets.
Args:
@@ -166,55 +186,50 @@ def get_embeddings(embedding: str, X_train, X_test, n_neighbors=30, n_components
n_neighbors (int, optional): Number of neighbors for certain embeddings. Defaults to 30.
n_components (int, optional): Number of components for the embedding. If None, it defaults to the number of features in X_train.
method (str, optional): Method for Locally Linear Embedding. Defaults to None.
-
+
Returns:
tuple: Transformed training and test datasets.
"""
- embedding = embedding.lower()
- valid_modes = ['none', 'pca', 'lle', 'isomap', 'spectral', 'umap', 'nmf']
+ embedding = embedding.lower()
+ valid_modes = ["none", "pca", "lle", "isomap", "spectral", "umap", "nmf"]
if embedding not in valid_modes:
raise ValueError(f"Invalid mode: {embedding}. Mode must be one of {valid_modes}")
-
- assert n_components <= X_train.shape[1], "number of components greater than number of feature in the dataset"
- if 'none' == embedding:
+ assert (
+ n_components <= X_train.shape[1]
+ ), "number of components greater than number of feature in the dataset"
+ if "none" == embedding:
return X_train, X_test
else:
embedding_model = None
- if 'pca' == embedding:
- embedding_model = PCA(
- n_components=n_components)
- elif 'nmf' == embedding:
- embedding_model = NMF(
- n_components=n_components)
- elif 'lle' == embedding:
- if method==None:
+ if "pca" == embedding:
+ embedding_model = PCA(n_components=n_components)
+ elif "nmf" == embedding:
+ embedding_model = NMF(n_components=n_components)
+ elif "lle" == embedding:
+ if method == None:
embedding_model = LocallyLinearEmbedding(
- n_neighbors=n_neighbors,
- n_components=n_components,
- method='standard')
- else:
+ n_neighbors=n_neighbors, n_components=n_components, method="standard"
+ )
+ else:
embedding_model = LocallyLinearEmbedding(
- n_neighbors=n_neighbors,
- n_components=n_components,
- method='modified')
- elif 'isomap' == embedding:
+ n_neighbors=n_neighbors, n_components=n_components, method="modified"
+ )
+ elif "isomap" == embedding:
embedding_model = Isomap(
- n_neighbors=n_neighbors,
- n_components=n_components,
- )
- elif 'spectral' == embedding:
- embedding_model = SpectralEmbedding(
- n_components=n_components,
- eigen_solver="arpack")
- elif 'umap' == embedding:
+ n_neighbors=n_neighbors,
+ n_components=n_components,
+ )
+ elif "spectral" == embedding:
+ embedding_model = SpectralEmbedding(n_components=n_components, eigen_solver="arpack")
+ elif "umap" == embedding:
embedding_model = UMAP(
- n_neighbors=n_neighbors,
- n_components=n_components,
- )
+ n_neighbors=n_neighbors,
+ n_components=n_components,
+ )
X_train = embedding_model.fit_transform(X_train)
X_test = embedding_model.transform(X_test)
-
- return X_train, X_test
\ No newline at end of file
+
+ return X_train, X_test
diff --git a/qbiocode/evaluation/__init__.py b/qbiocode/evaluation/__init__.py
index bdeb755..eddfc06 100644
--- a/qbiocode/evaluation/__init__.py
+++ b/qbiocode/evaluation/__init__.py
@@ -7,7 +7,7 @@
dataset complexity analysis, and automated model execution.
Available Functions
-------------------
+-------------------
- modeleval: Evaluate model performance with multiple metrics
- evaluate: Comprehensive dataset complexity evaluation
- model_run: Automated model training and evaluation pipeline
@@ -21,12 +21,12 @@
>>> complexity_metrics = evaluate(X, y)
"""
-from .model_evaluation import modeleval
from .dataset_evaluation import evaluate
+from .model_evaluation import modeleval
from .model_run import model_run
__all__ = [
- 'modeleval',
- 'evaluate',
- 'model_run',
+ "modeleval",
+ "evaluate",
+ "model_run",
]
diff --git a/qbiocode/evaluation/dataset_evaluation.py b/qbiocode/evaluation/dataset_evaluation.py
index 61647d0..b86b5d7 100644
--- a/qbiocode/evaluation/dataset_evaluation.py
+++ b/qbiocode/evaluation/dataset_evaluation.py
@@ -1,23 +1,24 @@
# ====== Base class imports ======
+import warnings
+
+import hfda
import numpy as np
import pandas as pd
-import hfda
+from scipy.linalg import eigvals, inv, norm
+from scipy.spatial import ConvexHull as CH
# ====== Scipy imports ======
from scipy.stats import entropy
-from scipy.linalg import norm, inv, eigvals
-from scipy.spatial import ConvexHull as CH
+from skdim import id
+from skdim.id import lPCA
# ====== Scikit-learn imports ======
from sklearn import datasets
-from skdim import id
-from skdim.id import lPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
-from sklearn.feature_selection import mutual_info_classif, VarianceThreshold
-from sklearn.neighbors import KernelDensity
+from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from sklearn.manifold import Isomap
+from sklearn.neighbors import KernelDensity
-import warnings
# df = pd.DataFrame(X)
def get_dimensions(df):
@@ -29,15 +30,16 @@ def get_dimensions(df):
- num_features (int): Number of features in the DataFrame
- num_samples (int): Number of samples in the DataFrame
- ratio (float): Feature-to-sample ratio
- """
+ """
# number of features
num_features = df.shape[1]
# of samples
num_samples = df.shape[0]
- # feature-to-sample ratio
- ratio = num_features/num_samples
-
- return num_features, num_samples, ratio
+ # feature-to-sample ratio
+ ratio = num_features / num_samples
+
+ return num_features, num_samples, ratio
+
def get_intrinsic_dim(df):
"""Get intrinsic dimension of the data using lPCA from skdim.
@@ -45,79 +47,85 @@ def get_intrinsic_dim(df):
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
float: Intrinsic dimension of the data
- """
+ """
# Intrinsic dimension, calculated via scikit-dimension's PCA method
- pca = id.lPCA() # Initialize the PCA estimator from skdim
- pca.fit(df) # Fit the estimator to your data
- return pca.dimension_
+ pca = id.lPCA() # Initialize the PCA estimator from skdim
+ pca.fit(df) # Fit the estimator to your data
+ return pca.dimension_
+
def get_condition_number(df):
- """Get condition number of a matrix.
- A function with a high condition number is said to be ill-conditioned.
- Ill conditioned matrices produce large errors in its output even with small errors in its input.
- Low condition number means more stable errors.
+ """Get the condition number of a matrix.
+
+ A high condition number indicates that the matrix is ill-conditioned and
+ can produce large output errors even for small input perturbations. A low
+ condition number indicates a more stable matrix.
+
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
- float: condition number of the matrix represented in df
+ float: Condition number of the matrix represented in ``df``.
"""
- # In general,
- # meaning that it can produce large errors in its output even with small errors in its input.
+ # In general,
+ # meaning that it can produce large errors in its output even with small errors in its input.
# Conversely, a function with a low condition number is well-conditioned and more stable in terms of its output.
return np.linalg.cond(df)
-def get_fdr(df,y):
- """Calculate Fisher Discriminant Ratio for a given dataset.
+
+def get_fdr(df, y):
+ """Calculate Fisher Discriminant Ratio for a given dataset.
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
y (int): supervised binary class label
-
+
Returns:
float: Fisher Discriminant ratio
"""
X = df.values
class_labels = np.unique(y)
n_classes = len(class_labels)
- FDR = 0
-
- if n_classes != 2:
+ FDR = 0
+
+ if n_classes != 2:
warnings.warn("WARNING: Fisher Discriminant Ratio is only defined for binary classes. ")
- else:
- mean1 = np.mean(X[y == class_labels[0]], axis=0) #mean for class1
- mean2 = np.mean(X[y == class_labels[1]], axis=0) #mean for class2
-
- #calculate within-class scatter matrices
+ else:
+ mean1 = np.mean(X[y == class_labels[0]], axis=0) # mean for class1
+ mean2 = np.mean(X[y == class_labels[1]], axis=0) # mean for class2
+
+ # calculate within-class scatter matrices
scatter_within = np.zeros((X.shape[1], X.shape[1]))
- for label in class_labels:
+ for label in class_labels:
X_class = X[y == label]
scatter_within += np.cov(X_class.T)
-
- #calculate between-class scatter matrix
+
+ # calculate between-class scatter matrix
scatter_between = np.outer(mean1 - mean2, mean1 - mean2)
-
- #compute FDR
- FDR = np.trace(scatter_between)/np.trace(scatter_within)
-
- return FDR
-
+
+ # compute FDR
+ FDR = np.trace(scatter_between) / np.trace(scatter_within)
+
+ return FDR
+
+
def get_total_correlation(df):
- """Calculate Total Correlation
-
+ """Calculate Total Correlation
+
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
float: Total correlation
"""
- corr_matrix = df.corr() #correlation matrix
- #total correlation by subtracting diagonal values to remove self-correlation
- total_correlation = corr_matrix.abs().sum().sum() - len(df.columns)
-
+ corr_matrix = df.corr() # correlation matrix
+ # total correlation by subtracting diagonal values to remove self-correlation
+ total_correlation = corr_matrix.abs().sum().sum() - len(df.columns)
+
return total_correlation
-def get_mutual_information(df, y):
+
+def get_mutual_information(df, y):
"""Calculate mutual information via sklearn
Args:
@@ -128,10 +136,11 @@ def get_mutual_information(df, y):
float: Mutual information
"""
mutual_info = np.mean(mutual_info_classif(df, y))
-
+
return mutual_info
-def get_variance(df):
+
+def get_variance(df):
"""Get variance
Args:
@@ -144,10 +153,11 @@ def get_variance(df):
variations = round(df.var(), 2)
avg_var = variations.mean()
std_var = variations.std()
-
+
return avg_var, std_var
-def get_coefficient_var(df):
+
+def get_coefficient_var(df):
"""Get coefficient of variance
Args:
@@ -160,45 +170,48 @@ def get_coefficient_var(df):
co_of_v = (df.std() / df.mean()) * 100
avg_co_of_v = co_of_v.mean()
std_co_of_v = co_of_v.std()
-
+
return avg_co_of_v, std_co_of_v
-def get_nnz(df):
+
+def get_nnz(df):
"""Calculate nonzero values in the data
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
Returns:
- int: nonzero count
+ int: nonzero count
"""
return np.count_nonzero(df.values)
-def get_low_var_features(df, num_features):
+
+def get_low_var_features(df, num_features):
"""Calculate get count of low variance features
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
num_features (int): number of features in the dataset
-
+
Raises:
ValueError: If no feature is strong enough to keep
Returns:
int: count of features with low variance
"""
-
+
threshold = np.percentile(df.var(), 25)
-
+
try:
- low_var_features = num_features - VarianceThreshold(threshold).fit(df).get_support().sum()
+ low_var_features = num_features - VarianceThreshold(threshold).fit(df).get_support().sum()
except ValueError:
print("No feature is strong enough to keep")
low_var_features = None
-
+
return low_var_features
-def get_log_density(df):
+
+def get_log_density(df):
"""Calculate the mean log density of the data
Args:
@@ -207,28 +220,31 @@ def get_log_density(df):
Returns:
float: mean log kernel density
"""
- kde = KernelDensity(bandwidth=0.2, kernel='gaussian').fit(df) # Create a KernelDensity estimator and fit the estimator to the data
+ kde = KernelDensity(bandwidth=0.2, kernel="gaussian").fit(
+ df
+ ) # Create a KernelDensity estimator and fit the estimator to the data
log_density = kde.score_samples(df)
-
+
return log_density.mean()
+
def get_fractal_dim(df, k_max):
"""Calculate the fractal dimension of the data using Higuchi's method
-
+
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
k_max (int): Maximum number of k values to use in the calculation
-
+
Returns:
float: Fractal dimension of the data
"""
FD = hfda.measure(df, k_max)
-
- return FD
+ return FD
-def get_moments(df):
- """Compute third and fourth order moments of the data
+
+def get_moments(df):
+ """Compute third and fourth order moments of the data
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
@@ -247,80 +263,83 @@ def get_moments(df):
kurt = df.kurtosis()
avg_kurt = kurt.mean()
std_kurt = kurt.std()
-
- return avg_skew, std_skew, avg_kurt, std_kurt
-def get_entropy(y):
+ return avg_skew, std_skew, avg_kurt, std_kurt
+
+
+def get_entropy(y):
"""Calculate entropy of the target variable
Args:
- y (int): supervised binary class label
-
- Returns:
- avg_y_entropy (float): mean entropy
- std_y_entropy (flat): standard deviation of entropy
+ y (int): supervised binary class label
+
+ Returns:
+ avg_y_entropy (float): mean entropy
+ std_y_entropy (flat): standard deviation of entropy
"""
- y_entropy = entropy(np.bincount(y), base=2) # Compute the entropy of the target variable (y)
+ y_entropy = entropy(np.bincount(y), base=2) # Compute the entropy of the target variable (y)
avg_y_entropy = y_entropy.mean()
std_y_entropy = y_entropy.std()
-
+
return avg_y_entropy, std_y_entropy
-def get_volume(df):
- """Get volume of the data from Convex Hull
+
+def get_volume(df):
+ """Get volume of the data from Convex Hull
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
-
- Returns:
- volume (float): Volume of the space spanned by the features of the data
+
+ Returns:
+ volume (float): Volume of the space spanned by the features of the data
"""
-
- vol = 0
- if df.shape[0] <= df.shape[1]:
+
+ vol = 0
+ if df.shape[0] <= df.shape[1]:
warnings.warn("Convex Hull requires number of observations > number of features")
- else:
- vol = CH(df, qhull_options='QJ').volume
-
+ else:
+ vol = CH(df, qhull_options="QJ").volume
+
return vol
-def get_complexity(df, n_neighbors=10, n_components=2):
- """ Measure the manifold complexity by fitting Isomap and analyzing the geodesic vs. Euclidean distances.
+
+def get_complexity(df, n_neighbors=10, n_components=2):
+ """Measure the manifold complexity by fitting Isomap and analyzing the geodesic vs. Euclidean distances.
This function computes the reconstruction error of the Isomap algorithm, which serves as an indicator of the complexity of the manifold represented by the data.
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
n_neighbors: Number of neighbors for the Isomap algorithm. Default value 10
n_components: Number of components (dimensions) for Isomap projection. Default value 2
-
+
Returns:
- reconstruction_error: float
The reconstruction error of the Isomap model, which indicates the complexity of the manifold.
- reconstruction_error: The residual error of geodesic distances
"""
-
+
isomap = Isomap(n_neighbors=10, n_components=2)
isomap.fit(df.values)
-
- #reconstruction error - an indicator of complexity
+
+ # reconstruction error - an indicator of complexity
reconstruction_error = isomap.reconstruction_error()
-
+
return reconstruction_error
-
+
def evaluate(df, y, file):
"""This function evaluates a dataset and returns a transposed summary DataFrame with various statistical measures, derived from the dataset.
- Using the functions defined above, it computes intrinsic dimension, condition number, Fisher Discriminant Ratio, total correlation, mutual information, variance, coefficient of variation,
+ Using the functions defined above, it computes intrinsic dimension, condition number, Fisher Discriminant Ratio, total correlation, mutual information, variance, coefficient of variation,
data sparsity, low variance features, data density, fractal dimension, data distributions (skewness and kurtosis), entropy of the target variable, and manifold complexity.
The summary DataFrame is transposed for easier readability and contains the dataset name, number of features, number of samples, feature-to-sample ratio, and various statistical measures.
- This function is useful for quickly summarizing the characteristics of a dataset, especially in the context of machine learning and data analysis, allowing you to correlate the dataset's
+ This function is useful for quickly summarizing the characteristics of a dataset, especially in the context of machine learning and data analysis, allowing you to correlate the dataset's
properties with its performance in predictive modeling tasks.
-
+
Args:
df (pandas.DataFrame): Dataset in pandas with observation in rows, features in columns
y (int): supervised binary class label
file (str): Name of the dataset file for identification in the summary DataFrame
-
+
Returns:
transposed (pandas.DataFrame): Summary DataFrame containing various statistical measures of the dataset
"""
@@ -329,17 +348,17 @@ def evaluate(df, y, file):
# Calculate statistical measures
n_features, n_samples, feature_sample_ratio = get_dimensions(df_numeric)
-
- # get intrinsic dimension
+
+ # get intrinsic dimension
intrinsic_dim = get_intrinsic_dim(df_numeric)
-
+
# Condition number
condition_number = get_condition_number(df_numeric)
# Class imbalance ratio via Fischer Discriminant
fdr = get_fdr(df_numeric, y)
- # Total correlation
+ # Total correlation
total_correlation = get_total_correlation(df_numeric)
# Mutual information
@@ -348,12 +367,12 @@ def evaluate(df, y, file):
# Variance
avg_var, std_var = get_variance(df_numeric)
- # Coefficient of variance
+ # Coefficient of variance
avg_co_of_v, std_co_of_v = get_coefficient_var(df_numeric)
-
+
# Data sparsity
count_nonzero = get_nnz(df)
-
+
# Get the number of low variance features
num_low_variance_features = get_low_var_features(df_numeric, n_features)
@@ -366,81 +385,69 @@ def evaluate(df, y, file):
# Data distributions
avg_skew, std_skew, avg_kurt, std_kurt = get_moments(df_numeric)
-
+
# entropy
avg_y_entropy, std_y_entropy = get_entropy(y)
- #volume of data
+ # volume of data
# volume = get_volume(df_numeric)
-
- #manifold complexity
+
+ # manifold complexity
complexity = get_complexity(df_numeric)
-
+
# Create summary DataFrame
- summary_df = pd.DataFrame.from_dict({
- # Data set
- 'Dataset': file,
-
- # Dimensions
- '# Features': n_features,
- '# Samples': n_samples,
- 'Feature_Samples_ratio': feature_sample_ratio,
-
- # Intrinsic dimension
- 'Intrinsic_Dimension': intrinsic_dim,
-
- # Condition number
- 'Condition number': condition_number,
-
- # Class imbalance ratio
- 'Fisher Discriminant Ratio': fdr,
-
- # Feature Correlations
- 'Total Correlations': total_correlation, # Total Correlations
- 'Mutual information': mutual_info,# Mutual information
-
- # Data sparsity
- '# Non-zero entries': count_nonzero,
- '# Low variance features': num_low_variance_features,
-
- #'Variation': variations,
- 'Variation': avg_var,
- 'std_var': std_var,
-
- #'Coefficient of Variation %': co_of_v,
- 'Coefficient of Variation %': avg_co_of_v,
- 'std_co_of_v': std_co_of_v,
-
- # Data distributions
- #'Skewness': skew,
- 'Skewness': avg_skew,
- 'std_skew': std_skew,
-
- #'Kurtosis': kurt,
- 'Kurtosis': avg_kurt,
- 'std_kurt': std_kurt,
-
- # Data density
- 'Mean Log Kernel Density': mean_log_density,
-
- # volume of feature space
- #'Volume': volume,
-
- # Manifold complexity
- 'Isomap Reconstruction Error': complexity,
-
- # Fractal dimension
- 'Fractal dimension': fractal_dim, # calculated via Higuchi Dimension
-
- #'Entropy': y_entropy,
- 'Entropy': avg_y_entropy,
- 'std_entropy': std_y_entropy
- },
- orient='index')
+ summary_df = pd.DataFrame.from_dict(
+ {
+ # Data set
+ "Dataset": file,
+ # Dimensions
+ "# Features": n_features,
+ "# Samples": n_samples,
+ "Feature_Samples_ratio": feature_sample_ratio,
+ # Intrinsic dimension
+ "Intrinsic_Dimension": intrinsic_dim,
+ # Condition number
+ "Condition number": condition_number,
+ # Class imbalance ratio
+ "Fisher Discriminant Ratio": fdr,
+ # Feature Correlations
+ "Total Correlations": total_correlation, # Total Correlations
+ "Mutual information": mutual_info, # Mutual information
+ # Data sparsity
+ "# Non-zero entries": count_nonzero,
+ "# Low variance features": num_low_variance_features,
+ #'Variation': variations,
+ "Variation": avg_var,
+ "std_var": std_var,
+ #'Coefficient of Variation %': co_of_v,
+ "Coefficient of Variation %": avg_co_of_v,
+ "std_co_of_v": std_co_of_v,
+ # Data distributions
+ #'Skewness': skew,
+ "Skewness": avg_skew,
+ "std_skew": std_skew,
+ #'Kurtosis': kurt,
+ "Kurtosis": avg_kurt,
+ "std_kurt": std_kurt,
+ # Data density
+ "Mean Log Kernel Density": mean_log_density,
+ # volume of feature space
+ #'Volume': volume,
+ # Manifold complexity
+ "Isomap Reconstruction Error": complexity,
+ # Fractal dimension
+ "Fractal dimension": fractal_dim, # calculated via Higuchi Dimension
+ #'Entropy': y_entropy,
+ "Entropy": avg_y_entropy,
+ "std_entropy": std_y_entropy,
+ },
+ orient="index",
+ )
transposed = summary_df.T
- #transposed.to_csv('DataSetEvaluation.csv', sep='\t', index=False)
- #print(transposed)
+ # transposed.to_csv('DataSetEvaluation.csv', sep='\t', index=False)
+ # print(transposed)
return transposed
-# evaluate(df,y)
\ No newline at end of file
+
+# evaluate(df,y)
diff --git a/qbiocode/evaluation/model_evaluation.py b/qbiocode/evaluation/model_evaluation.py
index d510cbe..ed493a6 100644
--- a/qbiocode/evaluation/model_evaluation.py
+++ b/qbiocode/evaluation/model_evaluation.py
@@ -2,17 +2,19 @@
import time
from typing import Literal
+
import pandas as pd
+from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
+from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler
-# ====== Scikit-learn imports ======
+from qbiocode.utils.helper_fn import print_results
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
-from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
-from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
+# ====== Scikit-learn imports ======
-from qbiocode.utils.helper_fn import print_results
-def modeleval(y_test, y_predicted, beg_time, params, args, model:str, verbose = True, average='weighted'):
+def modeleval(
+ y_test, y_predicted, beg_time, params, args, model: str, verbose=True, average="weighted"
+):
"""
Evaluates the model performance using accuracy, F1 score, and AUC.
@@ -36,14 +38,42 @@ def modeleval(y_test, y_predicted, beg_time, params, args, model:str, verbose =
f1 = f1_score(y_test, y_predicted, average=average)
compile_time = time.time() - beg_time
params = params
- if verbose==True:
+ if verbose == True:
print_results(model, accuracy, f1, compile_time, params)
-
- if args['grid_search'] == True:
- return pd.DataFrame({'y_test_' + model: [y_test],
- 'y_predicted_' + model: [y_predicted],
- 'results_' + model: [{'model':model,'accuracy': accuracy, 'f1_score': f1,'time': compile_time, 'auc': auc, 'BestParams_GridSearch': params}]})
- else:
- return pd.DataFrame({'y_test_' + model: [y_test],
- 'y_predicted_' + model: [y_predicted],
- 'results_' + model: [{'model':model,'accuracy': accuracy, 'f1_score': f1,'time': compile_time, 'auc': auc, 'Model_Parameters': params}]})
\ No newline at end of file
+
+ if args["grid_search"] == True:
+ return pd.DataFrame(
+ {
+ "y_test_" + model: [y_test],
+ "y_predicted_" + model: [y_predicted],
+ "results_"
+ + model: [
+ {
+ "model": model,
+ "accuracy": accuracy,
+ "f1_score": f1,
+ "time": compile_time,
+ "auc": auc,
+ "BestParams_GridSearch": params,
+ }
+ ],
+ }
+ )
+ else:
+ return pd.DataFrame(
+ {
+ "y_test_" + model: [y_test],
+ "y_predicted_" + model: [y_predicted],
+ "results_"
+ + model: [
+ {
+ "model": model,
+ "accuracy": accuracy,
+ "f1_score": f1,
+ "time": compile_time,
+ "auc": auc,
+ "Model_Parameters": params,
+ }
+ ],
+ }
+ )
diff --git a/qbiocode/evaluation/model_run.py b/qbiocode/evaluation/model_run.py
index 132681c..5e8843b 100644
--- a/qbiocode/evaluation/model_run.py
+++ b/qbiocode/evaluation/model_run.py
@@ -1,20 +1,22 @@
# ====== Base class imports ======
-import os, json
+import json
+import os
+
import pandas as pd
# ======= Parallelization =====
from joblib import Parallel, delayed
-
current_dir = os.getcwd()
+
def model_run(X_train, X_test, y_train, y_test, data_key, args):
"""This function runs the ML methods, with or without a grid search, as specified in the config.yaml file.
It returns a python dictionary contatining these results, which can then be parsed out. It is designed to run
- each of the ML methods in parallel, for each data set (this is done by calling the Parallel module in results below).
- The arguments X_train, X_test, y_train, y_test are all passed in from the main script (qmlbench.py) as the input
- datasets are processed, while the remaining arguments are passed from the config.yaml file.
-
+ each of the ML methods in parallel, for each data set (this is done by calling the Parallel module in results below).
+ The arguments X_train, X_test, y_train, y_test are all passed in from the main script (qmlbench.py) as the input
+ datasets are processed, while the remaining arguments are passed from the config.yaml file.
+
Args:
X_train (pd.DataFrame): Training features.
X_test (pd.DataFrame): Testing features.
@@ -28,69 +30,69 @@ def model_run(X_train, X_test, y_train, y_test, data_key, args):
- cross_validation: Cross-validation strategy.
- gridsearch__args: Arguments for grid search for each model.
- _args: Additional arguments for each model.
-
+
Returns:
model_total_result (dict): A dictionary containing the results of the models run, with keys as model names and values as their respective results.
This dictionary can readily be converted to a Pandas Dataframe, as seen in the 'ModelResults.csv' files that are produced in the results directory
when the main profiler is run (qbiocode-profiler.py).
-
+
"""
-
+
# Lazy imports to avoid circular dependency
# These imports happen inside the function, not at module level
- from qbiocode.learning.compute_svc import compute_svc, compute_svc_opt
from qbiocode.learning.compute_dt import compute_dt, compute_dt_opt
- from qbiocode.learning.compute_nb import compute_nb, compute_nb_opt
from qbiocode.learning.compute_lr import compute_lr, compute_lr_opt
- from qbiocode.learning.compute_rf import compute_rf, compute_rf_opt
- from qbiocode.learning.compute_xgb import compute_xgb, compute_xgb_opt
from qbiocode.learning.compute_mlp import compute_mlp, compute_mlp_opt
+ from qbiocode.learning.compute_nb import compute_nb, compute_nb_opt
+ from qbiocode.learning.compute_pqk import compute_pqk
from qbiocode.learning.compute_qnn import compute_qnn
from qbiocode.learning.compute_qsvc import compute_qsvc
+ from qbiocode.learning.compute_rf import compute_rf, compute_rf_opt
+ from qbiocode.learning.compute_svc import compute_svc, compute_svc_opt
from qbiocode.learning.compute_vqc import compute_vqc
- from qbiocode.learning.compute_pqk import compute_pqk
-
+ from qbiocode.learning.compute_xgb import compute_xgb, compute_xgb_opt
+
# Build model dictionary
compute_ml_dict = {
- 'svc_opt': compute_svc_opt,
- 'svc': compute_svc,
- 'dt_opt': compute_dt_opt,
- 'dt': compute_dt,
- 'lr_opt': compute_lr_opt,
- 'lr': compute_lr,
- 'nb_opt': compute_nb_opt,
- 'nb': compute_nb,
- 'rf_opt': compute_rf_opt,
- 'rf': compute_rf,
- 'xgb_opt': compute_xgb_opt,
- 'xgb': compute_xgb,
- 'mlp_opt': compute_mlp_opt,
- 'mlp': compute_mlp,
- 'qsvc': compute_qsvc,
- 'vqc': compute_vqc,
- 'qnn': compute_qnn,
- 'pqk': compute_pqk
+ "svc_opt": compute_svc_opt,
+ "svc": compute_svc,
+ "dt_opt": compute_dt_opt,
+ "dt": compute_dt,
+ "lr_opt": compute_lr_opt,
+ "lr": compute_lr,
+ "nb_opt": compute_nb_opt,
+ "nb": compute_nb,
+ "rf_opt": compute_rf_opt,
+ "rf": compute_rf,
+ "xgb_opt": compute_xgb_opt,
+ "xgb": compute_xgb,
+ "mlp_opt": compute_mlp_opt,
+ "mlp": compute_mlp,
+ "qsvc": compute_qsvc,
+ "vqc": compute_vqc,
+ "qnn": compute_qnn,
+ "pqk": compute_pqk,
}
# Quantum models don't have _opt versions (use separate configs for hyperparameter tuning)
- quantum_models = {'qsvc', 'qnn', 'vqc', 'pqk'}
-
+ quantum_models = {"qsvc", "qnn", "vqc", "pqk"}
+
# Run classical and quantum models
- n_jobs = len(args['model'])
- if 'n_jobs' in args.keys():
- n_jobs = min(args['n_jobs'], len(args['model']))
-
+ n_jobs = len(args["model"])
+ if "n_jobs" in args.keys():
+ n_jobs = min(args["n_jobs"], len(args["model"]))
+
grid_search = False
- if 'grid_search' in args.keys():
- grid_search = args['grid_search']
-
+ if "grid_search" in args.keys():
+ grid_search = args["grid_search"]
+
# Check if any quantum models are in the model list when grid_search is enabled
if grid_search:
- quantum_in_models = [m for m in args['model'] if m in quantum_models]
+ quantum_in_models = [m for m in args["model"] if m in quantum_models]
if quantum_in_models:
- print("\n" + "="*80)
+ print("\n" + "=" * 80)
print("WARNING: Grid search is enabled with quantum models:", quantum_in_models)
- print("="*80)
+ print("=" * 80)
print("Quantum models do not support automated grid search.")
print("For hyperparameter tuning of quantum models, you should:")
print(" 1. Create multiple configuration files with different hyperparameters")
@@ -104,38 +106,56 @@ def model_run(X_train, X_test, y_train, y_test, data_key, args):
print(" data_dirs=['data/your_data_dir']")
print(" )")
print("\nSee documentation: qbiocode.utils.generate_qml_experiment_configs")
- print("="*80 + "\n")
-
+ print("=" * 80 + "\n")
+
if grid_search:
results = []
- for method in args['model']:
+ for method in args["model"]:
if method in quantum_models:
# Quantum models don't have _opt versions, use regular function
result = delayed(compute_ml_dict[method])(
- X_train, X_test, y_train, y_test, args,
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
model=method,
data_key=data_key,
- **args.get(method + '_args', {}),
- verbose=False
+ **args.get(method + "_args", {}),
+ verbose=False,
)
else:
# Classical models have _opt versions with grid search
- result = delayed(compute_ml_dict[method + '_opt'])(
- X_train, X_test, y_train, y_test, args,
- model=method + '_opt',
- cv=args['cross_validation'],
- **args.get('gridsearch_' + method + '_args', {}),
- verbose=False
+ result = delayed(compute_ml_dict[method + "_opt"])(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ model=method + "_opt",
+ cv=args["cross_validation"],
+ **args.get("gridsearch_" + method + "_args", {}),
+ verbose=False,
)
results.append(result)
results = Parallel(n_jobs=n_jobs)(results)
else:
- results = Parallel(n_jobs=n_jobs)(delayed(compute_ml_dict[method])(X_train, X_test, y_train, y_test, args, model=method, data_key = data_key,
- **args[method+'_args'], verbose=False)
- for method in args['model'])
-
+ results = Parallel(n_jobs=n_jobs)(
+ delayed(compute_ml_dict[method])(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ model=method,
+ data_key=data_key,
+ **args[method + "_args"],
+ verbose=False,
+ )
+ for method in args["model"]
+ )
+
model_total_result = pd.melt(pd.concat(results)).dropna() # type: ignore
- model_total_result['i'] = 0
+ model_total_result["i"] = 0
model_total_result = model_total_result.pivot(columns="variable", values="value", index="i")
return model_total_result.to_dict()
-
diff --git a/qbiocode/learning/__init__.py b/qbiocode/learning/__init__.py
index 85da6c3..47c4cdd 100644
--- a/qbiocode/learning/__init__.py
+++ b/qbiocode/learning/__init__.py
@@ -7,7 +7,7 @@
optimized versions (where applicable) with hyperparameter tuning.
Classical Algorithms
--------------------
+--------------------
- Decision Tree (DT)
- Logistic Regression (LR)
- Multi-Layer Perceptron (MLP)
@@ -17,7 +17,7 @@
- XGBoost (XGB)
Quantum Algorithms
------------------
+------------------
- Quantum Neural Network (QNN)
- Quantum Support Vector Classifier (QSVC)
- Variational Quantum Classifier (VQC)
@@ -39,6 +39,7 @@
from .compute_nb import compute_nb, compute_nb_opt
from .compute_rf import compute_rf, compute_rf_opt
from .compute_svc import compute_svc, compute_svc_opt
+
try:
from .compute_xgb import compute_xgb, compute_xgb_opt
except Exception:
@@ -46,32 +47,32 @@
compute_xgb = None # type: ignore
compute_xgb_opt = None # type: ignore
+from .compute_pqk import compute_pqk
+
# Quantum ML algorithms
from .compute_qnn import compute_qnn
from .compute_qsvc import compute_qsvc
from .compute_vqc import compute_vqc
-from .compute_pqk import compute_pqk
__all__ = [
# Classical algorithms
- 'compute_dt',
- 'compute_dt_opt',
- 'compute_lr',
- 'compute_lr_opt',
- 'compute_mlp',
- 'compute_mlp_opt',
- 'compute_nb',
- 'compute_nb_opt',
- 'compute_rf',
- 'compute_rf_opt',
- 'compute_svc',
- 'compute_svc_opt',
- 'compute_xgb',
- 'compute_xgb_opt',
-
+ "compute_dt",
+ "compute_dt_opt",
+ "compute_lr",
+ "compute_lr_opt",
+ "compute_mlp",
+ "compute_mlp_opt",
+ "compute_nb",
+ "compute_nb_opt",
+ "compute_rf",
+ "compute_rf_opt",
+ "compute_svc",
+ "compute_svc_opt",
+ "compute_xgb",
+ "compute_xgb_opt",
# Quantum algorithms
- 'compute_qnn',
- 'compute_qsvc',
- 'compute_vqc',
- 'compute_pqk',
+ "compute_qnn",
+ "compute_qsvc",
+ "compute_vqc",
+ "compute_pqk",
]
diff --git a/qbiocode/learning/compute_dt.py b/qbiocode/learning/compute_dt.py
index 241fd76..5babec6 100644
--- a/qbiocode/learning/compute_dt.py
+++ b/qbiocode/learning/compute_dt.py
@@ -2,26 +2,46 @@
import time
-# ====== Scikit-learn imports ======
-
-from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
+from sklearn.tree import DecisionTreeClassifier
# ====== Additional local imports ======
from qbiocode.evaluation.model_evaluation import modeleval
+# ====== Scikit-learn imports ======
+
+
# ====== Begin functions ======
-def compute_dt(X_train, X_test, y_train, y_test, args, verbose=False, model='Decision Tree', data_key = '',criterion='gini', splitter='best',
- max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None,
- random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0,
- monotonic_cst=None):
-
+
+def compute_dt(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ model="Decision Tree",
+ data_key="",
+ criterion="gini",
+ splitter="best",
+ max_depth=None,
+ min_samples_split=2,
+ min_samples_leaf=1,
+ min_weight_fraction_leaf=0.0,
+ max_features=None,
+ random_state=None,
+ max_leaf_nodes=None,
+ min_impurity_decrease=0.0,
+ class_weight=None,
+ ccp_alpha=0.0,
+ monotonic_cst=None,
+):
"""This function generates a model using a Decision Tree (DT) Classifier method as implemented in
- `scikit-learn `_.
+ `scikit-learn `__.
It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed.
- The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset.
+ The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset.
The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model.
This function is designed to be used in a supervised learning context, where the goal is to classify data points.
@@ -49,34 +69,60 @@ def compute_dt(X_train, X_test, y_train, y_test, args, verbose=False, model='Dec
monotonic_cst: Monotonic constraints for tree nodes, if applicable. Default is None.
Returns:
modeleval (dict): A dictionary containing the evaluation metrics, model parameters, and time taken for training and validation.
- """
-
+ """
+
beg_time = time.time()
- dt = OneVsOneClassifier(DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth,
- min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
- min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features,
- random_state=random_state, max_leaf_nodes=max_leaf_nodes,
- min_impurity_decrease=min_impurity_decrease, class_weight=class_weight,
- ccp_alpha=ccp_alpha, monotonic_cst=monotonic_cst))
+ dt = OneVsOneClassifier(
+ DecisionTreeClassifier(
+ criterion=criterion,
+ splitter=splitter,
+ max_depth=max_depth,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ min_weight_fraction_leaf=min_weight_fraction_leaf,
+ max_features=max_features,
+ random_state=random_state,
+ max_leaf_nodes=max_leaf_nodes,
+ min_impurity_decrease=min_impurity_decrease,
+ class_weight=class_weight,
+ ccp_alpha=ccp_alpha,
+ monotonic_cst=monotonic_cst,
+ )
+ )
# Fit the training datset
model_fit = dt.fit(X_train, y_train)
model_params = model_fit.get_params()
# Validate the model in test dataset and calculate accuracy
- y_predicted = dt.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose))
+ y_predicted = dt.predict(X_test)
+ return modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose
+ )
+
-def compute_dt_opt(X_train, X_test, y_train, y_test, args, verbose=False, model='Decision Tree', cv=5,
- criterion=[], max_depth=[], min_samples_split=[], min_samples_leaf=[], max_features=[]):
-
+def compute_dt_opt(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ model="Decision Tree",
+ cv=5,
+ criterion=[],
+ max_depth=[],
+ min_samples_split=[],
+ min_samples_leaf=[],
+ max_features=[],
+):
"""This function also generates a model using a Decision Tree (DT) Classifier method as implemented in
- `scikit-learn `_.
+ `scikit-learn `__.
The difference here is that this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The
combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar
- datasets, without having to run the grid search.
- The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset.
+ datasets, without having to run the grid search.
+ The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset.
The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search.
This function is designed to be used in a supervised learning context, where the goal is to classify data points.
-
+
Args:
X_train (array-like): Training data features.
X_test (array-like): Test data features.
@@ -91,18 +137,19 @@ def compute_dt_opt(X_train, X_test, y_train, y_test, args, verbose=False, model=
min_samples_split (list): List of minimum samples required to split an internal node. Default is empty list.
min_samples_leaf (list): List of minimum samples required to be at a leaf node. Default is empty list.
max_features (list): List of maximum features to consider when looking for the best split. Default is empty list.
-
+
Returns:
modeleval (dict): A dictionary containing the evaluation metrics, best parameters, and time taken for training and validation.
- """
-
+ """
+
beg_time = time.time()
- params = {'criterion': criterion,
- 'max_depth': max_depth,
- 'min_samples_split': min_samples_split,
- 'min_samples_leaf': min_samples_leaf,
- 'max_features': max_features
- }
+ params = {
+ "criterion": criterion,
+ "max_depth": max_depth,
+ "min_samples_split": min_samples_split,
+ "min_samples_leaf": min_samples_leaf,
+ "max_features": max_features,
+ }
# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=params, cv=cv)
grid_search.fit(X_train, y_train)
@@ -114,4 +161,4 @@ def compute_dt_opt(X_train, X_test, y_train, y_test, args, verbose=False, model=
# Make predictions and calculate accuracy
y_predicted = best_dt.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose))
+ return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)
diff --git a/qbiocode/learning/compute_lr.py b/qbiocode/learning/compute_lr.py
index 8da039b..d25f0f2 100644
--- a/qbiocode/learning/compute_lr.py
+++ b/qbiocode/learning/compute_lr.py
@@ -1,10 +1,8 @@
# ====== Base class imports ======
import time
-import numpy as np
-
-# ====== Scikit-learn imports ======
+import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
@@ -12,15 +10,39 @@
# ====== Additional local imports ======
from qbiocode.evaluation.model_evaluation import modeleval
+# ====== Scikit-learn imports ======
+
+
# ====== Begin functions ======
-
-def compute_lr(X_train, X_test, y_train, y_test, args, model='Logistic Regression', data_key = '',
- penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1,
- class_weight=None, random_state=None, solver='saga', max_iter=10000, multi_class='deprecated',
- verbose=False, warm_start=False, n_jobs=None, l1_ratio=None):
-
+
+
+def compute_lr(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ model="Logistic Regression",
+ data_key="",
+ penalty="l2",
+ *,
+ dual=False,
+ tol=0.0001,
+ C=1.0,
+ fit_intercept=True,
+ intercept_scaling=1,
+ class_weight=None,
+ random_state=None,
+ solver="saga",
+ max_iter=10000,
+ multi_class="deprecated",
+ verbose=False,
+ warm_start=False,
+ n_jobs=None,
+ l1_ratio=None,
+):
"""This function generates a model using a Logistic Regression (LR) method as implemented in
- `scikit-learn `_.
+ `scikit-learn `__.
It takes in parameter arguments specified in the config.yaml file, but will use the default parameters
specified above if none are passed. The model is trained on the training dataset and validated on the
test dataset. The function returns the evaluation of the model on the test dataset, including accuracy,
@@ -49,36 +71,62 @@ def compute_lr(X_train, X_test, y_train, y_test, args, model='Logistic Regressio
verbose (bool): Whether to print detailed logs, default is False.
warm_start (bool): Whether to reuse the solution of the previous call to fit as initialization,
default is False.
- n_jobs (int or None): Number of jobs to run in parallel for both `fit` and `predict`,
+ n_jobs (int or None): Number of jobs to run in parallel for both `fit` and `predict`,
default is None which means 1 unless in a joblib.parallel_backend context.
- l1_ratio (float or None): The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
+ l1_ratio (float or None): The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
Only used if penalty='elasticnet', default is None.
-
+
Returns:
modeleval (dict): A dictionary containing the evaluation metrics, model parameters, and time taken for training and validation.
- """
-
+ """
+
beg_time = time.time()
- logres = OneVsOneClassifier(LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=C, fit_intercept=fit_intercept,
- intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state,
- solver=solver, max_iter=max_iter,
- warm_start=warm_start, n_jobs=n_jobs, l1_ratio=l1_ratio))
+ logres = OneVsOneClassifier(
+ LogisticRegression(
+ penalty=penalty,
+ dual=dual,
+ tol=tol,
+ C=C,
+ fit_intercept=fit_intercept,
+ intercept_scaling=intercept_scaling,
+ class_weight=class_weight,
+ random_state=random_state,
+ solver=solver,
+ max_iter=max_iter,
+ warm_start=warm_start,
+ n_jobs=n_jobs,
+ l1_ratio=l1_ratio,
+ )
+ )
# Fit the training datset
model_fit = logres.fit(X_train, y_train)
model_params = model_fit.get_params()
# Validate the model in test dataset and calculate accuracy
- y_predicted = logres.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose))
+ y_predicted = logres.predict(X_test)
+ return modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose
+ )
-def compute_lr_opt(X_train, X_test, y_train, y_test, args, model='Logistic Regression', cv=5,
- penalty=[], C=[],
- solver=[], verbose=False, max_iter=[]):
-
+
+def compute_lr_opt(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ model="Logistic Regression",
+ cv=5,
+ penalty=[],
+ C=[],
+ solver=[],
+ verbose=False,
+ max_iter=[],
+):
"""This function also generates a model using a Logistic Regression (LR) method as implemented in
- `scikit-learn `_.
+ `scikit-learn `__.
The difference here is that this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The
combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar
- datasets, without having to run the grid search. The function returns the evaluation of the model
+ datasets, without having to run the grid search. The function returns the evaluation of the model
on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search.
This function is designed to be used in a supervised learning context, where the goal is to classify data points.
@@ -95,17 +143,13 @@ def compute_lr_opt(X_train, X_test, y_train, y_test, args, model='Logistic Regre
solver (list): List of solvers to try, default is an empty list.
verbose (bool): Whether to print detailed logs, default is False.
max_iter (list): List of maximum iterations to try, default is an empty list.
-
+
Returns:
modeleval (dict): A dictionary containing the evaluation metrics, best parameters, and time taken for training and validation.
- """
-
+ """
+
beg_time = time.time()
- params = {'penalty': penalty,
- 'C': C,
- 'solver':solver,
- 'max_iter':max_iter
- }
+ params = {"penalty": penalty, "C": C, "solver": solver, "max_iter": max_iter}
# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(LogisticRegression(), param_grid=params, cv=cv)
grid_search.fit(X_train, y_train)
@@ -117,4 +161,4 @@ def compute_lr_opt(X_train, X_test, y_train, y_test, args, model='Logistic Regre
# Make predictions and calculate accuracy
y_predicted = best_logres.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose))
\ No newline at end of file
+ return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)
diff --git a/qbiocode/learning/compute_mlp.py b/qbiocode/learning/compute_mlp.py
index fc6c676..375f81b 100644
--- a/qbiocode/learning/compute_mlp.py
+++ b/qbiocode/learning/compute_mlp.py
@@ -1,125 +1,188 @@
# ====== Base class imports ======
import time
-import numpy as np
-
-# ====== Scikit-learn imports ======
-from sklearn.neural_network import MLPClassifier
+import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
+from sklearn.neural_network import MLPClassifier
# ====== Additional local imports ======
from qbiocode.evaluation.model_evaluation import modeleval
+# ====== Scikit-learn imports ======
+
+
# ====== Begin functions ======
-def compute_mlp(X_train, X_test, y_train, y_test, args, verbose=False, model='Multi-layer Perceptron', data_key = '',
- hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto',
- learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=10000, shuffle=True,
- random_state=None, tol=0.0001, warm_start=False, momentum=0.9, nesterovs_momentum=True,
- early_stopping=False, validation_fraction=0.1, beta_1=0.9,
- beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000):
-
+
+def compute_mlp(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ model="Multi-layer Perceptron",
+ data_key="",
+ hidden_layer_sizes=(100,),
+ activation="relu",
+ solver="adam",
+ alpha=0.0001,
+ batch_size="auto",
+ learning_rate="constant",
+ learning_rate_init=0.001,
+ power_t=0.5,
+ max_iter=10000,
+ shuffle=True,
+ random_state=None,
+ tol=0.0001,
+ warm_start=False,
+ momentum=0.9,
+ nesterovs_momentum=True,
+ early_stopping=False,
+ validation_fraction=0.1,
+ beta_1=0.9,
+ beta_2=0.999,
+ epsilon=1e-08,
+ n_iter_no_change=10,
+ max_fun=15000,
+):
+ """
+ This function generates a model using a Multi-layer Perceptron (mlp), a neural network, method as implemented in
+ `scikit-learn `__. It takes in parameter
+ arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed.
+ The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
+ on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model.
+ This function is designed to be used in a supervised learning context, where the goal is to classify data points.
+
+ Args:
+ X_train (numpy.ndarray): Training features.
+ X_test (numpy.ndarray): Test features.
+ y_train (numpy.ndarray): Training labels.
+ y_test (numpy.ndarray): Test labels.
+ args (dict): Additional arguments, such as config parameters.
+ verbose (bool): If True, prints additional information during execution.
+ model (str): Name of the model being used.
+ data_key (str): Key for the dataset, if applicable.
+ hidden_layer_sizes (tuple): The ith element represents the number of neurons in the ith hidden layer.
+ activation (str): Activation function for the hidden layer.
+ solver (str): The solver for weight optimization.
+ alpha (float): L2 penalty (regularization term) parameter.
+ batch_size (int or str): Size of minibatches for stochastic optimizers.
+ learning_rate (str): Learning rate schedule for weight updates.
+ learning_rate_init (float): Initial learning rate used.
+ power_t (float): The exponent for inverse scaling learning rate.
+ max_iter (int): Maximum number of iterations.
+ shuffle (bool): Whether to shuffle samples in each iteration.
+ random_state (int or None): Random seed for reproducibility.
+ tol (float): Tolerance for stopping criteria.
+ warm_start (bool): If True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.
+ momentum (float): Momentum for gradient descent update.
+ nesterovs_momentum (bool): Whether to use Nesterov's momentum or not.
+ early_stopping (bool): Whether to use early stopping to terminate training when validation score is not improving.
+ validation_fraction (float): Proportion of training data to set aside as validation set for early stopping.
+ beta_1, beta_2, epsilon: Parameters for Adam optimizer.
+ n_iter_no_change: Number of iterations with no improvement after which training will be stopped.
+ max_fun: Maximum number of function evaluations.
+
+ Returns:
+ modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score,
+ and the time taken to train and validate the model, along with the model parameters.
"""
- This function generates a model using a Multi-layer Perceptron (mlp), a neural network, method as implemented in
- `scikit-learn `_. It takes in parameter
- arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed.
- The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
- on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model.
- This function is designed to be used in a supervised learning context, where the goal is to classify data points.
-
- Args:
+
+ beg_time = time.time()
+ mlp = OneVsOneClassifier(
+ MLPClassifier(
+ hidden_layer_sizes=hidden_layer_sizes,
+ activation=activation,
+ solver=solver,
+ alpha=alpha,
+ batch_size=batch_size,
+ learning_rate=learning_rate,
+ learning_rate_init=learning_rate_init,
+ power_t=power_t,
+ max_iter=max_iter,
+ shuffle=shuffle,
+ random_state=random_state,
+ tol=tol,
+ warm_start=warm_start,
+ momentum=momentum,
+ nesterovs_momentum=nesterovs_momentum,
+ early_stopping=early_stopping,
+ validation_fraction=validation_fraction,
+ beta_1=beta_1,
+ beta_2=beta_2,
+ epsilon=epsilon,
+ n_iter_no_change=n_iter_no_change,
+ max_fun=max_fun,
+ )
+ )
+ # Fit the training datset
+ model_fit = mlp.fit(X_train, y_train)
+ model_params = model_fit.get_params()
+ # Validate the model in test dataset and calculate accuracy
+ y_predicted = mlp.predict(X_test)
+ return modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose
+ )
+
+
+def compute_mlp_opt(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ cv=5,
+ model="Multi-layer Perceptron",
+ hidden_layer_sizes=[],
+ activation=[],
+ max_iter=[],
+ solver=[],
+ alpha=[],
+ learning_rate=[],
+):
+ """
+ This function also generates a model using a Multi-layer Perceptron (mlp), a neural network, as implemented in scikit-learn
+ (https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html). The difference here is that
+ this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The
+ combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar
+ datasets, without having to run the grid search. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
+ on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search.
+ This function is designed to be used in a supervised learning context, where the goal is to classify data points.
+
+ Args:
X_train (numpy.ndarray): Training features.
X_test (numpy.ndarray): Test features.
y_train (numpy.ndarray): Training labels.
y_test (numpy.ndarray): Test labels.
args (dict): Additional arguments, such as config parameters.
verbose (bool): If True, prints additional information during execution.
+ cv (int): Number of cross-validation folds.
model (str): Name of the model being used.
- data_key (str): Key for the dataset, if applicable.
- hidden_layer_sizes (tuple): The ith element represents the number of neurons in the ith hidden layer.
- activation (str): Activation function for the hidden layer.
- solver (str): The solver for weight optimization.
- alpha (float): L2 penalty (regularization term) parameter.
- batch_size (int or str): Size of minibatches for stochastic optimizers.
- learning_rate (str): Learning rate schedule for weight updates.
- learning_rate_init (float): Initial learning rate used.
- power_t (float): The exponent for inverse scaling learning rate.
- max_iter (int): Maximum number of iterations.
- shuffle (bool): Whether to shuffle samples in each iteration.
- random_state (int or None): Random seed for reproducibility.
- tol (float): Tolerance for stopping criteria.
- warm_start (bool): If True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.
- momentum (float): Momentum for gradient descent update.
- nesterovs_momentum (bool): Whether to use Nesterov's momentum or not.
- early_stopping (bool): Whether to use early stopping to terminate training when validation score is not improving.
- validation_fraction (float): Proportion of training data to set aside as validation set for early stopping.
- beta_1, beta_2, epsilon: Parameters for Adam optimizer.
- n_iter_no_change: Number of iterations with no improvement after which training will be stopped.
- max_fun: Maximum number of function evaluations.
-
- Returns:
- modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score,
- and the time taken to train and validate the model, along with the model parameters.
- """
-
- beg_time = time.time()
- mlp = OneVsOneClassifier(MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha,
- batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init,
- power_t=power_t, max_iter=max_iter, shuffle=shuffle, random_state=random_state, tol=tol,
- warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum,
- early_stopping=early_stopping, validation_fraction=validation_fraction, beta_1=beta_1,
- beta_2=beta_2, epsilon=epsilon, n_iter_no_change=n_iter_no_change, max_fun=max_fun))
- # Fit the training datset
- model_fit = mlp.fit(X_train, y_train)
- model_params = model_fit.get_params()
- # Validate the model in test dataset and calculate accuracy
- y_predicted = mlp.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose))
-
-def compute_mlp_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, model='Multi-layer Perceptron',
- hidden_layer_sizes= [], activation = [], max_iter= [],
- solver = [], alpha = [], learning_rate= []):
-
+ hidden_layer_sizes (tuple or list): The ith element represents the number of neurons in the ith hidden layer.
+ activation (str or list): Activation function for the hidden layer.
+ max_iter (int or list): Maximum number of iterations.
+ solver (str or list): The solver for weight optimization.
+ alpha (float or list): L2 penalty (regularization term) parameter.
+ learning_rate (str or list): Learning rate schedule for weight updates.
+ Returns:
+ modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score,
+ and the time taken to train and validate the model, along with the best parameters found during grid search.
"""
- This function also generates a model using a Multi-layer Perceptron (mlp), a neural network, as implemented in scikit-learn
- (https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html). The difference here is that
- this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The
- combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar
- datasets, without having to run the grid search. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
- on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search.
- This function is designed to be used in a supervised learning context, where the goal is to classify data points.
-
- Args:
- X_train (numpy.ndarray): Training features.
- X_test (numpy.ndarray): Test features.
- y_train (numpy.ndarray): Training labels.
- y_test (numpy.ndarray): Test labels.
- args (dict): Additional arguments, such as config parameters.
- verbose (bool): If True, prints additional information during execution.
- cv (int): Number of cross-validation folds.
- model (str): Name of the model being used.
- hidden_layer_sizes (tuple or list): The ith element represents the number of neurons in the ith hidden layer.
- activation (str or list): Activation function for the hidden layer.
- max_iter (int or list): Maximum number of iterations.
- solver (str or list): The solver for weight optimization.
- alpha (float or list): L2 penalty (regularization term) parameter.
- learning_rate (str or list): Learning rate schedule for weight updates.
- Returns:
- modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score,
- and the time taken to train and validate the model, along with the best parameters found during grid search.
- """
-
+
beg_time = time.time()
- params={'hidden_layer_sizes': hidden_layer_sizes,
- 'activation': activation,
- 'max_iter': max_iter,
- 'solver': solver,
- 'alpha': alpha,
- 'learning_rate': learning_rate,
- }
-
+ params = {
+ "hidden_layer_sizes": hidden_layer_sizes,
+ "activation": activation,
+ "max_iter": max_iter,
+ "solver": solver,
+ "alpha": alpha,
+ "learning_rate": learning_rate,
+ }
+
# Pemlporm Grid Search to find the best parameters
grid_search = GridSearchCV(MLPClassifier(), param_grid=params, cv=cv)
grid_search.fit(X_train, y_train)
@@ -131,4 +194,4 @@ def compute_mlp_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5,
# Make predictions and calculate accuracy
y_predicted = best_mlp.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose))
+ return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)
diff --git a/qbiocode/learning/compute_nb.py b/qbiocode/learning/compute_nb.py
index 43ff692..c8ac20f 100644
--- a/qbiocode/learning/compute_nb.py
+++ b/qbiocode/learning/compute_nb.py
@@ -2,24 +2,34 @@
import time
-# ====== Scikit-learn imports ======
-
-from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
+from sklearn.naive_bayes import GaussianNB
# ====== Additional local imports ======
from qbiocode.evaluation.model_evaluation import modeleval
-def compute_nb(X_train, X_test, y_train, y_test, args, verbose=False, model='Naive Bayes', data_key = '', var_smoothing=1e-09):
-
+# ====== Scikit-learn imports ======
+
+
+def compute_nb(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ model="Naive Bayes",
+ data_key="",
+ var_smoothing=1e-09,
+):
"""This function generates a model using a Gaussian Naive Bayes (NB) Classifier method as implemented in
- `scikit-learn `_.
+ `scikit-learn `__.
It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed.
- The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
+ The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model.
This function is designed to be used in a supervised learning context, where the goal is to classify data points.
-
+
Args:
X_train (numpy.ndarray): Training features.
X_test (numpy.ndarray): Test features.
@@ -32,9 +42,9 @@ def compute_nb(X_train, X_test, y_train, y_test, args, verbose=False, model='Nai
var_smoothing (float): Portion of the largest variance of all features added to variances for calculation stability.
Returns:
modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score,
- and the time taken to train and validate the model, along with the model parameters.
- """
-
+ and the time taken to train and validate the model, along with the model parameters.
+ """
+
beg_time = time.time()
nb = OneVsOneClassifier(GaussianNB(var_smoothing=var_smoothing))
# Fit the training datset
@@ -42,16 +52,27 @@ def compute_nb(X_train, X_test, y_train, y_test, args, verbose=False, model='Nai
model_params = model_fit.get_params()
# Validate the model in test dataset and calculate accuracy
y_predicted = nb.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose))
+ return modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose
+ )
-def compute_nb_opt(X_train, X_test, y_train, y_test, args, verbose=False, model='Naive Bayes', cv=5,
- var_smoothing = [1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 1e-02]):
-
- """ This function generates a model using a Gaussian Naive Bayes (NB) Classifier method as implemented in
- `scikit-learn `_.
+
+def compute_nb_opt(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ model="Naive Bayes",
+ cv=5,
+ var_smoothing=[1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 1e-02],
+):
+ """This function generates a model using a Gaussian Naive Bayes (NB) Classifier method as implemented in
+ `scikit-learn `__.
It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. The
combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar
- datasets, without having to run the grid search. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
+ datasets, without having to run the grid search. The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search.
This function is designed to be used in a supervised learning context, where the goal is to classify data points.
Args:
@@ -67,10 +88,10 @@ def compute_nb_opt(X_train, X_test, y_train, y_test, args, verbose=False, model=
Returns:
modeleval (dict): A dictionary containing the evaluation metrics of the model on the test dataset, including accuracy, AUC, F1 score,
and the time taken to train and validate the model, along with the best parameters found during grid search.
- """
-
+ """
+
beg_time = time.time()
- params={'var_smoothing': var_smoothing}
+ params = {"var_smoothing": var_smoothing}
# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(GaussianNB(), param_grid=params, cv=cv)
grid_search.fit(X_train, y_train)
@@ -82,4 +103,4 @@ def compute_nb_opt(X_train, X_test, y_train, y_test, args, verbose=False, model=
# Make predictions and calculate accuracy
y_predicted = best_nb.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose))
\ No newline at end of file
+ return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)
diff --git a/qbiocode/learning/compute_pqk.py b/qbiocode/learning/compute_pqk.py
index d9c4803..a822c28 100644
--- a/qbiocode/learning/compute_pqk.py
+++ b/qbiocode/learning/compute_pqk.py
@@ -1,38 +1,55 @@
# ====== Base class imports ======
+import os
import time
+import warnings
+
import numpy as np
-import os
import pandas as pd
-import warnings
-from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score, auc
-from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
-from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
-from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, f1_score
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+from sklearn.neural_network import MLPClassifier
+from sklearn.svm import SVC
+
try:
from xgboost import XGBClassifier
+
XGBOOST_AVAILABLE = True
except Exception:
XGBOOST_AVAILABLE = False
XGBClassifier = None # type: ignore
-# ====== Additional local imports ======
-from qbiocode.evaluation.model_evaluation import modeleval
-import qbiocode.utils.qutils as qutils
-from sklearn.model_selection import GridSearchCV
+# from qiskit.primitives import Sampler
+from functools import reduce
# ====== Qiskit imports ======
from qiskit import QuantumCircuit
-
-#from qiskit.primitives import Sampler
-from functools import reduce
from qiskit.quantum_info import Pauli
from sklearn import svm
+from sklearn.model_selection import GridSearchCV
+
+import qbiocode.utils.qutils as qutils
+
+# ====== Additional local imports ======
+from qbiocode.evaluation.model_evaluation import modeleval
-def compute_pqk(X_train, X_test, y_train, y_test, args, model='PQK', data_key = '', verbose=False,
- encoding = 'Z', primitive = 'estimator', entanglement = 'linear', reps= 2,
- classical_models=None):
+
+def compute_pqk(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ model="PQK",
+ data_key="",
+ verbose=False,
+ encoding="Z",
+ primitive="estimator",
+ entanglement="linear",
+ reps=2,
+ classical_models=None,
+):
"""
This function generates quantum circuits, computes projections of the data onto these circuits,
and evaluates the performance of classical machine learning models on the projected data.
@@ -46,7 +63,7 @@ def compute_pqk(X_train, X_test, y_train, y_test, args, model='PQK', data_key =
This function is part of the main quantum machine learning pipeline (QProfiler.py) and is intended for use in supervised learning tasks.
It leverages quantum computing to enhance feature extraction and classification performance on complex datasets.
The function returns the performance results, including accuracy, F1-score, AUC, runtime, as well as model parameters, and other relevant metrics.
-
+
Args:
X_train (np.ndarray): Training data features.
X_test (np.ndarray): Test data features.
@@ -67,21 +84,24 @@ def compute_pqk(X_train, X_test, y_train, y_test, args, model='PQK', data_key =
Returns:
modeleval (pd.DataFrame): A DataFrame containing evaluation metrics and model parameters for all models.
"""
-
+
# Set default classical models if not provided
if classical_models is None:
- classical_models = ['rf', 'mlp', 'svc', 'lr', 'xgb']
+ classical_models = ["rf", "mlp", "svc", "lr", "xgb"]
beg_time = time.time()
feat_dimension = X_train.shape[1]
- if not os.path.exists( 'pqk_projections'):
- os.makedirs('pqk_projections')
+ if not os.path.exists("pqk_projections"):
+ os.makedirs("pqk_projections")
- file_projection_train = os.path.join( 'pqk_projections', 'pqk_projection_' + data_key + '_train.npy')
- file_projection_test = os.path.join( 'pqk_projections', 'pqk_projection_' + data_key + '_test.npy')
+ file_projection_train = os.path.join(
+ "pqk_projections", "pqk_projection_" + data_key + "_train.npy"
+ )
+ file_projection_test = os.path.join(
+ "pqk_projections", "pqk_projection_" + data_key + "_test.npy"
+ )
-
# This function ensures that all multiplicative factors of data features inside single qubit gates are 1.0
def data_map_func(x: np.ndarray) -> float:
"""
@@ -94,73 +114,92 @@ def data_map_func(x: np.ndarray) -> float:
float: the mapped value
"""
coeff = x[0] / 2 if len(x) == 1 else reduce(lambda m, n: (m * n) / 2, x)
- return coeff
-
- # choose a method for mapping your features onto the circuit
- feature_map, _ = qutils.get_feature_map(feature_map=encoding,
- feat_dimension=X_train.shape[1],
- reps = reps,
- entanglement=entanglement,
- data_map_func = data_map_func)
+ return float(coeff)
+
+ # choose a method for mapping your features onto the circuit
+ feature_map, _ = qutils.get_feature_map(
+ feature_map=encoding,
+ feat_dimension=X_train.shape[1],
+ reps=reps,
+ entanglement=entanglement,
+ data_map_func=data_map_func,
+ )
# Build quantum circuit
circuit = QuantumCircuit(feature_map.num_qubits)
circuit.compose(feature_map, inplace=True)
num_qubits = circuit.num_qubits
- if (not os.path.exists( file_projection_train ) ) | (not os.path.exists( file_projection_test ) ):
+ if (not os.path.exists(file_projection_train)) | (not os.path.exists(file_projection_test)):
# Generate the backend, session and primitive
- backend, session, prim = qutils.get_backend_session(args,
- 'estimator',
- num_qubits=num_qubits)
+ backend, session, prim = qutils.get_backend_session(
+ args, "estimator", num_qubits=num_qubits
+ )
# Transpile
- if args['backend'] != 'simulator':
- circuit = qutils.transpile_circuit( circuit, opt_level=3, backend = backend,
- PT = True, initial_layout = None)
-
+ if args["backend"] != "simulator":
+ circuit = qutils.transpile_circuit(
+ circuit, opt_level=3, backend=backend, PT=True, initial_layout=None
+ )
for f_tr in [file_projection_train, file_projection_test]:
- if not os.path.exists( f_tr ):
+ if not os.path.exists(f_tr):
projections = []
- if 'train' in f_tr:
+ if "train" in f_tr:
dat = X_train.copy()
else:
dat = X_test.copy()
-
+
# Identity operator on all qubits
- id = 'I' * feat_dimension
+ id = "I" * feat_dimension
# We group all commuting observables
# These groups are the Pauli X, Y and Z operators on individual qubits
# Apply the circuit layout to the observable if mapped to device
- if args['backend'] != 'simulator':
- observables_x =[]
- observables_y =[]
- observables_z =[]
+ if args["backend"] != "simulator":
+ observables_x = []
+ observables_y = []
+ observables_z = []
for i in range(feat_dimension):
- observables_x.append( Pauli(id[:i] + 'X' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) )
- observables_y.append( Pauli(id[:i] + 'Y' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) )
- observables_z.append( Pauli(id[:i] + 'Z' + id[(i + 1):]).apply_layout(circuit.layout, num_qubits=backend.num_qubits) )
+ observables_x.append(
+ Pauli(id[:i] + "X" + id[(i + 1) :]).apply_layout(
+ circuit.layout, num_qubits=backend.num_qubits
+ )
+ )
+ observables_y.append(
+ Pauli(id[:i] + "Y" + id[(i + 1) :]).apply_layout(
+ circuit.layout, num_qubits=backend.num_qubits
+ )
+ )
+ observables_z.append(
+ Pauli(id[:i] + "Z" + id[(i + 1) :]).apply_layout(
+ circuit.layout, num_qubits=backend.num_qubits
+ )
+ )
else:
- observables_x = [Pauli(id[:i] + 'X' + id[(i + 1):]) for i in range(feat_dimension)]
- observables_y = [Pauli(id[:i] + 'Y' + id[(i + 1):]) for i in range(feat_dimension)]
- observables_z = [Pauli(id[:i] + 'Z' + id[(i + 1):]) for i in range(feat_dimension)]
-
-
+ observables_x = [
+ Pauli(id[:i] + "X" + id[(i + 1) :]) for i in range(feat_dimension)
+ ]
+ observables_y = [
+ Pauli(id[:i] + "Y" + id[(i + 1) :]) for i in range(feat_dimension)
+ ]
+ observables_z = [
+ Pauli(id[:i] + "Z" + id[(i + 1) :]) for i in range(feat_dimension)
+ ]
+
# projections[i][j][k] will be the expectation value of the j-th Pauli operator (0: X, 1: Y, 2: Z)
# of datapoint i on qubit k
projections = []
for i in range(len(dat)):
if i % 100 == 0:
- print('at datapoint {}'.format(i))
+ print("at datapoint {}".format(i))
- # Get training sample
+ # Get training sample
parameters = dat[i]
- # We define the primitive unified blocs (PUBs) consisting of the embedding circuit,
+ # We define the primitive unified blocs (PUBs) consisting of the embedding circuit,
# set of observables and the circuit parameters
pub_x = (circuit, observables_x, parameters)
pub_y = (circuit, observables_y, parameters)
@@ -172,20 +211,20 @@ def data_map_func(x: np.ndarray) -> float:
job_result_z = job.result()[2].data.evs
# Record , and on all qubits for the current datapoint
- projections.append([job_result_x, job_result_y, job_result_z])
- np.save( f_tr, projections )
+ projections.append([job_result_x, job_result_y, job_result_z])
+ np.save(f_tr, projections)
if not isinstance(session, type(None)):
session.close()
# Load computed projections
- projections_train = np.load( file_projection_train )
+ projections_train = np.load(file_projection_train)
projections_train = np.array(projections_train).reshape(len(projections_train), -1)
- projections_test = np.load( file_projection_test )
+ projections_test = np.load(file_projection_test)
projections_test = np.array(projections_test).reshape(len(projections_test), -1)
-
+
# Check if XGBoost is requested but not available
- if 'xgb' in classical_models and not XGBOOST_AVAILABLE:
+ if "xgb" in classical_models and not XGBOOST_AVAILABLE:
warnings.warn(
"XGBoost is not properly installed or configured and will be skipped.\n"
"On macOS, you may need to install OpenMP:\n"
@@ -194,50 +233,58 @@ def data_map_func(x: np.ndarray) -> float:
" pip install --force-reinstall xgboost\n"
"See installation documentation for more details.\n"
f"Continuing with other models: {[m for m in classical_models if m != 'xgb']}",
- UserWarning
+ UserWarning,
)
# Remove xgb from the list
- classical_models = [m for m in classical_models if m != 'xgb']
-
+ classical_models = [m for m in classical_models if m != "xgb"]
+
# If no models remain after filtering, raise an error
if not classical_models:
- raise ValueError("No valid classical models specified. Please provide at least one model from: 'rf', 'mlp', 'svc', 'lr', 'xgb'")
-
+ raise ValueError(
+ "No valid classical models specified. Please provide at least one model from: 'rf', 'mlp', 'svc', 'lr', 'xgb'"
+ )
+
model_res = []
for method in classical_models:
- if method == 'rf':
- model = create_rf_model(args['seed'])
- elif method == 'svc':
- model = create_svc_model(args['seed'])
- elif method == 'mlp':
- model = create_mlp_model(args['seed'])
- elif method == 'lr':
- model = create_lr_model(args['seed'])
- elif method == 'xgb':
- model = create_xgb_model(args['seed'])
+ if method == "rf":
+ model = create_rf_model(args["seed"])
+ elif method == "svc":
+ model = create_svc_model(args["seed"])
+ elif method == "mlp":
+ model = create_mlp_model(args["seed"])
+ elif method == "lr":
+ model = create_lr_model(args["seed"])
+ elif method == "xgb":
+ model = create_xgb_model(args["seed"])
else:
- warnings.warn(f"Unknown model type '{method}' skipped. Valid options: 'rf', 'mlp', 'svc', 'lr', 'xgb'", UserWarning)
+ warnings.warn(
+ f"Unknown model type '{method}' skipped. Valid options: 'rf', 'mlp', 'svc', 'lr', 'xgb'",
+ UserWarning,
+ )
continue
-
- method_pqk = 'pqk_' + method
+
+ method_pqk = "pqk_" + method
print(method_pqk)
model.fit(projections_train, y_train)
y_predicted = model.predict(projections_test)
hyperparameters = {
- 'feature_map': feature_map.__class__.__name__,
- 'feature_map_reps': reps,
- 'entanglement' : entanglement,
- 'best_params': model.best_params_,
- # Add other hyperparameters as needed
- }
+ "feature_map": feature_map.__class__.__name__,
+ "feature_map_reps": reps,
+ "entanglement": entanglement,
+ "best_params": model.best_params_,
+ # Add other hyperparameters as needed
+ }
model_params = hyperparameters
- model_res.append(modeleval(y_test, y_predicted, beg_time, model_params, args, model=method_pqk, verbose=verbose))
+ model_res.append(
+ modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=method_pqk, verbose=verbose
+ )
+ )
model_res = pd.concat(model_res)
- return(model_res)
-
+ return model_res
def create_xgb_model(seed):
@@ -251,45 +298,50 @@ def create_xgb_model(seed):
" pip install --force-reinstall xgboost\n\n"
"See installation documentation for more details."
)
- xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss') # type: ignore
+ xgb = XGBClassifier(objective="binary:logistic", eval_metric="logloss") # type: ignore
xgb_param_distributions = {
- 'n_estimators': [100, 200, 300],
- 'learning_rate': [0.01, 0.1, 0.2],
- 'max_depth': [3, 5, 7],
- 'subsample': [0.7, 0.8, 1.0],
- 'colsample_bytree': [0.7, 0.8, 1.0],
- 'min_child_weight': [1, 3, 5]
+ "n_estimators": [100, 200, 300],
+ "learning_rate": [0.01, 0.1, 0.2],
+ "max_depth": [3, 5, 7],
+ "subsample": [0.7, 0.8, 1.0],
+ "colsample_bytree": [0.7, 0.8, 1.0],
+ "min_child_weight": [1, 3, 5],
}
# Initialize RandomizedSearchCV
- xgb_model = RandomizedSearchCV(estimator=xgb,
- param_distributions=xgb_param_distributions,
- n_iter=40,
- cv=5,
- random_state=seed,
- n_jobs=-1)
-
+ xgb_model = RandomizedSearchCV(
+ estimator=xgb,
+ param_distributions=xgb_param_distributions,
+ n_iter=40,
+ cv=5,
+ random_state=seed,
+ n_jobs=-1,
+ )
+
return xgb_model
+
def create_lr_model(seed):
# Initialize the Logistic Regression Classifier
lr = LogisticRegression(random_state=seed, max_iter=1000)
lr_param_distributions = {
- 'C': [0.001, 0.01, 0.1, 1, 10, 100],
- 'penalty': ['l1', 'l2'],
- 'solver': ['liblinear', 'saga']
+ "C": [0.001, 0.01, 0.1, 1, 10, 100],
+ "penalty": ["l1", "l2"],
+ "solver": ["liblinear", "saga"],
}
# Initialize RandomizedSearchCV
- lr_model = RandomizedSearchCV(estimator=lr,
- param_distributions=lr_param_distributions,
- n_iter=40,
- cv=5,
- random_state=seed,
- n_jobs=-1)
-
+ lr_model = RandomizedSearchCV(
+ estimator=lr,
+ param_distributions=lr_param_distributions,
+ n_iter=40,
+ cv=5,
+ random_state=seed,
+ n_jobs=-1,
+ )
+
return lr_model
@@ -298,58 +350,68 @@ def create_rf_model(seed):
rf = RandomForestClassifier(random_state=seed)
rf_param_distributions = {
- 'n_estimators': np.arange(100, 1000, 100),
- 'max_depth': np.arange(5, 20),
- 'min_samples_split': np.arange(2, 10),
- 'min_samples_leaf': np.arange(1, 5),
- 'bootstrap': [True, False]
+ "n_estimators": np.arange(100, 1000, 100),
+ "max_depth": np.arange(5, 20),
+ "min_samples_split": np.arange(2, 10),
+ "min_samples_leaf": np.arange(1, 5),
+ "bootstrap": [True, False],
}
# Initialize RandomizedSearchCV
- rf_model = RandomizedSearchCV(estimator=rf,
- param_distributions=rf_param_distributions,
- n_iter=40,
- cv=5,
- random_state=seed,
- n_jobs=-1)
-
+ rf_model = RandomizedSearchCV(
+ estimator=rf,
+ param_distributions=rf_param_distributions,
+ n_iter=40,
+ cv=5,
+ random_state=seed,
+ n_jobs=-1,
+ )
+
return rf_model
+
def create_mlp_model(seed):
- mlp_param_distributions = {"hidden_layer_sizes": [(128,64,32,10), (64,32,10), (128,64,32)],
- "activation": ["identity", "logistic", "tanh", "relu"],
- "solver": ["lbfgs", "sgd", "adam"],
- "alpha": [0.00005,0.0005]}
+ mlp_param_distributions = {
+ "hidden_layer_sizes": [(128, 64, 32, 10), (64, 32, 10), (128, 64, 32)],
+ "activation": ["identity", "logistic", "tanh", "relu"],
+ "solver": ["lbfgs", "sgd", "adam"],
+ "alpha": [0.00005, 0.0005],
+ }
# Initialize the MLP Classifier
mlp = MLPClassifier(random_state=seed)
# Initialize RandomizedSearchCV
- mlp_model = RandomizedSearchCV(estimator=mlp,
- param_distributions=mlp_param_distributions,
- n_iter=40,
- cv=5,
- random_state=seed,
- n_jobs=-1)
+ mlp_model = RandomizedSearchCV(
+ estimator=mlp,
+ param_distributions=mlp_param_distributions,
+ n_iter=40,
+ cv=5,
+ random_state=seed,
+ n_jobs=-1,
+ )
return mlp_model
+
def create_svc_model(seed):
- svc_param_distributions={
- 'C': [0.1, 1, 10, 100],
- 'gamma': [0.001, 0.01, 0.1, 1],
- 'kernel': ['linear', 'rbf', 'poly','sigmoid']
- }
+ svc_param_distributions = {
+ "C": [0.1, 1, 10, 100],
+ "gamma": [0.001, 0.01, 0.1, 1],
+ "kernel": ["linear", "rbf", "poly", "sigmoid"],
+ }
# Initialize the SVC
svc = SVC(random_state=seed)
# Initialize RandomizedSearchCV
- svc_model = RandomizedSearchCV(estimator=svc,
- param_distributions=svc_param_distributions,
- n_iter=40,
- cv=5,
- random_state=seed,
- n_jobs=-1)
-
- return svc_model
\ No newline at end of file
+ svc_model = RandomizedSearchCV(
+ estimator=svc,
+ param_distributions=svc_param_distributions,
+ n_iter=40,
+ cv=5,
+ random_state=seed,
+ n_jobs=-1,
+ )
+
+ return svc_model
diff --git a/qbiocode/learning/compute_qnn.py b/qbiocode/learning/compute_qnn.py
index 4bed244..9257ed1 100644
--- a/qbiocode/learning/compute_qnn.py
+++ b/qbiocode/learning/compute_qnn.py
@@ -2,23 +2,38 @@
import time
from typing import Literal
-# ====== Additional local imports ======
-from qbiocode.evaluation.model_evaluation import modeleval
-import qbiocode.utils.qutils as qutils
+# from qiskit.primitives import Sampler
+from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
+from qiskit_algorithms.utils import algorithm_globals
# ====== Qiskit imports ======
from qiskit_machine_learning.algorithms.classifiers import NeuralNetworkClassifier
-from qiskit_machine_learning.neural_networks import SamplerQNN, EstimatorQNN
-from qiskit_machine_learning.circuit.library import qnn_circuit as QNNCircuit
+from qiskit_machine_learning.circuit.library import qnn_circuit as QNNCircuit
+from qiskit_machine_learning.neural_networks import EstimatorQNN, SamplerQNN
-from qiskit_algorithms.utils import algorithm_globals
-#from qiskit.primitives import Sampler
-from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
+import qbiocode.utils.qutils as qutils
+
+# ====== Additional local imports ======
+from qbiocode.evaluation.model_evaluation import modeleval
-def compute_qnn(X_train, X_test, y_train, y_test, args, model='QNN', data_key = '',
- primitive: Literal['estimator', 'sampler'] = 'sampler', verbose=False,
- local_optimizer: Literal['COBYLA', 'L_BFGS_B', 'GradientDescent']='COBYLA',
- maxiter=100, encoding = 'Z', entanglement = 'linear', reps= 2, ansatz_type = 'amp'):
+
+def compute_qnn(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ model="QNN",
+ data_key="",
+ primitive: Literal["estimator", "sampler"] = "sampler",
+ verbose=False,
+ local_optimizer: Literal["COBYLA", "L_BFGS_B", "GradientDescent"] = "COBYLA",
+ maxiter=100,
+ encoding="Z",
+ entanglement="linear",
+ reps=2,
+ ansatz_type="amp",
+):
"""
This function computes a Quantum Neural Network (QNN) model on the provided training data and evaluates it on the test data.
It constructs a QNN circuit with a specified feature map and ansatz, optimizes it using a chosen optimizer, and fits the model to the training data.
@@ -41,31 +56,33 @@ def compute_qnn(X_train, X_test, y_train, y_test, args, model='QNN', data_key =
entanglement (str, optional): Entanglement strategy for the circuit. Defaults to 'linear'.
reps (int, optional): Number of repetitions for the feature map and ansatz. Defaults to 2.
ansatz_type (str, optional): Type of ansatz to use. Defaults to 'amp'.
-
+
Returns:
modeleval (dict): A dictionary containing the evaluation results, including accuracy, runtime, model parameters, and other relevant metrics.
- """
+ """
beg_time = time.time()
-
-
- # choose a method for mapping your features onto the circuit
- feature_map, _ = qutils.get_feature_map(feature_map=encoding,
- feat_dimension=X_train.shape[1],
- reps = reps,
- entanglement=entanglement)
+
+ # choose a method for mapping your features onto the circuit
+ feature_map, _ = qutils.get_feature_map(
+ feature_map=encoding, feat_dimension=X_train.shape[1], reps=reps, entanglement=entanglement
+ )
# get ansatz
- ansatz= qutils.get_ansatz( ansatz_type=ansatz_type, feat_dimension = feature_map.num_qubits, reps=reps, entanglement=entanglement)
+ ansatz = qutils.get_ansatz(
+ ansatz_type=ansatz_type,
+ feat_dimension=feature_map.num_qubits,
+ reps=reps,
+ entanglement=entanglement,
+ )
# Generate the backend, session and primitive
- backend, session, prim = qutils.get_backend_session(args,
- primitive,
- num_qubits=feature_map.num_qubits)
+ backend, session, prim = qutils.get_backend_session(
+ args, primitive, num_qubits=feature_map.num_qubits
+ )
- # Get Optimizer
- optimizer = qutils.get_optimizer( local_optimizer, max_iter=maxiter)
+ # Get Optimizer
+ optimizer = qutils.get_optimizer(local_optimizer, max_iter=maxiter)
-
# qc, input_params, weight_params = QNNCircuit(num_qubits=X_train.shape[1], feature_map=feature_map, ansatz=ansatz)
qc, _, _ = QNNCircuit(num_qubits=X_train.shape[1], feature_map=feature_map, ansatz=ansatz)
@@ -74,55 +91,75 @@ def compute_qnn(X_train, X_test, y_train, y_test, args, model='QNN', data_key =
print(f"The number of parameters in your circuit is: {feature_map.num_parameters}")
print(f"The number of ansatz parameters in your circuit is: {ansatz.num_parameters}")
- if primitive == 'estimator':
- if args['backend'] == 'simulator':
- qnn = EstimatorQNN(circuit=qc,
- input_params=feature_map.parameters,
- weight_params=ansatz.parameters)
+ neural_network: EstimatorQNN | SamplerQNN
+
+ if primitive == "estimator":
+ if args["backend"] == "simulator":
+ neural_network = EstimatorQNN(
+ circuit=qc, input_params=feature_map.parameters, weight_params=ansatz.parameters
+ )
else:
pm = generate_preset_pass_manager(backend=backend, optimization_level=3)
- qnn = EstimatorQNN(circuit=qc,
- estimator=prim,
- pass_manager=pm,
- input_params=feature_map.parameters,
- weight_params=ansatz.parameters)
+ neural_network = EstimatorQNN(
+ circuit=qc,
+ estimator=prim,
+ pass_manager=pm,
+ input_params=feature_map.parameters,
+ weight_params=ansatz.parameters,
+ )
# QNN maps inputs to [-1, +1]
- qnn.forward(X_train[0, :], algorithm_globals.random.random(qnn.num_weights))
+ neural_network.forward(
+ X_train[0, :], algorithm_globals.random.random(neural_network.num_weights)
+ )
else:
# sampler=Sampler(backend=backend)
# parity maps bitstrings to 0 or 1
def parity(x):
return "{:b}".format(x).count("1") % 2
- output_shape = 2 # corresponds to the number of classes, possible outcomes of the (parity) mapping
+
+ output_shape = (
+ 2 # corresponds to the number of classes, possible outcomes of the (parity) mapping
+ )
# construct QNN
- if 'simulator' in args['backend']:
- qnn = SamplerQNN(circuit=qc, interpret=parity, output_shape=output_shape,
- input_params=feature_map.parameters,
- weight_params=ansatz.parameters)
+ if "simulator" in args["backend"]:
+ neural_network = SamplerQNN(
+ circuit=qc,
+ interpret=parity,
+ output_shape=output_shape,
+ input_params=feature_map.parameters,
+ weight_params=ansatz.parameters,
+ )
else:
- pm = generate_preset_pass_manager(backend=backend, optimization_level=3)
- qnn = SamplerQNN(circuit=qc, sampler=prim,
- interpret=parity, output_shape=output_shape,
- pass_manager=pm, input_params=feature_map.parameters,
- weight_params=ansatz.parameters)
-
+ pm = generate_preset_pass_manager(backend=backend, optimization_level=3)
+ neural_network = SamplerQNN(
+ circuit=qc,
+ sampler=prim,
+ interpret=parity,
+ output_shape=output_shape,
+ pass_manager=pm,
+ input_params=feature_map.parameters,
+ weight_params=ansatz.parameters,
+ )
+
# construct classifier
- qnn = NeuralNetworkClassifier(neural_network=qnn, optimizer=optimizer)
-
- # fit classifier to data
+ qnn = NeuralNetworkClassifier(neural_network=neural_network, optimizer=optimizer)
+
+ # fit classifier to data
model_fit = qnn.fit(X_train, y_train)
hyperparameters = {
- 'feature_map': feature_map.__class__.__name__,
- 'ansatz': ansatz.__class__.__name__,
- 'optimizer': optimizer.__class__.__name__,
- 'optimizer_params': optimizer.settings,
- # Add other hyperparameters as needed
- }
+ "feature_map": feature_map.__class__.__name__,
+ "ansatz": ansatz.__class__.__name__,
+ "optimizer": optimizer.__class__.__name__,
+ "optimizer_params": optimizer.settings,
+ # Add other hyperparameters as needed
+ }
model_params = hyperparameters
y_predicted = qnn.predict(X_test)
-
+
if not isinstance(session, type(None)):
session.close()
-
- return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose))
\ No newline at end of file
+
+ return modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose
+ )
diff --git a/qbiocode/learning/compute_qsvc.py b/qbiocode/learning/compute_qsvc.py
index a3d05f7..7421afe 100644
--- a/qbiocode/learning/compute_qsvc.py
+++ b/qbiocode/learning/compute_qsvc.py
@@ -1,37 +1,55 @@
-import time
-import numpy as np
+import time
from typing import Literal
+import numpy as np
+from qiskit.circuit.library import PauliFeatureMap, ZFeatureMap, ZZFeatureMap
+from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
+from qiskit_aer import AerSimulator
+from qiskit_ibm_runtime import QiskitRuntimeService
+from qiskit_ibm_runtime import SamplerV2 as Sampler
+from qiskit_machine_learning.algorithms import QSVC, PegasosQSVC
+from qiskit_machine_learning.kernels import FidelityQuantumKernel
+
+# from qiskit.primitives import Sampler
+from qiskit_machine_learning.state_fidelities import ComputeUncompute
+from sklearn.model_selection import GridSearchCV
+
+import qbiocode.utils.qutils as qutils
+
# ====== Additional local imports ======
from qbiocode.evaluation.model_evaluation import modeleval
-import qbiocode.utils.qutils as qutils
# ====== Scikit-learn imports ======
-from sklearn.model_selection import GridSearchCV
# ====== Qiskit imports ======
-from qiskit.circuit.library import ZZFeatureMap
-from qiskit.circuit.library import ZZFeatureMap, ZFeatureMap, PauliFeatureMap
-from qiskit_aer import AerSimulator
-#from qiskit.primitives import Sampler
-from qiskit_machine_learning.state_fidelities import ComputeUncompute
-from qiskit_machine_learning.kernels import FidelityQuantumKernel
-from qiskit_machine_learning.algorithms import QSVC, PegasosQSVC
-from qiskit_ibm_runtime import QiskitRuntimeService, SamplerV2 as Sampler
-from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
-def compute_qsvc(X_train, X_test, y_train, y_test, args, model='QSVC', data_key = '',
- C=1, gamma='scale', pegasos=False, encoding: Literal['ZZ', 'Z', 'P']="ZZ",
- entanglement='linear', primitive = 'sampler', reps = 2, verbose=False, local_optimizer = ''):
+def compute_qsvc(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ model="QSVC",
+ data_key="",
+ C=1,
+ gamma="scale",
+ pegasos=False,
+ encoding: Literal["ZZ", "Z", "P"] = "ZZ",
+ entanglement="linear",
+ primitive="sampler",
+ reps=2,
+ verbose=False,
+ local_optimizer="",
+):
"""
This function computes a quantum support vector classifier (QSVC) using the Qiskit Machine Learning library.
It takes training and testing datasets, along with various parameters to configure the QSVC model.
It initializes the quantum feature map, sets up the backend and session, and fits the QSVC model to the training data.
It then predicts the labels for the test data and evaluates the model's performance.
The function returns the performance results, including accuracy, F1-score, AUC, runtime, as well as model parameters, and other relevant metrics.
-
+
Args:
X_train (np.ndarray): Training feature set.
X_test (np.ndarray): Testing feature set.
@@ -53,48 +71,50 @@ def compute_qsvc(X_train, X_test, y_train, y_test, args, model='QSVC', data_key
modeleval (dict): A dictionary containing the evaluation results, including accuracy, runtime, model parameters, and other relevant metrics.
"""
beg_time = time.time()
-
-
- # choose a method for mapping your features onto the circuit
- feature_map, _ = qutils.get_feature_map(feature_map=encoding,
- feat_dimension=X_train.shape[1],
- reps = reps,
- entanglement=entanglement)
+ # choose a method for mapping your features onto the circuit
+ feature_map, _ = qutils.get_feature_map(
+ feature_map=encoding, feat_dimension=X_train.shape[1], reps=reps, entanglement=entanglement
+ )
# Generate the backend, session and primitive
- backend, session, prim = qutils.get_backend_session(args,
- primitive,
- num_qubits=feature_map.num_qubits)
-
+ backend, session, prim = qutils.get_backend_session(
+ args, primitive, num_qubits=feature_map.num_qubits
+ )
+
print(f"Currently running a quantum support vector classifier (QSVC) on this dataset.")
print(f"The number of qubits in your circuit is: {feature_map.num_qubits}")
print(f"The number of parameters in your circuit is: {feature_map.num_parameters}")
-
- if 'simulator' == args['backend']:
+
+ if "simulator" == args["backend"]:
fidelity = ComputeUncompute(sampler=prim)
- else:
+ else:
# Need to instatiate a basic pass manager to store the chosen hardware backend
- pm = generate_preset_pass_manager(backend=backend, optimization_level=3)
- fidelity = ComputeUncompute(sampler=prim, pass_manager=pm) #, num_virtual_qubits = feature_map.num_qubits )
-
+ pm = generate_preset_pass_manager(backend=backend, optimization_level=3)
+ fidelity = ComputeUncompute(
+ sampler=prim, pass_manager=pm
+ ) # , num_virtual_qubits = feature_map.num_qubits )
+
Qkernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=feature_map)
if pegasos == True:
- qsvc = PegasosQSVC(C=C, gamma=gamma, quantum_kernel=Qkernel)
+ qsvc = PegasosQSVC(C=C, quantum_kernel=Qkernel)
else:
qsvc = QSVC(C=C, gamma=gamma, quantum_kernel=Qkernel)
-
+
model_fit = qsvc.fit(X_train, y_train)
# model_params = model_fit.get_params()
- hyperparameters = {'feature_map': feature_map.__class__.__name__,
- 'quantum_kernel': Qkernel.__class__.__name__,
- 'C': C,
- 'gamma': gamma,
- }
+ hyperparameters = {
+ "feature_map": feature_map.__class__.__name__,
+ "quantum_kernel": Qkernel.__class__.__name__,
+ "C": C,
+ "gamma": gamma,
+ }
model_params = hyperparameters
- y_predicted = qsvc.predict(X_test)
+ y_predicted = qsvc.predict(X_test)
if not isinstance(session, type(None)):
session.close()
- return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose))
+ return modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose
+ )
diff --git a/qbiocode/learning/compute_rf.py b/qbiocode/learning/compute_rf.py
index 8abc0e1..24e1cbd 100644
--- a/qbiocode/learning/compute_rf.py
+++ b/qbiocode/learning/compute_rf.py
@@ -1,10 +1,8 @@
# ====== Base class imports ======
import time
-import numpy as np
-
-# ====== Scikit-learn imports ======
+import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
@@ -12,19 +10,46 @@
# ====== Additional local imports ======
from qbiocode.evaluation.model_evaluation import modeleval
+# ====== Scikit-learn imports ======
+
+
# ====== Begin functions ======
-def compute_rf(X_train, X_test, y_train, y_test, args, verbose=False, model='Random Forest', data_key = '',
- n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
- min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0,
- bootstrap=True, oob_score=False, n_jobs=None, random_state=None, warm_start=False,
- class_weight=None, ccp_alpha=0.0, max_samples=None, monotonic_cst=None):
-
- """
+
+def compute_rf(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ model="Random Forest",
+ data_key="",
+ n_estimators=100,
+ *,
+ criterion="gini",
+ max_depth=None,
+ min_samples_split=2,
+ min_samples_leaf=1,
+ min_weight_fraction_leaf=0.0,
+ max_features="sqrt",
+ max_leaf_nodes=None,
+ min_impurity_decrease=0.0,
+ bootstrap=True,
+ oob_score=False,
+ n_jobs=None,
+ random_state=None,
+ warm_start=False,
+ class_weight=None,
+ ccp_alpha=0.0,
+ max_samples=None,
+ monotonic_cst=None,
+):
+ """
This function generates a model using a Random Forest (RF) Classifier method as implemented in
- `scikit-learn `_.
+ `scikit-learn `__.
It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed.
- The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
+ The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model.
This function is designed to be used in a supervised learning context, where the goal is to classify data points.
@@ -56,34 +81,64 @@ def compute_rf(X_train, X_test, y_train, y_test, args, verbose=False, model='Ran
Returns:
modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation.
- """
-
+ """
+
beg_time = time.time()
- rf = OneVsOneClassifier(RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
- min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
- min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features,
- max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease,
- bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state,
- warm_start=warm_start, class_weight=class_weight,
- ccp_alpha=ccp_alpha, max_samples=max_samples, monotonic_cst=monotonic_cst))
+ rf = OneVsOneClassifier(
+ RandomForestClassifier(
+ n_estimators=n_estimators,
+ criterion=criterion,
+ max_depth=max_depth,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ min_weight_fraction_leaf=min_weight_fraction_leaf,
+ max_features=max_features,
+ max_leaf_nodes=max_leaf_nodes,
+ min_impurity_decrease=min_impurity_decrease,
+ bootstrap=bootstrap,
+ oob_score=oob_score,
+ n_jobs=n_jobs,
+ random_state=random_state,
+ warm_start=warm_start,
+ class_weight=class_weight,
+ ccp_alpha=ccp_alpha,
+ max_samples=max_samples,
+ monotonic_cst=monotonic_cst,
+ )
+ )
# Fit the training datset
model_fit = rf.fit(X_train, y_train)
model_params = model_fit.get_params()
# Validate the model in test dataset and calculate accuracy
- y_predicted = rf.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose))
-
-def compute_rf_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, model='Random Forest',
- bootstrap= [], max_depth= [], max_features= [],
- min_samples_leaf= [], min_samples_split= [], n_estimators= []):
-
- """
+ y_predicted = rf.predict(X_test)
+ return modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose
+ )
+
+
+def compute_rf_opt(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ cv=5,
+ model="Random Forest",
+ bootstrap=[],
+ max_depth=[],
+ max_features=[],
+ min_samples_leaf=[],
+ min_samples_split=[],
+ n_estimators=[],
+):
+ """
This function also generates a model using a Random Forest (RF) Classifier method as implemented in
- `scikit-learn `_.
+ `scikit-learn `__.
The difference here is that this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The
combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar
datasets, without having to run the grid search.
- The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
+ The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search.
This function is designed to be used in a supervised learning context, where the goal is to classify data points.
@@ -104,19 +159,20 @@ def compute_rf_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5,
n_estimators (list): List of number of estimators options for grid search.
Returns:
- modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation.
+ modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation.
+
+ """
- """
-
beg_time = time.time()
- params={'n_estimators': n_estimators,
- 'max_features': max_features,
- 'max_depth': max_depth,
- 'min_samples_split': min_samples_split,
- 'min_samples_leaf': min_samples_leaf,
- 'bootstrap': bootstrap
- }
-
+ params = {
+ "n_estimators": n_estimators,
+ "max_features": max_features,
+ "max_depth": max_depth,
+ "min_samples_split": min_samples_split,
+ "min_samples_leaf": min_samples_leaf,
+ "bootstrap": bootstrap,
+ }
+
# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=cv)
grid_search.fit(X_train, y_train)
@@ -128,4 +184,4 @@ def compute_rf_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5,
# Make predictions and calculate accuracy
y_predicted = best_rf.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose))
\ No newline at end of file
+ return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)
diff --git a/qbiocode/learning/compute_svc.py b/qbiocode/learning/compute_svc.py
index 4dcc8ed..fd47c7b 100644
--- a/qbiocode/learning/compute_svc.py
+++ b/qbiocode/learning/compute_svc.py
@@ -2,23 +2,44 @@
import time
-# ====== Scikit-learn imports ======
-
-from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
+from sklearn.svm import SVC
# ====== Additional local imports ======
from qbiocode.evaluation.model_evaluation import modeleval
-
-def compute_svc(X_train, X_test, y_train, y_test, args, model='SVC', data_key = '', C=1.0, kernel='rbf',
- degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200,
- class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None):
-
- """ This function generates a model using a Support Vector Classifier (SVC) method as implemented in
- `scikit-learn `_.
+
+# ====== Scikit-learn imports ======
+
+
+def compute_svc(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ model="SVC",
+ data_key="",
+ C=1.0,
+ kernel="rbf",
+ degree=3,
+ gamma="scale",
+ coef0=0.0,
+ shrinking=True,
+ probability=False,
+ tol=0.001,
+ cache_size=200,
+ class_weight=None,
+ verbose=False,
+ max_iter=-1,
+ decision_function_shape="ovr",
+ break_ties=False,
+ random_state=None,
+):
+ """This function generates a model using a Support Vector Classifier (SVC) method as implemented in
+ `scikit-learn `__.
It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed.
- The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset.
+ The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset.
The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model.
This function is designed to be used in a supervised learning context, where the goal is to classify data points.
@@ -47,29 +68,56 @@ def compute_svc(X_train, X_test, y_train, y_test, args, model='SVC', data_key =
random_state (int or None): Controls the randomness of the estimator, default is None.
Returns:
modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken to train and validate the model.
- """
-
+ """
+
beg_time = time.time()
- svc = OneVsOneClassifier(SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
- probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight,
- max_iter=max_iter, decision_function_shape=decision_function_shape,
- break_ties=break_ties, random_state=random_state))
+ svc = OneVsOneClassifier(
+ SVC(
+ C=C,
+ kernel=kernel,
+ degree=degree,
+ gamma=gamma,
+ coef0=coef0,
+ shrinking=shrinking,
+ probability=probability,
+ tol=tol,
+ cache_size=cache_size,
+ class_weight=class_weight,
+ max_iter=max_iter,
+ decision_function_shape=decision_function_shape,
+ break_ties=break_ties,
+ random_state=random_state,
+ )
+ )
# Fit the training datset
model_fit = svc.fit(X_train, y_train)
model_params = model_fit.get_params()
# Validate the model in test dataset and calculate accuracy
- y_predicted = svc.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose))
+ y_predicted = svc.predict(X_test)
+ return modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose
+ )
+
-def compute_svc_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, model='SVC',
- C=[], gamma=[], kernel=[]):
-
- """ This function generates a model using a Support Vector Classifier (SVC) method as implemented in
- `scikit-learn `_.
+def compute_svc_opt(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ cv=5,
+ model="SVC",
+ C=[],
+ gamma=[],
+ kernel=[],
+):
+ """This function generates a model using a Support Vector Classifier (SVC) method as implemented in
+ `scikit-learn `__.
It takes in parameter arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed. The
combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar
datasets, without having to run the grid search.
- The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset.
+ The model is trained on the training dataset and validated on the test dataset. The model is trained on the training dataset and validated on the test dataset.
The function returns the evaluation of the model on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search.
This function is designed to be used in a supervised learning context, where the goal is to classify data points.
@@ -87,13 +135,10 @@ def compute_svc_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5,
kernel (list or str): Specifies the kernel type(s) to be used in the algorithm, default is an empty list.
Returns:
modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search.
- """
+ """
beg_time = time.time()
- params={'C': C,
- 'gamma': gamma,
- 'kernel': kernel
- }
+ params = {"C": C, "gamma": gamma, "kernel": kernel}
# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(SVC(), param_grid=params, cv=cv)
grid_search.fit(X_train, y_train)
@@ -105,4 +150,4 @@ def compute_svc_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5,
# Make predictions and calculate accuracy
y_predicted = best_svc.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose))
\ No newline at end of file
+ return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)
diff --git a/qbiocode/learning/compute_vqc.py b/qbiocode/learning/compute_vqc.py
index 2719515..da21f82 100644
--- a/qbiocode/learning/compute_vqc.py
+++ b/qbiocode/learning/compute_vqc.py
@@ -2,18 +2,35 @@
import time
from typing import Literal
-# ====== Additional local imports ======
-from qbiocode.evaluation.model_evaluation import modeleval
-import qbiocode.utils.qutils as qutils
+# from qiskit.primitives import Sampler
+from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
# ====== Qiskit imports ======
from qiskit_machine_learning.algorithms.classifiers import VQC
-#from qiskit.primitives import Sampler
-from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
-def compute_vqc(X_train, X_test, y_train, y_test, args, verbose=False, model='VQC', data_key = '',
- local_optimizer: Literal['COBYLA', 'L_BFGS_B', 'GradientDescent']="COBYLA", maxiter=100,
- encoding = 'Z', entanglement = 'linear', reps= 2,primitive = 'sampler', ansatz_type='amp'):
+import qbiocode.utils.qutils as qutils
+
+# ====== Additional local imports ======
+from qbiocode.evaluation.model_evaluation import modeleval
+
+
+def compute_vqc(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ model="VQC",
+ data_key="",
+ local_optimizer: Literal["COBYLA", "L_BFGS_B", "GradientDescent"] = "COBYLA",
+ maxiter=100,
+ encoding="Z",
+ entanglement="linear",
+ reps=2,
+ primitive="sampler",
+ ansatz_type="amp",
+):
"""
This function computes a Variational Quantum Classifier (VQC) using the Qiskit Machine Learning library.
It takes training and testing datasets, along with various parameters to configure the VQC model.
@@ -41,48 +58,59 @@ def compute_vqc(X_train, X_test, y_train, y_test, args, verbose=False, model='VQ
dict: Evaluation results including accuracy, time taken, and model parameters.
"""
beg_time = time.time()
- # choose a method for mapping your features onto the circuit
- feature_map, _ = qutils.get_feature_map(feature_map=encoding,
- feat_dimension=X_train.shape[1],
- reps = reps,
- entanglement=entanglement)
+ # choose a method for mapping your features onto the circuit
+ feature_map, _ = qutils.get_feature_map(
+ feature_map=encoding, feat_dimension=X_train.shape[1], reps=reps, entanglement=entanglement
+ )
# get ansatz
- ansatz= qutils.get_ansatz( ansatz_type=ansatz_type, feat_dimension = feature_map.num_qubits, reps=reps, entanglement=entanglement)
-
+ ansatz = qutils.get_ansatz(
+ ansatz_type=ansatz_type,
+ feat_dimension=feature_map.num_qubits,
+ reps=reps,
+ entanglement=entanglement,
+ )
# Generate the backend, session and primitive
- backend, session, prim = qutils.get_backend_session(args,
- primitive,
- num_qubits=feature_map.num_qubits)
-
- # Get Optimizer
- optimizer = qutils.get_optimizer( local_optimizer, max_iter=maxiter)
-
- # instantiate the primitive
- if 'simulator' == args['backend']:
- vqc= VQC(sampler=prim, feature_map=feature_map, ansatz=ansatz, optimizer=optimizer)
+ backend, session, prim = qutils.get_backend_session(
+ args, primitive, num_qubits=feature_map.num_qubits
+ )
+
+ # Get Optimizer
+ optimizer = qutils.get_optimizer(local_optimizer, max_iter=maxiter)
+
+ # instantiate the primitive
+ if "simulator" == args["backend"]:
+ vqc = VQC(sampler=prim, feature_map=feature_map, ansatz=ansatz, optimizer=optimizer)
else:
pm = generate_preset_pass_manager(backend=backend, optimization_level=3)
- vqc= VQC(sampler=prim, feature_map=feature_map, ansatz=ansatz, optimizer=optimizer, pass_manager=pm)
+ vqc = VQC(
+ sampler=prim,
+ feature_map=feature_map,
+ ansatz=ansatz,
+ optimizer=optimizer,
+ pass_manager=pm,
+ )
print(f"Currently running a variational quantum classifer (VQC) on this dataset.")
print(f"The number of qubits in your circuit is: {feature_map.num_qubits}")
print(f"The number of parameters in your circuit is: {feature_map.num_parameters}")
-
+
# fit classifier to data
model_fit = vqc.fit(X_train, y_train)
hyperparameters = {
- 'feature_map': feature_map.__class__.__name__,
- 'ansatz': ansatz.__class__.__name__,
- 'optimizer': optimizer.__class__.__name__,
- 'optimizer_params': optimizer.settings,
- # Add other hyperparameters as needed
- }
+ "feature_map": feature_map.__class__.__name__,
+ "ansatz": ansatz.__class__.__name__,
+ "optimizer": optimizer.__class__.__name__,
+ "optimizer_params": optimizer.settings,
+ # Add other hyperparameters as needed
+ }
model_params = hyperparameters
y_predicted = vqc.predict(X_test)
if not isinstance(session, type(None)):
session.close()
- return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose))
\ No newline at end of file
+ return modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose
+ )
diff --git a/qbiocode/learning/compute_xgb.py b/qbiocode/learning/compute_xgb.py
index 2ae0227..ef446eb 100644
--- a/qbiocode/learning/compute_xgb.py
+++ b/qbiocode/learning/compute_xgb.py
@@ -1,12 +1,14 @@
# ====== Base class imports ======
import time
+
import numpy as np
# ====== Scikit-learn imports ======
try:
from xgboost import XGBClassifier
+
XGBOOST_AVAILABLE = True
_XGBOOST_ERROR = None
except Exception as e:
@@ -23,10 +25,25 @@
# ====== Begin functions ======
-def compute_xgb(X_train, X_test, y_train, y_test, args, verbose=False, model='xgb', data_key = '',
- n_estimators=100, *, criterion='gini', max_depth=None, subsample=0.5, learning_rate=0.5,
- colsample_bytree=1, min_child_weight=1):
-
+
+def compute_xgb(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ model="xgb",
+ data_key="",
+ n_estimators=100,
+ *,
+ criterion="gini",
+ max_depth=None,
+ subsample=0.5,
+ learning_rate=0.5,
+ colsample_bytree=1,
+ min_child_weight=1,
+):
"""
This function generates a model using an Extreme Gradient Boositing (xgb) Classifier method as implemented in xgboost. It takes in parameter
arguments specified in the config.yaml file, but will use the default parameters specified above if none are passed.
@@ -51,12 +68,12 @@ def compute_xgb(X_train, X_test, y_train, y_test, args, verbose=False, model='xg
min_child_weight (int) : Minimum sum of instance weight (hessian) needed in a child. Default is 1
Returns:
modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation.
-
+
Raises:
ImportError: If XGBoost is not properly installed or configured.
"""
-
+
if not XGBOOST_AVAILABLE:
error_msg = (
"XGBoost is not properly installed or configured.\n"
@@ -68,28 +85,53 @@ def compute_xgb(X_train, X_test, y_train, y_test, args, verbose=False, model='xg
"See installation documentation for more details."
)
raise ImportError(error_msg)
-
+
beg_time = time.time()
- xgb = OneVsOneClassifier(XGBClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, # type: ignore
- subsample=subsample, learning_rate=learning_rate, colsample_bytree=colsample_bytree,
- min_child_weight=min_child_weight))
+ xgb = OneVsOneClassifier(
+ XGBClassifier(
+ n_estimators=n_estimators,
+ criterion=criterion,
+ max_depth=max_depth, # type: ignore
+ subsample=subsample,
+ learning_rate=learning_rate,
+ colsample_bytree=colsample_bytree,
+ min_child_weight=min_child_weight,
+ )
+ )
# Fit the training datset
model_fit = xgb.fit(X_train, y_train)
model_params = model_fit.get_params()
# Validate the model in test dataset and calculate accuracy
- y_predicted = xgb.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose))
-
-def compute_xgb_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5, model='xgb',
- bootstrap= [], max_depth= [], max_features= [],learning_rate=[],subsample = [], colsample_bytree = []
- , n_estimators= [], min_child_weight = []):
-
- """
+ y_predicted = xgb.predict(X_test)
+ return modeleval(
+ y_test, y_predicted, beg_time, model_params, args, model=model, verbose=verbose
+ )
+
+
+def compute_xgb_opt(
+ X_train,
+ X_test,
+ y_train,
+ y_test,
+ args,
+ verbose=False,
+ cv=5,
+ model="xgb",
+ bootstrap=[],
+ max_depth=[],
+ max_features=[],
+ learning_rate=[],
+ subsample=[],
+ colsample_bytree=[],
+ n_estimators=[],
+ min_child_weight=[],
+):
+ """
This function generates a model using an Extreme Gradient Boositing (xgb) Classifier method as implemented in xgboost.
The difference here is that this function runs a grid search. The range of the grid search for each parameter is specified in the config.yaml file. The
combination of parameters that led to the best performance is saved and returned as best_params, which can then be used on similar
datasets, without having to run the grid search.
- The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
+ The model is trained on the training dataset and validated on the test dataset. The function returns the evaluation of the model
on the test dataset, including accuracy, AUC, F1 score, and the time taken to train and validate the model across the grid search.
This function is designed to be used in a supervised learning context, where the goal is to classify data points.
@@ -111,12 +153,12 @@ def compute_xgb_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5,
min_child_weight (list): List of minimum sum of instance weight (hessian) needed in a childoptions for grid search.
Returns:
- modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation.
+ modeleval (dict): A dictionary containing the evaluation metrics of the model, including accuracy, AUC, F1 score, and the time taken for training and validation.
Raises:
ImportError: If XGBoost is not properly installed or configured.
"""
-
+
if not XGBOOST_AVAILABLE:
error_msg = (
"XGBoost is not properly installed or configured.\n"
@@ -128,17 +170,18 @@ def compute_xgb_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5,
"See installation documentation for more details."
)
raise ImportError(error_msg)
-
+
beg_time = time.time()
- params={'n_estimators': n_estimators,
- 'max_depth': max_depth,
- 'learning_rate' : learning_rate,
- 'subsample' : subsample,
- 'colsample_bytree' : colsample_bytree,
- 'min_child_weight' : min_child_weight,
- 'bootstrap': bootstrap
- }
-
+ params = {
+ "n_estimators": n_estimators,
+ "max_depth": max_depth,
+ "learning_rate": learning_rate,
+ "subsample": subsample,
+ "colsample_bytree": colsample_bytree,
+ "min_child_weight": min_child_weight,
+ "bootstrap": bootstrap,
+ }
+
# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(XGBClassifier(), param_grid=params, cv=cv) # type: ignore
grid_search.fit(X_train, y_train)
@@ -150,4 +193,4 @@ def compute_xgb_opt(X_train, X_test, y_train, y_test, args, verbose=False, cv=5,
# Make predictions and calculate accuracy
y_predicted = best_xgb.predict(X_test)
- return(modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose))
\ No newline at end of file
+ return modeleval(y_test, y_predicted, beg_time, best_params, args, model=model, verbose=verbose)
diff --git a/qbiocode/utils/__init__.py b/qbiocode/utils/__init__.py
index 40c0fe5..cdd23d9 100644
--- a/qbiocode/utils/__init__.py
+++ b/qbiocode/utils/__init__.py
@@ -6,7 +6,7 @@
model management, IBM Quantum account handling, and result analysis.
Available Functions
-------------------
+-------------------
- scaler_fn: Data scaling and normalization
- feature_encoding: Encode features for quantum circuits
- qml_winner: Identify best performing quantum model
@@ -34,52 +34,46 @@
>>> X_encoded = feature_encoding(X, feature_encoding='OneHotEncoder')
"""
-from .helper_fn import scaler_fn, feature_encoding
-from .qc_winner_finder import qml_winner
+from .combine_evals_results import combine_results, track_progress
from .dataset_checkpoint import checkpoint_restart
-from .combine_evals_results import track_progress, combine_results
from .find_duplicates import find_duplicate_files
from .find_string import find_string_in_files
from .generate_qml_configs import generate_qml_experiment_configs
+from .helper_fn import feature_encoding, scaler_fn
from .ibm_account import get_creds, instantiate_runtime_service
+from .qc_winner_finder import qml_winner
from .qutils import (
+ get_ansatz,
get_backend_session,
- get_sampler,
get_estimator,
- get_ansatz,
get_feature_map,
get_optimizer,
+ get_sampler,
)
__all__ = [
# Data preprocessing
- 'scaler_fn',
- 'feature_encoding',
-
+ "scaler_fn",
+ "feature_encoding",
# Model management
- 'qml_winner',
- 'checkpoint_restart',
-
+ "qml_winner",
+ "checkpoint_restart",
# Results management
- 'track_progress',
- 'combine_results',
-
+ "track_progress",
+ "combine_results",
# Configuration generation
- 'generate_qml_experiment_configs',
-
+ "generate_qml_experiment_configs",
# File utilities
- 'find_duplicate_files',
- 'find_string_in_files',
-
+ "find_duplicate_files",
+ "find_string_in_files",
# IBM Quantum utilities
- 'get_creds',
- 'instantiate_runtime_service',
-
+ "get_creds",
+ "instantiate_runtime_service",
# Quantum utilities
- 'get_backend_session',
- 'get_sampler',
- 'get_estimator',
- 'get_ansatz',
- 'get_feature_map',
- 'get_optimizer',
+ "get_backend_session",
+ "get_sampler",
+ "get_estimator",
+ "get_ansatz",
+ "get_feature_map",
+ "get_optimizer",
]
diff --git a/qbiocode/utils/combine_evals_results.py b/qbiocode/utils/combine_evals_results.py
index 16ed1ac..7e09e9b 100644
--- a/qbiocode/utils/combine_evals_results.py
+++ b/qbiocode/utils/combine_evals_results.py
@@ -8,25 +8,26 @@
"""
import os
+from typing import List, Optional, Tuple
+
import pandas as pd
-from typing import List, Tuple, Optional
def track_progress(
input_dataset_dir: str,
current_results_dir: str,
- completion_marker: str = 'RawDataEvaluation.csv',
+ completion_marker: str = "RawDataEvaluation.csv",
prefix_length: int = 8,
- input_extension: str = 'csv',
- verbose: bool = True
+ input_extension: str = "csv",
+ verbose: bool = True,
) -> Tuple[List[str], int, int]:
"""
Track progress of a computational job by checking for completed datasets.
-
+
This function scans the results directory for completed datasets (identified
by the presence of a specific marker file) and compares against the total
number of input datasets to determine how many remain to be processed.
-
+
Parameters
----------
input_dataset_dir : str
@@ -38,13 +39,13 @@ def track_progress(
Default is 'RawDataEvaluation.csv'.
prefix_length : int, optional
Number of characters to skip from the beginning of directory names
- when extracting dataset identifiers. Default is 8 (e.g., skips 'dataset_'
+ when extracting dataset identifiers. Default is 8 (e.g., skips ``dataset_``
prefix).
input_extension : str, optional
File extension of input datasets (without dot). Default is 'csv'.
verbose : bool, optional
If True, prints progress information. Default is True.
-
+
Returns
-------
completed_datasets : List[str]
@@ -53,7 +54,7 @@ def track_progress(
Number of completed datasets.
num_remaining : int
Number of datasets remaining to be processed.
-
+
Examples
--------
>>> from qbiocode.utils import track_progress
@@ -64,7 +65,7 @@ def track_progress(
The completed datasets are: ['dataset1', 'dataset2']
You have finished running program on 2 out of a total of 10 input datasets.
You have 8 input datasets left before program finishes.
-
+
>>> # Custom completion marker
>>> completed, done, remaining = track_progress(
... input_dataset_dir='data/inputs',
@@ -74,7 +75,7 @@ def track_progress(
... )
"""
completed_files = []
-
+
# Scan results directory for completed datasets
for entry in os.scandir(current_results_dir):
if entry.is_dir():
@@ -83,42 +84,44 @@ def track_progress(
# Extract dataset identifier by skipping prefix
dataset_id = entry.name[prefix_length:] if prefix_length > 0 else entry.name
completed_files.append(dataset_id)
-
+
# Count total input datasets
num_input_datasets = []
for file in os.listdir(input_dataset_dir):
if file.endswith(input_extension):
num_input_datasets.append(file)
-
+
num_completed = len(completed_files)
num_total = len(num_input_datasets)
num_remaining = num_total - num_completed
-
+
if verbose:
- print(f'The completed datasets are: {completed_files}')
- print(f'You have finished running program on {num_completed} out of a total of {num_total} input datasets.')
- print(f'You have {num_remaining} input datasets left before program finishes.')
-
+ print(f"The completed datasets are: {completed_files}")
+ print(
+ f"You have finished running program on {num_completed} out of a total of {num_total} input datasets."
+ )
+ print(f"You have {num_remaining} input datasets left before program finishes.")
+
return completed_files, num_completed, num_remaining
def combine_results(
prev_results_dir: str,
recent_results_dir: str,
- eval_file_prefix: str = 'Raw',
- results_file_prefix: str = 'Model',
- output_eval_file: str = 'RawDataEvaluation_Combined.csv',
- output_results_file: str = 'ModelResults_Combined.csv',
+ eval_file_prefix: str = "Raw",
+ results_file_prefix: str = "Model",
+ output_eval_file: str = "RawDataEvaluation_Combined.csv",
+ output_results_file: str = "ModelResults_Combined.csv",
save_intermediate: bool = True,
- verbose: bool = True
+ verbose: bool = True,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Combine results from interrupted and resumed computational jobs.
-
+
This function merges CSV files from a previous (interrupted) job run with
files from a recent (resumed) job run. It's useful when a long-running
computational job needs to be restarted and you want to combine all results.
-
+
Parameters
----------
prev_results_dir : str
@@ -132,7 +135,7 @@ def combine_results(
results_file_prefix : str, optional
Prefix of model results files to combine. Default is 'Model'.
output_eval_file : str, optional
- Name of the combined evaluation output file.
+ Name of the combined evaluation output file.
Default is 'RawDataEvaluation_Combined.csv'.
output_results_file : str, optional
Name of the combined results output file.
@@ -142,14 +145,14 @@ def combine_results(
Default is True.
verbose : bool, optional
If True, prints shape information during processing. Default is True.
-
+
Returns
-------
combined_eval_df : pd.DataFrame
Combined dataframe of all evaluation/assessment data.
combined_results_df : pd.DataFrame
Combined dataframe of all model results.
-
+
Examples
--------
>>> from qbiocode.utils import combine_results
@@ -159,7 +162,7 @@ def combine_results(
... )
>>> print(f"Combined {len(eval_df)} evaluation records")
>>> print(f"Combined {len(results_df)} result records")
-
+
>>> # Custom file prefixes and output names
>>> eval_df, results_df = combine_results(
... prev_results_dir='results/old',
@@ -169,7 +172,7 @@ def combine_results(
... output_eval_file='AllEvaluations.csv',
... output_results_file='AllResults.csv'
... )
-
+
Notes
-----
The function expects:
@@ -182,7 +185,7 @@ def combine_results(
previous_combined_eval_df = []
results_dfs = []
previous_combined_result_df = []
-
+
# Collect all individual CSV files from previous run subdirectories
for entry in os.scandir(prev_results_dir):
if entry.is_dir():
@@ -193,30 +196,32 @@ def combine_results(
if file.startswith(results_file_prefix):
results_csv_files = os.path.join(entry, file)
results_dfs.append(results_csv_files)
-
+
# Read and collect all previous evaluation dataframes
for evalfile in eval_dfs:
df1 = pd.read_csv(evalfile)
previous_combined_eval_df.append(df1)
-
+
# Read and collect all previous results dataframes
for resultsfile in results_dfs:
df2 = pd.read_csv(resultsfile)
previous_combined_result_df.append(df2)
-
+
# Concatenate all previous dataframes
concat_previous_eval_df = pd.concat(previous_combined_eval_df, ignore_index=True)
concat_previous_result_df = pd.concat(previous_combined_result_df, ignore_index=True)
-
+
# Optionally save intermediate combined files
if save_intermediate:
- concat_previous_eval_df.to_csv(f'{eval_file_prefix}DataEvaluation_previous.csv', index=False)
- concat_previous_result_df.to_csv(f'{results_file_prefix}Results_previous.csv', index=False)
-
+ concat_previous_eval_df.to_csv(
+ f"{eval_file_prefix}DataEvaluation_previous.csv", index=False
+ )
+ concat_previous_result_df.to_csv(f"{results_file_prefix}Results_previous.csv", index=False)
+
# Read recent (resumed run) dataframes
recent_eval_df = None
recent_results_df = None
-
+
for file in os.listdir(recent_results_dir):
if file.startswith(eval_file_prefix):
recent_eval_csv_file = os.path.join(recent_results_dir, file)
@@ -226,7 +231,7 @@ def combine_results(
recent_results_csv_file = os.path.join(recent_results_dir, file)
recent_results_df = pd.read_csv(recent_results_csv_file, index_col=0)
recent_results_df.reset_index(drop=True, inplace=True)
-
+
# Verify that recent dataframes were found
if recent_eval_df is None:
raise FileNotFoundError(
@@ -236,39 +241,41 @@ def combine_results(
raise FileNotFoundError(
f"No results file starting with '{results_file_prefix}' found in {recent_results_dir}"
)
-
+
if verbose:
print(f"Recent evaluation dataframe shape: {recent_eval_df.shape}")
print(f"Previous evaluation dataframe shape: {concat_previous_eval_df.shape}")
print(f"Recent results dataframe shape: {recent_results_df.shape}")
print(f"Previous results dataframe shape: {concat_previous_result_df.shape}")
-
+
# Combine previous and recent dataframes
new_combined_eval_df = pd.concat([concat_previous_eval_df, recent_eval_df], ignore_index=True)
- new_combined_result_df = pd.concat([concat_previous_result_df, recent_results_df], ignore_index=True)
-
+ new_combined_result_df = pd.concat(
+ [concat_previous_result_df, recent_results_df], ignore_index=True
+ )
+
# Save final combined dataframes
new_combined_eval_df.to_csv(output_eval_file, index=False)
new_combined_result_df.to_csv(output_results_file, index=False)
-
+
if verbose:
print(f"\nCombined evaluation dataframe shape: {new_combined_eval_df.shape}")
print(f"Combined results dataframe shape: {new_combined_result_df.shape}")
print(f"\nSaved combined files:")
print(f" - {output_eval_file}")
print(f" - {output_results_file}")
-
+
return new_combined_eval_df, new_combined_result_df
# Example usage (commented out to prevent execution at import time):
-#
+#
# # Track progress of current job
# completed, done, remaining = track_progress(
# input_dataset_dir='data/inputs',
# current_results_dir='results/current_run'
# )
-#
+#
# # Combine results from interrupted and resumed runs
# eval_df, results_df = combine_results(
# prev_results_dir='results/run1_interrupted',
diff --git a/qbiocode/utils/dataset_checkpoint.py b/qbiocode/utils/dataset_checkpoint.py
index c034a49..14a0fde 100644
--- a/qbiocode/utils/dataset_checkpoint.py
+++ b/qbiocode/utils/dataset_checkpoint.py
@@ -11,22 +11,22 @@
def checkpoint_restart(
previous_results_dir: str,
- completion_marker: str = 'RawDataEvaluation.csv',
+ completion_marker: str = "RawDataEvaluation.csv",
prefix_length: int = 8,
- verbose: bool = False
+ verbose: bool = False,
) -> List[str]:
"""
Identify completed datasets from a previous run to enable checkpoint restart.
-
+
This function scans a results directory to find which datasets were fully processed
in a previous run by checking for the presence of a completion marker file. This
allows you to resume interrupted batch processing jobs without reprocessing
completed datasets.
-
+
The function assumes that each dataset has its own subdirectory in the results
directory, and that a specific file (completion marker) is created when processing
completes successfully.
-
+
Parameters
----------
previous_results_dir : str
@@ -37,43 +37,43 @@ def checkpoint_restart(
Default is 'RawDataEvaluation.csv' (used by QProfiler).
prefix_length : int, optional
Number of characters to strip from the beginning of directory names to get
- the dataset name. Default is 8 (strips 'dataset_' prefix used by QProfiler).
+ the dataset name. Default is 8 (strips ``dataset_`` prefix used by QProfiler).
Set to 0 to use the full directory name.
verbose : bool, optional
If True, print the list of completed datasets and count. Default is False.
-
+
Returns
-------
List[str]
List of dataset names that were fully processed in the previous run.
These can be excluded when restarting the batch job.
-
+
Examples
--------
Basic usage with QProfiler default settings:
-
+
>>> completed = checkpoint_restart('/path/to/previous_results')
>>> print(f"Found {len(completed)} completed datasets")
-
+
Resume processing only incomplete datasets:
-
+
>>> import os
>>> all_datasets = [f for f in os.listdir('/path/to/data') if f.endswith('.csv')]
>>> completed = checkpoint_restart('/path/to/previous_results')
>>> remaining = [d for d in all_datasets if d not in completed]
>>> print(f"Need to process {len(remaining)} more datasets")
-
+
Custom completion marker and no prefix stripping:
-
+
>>> completed = checkpoint_restart(
... '/path/to/results',
... completion_marker='ModelResults.csv',
... prefix_length=0,
... verbose=True
... )
-
+
Integration with QProfiler batch processing:
-
+
>>> from qbiocode.utils.dataset_checkpoint import checkpoint_restart
>>>
>>> # Get list of completed datasets from previous run
@@ -91,34 +91,30 @@ def checkpoint_restart(
>>>
>>> # Run QProfiler only on remaining datasets
>>> # (use datasets_to_process in your batch processing loop)
-
+
Notes
-----
- The function only checks for the presence of the completion marker file,
not its contents or validity
- When restarting, you may need to manually combine results from the previous
and current runs
- - Directory names are expected to have a consistent prefix (e.g., 'dataset_')
+ - Directory names are expected to have a consistent prefix (e.g., ``dataset_``)
that can be stripped using the prefix_length parameter
- Non-directory entries in previous_results_dir are ignored
-
+
See Also
--------
qbiocode.evaluation.model_run : Main QProfiler batch processing function
"""
completed_files = []
-
+
# Validate input directory
if not os.path.exists(previous_results_dir):
- raise FileNotFoundError(
- f"Previous results directory not found: {previous_results_dir}"
- )
-
+ raise FileNotFoundError(f"Previous results directory not found: {previous_results_dir}")
+
if not os.path.isdir(previous_results_dir):
- raise NotADirectoryError(
- f"Path is not a directory: {previous_results_dir}"
- )
-
+ raise NotADirectoryError(f"Path is not a directory: {previous_results_dir}")
+
# Scan for completed datasets
for entry in os.scandir(previous_results_dir):
if entry.is_dir():
@@ -131,10 +127,10 @@ def checkpoint_restart(
else:
dataset_name = entry.name
completed_files.append(dataset_name)
-
+
if verbose:
print(f"Found {len(completed_files)} completed datasets:")
for dataset in sorted(completed_files):
print(f" - {dataset}")
-
+
return completed_files
diff --git a/qbiocode/utils/find_duplicates.py b/qbiocode/utils/find_duplicates.py
index 31efe30..9e695d1 100644
--- a/qbiocode/utils/find_duplicates.py
+++ b/qbiocode/utils/find_duplicates.py
@@ -5,9 +5,9 @@
useful for cleaning up redundant configuration files or identifying duplicate datasets.
"""
-import os
import itertools
-from typing import List, Tuple, Optional
+import os
+from typing import List, Optional, Tuple
def find_duplicate_files(
@@ -15,22 +15,22 @@ def find_duplicate_files(
file_pattern: Optional[str] = None,
ignore_empty_lines: bool = True,
case_sensitive: bool = True,
- verbose: bool = False
+ verbose: bool = False,
) -> List[Tuple[str, str]]:
"""
Find files with identical content in a directory.
-
+
Scans the specified directory for files and compares their content line by line.
Identifies files that have identical content, even if they have different names.
Optionally filters files by pattern and provides various comparison options.
-
+
This is particularly useful for:
-
+
- Finding duplicate configuration files (e.g., YAML, JSON)
- Identifying redundant experiment configurations
- Cleaning up duplicate datasets before batch processing
- Validating file uniqueness in automated workflows
-
+
Parameters
----------
directory : str
@@ -44,13 +44,13 @@ def find_duplicate_files(
If True, comparison is case-sensitive. Default is True.
verbose : bool, optional
If True, print progress information during comparison. Default is False.
-
+
Returns
-------
List[Tuple[str, str]]
List of tuples, where each tuple contains paths of two duplicate files.
Returns empty list if no duplicates are found.
-
+
Raises
------
FileNotFoundError
@@ -59,17 +59,17 @@ def find_duplicate_files(
If the specified path is not a directory.
PermissionError
If files cannot be read due to permission issues.
-
+
Examples
--------
Find all duplicate files in a directory:
-
+
>>> duplicates = find_duplicate_files("configs/")
>>> if duplicates:
... print(f"Found {len(duplicates)} duplicate pairs")
-
+
Find duplicate YAML configuration files:
-
+
>>> duplicates = find_duplicate_files(
... "configs/qml_gridsearch/",
... file_pattern='.yaml',
@@ -77,17 +77,17 @@ def find_duplicate_files(
... )
>>> for file1, file2 in duplicates:
... print(f"Duplicate: {file1} == {file2}")
-
+
Case-insensitive comparison:
-
+
>>> duplicates = find_duplicate_files(
... "data/",
... file_pattern='.txt',
... case_sensitive=False
... )
-
+
Integration with QProfiler workflow:
-
+
>>> # Check for duplicate configs before batch processing
>>> config_dir = "configs/experiments/"
>>> duplicates = find_duplicate_files(config_dir, file_pattern='.yaml')
@@ -97,7 +97,7 @@ def find_duplicate_files(
... for f1, f2 in duplicates:
... print(f" {os.path.basename(f1)} == {os.path.basename(f2)}")
... # Optionally remove duplicates or warn user
-
+
Notes
-----
- Files are compared line by line after sorting (order-independent)
@@ -105,7 +105,7 @@ def find_duplicate_files(
- Large files may consume significant memory during comparison
- Symbolic links are followed and treated as regular files
- Hidden files (starting with '.') are included in comparison
-
+
See Also
--------
find_string_in_files : Search for specific strings across multiple files
@@ -114,10 +114,10 @@ def find_duplicate_files(
# Validate input directory
if not os.path.exists(directory):
raise FileNotFoundError(f"Directory not found: {directory}")
-
+
if not os.path.isdir(directory):
raise NotADirectoryError(f"Path is not a directory: {directory}")
-
+
# Collect files to compare
files = []
for entry in os.scandir(directory):
@@ -125,53 +125,55 @@ def find_duplicate_files(
# Apply file pattern filter if specified
if file_pattern is None or entry.name.endswith(file_pattern):
files.append(entry.path)
-
+
if verbose:
print(f"Comparing {len(files)} files in {directory}")
if file_pattern:
print(f"Filtering by pattern: {file_pattern}")
-
+
# Find duplicates by comparing all pairs
duplicates = []
total_comparisons = len(list(itertools.combinations(files, 2)))
-
+
for idx, (file1, file2) in enumerate(itertools.combinations(files, 2)):
if verbose and idx % 100 == 0:
print(f"Progress: {idx}/{total_comparisons} comparisons")
-
+
try:
# Read and process file contents
- with open(file1, 'r', encoding='utf-8') as f1:
+ with open(file1, "r", encoding="utf-8") as f1:
content1 = f1.readlines()
- with open(file2, 'r', encoding='utf-8') as f2:
+ with open(file2, "r", encoding="utf-8") as f2:
content2 = f2.readlines()
-
+
# Filter empty lines if requested
if ignore_empty_lines:
content1 = [line for line in content1 if line.strip()]
content2 = [line for line in content2 if line.strip()]
-
+
# Apply case sensitivity
if not case_sensitive:
content1 = [line.lower() for line in content1]
content2 = [line.lower() for line in content2]
-
+
# Sort for order-independent comparison
content1_sorted = sorted(content1)
content2_sorted = sorted(content2)
-
+
# Compare contents
if content1_sorted == content2_sorted:
duplicates.append((file1, file2))
if verbose:
- print(f" Duplicate found: {os.path.basename(file1)} == {os.path.basename(file2)}")
-
+ print(
+ f" Duplicate found: {os.path.basename(file1)} == {os.path.basename(file2)}"
+ )
+
except (UnicodeDecodeError, PermissionError) as e:
if verbose:
print(f" Warning: Could not read {file1} or {file2}: {e}")
continue
-
+
if verbose:
print(f"\nFound {len(duplicates)} duplicate file pairs")
-
- return duplicates
\ No newline at end of file
+
+ return duplicates
diff --git a/qbiocode/utils/find_string.py b/qbiocode/utils/find_string.py
index 2ef4ad3..08568fb 100644
--- a/qbiocode/utils/find_string.py
+++ b/qbiocode/utils/find_string.py
@@ -7,7 +7,7 @@
"""
import os
-from typing import List, Dict, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
def find_string_in_files(
@@ -16,16 +16,16 @@ def find_string_in_files(
file_pattern: Optional[str] = None,
case_sensitive: bool = True,
return_lines: bool = False,
- verbose: bool = True
+ verbose: bool = True,
) -> Dict[str, List[Tuple[int, str]]]:
"""
Search for a specific string in all files within a directory.
-
+
Scans files in the specified directory and identifies which files contain
the search string. Optionally returns the matching lines with line numbers.
Useful for auditing configurations, finding specific parameters, or
validating settings across multiple files.
-
+
Parameters
----------
directory : str
@@ -41,33 +41,33 @@ def find_string_in_files(
If True, return matching lines with line numbers. Default is False.
verbose : bool, optional
If True, print progress and results. Default is True.
-
+
Returns
-------
Dict[str, List[Tuple[int, str]]]
Dictionary mapping file paths to list of (line_number, line_content) tuples
for files containing the search string. If return_lines is False, the list
contains empty tuples.
-
+
Raises
------
FileNotFoundError
If the specified directory does not exist.
NotADirectoryError
If the specified path is not a directory.
-
+
Examples
--------
Basic search for a string:
-
+
>>> results = find_string_in_files(
... 'configs/',
... 'embeddings: none'
... )
>>> print(f"Found in {len(results)} files")
-
+
Search with line numbers returned:
-
+
>>> results = find_string_in_files(
... 'configs/qml_gridsearch/',
... 'n_qubits: 4',
@@ -78,18 +78,18 @@ def find_string_in_files(
... print(f"{filepath}:")
... for line_num, line_content in matches:
... print(f" Line {line_num}: {line_content.strip()}")
-
+
Case-insensitive search:
-
+
>>> results = find_string_in_files(
... 'logs/',
... 'error',
... file_pattern='.log',
... case_sensitive=False
... )
-
+
Integration with QProfiler workflow:
-
+
>>> # Find all configs using a specific embedding
>>> config_dir = "configs/experiments/"
>>> results = find_string_in_files(
@@ -103,14 +103,14 @@ def find_string_in_files(
... print(f"Found {len(results)} configs using PCA embedding")
... for config_file in results.keys():
... print(f" - {os.path.basename(config_file)}")
-
+
Notes
-----
- Only text files are supported; binary files will be skipped
- Large files may consume significant memory if return_lines=True
- Symbolic links are followed and treated as regular files
- Hidden files (starting with '.') are included in search
-
+
See Also
--------
find_duplicate_files : Find files with identical content
@@ -119,44 +119,44 @@ def find_string_in_files(
# Validate input directory
if not os.path.exists(directory):
raise FileNotFoundError(f"Directory not found: {directory}")
-
+
if not os.path.isdir(directory):
raise NotADirectoryError(f"Path is not a directory: {directory}")
-
+
# Prepare search string for case-insensitive search
search_str = search_string if case_sensitive else search_string.lower()
-
+
# Results dictionary
results = {}
total_files = 0
files_with_match = 0
-
+
# Scan directory
for entry in os.scandir(directory):
if entry.is_file():
# Apply file pattern filter if specified
if file_pattern is not None and not entry.name.endswith(file_pattern):
continue
-
+
total_files += 1
-
+
try:
- with open(entry.path, 'r', encoding='utf-8') as f:
+ with open(entry.path, "r", encoding="utf-8") as f:
matches = []
for line_num, line in enumerate(f, start=1):
# Apply case sensitivity
line_to_search = line if case_sensitive else line.lower()
-
+
if search_str in line_to_search:
if return_lines:
matches.append((line_num, line))
else:
- matches.append((0, '')) # Placeholder
-
+ matches.append((0, "")) # Placeholder
+
if matches:
results[entry.path] = matches
files_with_match += 1
-
+
if verbose:
if return_lines:
print(f"\n{entry.path} contains '{search_string}':")
@@ -164,12 +164,12 @@ def find_string_in_files(
print(f" Line {line_num}: {line_content.rstrip()}")
else:
print(f"{entry.path} contains '{search_string}'")
-
+
except (UnicodeDecodeError, PermissionError) as e:
if verbose:
print(f"Warning: Could not read {entry.path}: {e}")
continue
-
+
# Print summary
if verbose:
print(f"\n{'='*60}")
@@ -179,5 +179,5 @@ def find_string_in_files(
if file_pattern:
print(f" File pattern filter: {file_pattern}")
print(f"{'='*60}")
-
+
return results
diff --git a/qbiocode/utils/generate_qml_configs.py b/qbiocode/utils/generate_qml_configs.py
index 203d253..b901ce6 100644
--- a/qbiocode/utils/generate_qml_configs.py
+++ b/qbiocode/utils/generate_qml_configs.py
@@ -5,13 +5,14 @@
for systematic hyperparameter tuning of quantum machine learning models.
"""
+import itertools
import os
import re
-import itertools
-from typing import List, Dict, Any, Optional, Tuple
-import yaml
-import pandas as pd
+from typing import Any, Dict, List, Optional, Tuple, cast
+
import numpy as np
+import pandas as pd
+import yaml
def generate_qml_experiment_configs(
@@ -30,15 +31,15 @@ def generate_qml_experiment_configs(
embeddings: Optional[List[str]] = None,
data_sample_fraction: float = 1.0,
used_files_path: Optional[str] = None,
- random_seed: Optional[int] = None
+ random_seed: Optional[int] = None,
) -> Tuple[int, str]:
"""
Generate YAML configuration files for quantum ML hyperparameter grid search.
-
+
This function creates multiple configuration files by combining different
hyperparameter values for quantum machine learning models (QNN, VQC, QSVC).
Each configuration file can be used with QProfiler to run systematic experiments.
-
+
Parameters
----------
template_config_path : str
@@ -73,16 +74,16 @@ def generate_qml_experiment_configs(
Path to CSV file tracking previously used data files.
random_seed : int, optional
Random seed for reproducible file sampling.
-
+
Returns
-------
Tuple[int, str]
Number of configuration files generated and path to used files CSV.
-
+
Examples
--------
>>> from qbiocode.utils import generate_qml_experiment_configs
- >>>
+ >>>
>>> # Generate configs for quantum model grid search
>>> num_configs, used_files = generate_qml_experiment_configs(
... template_config_path='configs/config.yaml',
@@ -94,7 +95,7 @@ def generate_qml_experiment_configs(
... data_sample_fraction=0.1 # Use 10% of files for testing
... )
>>> print(f"Generated {num_configs} configuration files")
-
+
Notes
-----
- Quantum models (QNN, VQC, QSVC) don't support automated grid search
@@ -104,24 +105,24 @@ def generate_qml_experiment_configs(
* QSVC uses only 'amp' ansatz and 'COBYLA' optimizer
* QNN/VQC don't use the C parameter
- Embedding is set to 'none' when n_components >= original feature count
-
+
See Also
--------
qbiocode.apps.qprofiler : Main profiling application
"""
# Set default hyperparameter values
if qmethods is None:
- qmethods = ['qnn', 'vqc', 'qsvc']
+ qmethods = ["qnn", "vqc", "qsvc"]
if reps is None:
reps = [1, 2]
if optimizers is None:
- optimizers = ['COBYLA', 'SPSA']
+ optimizers = ["COBYLA", "SPSA"]
if entanglements is None:
- entanglements = ['linear', 'full']
+ entanglements = ["linear", "full"]
if feature_maps is None:
- feature_maps = ['Z', 'ZZ']
+ feature_maps = ["Z", "ZZ"]
if ansatz_types is None:
- ansatz_types = ['amp', 'esu2']
+ ansatz_types = ["amp", "esu2"]
if n_components is None:
n_components = [5, 10]
if Cs is None:
@@ -129,132 +130,153 @@ def generate_qml_experiment_configs(
if max_iters is None:
max_iters = [100, 500]
if embeddings is None:
- embeddings = ['none', 'pca', 'lle', 'isomap', 'spectral', 'umap', 'nmf']
-
+ embeddings = ["none", "pca", "lle", "isomap", "spectral", "umap", "nmf"]
+
# Set random seed if provided
if random_seed is not None:
np.random.seed(random_seed)
-
+
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
-
+
# Set up used files tracking
if used_files_path is None:
- used_files_path = os.path.join(output_dir, 'used_data_files.csv')
-
+ used_files_path = os.path.join(output_dir, "used_data_files.csv")
+
# Generate all hyperparameter combinations
- param_grid = [qmethods, reps, optimizers, entanglements, feature_maps,
- ansatz_types, n_components, Cs, max_iters, embeddings]
-
+ param_grid = [
+ qmethods,
+ reps,
+ optimizers,
+ entanglements,
+ feature_maps,
+ ansatz_types,
+ n_components,
+ Cs,
+ max_iters,
+ embeddings,
+ ]
+
param_combinations = pd.DataFrame(
- list(itertools.product(*param_grid)),
- columns=['method', 'reps', 'local_optimizer', 'entanglement',
- 'feature_map', 'ansatz_type', 'n_components', 'C',
- 'max_iter', 'embedding']
+ list(itertools.product(*(cast(List[Any], values) for values in param_grid))),
+ columns=[
+ "method",
+ "reps",
+ "local_optimizer",
+ "entanglement",
+ "feature_map",
+ "ansatz_type",
+ "n_components",
+ "C",
+ "max_iter",
+ "embedding",
+ ],
)
-
+
# Apply model-specific constraints
- param_combinations.loc[param_combinations['method'].isin(['qnn', 'vqc']), 'C'] = 1
- param_combinations.loc[param_combinations['method'].isin(['qsvc']), 'ansatz_type'] = 'amp'
- param_combinations.loc[param_combinations['method'].isin(['qsvc']), 'max_iter'] = 100
- param_combinations.loc[param_combinations['method'].isin(['qsvc']), 'local_optimizer'] = 'COBYLA'
-
+ param_combinations.loc[param_combinations["method"].isin(["qnn", "vqc"]), "C"] = 1
+ param_combinations.loc[param_combinations["method"].isin(["qsvc"]), "ansatz_type"] = "amp"
+ param_combinations.loc[param_combinations["method"].isin(["qsvc"]), "max_iter"] = 100
+ param_combinations.loc[param_combinations["method"].isin(["qsvc"]), "local_optimizer"] = (
+ "COBYLA"
+ )
+
# Remove duplicates and apply filtering rules
param_combinations = param_combinations.drop_duplicates()
param_combinations = param_combinations[
- ~((param_combinations['n_components'] >= 10) & (param_combinations['max_iter'] < 500))
+ ~((param_combinations["n_components"] >= 10) & (param_combinations["max_iter"] < 500))
]
param_combinations = param_combinations[
- ~((param_combinations['reps'] > 1) & (param_combinations['n_components'] <= 10))
+ ~((param_combinations["reps"] > 1) & (param_combinations["n_components"] <= 10))
]
-
+
# Load template configuration
- with open(template_config_path, 'r') as f:
+ with open(template_config_path, "r") as f:
cfg_template = yaml.safe_load(f)
-
+
# Load or initialize used files list
if os.path.exists(used_files_path):
used_files = pd.read_csv(used_files_path).iloc[:, 0].tolist()
else:
used_files = []
-
+
# Generate configuration files
config_idx = 1
-
+
for data_dir in data_dirs:
# Get all CSV files in directory
- csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
+ csv_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]
csv_files.sort()
-
+
# Remove previously used files
csv_files = list(set(csv_files) - set(used_files))
-
+
# Sample files if requested
if data_sample_fraction < 1.0:
n_files = max(1, int(len(csv_files) * data_sample_fraction))
csv_files = list(np.random.choice(csv_files, n_files, replace=False))
-
+
# Update used files list
used_files.extend(csv_files)
-
+
# Filter parameter combinations based on data type
param_subset = param_combinations.copy()
- if ('moons' in data_dir) or ('circles' in data_dir):
- param_subset = param_subset[param_subset['embedding'] == 'none']
+ if ("moons" in data_dir) or ("circles" in data_dir):
+ param_subset = param_subset[param_subset["embedding"] == "none"]
else:
- param_subset = param_subset[param_subset['embedding'] != 'none']
-
+ param_subset = param_subset[param_subset["embedding"] != "none"]
+
# Generate config for each combination and file
for _, params in param_subset.iterrows():
for csv_file in csv_files:
- config_path = os.path.join(output_dir, f'exp_{config_idx}.yaml')
+ config_path = os.path.join(output_dir, f"exp_{config_idx}.yaml")
key = f"{params['method']}_{csv_file.replace('.csv', '')}"
-
+
# Create config from template
config = cfg_template.copy()
- config['yaml'] = config_path
- config['model'] = [params['method']]
- config['file_dataset'] = csv_file
- config['folder_path'] = data_dir.replace('data/', '')
- config['hydra'] = config.get('hydra', {})
- config['hydra']['run'] = config['hydra'].get('run', {})
- config['hydra']['run']['dir'] = os.path.join('results', f'qmlgridsearch_{key}')
-
+ config["yaml"] = config_path
+ config["model"] = [params["method"]]
+ config["file_dataset"] = csv_file
+ config["folder_path"] = data_dir.replace("data/", "")
+ config["hydra"] = config.get("hydra", {})
+ config["hydra"]["run"] = config["hydra"].get("run", {})
+ config["hydra"]["run"]["dir"] = os.path.join("results", f"qmlgridsearch_{key}")
+
# Check if embedding should be 'none' based on feature count
df = pd.read_csv(os.path.join(data_dir, csv_file))
orig_features = df.shape[1] - 1 # Subtract label column
-
- if params['n_components'] >= orig_features:
- config['embeddings'] = ['none']
+
+ if params["n_components"] >= orig_features:
+ config["embeddings"] = ["none"]
else:
- config['embeddings'] = [params['embedding']]
-
- config['n_components'] = params['n_components']
-
+ config["embeddings"] = [params["embedding"]]
+
+ config["n_components"] = params["n_components"]
+
# Set method-specific parameters
method_args_key = f"{params['method']}_args"
if method_args_key not in config:
config[method_args_key] = {}
-
- config[method_args_key]['reps'] = int(params['reps'])
- config[method_args_key]['entanglement'] = params['entanglement']
- config[method_args_key]['encoding'] = params['feature_map']
-
- if params['method'] != 'qsvc':
- config[method_args_key]['ansatz_type'] = params['ansatz_type']
- config[method_args_key]['maxiter'] = int(params['max_iter'])
+
+ config[method_args_key]["reps"] = int(params["reps"])
+ config[method_args_key]["entanglement"] = params["entanglement"]
+ config[method_args_key]["encoding"] = params["feature_map"]
+
+ if params["method"] != "qsvc":
+ config[method_args_key]["ansatz_type"] = params["ansatz_type"]
+ config[method_args_key]["maxiter"] = int(params["max_iter"])
else:
- config[method_args_key]['C'] = float(params['C'])
- config[method_args_key]['local_optimizer'] = params['local_optimizer']
-
+ config[method_args_key]["C"] = float(params["C"])
+ config[method_args_key]["local_optimizer"] = params["local_optimizer"]
+
# Write configuration file
- with open(config_path, 'w') as f:
+ with open(config_path, "w") as f:
yaml.dump(config, f, default_flow_style=False)
-
+
config_idx += 1
-
+
# Save used files list
- pd.Series(used_files).to_csv(used_files_path, index=False, header=['filename'])
-
+ pd.Series(used_files).to_csv(used_files_path, index=False, header=["filename"])
+
num_configs = config_idx - 1
return num_configs, used_files_path
diff --git a/qbiocode/utils/helper_fn.py b/qbiocode/utils/helper_fn.py
index 8d25dc7..7eeb932 100644
--- a/qbiocode/utils/helper_fn.py
+++ b/qbiocode/utils/helper_fn.py
@@ -11,47 +11,46 @@
import time
from typing import Literal
-# ====== Scikit-learn imports ======
+from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
-from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
+# ====== Scikit-learn imports ======
-def scaler_fn(X, scaling: Literal['None', 'StandardScaler', 'MinMaxScaler'] = "None"):
+def scaler_fn(X, scaling: Literal["None", "StandardScaler", "MinMaxScaler"] = "None"):
"""
Apply scaling transformation to input data.
-
+
Scales the input data using one of three methods: no scaling, standard scaling
(z-score normalization), or min-max scaling to [0, 1] range.
-
+
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data to be scaled.
scaling : {'None', 'StandardScaler', 'MinMaxScaler'}, default='None'
Scaling method to apply:
-
+
- 'None': No scaling, returns original data
- 'StandardScaler': Standardize features by removing mean and scaling to unit variance
- 'MinMaxScaler': Scale features to [0, 1] range
-
+
Returns
-------
X_scaled : array-like of shape (n_samples, n_features)
Scaled data. If scaling='None', returns original data unchanged.
-
+
Notes
-----
StandardScaler transforms data to have mean=0 and variance=1:
-
+
.. math::
z = \\frac{x - \\mu}{\\sigma}
-
+
MinMaxScaler transforms data to [0, 1] range:
-
+
.. math::
x_{scaled} = \\frac{x - x_{min}}{x_{max} - x_{min}}
-
+
Examples
--------
>>> import numpy as np
@@ -59,16 +58,16 @@ def scaler_fn(X, scaling: Literal['None', 'StandardScaler', 'MinMaxScaler'] = "N
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
>>> X_scaled = scaler_fn(X, scaling='StandardScaler')
>>> X_minmax = scaler_fn(X, scaling='MinMaxScaler')
-
+
See Also
--------
sklearn.preprocessing.StandardScaler : Standardize features
sklearn.preprocessing.MinMaxScaler : Scale features to a range
"""
- if scaling == 'MinMaxScaler':
+ if scaling == "MinMaxScaler":
scaler = MinMaxScaler()
return scaler.fit_transform(X)
- elif scaling == 'StandardScaler':
+ elif scaling == "StandardScaler":
scaler = StandardScaler()
return scaler.fit_transform(X)
else: # scaling == 'None'
@@ -78,15 +77,15 @@ def scaler_fn(X, scaling: Literal['None', 'StandardScaler', 'MinMaxScaler'] = "N
def feature_encoding(
feature1,
sparse_output=False,
- feature_encoding: Literal['None', 'OneHotEncoder', 'OrdinalEncoder'] = "None"
+ feature_encoding: Literal["None", "OneHotEncoder", "OrdinalEncoder"] = "None",
):
"""
Encode categorical features using various encoding strategies.
-
+
Transforms categorical features into numerical representations suitable for
machine learning algorithms. Supports one-hot encoding, ordinal encoding,
or no encoding.
-
+
Parameters
----------
feature1 : array-like of shape (n_samples,)
@@ -96,29 +95,29 @@ def feature_encoding(
If False, returns a dense array. Ignored for other encoding methods.
feature_encoding : {'None', 'OneHotEncoder', 'OrdinalEncoder'}, default='None'
Encoding method to apply:
-
+
- 'None': No encoding, returns original feature
- 'OneHotEncoder': Create binary columns for each category
- 'OrdinalEncoder': Map categories to integer values
-
+
Returns
-------
feature1_encoded : array-like
Encoded feature. Shape depends on encoding method:
-
+
- 'None': shape (n_samples, 1)
- 'OrdinalEncoder': shape (n_samples, 1)
- 'OneHotEncoder': shape (n_samples, n_categories)
-
+
Notes
-----
One-hot encoding creates a binary column for each unique category, useful
when categories have no ordinal relationship. Ordinal encoding assigns
integer values, suitable when categories have a natural order.
-
+
The function automatically reshapes the input to (-1, 1) format required
by scikit-learn encoders.
-
+
Examples
--------
>>> import numpy as np
@@ -128,16 +127,16 @@ def feature_encoding(
>>> encoded_onehot = feature_encoding(categories, feature_encoding='OneHotEncoder')
>>> # Ordinal encoding
>>> encoded_ordinal = feature_encoding(categories, feature_encoding='OrdinalEncoder')
-
+
See Also
--------
sklearn.preprocessing.OneHotEncoder : Encode categorical features as one-hot
sklearn.preprocessing.OrdinalEncoder : Encode categorical features as integers
"""
- if feature_encoding == 'OrdinalEncoder':
+ if feature_encoding == "OrdinalEncoder":
encoder = OrdinalEncoder()
return encoder.fit_transform(feature1.reshape(-1, 1))
- elif feature_encoding == 'OneHotEncoder':
+ elif feature_encoding == "OneHotEncoder":
encoder = OneHotEncoder(sparse_output=sparse_output)
return encoder.fit_transform(feature1.reshape(-1, 1))
else: # feature_encoding == 'None'
@@ -147,11 +146,11 @@ def feature_encoding(
def print_results(model, accuracy, f1, compile_time, params):
"""
Print formatted machine learning model evaluation results.
-
+
Displays model performance metrics and parameters in a consistent,
readable format. Useful for comparing multiple models during
experimentation and benchmarking.
-
+
Parameters
----------
model : str
@@ -164,17 +163,17 @@ def print_results(model, accuracy, f1, compile_time, params):
Time taken to train/compile the model, in seconds.
params : dict
Dictionary of model hyperparameters and configuration settings.
-
+
Returns
-------
None
Prints results to stdout.
-
+
Notes
-----
The function formats floating-point numbers to 4 decimal places for
consistency. All metrics are printed with descriptive labels.
-
+
Examples
--------
>>> from qbiocode.utils import print_results
@@ -184,7 +183,7 @@ def print_results(model, accuracy, f1, compile_time, params):
RandomForest Model F1 score: 0.9156
Time taken for RandomForest Model (secs): 2.3450
RandomForest Model Params: {'n_estimators': 100, 'max_depth': 10}
-
+
See Also
--------
sklearn.metrics.accuracy_score : Compute accuracy
@@ -195,4 +194,5 @@ def print_results(model, accuracy, f1, compile_time, params):
print(f"Time taken for {model} Model (secs): {compile_time:.4f}")
print(f"{model} Model Params: ", params)
+
# Made with Bob
diff --git a/qbiocode/utils/ibm_account.py b/qbiocode/utils/ibm_account.py
index a1bd109..9464c79 100644
--- a/qbiocode/utils/ibm_account.py
+++ b/qbiocode/utils/ibm_account.py
@@ -1,8 +1,11 @@
# This will be a simple function to extract information from a user's qiskit-json file
-import json, os
+import json
+import os
+
from qiskit_ibm_runtime import QiskitRuntimeService
+
def get_creds(args):
"""This function determines the user's IBM Quantum channel, instance, and token, using values provided
within the config.yaml file or as defined within the user's qiskit configuration from provided qiskit_json_path
@@ -19,34 +22,42 @@ def get_creds(args):
Returns:
rval (dict): A dictionary containing the IBM Quantum credentials, including 'channel', 'instance', 'token', and 'url'.
"""
- cred_source_dict = {'channel':'ibm_channel', 'instance':'ibm_instance', 'token':'ibm_token', 'url':'ibm_url'}
+ cred_source_dict = {
+ "channel": "ibm_channel",
+ "instance": "ibm_instance",
+ "token": "ibm_token",
+ "url": "ibm_url",
+ }
rval = {}
for ibm_name, yaml_name in cred_source_dict.items():
value = args.get(yaml_name, None)
if value:
rval[ibm_name] = value
- qiskit_json_path = args.get('qiskit_json_path', None)
+ qiskit_json_path = args.get("qiskit_json_path", None)
if qiskit_json_path:
qiskit_json_path = os.path.expanduser(qiskit_json_path)
if os.path.exists(qiskit_json_path):
# load the qiskit json file
- with open(qiskit_json_path, 'r') as jfile:
+ with open(qiskit_json_path, "r") as jfile:
creds = json.load(jfile)
# Access keys and values
- # The items we want are actually in a nested dictionary, so we have to loop through the outer dictionary first, then the
- # nested one. This nested dictionary (outer_value) is actually the value for the key in the parent dictionary.
+ # The items we want are actually in a nested dictionary, so we have to loop through the outer dictionary first, then the
+ # nested one. This nested dictionary (outer_value) is actually the value for the key in the parent dictionary.
for outer_key, outer_value in creds.items():
- if 'name' in rval.keys() and outer_key == rval['name']:
+ if "name" in rval.keys() and outer_key == rval["name"]:
for ibm_name in cred_source_dict.keys():
if ibm_name not in rval:
value = outer_value.get(ibm_name, None)
if value:
rval[ibm_name] = value
else:
- print('IBM credentials not found! Please verify that the path to your qiskit-ibm.json file is correct.')
+ print(
+ "IBM credentials not found! Please verify that the path to your qiskit-ibm.json file is correct."
+ )
return rval
+
def instantiate_runtime_service(args):
"""This function provides a quick way to instantiate QiskitRuntimeService in one place. A basic call to this function can then be done in anywhere else.
It uses the get_creds function to retrieve the necessary credentials from the qiskit-ibm.json file, with the file path specified in the config.yaml file.
@@ -55,7 +66,7 @@ def instantiate_runtime_service(args):
Args:
args (dict): This passes the arguments from the config.yaml file. In this particular case, it is importing the path to the qiskit-ibm.json file (qiskit_json_path) and the credentials
defined in this json file (ibm_channel, ibm_instance, ibm_token, ibm_url).
-
+
Returns:
QiskitRuntimeService: An instance of the QiskitRuntimeService class, initialized with the credentials from the qiskit-ibm.json file or the provided arguments.
"""
diff --git a/qbiocode/utils/qc_winner_finder.py b/qbiocode/utils/qc_winner_finder.py
index 12c15df..ae3d040 100644
--- a/qbiocode/utils/qc_winner_finder.py
+++ b/qbiocode/utils/qc_winner_finder.py
@@ -1,94 +1,114 @@
## function to find datasets where QML methods did better than classical
-import pandas as pd
-import numpy as np
+import os
+
import matplotlib.pyplot as plt
+import numpy as np
import pandas as pd
-import os
+
def qml_winner(results_df, rawevals_df, output_dir, tag):
"""This function finds data sets where QML was beneficial (higher F1 scores than CML) and create new .csv files
- with the relevant evaluation and performance for these specific datasets, for further analysis.
+ with the relevant evaluation and performance for these specific datasets, for further analysis.
It also computes the best results per method across all splits and the best results per dataset.
It returns two DataFrames: one with the datasets where QML methods outperformed CML methods, and another with the
evaluation scores for the best QML method for each of these datasets.
It also saves these DataFrames as .csv files in the specified output directory.
-
+
Args:
results_df (pandas.DataFrame): Dataset in pandas corresponding to 'ModelResults.csv'
rawevals_df (pandas.DataFrame): Dataset in pandas corresponding to 'RawDataEvaluation.csv'
- Returns:
+ Returns:
qml_winners (pandas.DataFrame): contais the input datasets for which at least one QML method
performed better than CML. DataFrame contains the scores of all
- the methods.
- winner_eval_score (pandas.DataFrame): contains the input datasets, their evaluation, and scores for the
+ the methods.
+ winner_eval_score (pandas.DataFrame): contains the input datasets, their evaluation, and scores for the
specific qml method that yielded the best score.
"""
-
+
# pass in the ML results
df = results_df.copy()
# pull in the raw evaluations
rawevals = rawevals_df.copy()
- #first, compute mean across all splits
- if 'Model_Parameters' in df.columns:
- df_across_split= df.groupby(['Dataset', 'embeddings', 'model', 'Model_Parameters'])['f1_score'].mean().reset_index()
- else:
+ # first, compute mean across all splits
+ if "Model_Parameters" in df.columns:
+ df_across_split = (
+ df.groupby(["Dataset", "embeddings", "model", "Model_Parameters"])["f1_score"]
+ .mean()
+ .reset_index()
+ )
+ else:
# if 'Model_Parameters' is not present, this means you ran a grid search and this column will be named 'BestParams_GridSearch' instead
- df_across_split= df.groupby(['Dataset', 'embeddings', 'model', 'BestParams_GridSearch'])['f1_score'].mean().reset_index()
- #now, extract the best results per method across embedding and iteration
- df_best = df_across_split.groupby(['Dataset', 'model'])['f1_score'].max().reset_index()
- #df_best = df_across_split.groupby(['Dataset', 'model', 'Model_Parameters'])['f1_score'].max().reset_index()
- df_best.to_csv(( os.path.join( output_dir, tag +'_best_across_split.csv')), index=False)
+ df_across_split = (
+ df.groupby(["Dataset", "embeddings", "model", "BestParams_GridSearch"])["f1_score"]
+ .mean()
+ .reset_index()
+ )
+ # now, extract the best results per method across embedding and iteration
+ df_best = df_across_split.groupby(["Dataset", "model"])["f1_score"].max().reset_index()
+ # df_best = df_across_split.groupby(['Dataset', 'model', 'Model_Parameters'])['f1_score'].max().reset_index()
+ df_best.to_csv((os.path.join(output_dir, tag + "_best_across_split.csv")), index=False)
# get summary accross all datasets
- df_best_model_mean = df_best.groupby('model')['f1_score'].mean()
- df_best_model_median = df_best.groupby('model')['f1_score'].median()
- df_best_model_max = df_best.groupby('model')['f1_score'].max()
- df_best_model_std = df_best.groupby('model')['f1_score'].std()
- df_best_permodel_summary = pd.concat([df_best_model_mean, df_best_model_median, df_best_model_max, df_best_model_std], axis=1)
- df_best_permodel_summary.columns = ['Mean_F1_Score', 'Median_F1_Score', 'Max_F1_Score', 'StandardDev_F1_Score']
- df_best_permodel_summary.to_csv(( os.path.join( output_dir, tag +'_best_permodel_summary.csv')))
+ df_best_model_mean = df_best.groupby("model")["f1_score"].mean()
+ df_best_model_median = df_best.groupby("model")["f1_score"].median()
+ df_best_model_max = df_best.groupby("model")["f1_score"].max()
+ df_best_model_std = df_best.groupby("model")["f1_score"].std()
+ df_best_permodel_summary = pd.concat(
+ [df_best_model_mean, df_best_model_median, df_best_model_max, df_best_model_std], axis=1
+ )
+ df_best_permodel_summary.columns = [
+ "Mean_F1_Score",
+ "Median_F1_Score",
+ "Max_F1_Score",
+ "StandardDev_F1_Score",
+ ]
+ df_best_permodel_summary.to_csv((os.path.join(output_dir, tag + "_best_permodel_summary.csv")))
# print(df_best_permodel_summary)
-
+
# extract the best results per dataset
- best_per_dataset = df_best.loc[df_best.groupby('Dataset')['f1_score'].idxmax()]
+ best_per_dataset = df_best.loc[df_best.groupby("Dataset")["f1_score"].idxmax()]
# best_per_dataset = df_across_split.loc[df_across_split.groupby('Dataset')['f1_score'].idxmax()]
# create list of qml methods
- qml_list = ['QSVC', 'QNN', 'VQC', 'PQK']
+ qml_list = ["QSVC", "QNN", "VQC", "PQK"]
# qml_winner = df_best[df_best['Dataset'].isin(best_per_dataset[best_per_dataset['model'].isin(qml_list)]['Dataset'])]
- qml_winner = df_across_split[df_across_split['Dataset'].isin(best_per_dataset[best_per_dataset['model'].isin(qml_list)]['Dataset'])]
+ qml_winner = df_across_split[
+ df_across_split["Dataset"].isin(
+ best_per_dataset[best_per_dataset["model"].isin(qml_list)]["Dataset"]
+ )
+ ]
if not qml_winner.empty:
- bestmethod = qml_winner.groupby('Dataset')['f1_score'].idxmax()
+ bestmethod = qml_winner.groupby("Dataset")["f1_score"].idxmax()
qc_method_and_score = qml_winner.loc[bestmethod]
- qml_winner.to_csv(( os.path.join( output_dir, tag +'_qml_winners.csv')), index=False)
- dataset = list(qml_winner['Dataset'].unique())
-
+ qml_winner.to_csv((os.path.join(output_dir, tag + "_qml_winners.csv")), index=False)
+ dataset = list(qml_winner["Dataset"].unique())
+
#######
# now let's find the raw data evaluations for the qml winner data sets
# this wil produce another csv file that contains scores, evaluation, and qml method
# for these "qml winners".
winner_evals = []
for file in dataset:
- eval = rawevals.loc[rawevals['Dataset'] == file]
+ eval = rawevals.loc[rawevals["Dataset"] == file]
# print(eval)
winner_evals.append(eval)
winner_evals_df = pd.concat(winner_evals)
- winner_evals_df.to_csv(( os.path.join( output_dir, tag +'_winner_evals.csv')), index=False)
+ winner_evals_df.to_csv((os.path.join(output_dir, tag + "_winner_evals.csv")), index=False)
winner_scores_df = qc_method_and_score.iloc[:, -3:]
- winner_scores_df.to_csv(( os.path.join( output_dir, tag +'_winner_score.csv')), index=False)
+ winner_scores_df.to_csv((os.path.join(output_dir, tag + "_winner_score.csv")), index=False)
print(winner_scores_df)
winner_eval_score = pd.concat([winner_evals_df, winner_scores_df], axis=1)
- winner_eval_score.to_csv(( os.path.join( output_dir, tag +'_winner_eval_score.csv')), index=False) # contains dataset, evaluation, qml method, and average f1 score
+ winner_eval_score.to_csv(
+ (os.path.join(output_dir, tag + "_winner_eval_score.csv")), index=False
+ ) # contains dataset, evaluation, qml method, and average f1 score
#######
-
+
# optional print statements
- print('*** The number of qml winners is', len(dataset))
- print('*** The qml winners are:', dataset)
-
+ print("*** The number of qml winners is", len(dataset))
+ print("*** The qml winners are:", dataset)
+
return qml_winner, winner_eval_score, df_best
-
- else:
- print('*** QML methods were outperformed by CML methods in all datasets ***')
-
- return
+ else:
+ print("*** QML methods were outperformed by CML methods in all datasets ***")
+ return
diff --git a/qbiocode/utils/qutils.py b/qbiocode/utils/qutils.py
index cdc333f..b36f760 100644
--- a/qbiocode/utils/qutils.py
+++ b/qbiocode/utils/qutils.py
@@ -6,15 +6,20 @@
import numpy as np
import pandas as pd
from qiskit.circuit.equivalence_library import SessionEquivalenceLibrary as sel
-from qiskit.circuit.library import (EfficientSU2, PauliFeatureMap,
- RealAmplitudes, TwoLocal, XGate, YGate,
- ZFeatureMap, ZZFeatureMap)
-from qiskit.primitives import StatevectorEstimator
-from qiskit.primitives import StatevectorSampler
+from qiskit.circuit.library import (
+ EfficientSU2,
+ PauliFeatureMap,
+ RealAmplitudes,
+ TwoLocal,
+ XGate,
+ YGate,
+ ZFeatureMap,
+ ZZFeatureMap,
+)
+from qiskit.primitives import StatevectorEstimator, StatevectorSampler
from qiskit.quantum_info import SparsePauliOp
from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
-from qiskit_algorithms.optimizers import (COBYLA, L_BFGS_B, NFT, SPSA,
- GradientDescent, spsa)
+from qiskit_algorithms.optimizers import COBYLA, L_BFGS_B, NFT, SPSA, GradientDescent, spsa
from qiskit_ibm_runtime import EstimatorOptions
from qiskit_ibm_runtime import EstimatorV2 as Estimator
from qiskit_ibm_runtime import SamplerOptions
@@ -25,7 +30,7 @@
from qbiocode.utils.ibm_account import instantiate_runtime_service
-def get_backend_session( args: dict, primitive : str, num_qubits : int ):
+def get_backend_session(args: dict, primitive: str, num_qubits: int):
"""
This function to get the backend and session for the specified primitive.
@@ -42,35 +47,33 @@ def get_backend_session( args: dict, primitive : str, num_qubits : int ):
backend = None
session = None
prim = None
-
- if args['backend'] == 'simulator':
- if primitive == 'estimator':
+ if args["backend"] == "simulator":
+
+ if primitive == "estimator":
# Estimator primitive
- prim = StatevectorEstimator(seed=args['seed'])
+ prim = StatevectorEstimator(seed=args["seed"])
else:
- prim = StatevectorSampler(seed = args['seed'], default_shots=args['shots'])
- elif 'ibm' in args['backend']:
+ prim = StatevectorSampler(seed=args["seed"], default_shots=args["shots"])
+ elif "ibm" in args["backend"]:
service = instantiate_runtime_service(args)
- if args['backend'] == 'ibm_least':
- backend = service.least_busy(simulator=False, operational=True, min_num_qubits=num_qubits)
+ if args["backend"] == "ibm_least":
+ backend = service.least_busy(
+ simulator=False, operational=True, min_num_qubits=num_qubits
+ )
else:
- backend = service.backend(name=args['backend'])
+ backend = service.backend(name=args["backend"])
session = Session(backend=backend)
-
- if primitive == 'sampler':
- prim = get_sampler(mode = session,
- shots = args['shots'])
+
+ if primitive == "sampler":
+ prim = get_sampler(mode=session, shots=args["shots"])
else:
- prim = get_estimator(mode = session,
- shots = args['shots'],
- resil_level=args['resil_level'])
+ prim = get_estimator(mode=session, shots=args["shots"], resil_level=args["resil_level"])
return backend, session, prim
-def transpile_circuit( circuit, opt_level, backend, initial_layout, PT = False,
- dd_sequence = 'XpXm'):
+def transpile_circuit(circuit, opt_level, backend, initial_layout, PT=False, dd_sequence="XpXm"):
"""
This function transpiles the given quantum circuit based on the optimization level and backend.
@@ -85,7 +88,7 @@ def transpile_circuit( circuit, opt_level, backend, initial_layout, PT = False,
Returns:
t_qc (QuantumCircuit): The transpiled quantum circuit.
"""
- if str(opt_level) == 'AI':
+ if str(opt_level) == "AI":
pm = TranspilerService(
backend_name=backend,
ai="true",
@@ -100,23 +103,24 @@ def transpile_circuit( circuit, opt_level, backend, initial_layout, PT = False,
)
t_qc = pm.run(circuit)
- return( t_qc)
+ return t_qc
+
+
def get_observable(circuit, backend):
observable = SparsePauliOp.from_list([("Z" * circuit.num_qubits, 1)])
# observable = SparsePauliOp.from_list([("Z" + "I" * (int(circuit.num_qubits) - 1), 0.5)])
- if 'ibm' in backend.name:
- observable = observable.apply_layout(circuit.layout)#, num_qubits=backend.num_qubits)
+ if "ibm" in backend.name:
+ observable = observable.apply_layout(circuit.layout) # , num_qubits=backend.num_qubits)
return observable
def get_sampler(
- mode = None,
- shots = 1024,
- dd = True,
- dd_seq = 'XpXm',
- PT = True,
- ):
-
+ mode=None,
+ shots=1024,
+ dd=True,
+ dd_seq="XpXm",
+ PT=True,
+):
"""
This function creates a Sampler instance with specified options.
@@ -130,7 +134,7 @@ def get_sampler(
Returns:
Sampler: An instance of the Sampler with the specified options.
"""
-
+
sampler_options = SamplerOptions()
## ERROR SUPPRESSION TESTING ###
@@ -138,30 +142,28 @@ def get_sampler(
if dd:
sampler_options.dynamical_decoupling.enable = dd
sampler_options.dynamical_decoupling.sequence_type = dd_seq
- sampler_options.dynamical_decoupling.extra_slack_distribution = 'middle'
- sampler_options.dynamical_decoupling.scheduling_method = 'alap'
+ sampler_options.dynamical_decoupling.extra_slack_distribution = "middle"
+ sampler_options.dynamical_decoupling.scheduling_method = "alap"
if PT:
sampler_options.twirling.enable_gates = True
sampler_options.twirling.enable_measure = False
- sampler_options.twirling.num_randomizations = 'auto'
- sampler_options.twirling.shots_per_randomization = 'auto'
- sampler_options.twirling.strategy = (
- "active-accum" ### TRY VARYING THIS ###
- )
-
+ sampler_options.twirling.num_randomizations = "auto"
+ sampler_options.twirling.shots_per_randomization = "auto"
+ sampler_options.twirling.strategy = "active-accum" ### TRY VARYING THIS ###
sampler = Sampler(mode=mode, options=sampler_options)
-
+
return sampler
+
def get_estimator(
- mode = None,
- shots = 1024,
- resil_level = 2,
- dd = True,
- dd_seq = 'XpXm',
- PT = True,
- ):
+ mode=None,
+ shots=1024,
+ resil_level=2,
+ dd=True,
+ dd_seq="XpXm",
+ PT=True,
+):
"""
This function creates an Estimator instance with specified options.
@@ -175,11 +177,11 @@ def get_estimator(
Returns:
Estimator: An instance of the Estimator with the specified options.
"""
-
+
experimental_opts = {}
# experimental_opts["execution_path"] = "gen3-turbo"
- estimator_options = EstimatorOptions(experimental = experimental_opts)
+ estimator_options = EstimatorOptions(experimental=experimental_opts)
## ERROR SUPPRESSION TESTING ###
estimator_options.default_shots = shots
@@ -187,24 +189,23 @@ def get_estimator(
if dd:
estimator_options.dynamical_decoupling.enable = dd
estimator_options.dynamical_decoupling.sequence_type = dd_seq
- estimator_options.dynamical_decoupling.extra_slack_distribution = 'middle'
- estimator_options.dynamical_decoupling.scheduling_method = 'alap'
+ estimator_options.dynamical_decoupling.extra_slack_distribution = "middle"
+ estimator_options.dynamical_decoupling.scheduling_method = "alap"
if PT:
estimator_options.twirling.enable_gates = True
estimator_options.twirling.enable_measure = False
- estimator_options.twirling.num_randomizations = 'auto'
- estimator_options.twirling.shots_per_randomization = 'auto'
- estimator_options.twirling.strategy = (
- "active-accum" ### TRY VARYING THIS ###
- )
-
+ estimator_options.twirling.num_randomizations = "auto"
+ estimator_options.twirling.shots_per_randomization = "auto"
+ estimator_options.twirling.strategy = "active-accum" ### TRY VARYING THIS ###
+
estimator = Estimator(mode=mode, options=estimator_options)
return estimator
-def get_ansatz( ansatz_type, feat_dimension, reps = 1, entanglement = 'linear'):
+
+def get_ansatz(ansatz_type, feat_dimension, reps=1, entanglement="linear"):
"""
This function returns an ansatz based on the specified type and parameters.
- It supports 'esu2', 'amp', and 'twolocal' ansatz types, constructing it using the specified feature dimension,
+ It supports 'esu2', 'amp', and 'twolocal' ansatz types, constructing it using the specified feature dimension,
number of repetitions, and entanglement type.
Args:
@@ -215,16 +216,16 @@ def get_ansatz( ansatz_type, feat_dimension, reps = 1, entanglement = 'linear'):
Returns:
ansatz: An instance of the specified ansatz type.
"""
- if(ansatz_type=='esu2'):
- ansatz = EfficientSU2(feat_dimension, ['ry', 'rz'], entanglement, reps=reps)
- elif ansatz_type == 'amp':
- ansatz = RealAmplitudes(num_qubits=feat_dimension, reps=reps)
- elif ansatz_type == 'twolocal':
- ansatz = TwoLocal(feat_dimension, ['ry', 'rz'], 'cz', entanglement, reps=reps)
+ if ansatz_type == "esu2":
+ ansatz = EfficientSU2(feat_dimension, ["ry", "rz"], entanglement, reps=reps)
+ elif ansatz_type == "amp":
+ ansatz = RealAmplitudes(num_qubits=feat_dimension, reps=reps)
+ elif ansatz_type == "twolocal":
+ ansatz = TwoLocal(feat_dimension, ["ry", "rz"], "cz", entanglement, reps=reps)
return ansatz
-def get_feature_map( feature_map, feat_dimension, reps = 1, entanglement = 'linear', data_map_func = None ):
+def get_feature_map(feature_map, feat_dimension, reps=1, entanglement="linear", data_map_func=None):
"""
This function returns a feature map based on the specified type and parameters.
It supports 'Z', 'ZZ', and 'P' feature maps, constructing it using the specified feature dimension,
@@ -240,29 +241,35 @@ def get_feature_map( feature_map, feat_dimension, reps = 1, entanglement = 'line
feat_dimension (int): The number of qubits in the feature map.
"""
# Get Feature Map
- if feature_map == 'Z':
- feature_map = ZFeatureMap(feat_dimension,reps=reps, parameter_prefix='a', data_map_func = data_map_func)
- elif feature_map == 'ZZ':
- feature_map = ZZFeatureMap(feature_dimension=feat_dimension,
- reps=reps,
- entanglement=entanglement,
- parameter_prefix='a',
- data_map_func = data_map_func)
- elif feature_map == 'P':
- feature_map = PauliFeatureMap(feature_dimension=feat_dimension,
- reps=reps,
- entanglement=entanglement,
- data_map_func = data_map_func)
+ if feature_map == "Z":
+ feature_map = ZFeatureMap(
+ feat_dimension, reps=reps, parameter_prefix="a", data_map_func=data_map_func
+ )
+ elif feature_map == "ZZ":
+ feature_map = ZZFeatureMap(
+ feature_dimension=feat_dimension,
+ reps=reps,
+ entanglement=entanglement,
+ parameter_prefix="a",
+ data_map_func=data_map_func,
+ )
+ elif feature_map == "P":
+ feature_map = PauliFeatureMap(
+ feature_dimension=feat_dimension,
+ reps=reps,
+ entanglement=entanglement,
+ data_map_func=data_map_func,
+ )
# print("The number of qubits is:", feature_map.num_qubits)
# print("The number of parameters is:", feature_map.num_parameters)
-
- return feature_map, feat_dimension
+ return feature_map, feat_dimension
-def get_optimizer( type = 'COBYLA', max_iter = 100, learning_rate_a = None,
- perturbation_gamma = None, prior_iter = 0 ):
+def get_optimizer(
+ type="COBYLA", max_iter=100, learning_rate_a=None, perturbation_gamma=None, prior_iter=0
+):
"""
This function returns an optimizer based on the specified type and parameters.
It supports 'SPSA', 'COBYLA', 'GradientDescent', and 'L_BFGS_B' optimizer types,
@@ -278,31 +285,33 @@ def get_optimizer( type = 'COBYLA', max_iter = 100, learning_rate_a = None,
Returns:
optimizer: An instance of the specified optimizer type.
"""
- if type == 'SPSA':
+ if type == "SPSA":
if (learning_rate_a != None) & (perturbation_gamma != None):
# set up the power series
def learning_rate():
return spsa.powerseries(learning_rate_a, 0.602, 0)
+
gen = learning_rate()
learning_rates = np.array([next(gen) for _ in range(max_iter + prior_iter)])
- learning_rates = learning_rates[prior_iter:(max_iter + prior_iter)]
+ learning_rates = learning_rates[prior_iter : (max_iter + prior_iter)]
def perturbation():
return spsa.powerseries(0.2, perturbation_gamma)
+
gen = perturbation()
perturbations = np.array([next(gen) for _ in range(max_iter + prior_iter)])
- perturbations = perturbations[prior_iter:(max_iter + prior_iter)]
+ perturbations = perturbations[prior_iter : (max_iter + prior_iter)]
- optimizer=SPSA(maxiter=max_iter,
- learning_rate= learning_rates,
- perturbation= perturbations)
+ optimizer = SPSA(
+ maxiter=max_iter, learning_rate=learning_rates, perturbation=perturbations
+ )
else:
- optimizer=SPSA(maxiter=max_iter)
- elif type == 'COBYLA':
- optimizer=COBYLA(maxiter=max_iter)
- elif type == 'GradientDescent':
- optimizer=GradientDescent(maxiter=max_iter)
- elif type == 'L_BFGS_B':
+ optimizer = SPSA(maxiter=max_iter)
+ elif type == "COBYLA":
+ optimizer = COBYLA(maxiter=max_iter)
+ elif type == "GradientDescent":
+ optimizer = GradientDescent(maxiter=max_iter)
+ elif type == "L_BFGS_B":
optimizer == L_BFGS_B(maxiter=max_iter)
-
+
return optimizer
diff --git a/qbiocode/visualization/__init__.py b/qbiocode/visualization/__init__.py
index a833ded..741526f 100644
--- a/qbiocode/visualization/__init__.py
+++ b/qbiocode/visualization/__init__.py
@@ -7,7 +7,7 @@
comparisons between classical and quantum models.
Available Functions
-------------------
+-------------------
- compute_results_correlation: Compute Spearman correlation between metrics
- plot_results_correlation: Create correlation plots and visualizations
@@ -21,6 +21,6 @@
from .visualize_correlation import compute_results_correlation, plot_results_correlation
__all__ = [
- 'compute_results_correlation',
- 'plot_results_correlation',
+ "compute_results_correlation",
+ "plot_results_correlation",
]
diff --git a/qbiocode/visualization/visualize_correlation.py b/qbiocode/visualization/visualize_correlation.py
index cd1bd4e..581865c 100644
--- a/qbiocode/visualization/visualize_correlation.py
+++ b/qbiocode/visualization/visualize_correlation.py
@@ -1,46 +1,45 @@
+import re
+import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
+import numpy as np
import pandas as pd
+import seaborn as sns
from scipy.stats import spearmanr
from sklearn.metrics import r2_score
-import re
-import seaborn as sns
-import matplotlib.pyplot as plt
-import matplotlib.colors as mcolors
-import numpy as np
from sklearn.preprocessing import MinMaxScaler
# Set publication-quality defaults for scientific journals
-plt.rcParams['font.family'] = 'sans-serif'
-plt.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans', 'Helvetica', 'Liberation Sans']
-plt.rcParams['font.size'] = 11
-plt.rcParams['axes.labelsize'] = 12
-plt.rcParams['axes.titlesize'] = 13
-plt.rcParams['xtick.labelsize'] = 10
-plt.rcParams['ytick.labelsize'] = 10
-plt.rcParams['legend.fontsize'] = 10
-plt.rcParams['figure.titlesize'] = 13
-plt.rcParams['axes.linewidth'] = 1.2
-plt.rcParams['xtick.major.width'] = 1.2
-plt.rcParams['ytick.major.width'] = 1.2
-plt.rcParams['xtick.minor.width'] = 0.8
-plt.rcParams['ytick.minor.width'] = 0.8
-plt.rcParams['xtick.major.size'] = 5
-plt.rcParams['ytick.major.size'] = 5
-plt.rcParams['xtick.minor.size'] = 3
-plt.rcParams['ytick.minor.size'] = 3
-plt.rcParams['savefig.dpi'] = 600
-plt.rcParams['savefig.bbox'] = 'tight'
-plt.rcParams['savefig.pad_inches'] = 0.05
-plt.rcParams['axes.spines.top'] = False
-plt.rcParams['axes.spines.right'] = False
-plt.rcParams['axes.grid'] = False
-plt.rcParams['grid.alpha'] = 0.3
-plt.rcParams['grid.linestyle'] = '--'
-plt.rcParams['grid.linewidth'] = 0.5
-
-
-def compute_results_correlation( results_df, correlation = 'spearman', thresh = 0.7 ):
-
+plt.rcParams["font.family"] = "sans-serif"
+plt.rcParams["font.sans-serif"] = ["Arial", "DejaVu Sans", "Helvetica", "Liberation Sans"]
+plt.rcParams["font.size"] = 11
+plt.rcParams["axes.labelsize"] = 12
+plt.rcParams["axes.titlesize"] = 13
+plt.rcParams["xtick.labelsize"] = 10
+plt.rcParams["ytick.labelsize"] = 10
+plt.rcParams["legend.fontsize"] = 10
+plt.rcParams["figure.titlesize"] = 13
+plt.rcParams["axes.linewidth"] = 1.2
+plt.rcParams["xtick.major.width"] = 1.2
+plt.rcParams["ytick.major.width"] = 1.2
+plt.rcParams["xtick.minor.width"] = 0.8
+plt.rcParams["ytick.minor.width"] = 0.8
+plt.rcParams["xtick.major.size"] = 5
+plt.rcParams["ytick.major.size"] = 5
+plt.rcParams["xtick.minor.size"] = 3
+plt.rcParams["ytick.minor.size"] = 3
+plt.rcParams["savefig.dpi"] = 600
+plt.rcParams["savefig.bbox"] = "tight"
+plt.rcParams["savefig.pad_inches"] = 0.05
+plt.rcParams["axes.spines.top"] = False
+plt.rcParams["axes.spines.right"] = False
+plt.rcParams["axes.grid"] = False
+plt.rcParams["grid.alpha"] = 0.3
+plt.rcParams["grid.linestyle"] = "--"
+plt.rcParams["grid.linewidth"] = 0.5
+
+
+def compute_results_correlation(results_df, correlation="spearman", thresh=0.7):
"""This function takes in as input a Pandas Dataframe containing the results and data evaluations for
a given dataset. It then produces a spearman correlation between the data evaluation characteristics (features)
and instances where an F1 score was observed above a certain threshold (thresh).
@@ -57,7 +56,7 @@ def compute_results_correlation( results_df, correlation = 'spearman', thresh =
results_df (pd.DataFrame): A DataFrame containing the results and data evaluations.
correlation (str): The type of correlation to compute, default is 'spearman'.
thresh (float): The threshold for F1 score to consider, default is 0.7.
-
+
Returns:
results_df (pd.DataFrame): The input DataFrame with additional columns for datatype and model_embed_datatype.
correlations_df (pd.DataFrame): A DataFrame containing the computed correlations between metrics and features.
@@ -65,37 +64,89 @@ def compute_results_correlation( results_df, correlation = 'spearman', thresh =
"""
# Refining datasrame
- results_df['datatype'] = [ re.sub( '\.csv', '', re.sub( '-.*', '', x ) ) for x in results_df['Dataset'] ]
- results_df[ 'model_embed_datatype'] = [ '_'.join( [str(row.model), str(row.embeddings), str(row.datatype)] ) for idx, row in results_df.iterrows() ]
+ results_df["datatype"] = [
+ re.sub(r"\.csv", "", re.sub(r"-.*", "", x)) for x in results_df["Dataset"]
+ ]
+ results_df["model_embed_datatype"] = [
+ "_".join([str(row.model), str(row.embeddings), str(row.datatype)])
+ for idx, row in results_df.iterrows()
+ ]
correlations = []
- features = ['Feature_Samples_ratio', 'Intrinsic_Dimension', 'Condition number',
- 'Fisher Discriminant Ratio', 'Total Correlations', 'Mutual information',
- '# Non-zero entries', '# Low variance features', 'Variation', 'std_var',
- 'Coefficient of Variation %', 'std_co_of_v', 'Skewness', 'std_skew',
- 'Kurtosis', 'std_kurt', 'Mean Log Kernel Density',
- 'Isomap Reconstruction Error', 'Fractal dimension', 'Entropy',
- 'std_entropy']
- metrics = ['accuracy', 'f1_score', 'time', 'auc']
-
- keys = list(set(results_df['model_embed_datatype']))
+ features = [
+ "Feature_Samples_ratio",
+ "Intrinsic_Dimension",
+ "Condition number",
+ "Fisher Discriminant Ratio",
+ "Total Correlations",
+ "Mutual information",
+ "# Non-zero entries",
+ "# Low variance features",
+ "Variation",
+ "std_var",
+ "Coefficient of Variation %",
+ "std_co_of_v",
+ "Skewness",
+ "std_skew",
+ "Kurtosis",
+ "std_kurt",
+ "Mean Log Kernel Density",
+ "Isomap Reconstruction Error",
+ "Fractal dimension",
+ "Entropy",
+ "std_entropy",
+ ]
+ metrics = ["accuracy", "f1_score", "time", "auc"]
+
+ keys = list(set(results_df["model_embed_datatype"]))
for m in keys:
- dat_temp_m = results_df[results_df['model_embed_datatype'] == m]
+ dat_temp_m = results_df[results_df["model_embed_datatype"] == m]
if len(dat_temp_m) > 0:
for s in metrics:
for f in features:
if f in dat_temp_m.columns:
- if correlation == 'spearman':
- correlations.append( [m, s, f, np.median(dat_temp_m[s]), sum(dat_temp_m[s]>thresh)/len(dat_temp_m[s]), spearmanr( dat_temp_m[s], dat_temp_m[f] )[0] ] )
-
- correlations_df = pd.DataFrame(correlations, columns = ['model_embed_datatype', 'metric', 'feature', 'median_metric', 'frac_gt_thresh', 'correlation'] )
+ if correlation == "spearman":
+ correlations.append(
+ [
+ m,
+ s,
+ f,
+ np.median(dat_temp_m[s]),
+ sum(dat_temp_m[s] > thresh) / len(dat_temp_m[s]),
+ spearmanr(dat_temp_m[s], dat_temp_m[f])[0],
+ ]
+ )
+
+ correlations_df = pd.DataFrame(
+ correlations,
+ columns=[
+ "model_embed_datatype",
+ "metric",
+ "feature",
+ "median_metric",
+ "frac_gt_thresh",
+ "correlation",
+ ],
+ )
return results_df, correlations_df
-def plot_results_correlation( correlations_df, metric = 'f1_score', title = '', correlation_type = 'Spearman ρ', figsize=(6.5,10), save_file_path = '', size = 'median_metric',
- xticks = True, key = 'model_embed_datatype', legend_offset = 1.0, show_plots = True,
- colorbar_label = 'Correlation coefficient', size_label = 'Median metric value'):
-
+
+def plot_results_correlation(
+ correlations_df,
+ metric="f1_score",
+ title="",
+ correlation_type="Spearman ρ",
+ figsize=(6.5, 10),
+ save_file_path="",
+ size="median_metric",
+ xticks=True,
+ key="model_embed_datatype",
+ legend_offset=1.0,
+ show_plots=True,
+ colorbar_label="Correlation coefficient",
+ size_label="Median metric value",
+):
"""This function plots publication-quality correlation dot plots using the previously generated correlations_df dataframe.
The larger the circle, the higher the metric value for that particular data set. The circle colors correspond to the
correlations between the data characteristics (evaluations) and the metric. Red corresponds to a positive
@@ -114,41 +165,71 @@ def plot_results_correlation( correlations_df, metric = 'f1_score', title = '',
show_plots (bool): Whether to display plots, default is True.
colorbar_label (str): Label for the colorbar, default is 'Correlation coefficient'.
size_label (str): Label for the size legend, default is 'Median metric value'.
-
+
Returns:
None: Displays the plot and saves it to the specified file path if provided.
"""
# Use enhanced professional diverging colormap
from matplotlib.colors import LinearSegmentedColormap
- colors_custom = ['#053061', '#2166ac', '#4393c3', '#92c5de', '#d1e5f0',
- '#f7f7f7', '#fddbc7', '#f4a582', '#d6604d', '#b2182b', '#67001f']
- cmap_custom = LinearSegmentedColormap.from_list('custom_diverging', colors_custom, N=256)
+
+ colors_custom = [
+ "#053061",
+ "#2166ac",
+ "#4393c3",
+ "#92c5de",
+ "#d1e5f0",
+ "#f7f7f7",
+ "#fddbc7",
+ "#f4a582",
+ "#d6604d",
+ "#b2182b",
+ "#67001f",
+ ]
+ cmap_custom = LinearSegmentedColormap.from_list("custom_diverging", colors_custom, N=256)
norm = mcolors.TwoSlopeNorm(vmin=-1.0, vcenter=0.0, vmax=1.0)
# Sample data
- data = correlations_df[correlations_df['metric'] == metric].copy()
- data['feature'] = [ re.sub( 'std', 'Std. dev. of',
- re.sub( 'co of v', 'coefficient of variation',
- re.sub( 'kurt$' ,'kurtosis',
- re.sub( 'skew$', 'skewness',
- re.sub( 'var$', 'variation',
- re.sub( '%', '',
- re.sub( '_', ' ', x ) ) ) ) ) ) ) for x in data['feature']]
-
- if key == 'model_datatype':
- data['datatype'] = [ '_'.join( x.split('_')[1:] ) for x in data[key]]
- key_column = 'Model / Dataset'
+ data = correlations_df[correlations_df["metric"] == metric].copy()
+ data["feature"] = [
+ re.sub(
+ "std",
+ "Std. dev. of",
+ re.sub(
+ "co of v",
+ "coefficient of variation",
+ re.sub(
+ "kurt$",
+ "kurtosis",
+ re.sub(
+ "skew$",
+ "skewness",
+ re.sub("var$", "variation", re.sub("%", "", re.sub("_", " ", x))),
+ ),
+ ),
+ ),
+ )
+ for x in data["feature"]
+ ]
+
+ if key == "model_datatype":
+ data["datatype"] = ["_".join(x.split("_")[1:]) for x in data[key]]
+ key_column = "Model / Dataset"
else:
- data['datatype'] = [ '_'.join( x.split('_')[2:] ) for x in data[key]]
- key_column = 'Model / Embedding / Dataset'
-
- data = data.sort_values( ['feature','datatype'], ascending = False )
- data['model'] = [ re.sub( '_.*', '', x ) for x in data[key]]
- data['model'] = [x.upper() for x in data['model']]
- data = pd.concat( [data[ ~data['model'].isin( ['QSVC', 'QNN', 'VQC', 'PQK']) ], data[ data['model'].isin( ['QSVC', 'QNN', 'VQC', 'PQK']) ] ] )
- fm = dict(zip( list(set(data['feature'])), range(len(set(data['feature']))) ) )
- data['feature_map'] = [ fm[x] for x in data['feature']]
+ data["datatype"] = ["_".join(x.split("_")[2:]) for x in data[key]]
+ key_column = "Model / Embedding / Dataset"
+
+ data = data.sort_values(["feature", "datatype"], ascending=False)
+ data["model"] = [re.sub("_.*", "", x) for x in data[key]]
+ data["model"] = [x.upper() for x in data["model"]]
+ data = pd.concat(
+ [
+ data[~data["model"].isin(["QSVC", "QNN", "VQC", "PQK"])],
+ data[data["model"].isin(["QSVC", "QNN", "VQC", "PQK"])],
+ ]
+ )
+ fm = dict(zip(list(set(data["feature"])), range(len(set(data["feature"])))))
+ data["feature_map"] = [fm[x] for x in data["feature"]]
# Fill NaN values before scaling to avoid errors
data = data.fillna(0)
@@ -156,225 +237,310 @@ def plot_results_correlation( correlations_df, metric = 'f1_score', title = '',
# Scale dot size based on actual data range for meaningful representation
# Reduced sizes to minimize overlap
epsilon = 25
-
+
# Get actual min/max from the data to scale appropriately
min_val = data[size].min()
max_val = data[size].max()
-
+
# Normalize to 0-1 based on actual data range, then scale to pixel sizes
if max_val > min_val:
normalized_values = (data[size] - min_val) / (max_val - min_val)
else:
normalized_values = np.ones_like(data[size]) * 0.5
-
+
# Size formula: normalized value in [0,1] → size in [epsilon, 150+epsilon] (reduced from 200)
- data['norm_size'] = (normalized_values * 150 + epsilon).astype(float)
+ data["norm_size"] = (normalized_values * 150 + epsilon).astype(float)
+
+ data[key] = [re.sub("_", " / ", x) for x in data[key]]
- data[key] = [ re.sub( '_', ' / ', x ) for x in data[key]]
-
# Create figure with very compact design
- fig, ax = plt.subplots(figsize=figsize, facecolor='white', dpi=100)
- ax.set_facecolor('white')
-
+ fig, ax = plt.subplots(figsize=figsize, facecolor="white", dpi=100)
+ ax.set_facecolor("white")
+
# Create scatter plot with enhanced professional styling
- scatter = ax.scatter(data[key], data['feature'], s=data['norm_size'],
- c=data['correlation'], cmap=cmap_custom, norm=norm,
- alpha=0.92, edgecolors='#34495E', linewidths=1.2,
- zorder=3)
-
+ scatter = ax.scatter(
+ data[key],
+ data["feature"],
+ s=data["norm_size"],
+ c=data["correlation"],
+ cmap=cmap_custom,
+ norm=norm,
+ alpha=0.92,
+ edgecolors="#34495E",
+ linewidths=1.2,
+ zorder=3,
+ )
+
# Add colorbar with enhanced professional styling
cbar = plt.colorbar(scatter, ax=ax, pad=0.018, aspect=28, shrink=0.88)
- cbar.set_label(colorbar_label, rotation=270, labelpad=22, fontsize=11, fontweight='bold')
+ cbar.set_label(colorbar_label, rotation=270, labelpad=22, fontsize=11, fontweight="bold")
cbar.ax.tick_params(labelsize=10, width=1.3, length=5, pad=4)
for spine in cbar.ax.spines.values():
spine.set_linewidth(1.3)
- spine.set_edgecolor('#34495E')
-
+ spine.set_edgecolor("#34495E")
+
# Set labels with clean formatting
- ax.set_xlabel(key_column, fontweight='bold', fontsize=13, labelpad=10)
- ax.set_ylabel('Data Feature', fontweight='bold', fontsize=13, labelpad=10)
-
+ ax.set_xlabel(key_column, fontweight="bold", fontsize=13, labelpad=10)
+ ax.set_ylabel("Data Feature", fontweight="bold", fontsize=13, labelpad=10)
+
# Add title if provided
if title:
- ax.set_title(title, fontweight='bold', pad=20, fontsize=14)
-
+ ax.set_title(title, fontweight="bold", pad=20, fontsize=14)
+
# Rotate x-axis labels for better readability
- plt.setp(ax.xaxis.get_majorticklabels(), rotation=90, ha='right', va='top', fontsize=10)
+ plt.setp(ax.xaxis.get_majorticklabels(), rotation=90, ha="right", va="top", fontsize=10)
plt.setp(ax.yaxis.get_majorticklabels(), fontsize=10)
-
+
# Add professional grid for better readability
- ax.grid(True, alpha=0.18, linestyle='--', linewidth=0.8, color='#95A5A6', zorder=0)
+ ax.grid(True, alpha=0.18, linestyle="--", linewidth=0.8, color="#95A5A6", zorder=0)
ax.set_axisbelow(True)
-
+
# Proper margins to prevent cropping while keeping columns close
ax.margins(x=0.025, y=0.035)
-
+
# Clean tick parameters
- ax.tick_params(axis='both', which='major', labelsize=11, width=1.2, length=5)
-
+ ax.tick_params(axis="both", which="major", labelsize=11, width=1.2, length=5)
+
# Remove top and right spines for cleaner look
sns.despine(ax=ax)
-
+
# Create size legend with 4 dots showing ACTUAL median metric values from data
- handles_size, labels_size = scatter.legend_elements(prop="sizes", alpha=0.75, num=4,
- markeredgecolor='#34495E', markeredgewidth=1.2)
-
+ handles_size, labels_size = scatter.legend_elements(
+ prop="sizes", alpha=0.75, num=4, markeredgecolor="#34495E", markeredgewidth=1.2
+ )
+
# Use REAL median metric values from the data
smin = np.min(data[size])
smax = np.max(data[size])
- labels_size = [f'{x:.2f}' for x in np.linspace(smin, smax, 4)]
-
+ labels_size = [f"{x:.2f}" for x in np.linspace(smin, smax, 4)]
+
# Position legend on the right side, well below the colorbar with proper spacing
- legend = ax.legend(handles_size, labels_size, title=size_label,
- loc='upper left', bbox_to_anchor=(1.15, -0.05),
- ncol=1, frameon=True, fancybox=False,
- title_fontsize=9, fontsize=8,
- edgecolor='#34495E', framealpha=0.98,
- labelspacing=0.8, handletextpad=0.5)
+ legend = ax.legend(
+ handles_size,
+ labels_size,
+ title=size_label,
+ loc="upper left",
+ bbox_to_anchor=(1.15, -0.05),
+ ncol=1,
+ frameon=True,
+ fancybox=False,
+ title_fontsize=9,
+ fontsize=8,
+ edgecolor="#34495E",
+ framealpha=0.98,
+ labelspacing=0.8,
+ handletextpad=0.5,
+ )
legend.get_frame().set_linewidth(1.2)
- legend.get_frame().set_facecolor('white')
- legend.get_title().set_fontweight('bold')
-
+ legend.get_frame().set_facecolor("white")
+ legend.get_title().set_fontweight("bold")
+
# Adjust layout with reduced horizontal spacing between subplots
plt.tight_layout(pad=0.8, w_pad=1.8)
-
- if save_file_path != '':
- plt.savefig(save_file_path, dpi=600, bbox_inches='tight', facecolor='white',
- edgecolor='none', format='pdf' if save_file_path.endswith('.pdf') else None)
+
+ if save_file_path != "":
+ plt.savefig(
+ save_file_path,
+ dpi=600,
+ bbox_inches="tight",
+ facecolor="white",
+ edgecolor="none",
+ format="pdf" if save_file_path.endswith(".pdf") else None,
+ )
print(f"Scatter plot saved to: {save_file_path}")
-
+
if show_plots:
plt.show()
plt.close()
+ model_qml = ["QNN", "PQK", "VQC", "QSVC"]
- model_qml = ['QNN', 'PQK', 'VQC' ,'QSVC']
-
data[key_column] = data[key]
- data['Data feature'] = data['feature']
- to_plot = data.pivot_table(columns = key_column, index = 'Data feature', values = 'correlation')
-
+ data["Data feature"] = data["feature"]
+ to_plot = data.pivot_table(columns=key_column, index="Data feature", values="correlation")
+
# Define professional color scheme for model types
- ccolors = ['#7B68EE' if re.sub(' .*', '', x) in model_qml else '#FF8C00' for x in to_plot.columns]
+ ccolors = [
+ "#7B68EE" if re.sub(" .*", "", x) in model_qml else "#FF8C00" for x in to_plot.columns
+ ]
# Create custom diverging colormap
from matplotlib.colors import LinearSegmentedColormap
- colors_heatmap = ['#2166ac', '#4393c3', '#92c5de', '#d1e5f0', '#f7f7f7',
- '#fddbc7', '#f4a582', '#d6604d', '#b2182b']
- cmap_heatmap = LinearSegmentedColormap.from_list('custom_heatmap', colors_heatmap, N=256)
+
+ colors_heatmap = [
+ "#2166ac",
+ "#4393c3",
+ "#92c5de",
+ "#d1e5f0",
+ "#f7f7f7",
+ "#fddbc7",
+ "#f4a582",
+ "#d6604d",
+ "#b2182b",
+ ]
+ cmap_heatmap = LinearSegmentedColormap.from_list("custom_heatmap", colors_heatmap, N=256)
# Create professional heatmap with better proportions
heatmap_height = figsize[1] * 0.95 # Much taller to reduce space above colorbar
heatmap_width = min(figsize[0] * 0.9, 10) # Narrower columns
-
- g = sns.clustermap(to_plot.fillna(0),
- figsize=(heatmap_width, heatmap_height),
- col_colors=ccolors,
- cmap=cmap_heatmap,
- method='average',
- metric='euclidean',
- center=0,
- xticklabels=xticks,
- yticklabels=True,
- cbar_kws={'label': colorbar_label, 'orientation': 'horizontal'},
- linewidths=1.0,
- linecolor='white',
- vmin=-1, vmax=1,
- dendrogram_ratio=0.05,
- cbar_pos=(0.55, 0.01, 0.4, 0.015))
-
+
+ g = sns.clustermap(
+ to_plot.fillna(0),
+ figsize=(heatmap_width, heatmap_height),
+ col_colors=ccolors,
+ cmap=cmap_heatmap,
+ method="average",
+ metric="euclidean",
+ center=0,
+ xticklabels=xticks,
+ yticklabels=True,
+ cbar_kws={"label": colorbar_label, "orientation": "horizontal"},
+ linewidths=1.0,
+ linecolor="white",
+ vmin=-1,
+ vmax=1,
+ dendrogram_ratio=0.05,
+ cbar_pos=(0.55, 0.01, 0.4, 0.015),
+ )
+
# Hide dendrograms for cleaner appearance
g.ax_row_dendrogram.set_visible(False)
g.ax_col_dendrogram.set_visible(False)
-
+
# Improve axis labels with better styling
- g.ax_heatmap.set_xlabel(key_column, fontweight='bold', fontsize=11, labelpad=12, color='#2C3E50')
- g.ax_heatmap.set_ylabel('Data Feature', fontweight='bold', fontsize=11, labelpad=12, color='#2C3E50')
-
+ g.ax_heatmap.set_xlabel(
+ key_column, fontweight="bold", fontsize=11, labelpad=12, color="#2C3E50"
+ )
+ g.ax_heatmap.set_ylabel(
+ "Data Feature", fontweight="bold", fontsize=11, labelpad=12, color="#2C3E50"
+ )
+
# Rotate x-labels 45 degrees for readability
- plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, ha='right', fontsize=9, color='#2C3E50')
- plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, fontsize=9, color='#2C3E50')
-
+ plt.setp(
+ g.ax_heatmap.xaxis.get_majorticklabels(),
+ rotation=45,
+ ha="right",
+ fontsize=9,
+ color="#2C3E50",
+ )
+ plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, fontsize=9, color="#2C3E50")
+
# Improve tick parameters with better styling
- g.ax_heatmap.tick_params(axis='both', which='major', width=1.2, length=5, pad=4, colors='#2C3E50')
-
+ g.ax_heatmap.tick_params(
+ axis="both", which="major", width=1.2, length=5, pad=4, colors="#2C3E50"
+ )
+
# Style heatmap spines
for spine in g.ax_heatmap.spines.values():
spine.set_linewidth(1.5)
- spine.set_edgecolor('#34495E')
-
+ spine.set_edgecolor("#34495E")
+
# Enhance horizontal colorbar styling at bottom
if g.cax is not None:
- g.cax.set_xlabel(colorbar_label, fontsize=10, fontweight='bold', labelpad=10, color='#2C3E50')
- g.cax.tick_params(labelsize=9, width=1.2, length=4, colors='#2C3E50')
+ g.cax.set_xlabel(
+ colorbar_label, fontsize=10, fontweight="bold", labelpad=10, color="#2C3E50"
+ )
+ g.cax.tick_params(labelsize=9, width=1.2, length=4, colors="#2C3E50")
for spine in g.cax.spines.values():
spine.set_linewidth(1.2)
- spine.set_edgecolor('#34495E')
+ spine.set_edgecolor("#34495E")
- if save_file_path != '':
- heatmap_path = re.sub('.pdf', '_heatmap.pdf', save_file_path)
- plt.savefig(heatmap_path, dpi=600, bbox_inches='tight', facecolor='white',
- edgecolor='none', format='pdf' if heatmap_path.endswith('.pdf') else None)
+ if save_file_path != "":
+ heatmap_path = re.sub(".pdf", "_heatmap.pdf", save_file_path)
+ plt.savefig(
+ heatmap_path,
+ dpi=600,
+ bbox_inches="tight",
+ facecolor="white",
+ edgecolor="none",
+ format="pdf" if heatmap_path.endswith(".pdf") else None,
+ )
print(f"Clustered heatmap saved to: {heatmap_path}")
-
+
if show_plots:
plt.show()
plt.close()
# Create non-clustered heatmap with quantum models first
- qml_col = [x for x in to_plot.columns if re.sub(' .*', '', x) in model_qml]
- cml_col = [x for x in to_plot.columns if re.sub(' .*', '', x) not in model_qml]
+ qml_col = [x for x in to_plot.columns if re.sub(" .*", "", x) in model_qml]
+ cml_col = [x for x in to_plot.columns if re.sub(" .*", "", x) not in model_qml]
to_plot_ordered = to_plot.loc[:, qml_col + cml_col]
- ccolors_ordered = ['#7B68EE' if re.sub(' .*', '', x) in model_qml else '#FF8C00' for x in to_plot_ordered.columns]
-
- g2 = sns.clustermap(to_plot_ordered.fillna(0),
- figsize=(heatmap_width, heatmap_height),
- col_colors=ccolors_ordered,
- col_cluster=False,
- row_cluster=True,
- cmap=cmap_heatmap,
- center=0,
- xticklabels=xticks,
- yticklabels=True,
- cbar_kws={'label': colorbar_label, 'orientation': 'horizontal'},
- linewidths=1.0,
- linecolor='white',
- vmin=-1, vmax=1,
- dendrogram_ratio=0.05,
- cbar_pos=(0.55, 0.01, 0.4, 0.015),
- method='average',
- metric='euclidean')
-
+ ccolors_ordered = [
+ "#7B68EE" if re.sub(" .*", "", x) in model_qml else "#FF8C00"
+ for x in to_plot_ordered.columns
+ ]
+
+ g2 = sns.clustermap(
+ to_plot_ordered.fillna(0),
+ figsize=(heatmap_width, heatmap_height),
+ col_colors=ccolors_ordered,
+ col_cluster=False,
+ row_cluster=True,
+ cmap=cmap_heatmap,
+ center=0,
+ xticklabels=xticks,
+ yticklabels=True,
+ cbar_kws={"label": colorbar_label, "orientation": "horizontal"},
+ linewidths=1.0,
+ linecolor="white",
+ vmin=-1,
+ vmax=1,
+ dendrogram_ratio=0.05,
+ cbar_pos=(0.55, 0.01, 0.4, 0.015),
+ method="average",
+ metric="euclidean",
+ )
+
# Improve axis labels with better styling
- g2.ax_heatmap.set_xlabel(key_column, fontweight='bold', fontsize=11, labelpad=12, color='#2C3E50')
- g2.ax_heatmap.set_ylabel('Data Feature', fontweight='bold', fontsize=11, labelpad=12, color='#2C3E50')
-
+ g2.ax_heatmap.set_xlabel(
+ key_column, fontweight="bold", fontsize=11, labelpad=12, color="#2C3E50"
+ )
+ g2.ax_heatmap.set_ylabel(
+ "Data Feature", fontweight="bold", fontsize=11, labelpad=12, color="#2C3E50"
+ )
+
# Rotate x-labels 45 degrees for readability
- plt.setp(g2.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, ha='right', fontsize=9, color='#2C3E50')
- plt.setp(g2.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, fontsize=9, color='#2C3E50')
-
+ plt.setp(
+ g2.ax_heatmap.xaxis.get_majorticklabels(),
+ rotation=45,
+ ha="right",
+ fontsize=9,
+ color="#2C3E50",
+ )
+ plt.setp(g2.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, fontsize=9, color="#2C3E50")
+
# Improve tick parameters with better styling
- g2.ax_heatmap.tick_params(axis='both', which='major', width=1.2, length=5, pad=4, colors='#2C3E50')
-
+ g2.ax_heatmap.tick_params(
+ axis="both", which="major", width=1.2, length=5, pad=4, colors="#2C3E50"
+ )
+
# Style heatmap spines
for spine in g2.ax_heatmap.spines.values():
spine.set_linewidth(1.5)
- spine.set_edgecolor('#34495E')
-
+ spine.set_edgecolor("#34495E")
+
# Enhance horizontal colorbar styling at bottom
if g2.cax is not None:
- g2.cax.set_xlabel(colorbar_label, fontsize=10, fontweight='bold', labelpad=10, color='#2C3E50')
- g2.cax.tick_params(labelsize=9, width=1.2, length=4, colors='#2C3E50')
+ g2.cax.set_xlabel(
+ colorbar_label, fontsize=10, fontweight="bold", labelpad=10, color="#2C3E50"
+ )
+ g2.cax.tick_params(labelsize=9, width=1.2, length=4, colors="#2C3E50")
for spine in g2.cax.spines.values():
spine.set_linewidth(1.2)
- spine.set_edgecolor('#34495E')
+ spine.set_edgecolor("#34495E")
- if save_file_path != '':
- noncluster_path = re.sub('.pdf', '_noncluster_heatmap.pdf', save_file_path)
- plt.savefig(noncluster_path, dpi=600, bbox_inches='tight', facecolor='white',
- edgecolor='none', format='pdf' if noncluster_path.endswith('.pdf') else None)
+ if save_file_path != "":
+ noncluster_path = re.sub(".pdf", "_noncluster_heatmap.pdf", save_file_path)
+ plt.savefig(
+ noncluster_path,
+ dpi=600,
+ bbox_inches="tight",
+ facecolor="white",
+ edgecolor="none",
+ format="pdf" if noncluster_path.endswith(".pdf") else None,
+ )
print(f"Non-clustered heatmap saved to: {noncluster_path}")
-
+
if show_plots:
plt.show()
- plt.close()
\ No newline at end of file
+ plt.close()
diff --git a/setup.py b/setup.py
index d1588f0..d6d7a54 100644
--- a/setup.py
+++ b/setup.py
@@ -107,6 +107,7 @@ def read_requirements():
'black>=23.0',
'flake8>=6.0',
'mypy>=1.0',
+ 'types-PyYAML',
],
'all': docs_require + [
'hydra-core',
@@ -116,6 +117,7 @@ def read_requirements():
'black>=23.0',
'flake8>=6.0',
'mypy>=1.0',
+ 'types-PyYAML',
],
},
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..e1ee87a
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,27 @@
+from importlib.util import module_from_spec, spec_from_file_location
+from pathlib import Path
+import sys
+import types
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+def load_module(module_name: str, relative_path: str):
+ spec = spec_from_file_location(module_name, REPO_ROOT / relative_path)
+ if spec is None or spec.loader is None:
+ raise ImportError(f"Could not load module {module_name} from {relative_path}")
+
+ module = module_from_spec(spec)
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module)
+ return module
+
+
+def ensure_package(package_name: str, relative_path: str):
+ package = sys.modules.get(package_name)
+ if package is None:
+ package = types.ModuleType(package_name)
+ package.__path__ = [str(REPO_ROOT / relative_path)]
+ sys.modules[package_name] = package
+ return package
diff --git a/tests/test_data_generation.py b/tests/test_data_generation.py
new file mode 100644
index 0000000..18d9b3c
--- /dev/null
+++ b/tests/test_data_generation.py
@@ -0,0 +1,92 @@
+import json
+
+import pandas as pd
+
+from conftest import load_module
+
+
+make_circles = load_module(
+ "tests._make_circles",
+ "qbiocode/data_generation/make_circles.py",
+)
+make_class = load_module(
+ "tests._make_class",
+ "qbiocode/data_generation/make_class.py",
+)
+make_spheres = load_module(
+ "tests._make_spheres",
+ "qbiocode/data_generation/make_spheres.py",
+)
+
+
+def test_generate_circles_datasets_writes_expected_files(tmp_path):
+ make_circles.generate_circles_datasets(
+ n_samples=[12],
+ noise=[0.15],
+ save_path=str(tmp_path),
+ random_state=7,
+ )
+
+ dataset_path = tmp_path / "circles_data-1.csv"
+ config_path = tmp_path / "dataset_config.json"
+
+ assert dataset_path.exists()
+ assert config_path.exists()
+
+ dataset = pd.read_csv(dataset_path)
+ with config_path.open(encoding="utf-8") as handle:
+ config = json.load(handle)
+
+ assert list(dataset.columns) == ["0", "1", "class"]
+ assert len(dataset) == 12
+ assert dataset["class"].isin([0, 1]).all()
+ assert list(config.values()) == [{"n_samples": 12, "noise": 0.15}]
+
+
+def test_generate_classification_datasets_only_writes_valid_configurations(tmp_path):
+ make_class.generate_classification_datasets(
+ n_samples=[10],
+ n_features=[4, 3],
+ n_informative=[2],
+ n_redundant=[2],
+ n_classes=[2],
+ n_clusters_per_class=[1],
+ weights=[[0.5, 0.5]],
+ save_path=str(tmp_path),
+ random_state=11,
+ )
+
+ csv_files = sorted(tmp_path.glob("class_data-*.csv"))
+ config_path = tmp_path / "dataset_config.json"
+
+ assert [path.name for path in csv_files] == ["class_data-1.csv"]
+
+ with config_path.open(encoding="utf-8") as handle:
+ config = json.load(handle)
+
+ assert list(config.values()) == [
+ {
+ "n_samples": 10,
+ "n_features": 4,
+ "n_informative": 2,
+ "n_redundant": 2,
+ "n_classes": 2,
+ "n_clusters_per_class": 1,
+ "weights": [0.5, 0.5],
+ }
+ ]
+
+
+def test_generate_points_in_nd_sphere_respects_radius_threshold():
+ points = make_spheres.generate_points_in_nd_sphere(
+ n_s=25,
+ dim=4,
+ radius=3,
+ thresh=0.6,
+ )
+
+ norms = (points ** 2).sum(axis=1) ** 0.5
+
+ assert points.shape == (25, 4)
+ assert (norms <= 3).all()
+ assert (norms >= 1.8).all()
diff --git a/tests/test_file_utilities.py b/tests/test_file_utilities.py
new file mode 100644
index 0000000..d3c34a1
--- /dev/null
+++ b/tests/test_file_utilities.py
@@ -0,0 +1,82 @@
+from pathlib import Path
+
+import pytest
+
+from conftest import load_module
+
+
+find_duplicates = load_module(
+ "tests._find_duplicates",
+ "qbiocode/utils/find_duplicates.py",
+)
+find_string = load_module(
+ "tests._find_string",
+ "qbiocode/utils/find_string.py",
+)
+
+
+def write_text(path: Path, content: str) -> None:
+ path.write_text(content, encoding="utf-8")
+
+
+def normalize_pairs(pairs):
+ return {tuple(sorted(pair)) for pair in pairs}
+
+
+def test_find_duplicate_files_detects_matches_ignoring_empty_lines(tmp_path):
+ write_text(tmp_path / "one.txt", "alpha\n\nbeta\n")
+ write_text(tmp_path / "two.txt", "beta\nalpha\n")
+ write_text(tmp_path / "three.txt", "alpha\ngamma\n")
+
+ duplicates = find_duplicates.find_duplicate_files(str(tmp_path))
+
+ assert normalize_pairs(duplicates) == {
+ tuple(sorted((str(tmp_path / "one.txt"), str(tmp_path / "two.txt"))))
+ }
+
+
+def test_find_duplicate_files_honors_case_sensitivity_setting(tmp_path):
+ write_text(tmp_path / "upper.txt", "Alpha\n")
+ write_text(tmp_path / "lower.txt", "alpha\n")
+
+ duplicates = find_duplicates.find_duplicate_files(
+ str(tmp_path),
+ case_sensitive=False,
+ )
+
+ assert normalize_pairs(duplicates) == {
+ tuple(sorted((str(tmp_path / "upper.txt"), str(tmp_path / "lower.txt"))))
+ }
+
+
+def test_find_duplicate_files_raises_for_missing_directory(tmp_path):
+ missing_dir = tmp_path / "missing"
+
+ with pytest.raises(FileNotFoundError):
+ find_duplicates.find_duplicate_files(str(missing_dir))
+
+
+def test_find_string_in_files_returns_matching_lines_and_filters_by_pattern(tmp_path):
+ write_text(tmp_path / "config.yaml", "mode: fast\nEmbedding: PCA\n")
+ write_text(tmp_path / "notes.txt", "embedding: pca\n")
+
+ results = find_string.find_string_in_files(
+ str(tmp_path),
+ "embedding: pca",
+ file_pattern=".yaml",
+ case_sensitive=False,
+ return_lines=True,
+ verbose=False,
+ )
+
+ assert results == {
+ str(tmp_path / "config.yaml"): [(2, "Embedding: PCA\n")],
+ }
+
+
+def test_find_string_in_files_raises_for_non_directory(tmp_path):
+ file_path = tmp_path / "data.txt"
+ write_text(file_path, "content\n")
+
+ with pytest.raises(NotADirectoryError):
+ find_string.find_string_in_files(str(file_path), "content")
diff --git a/tests/test_generator_dispatch.py b/tests/test_generator_dispatch.py
new file mode 100644
index 0000000..8f187ed
--- /dev/null
+++ b/tests/test_generator_dispatch.py
@@ -0,0 +1,114 @@
+import pytest
+
+from conftest import ensure_package, load_module
+
+
+def load_generator_module():
+ ensure_package("qbiocode", "qbiocode")
+ ensure_package("qbiocode.data_generation", "qbiocode/data_generation")
+
+ for module_name in [
+ "make_circles",
+ "make_moons",
+ "make_class",
+ "make_s_curve",
+ "make_spheres",
+ "make_spirals",
+ "make_swiss_roll",
+ ]:
+ load_module(
+ f"qbiocode.data_generation.{module_name}",
+ f"qbiocode/data_generation/{module_name}.py",
+ )
+
+ return load_module(
+ "qbiocode.data_generation.generator",
+ "qbiocode/data_generation/generator.py",
+ )
+
+
+@pytest.mark.parametrize(
+ ("dataset_type", "module_attr", "function_name", "expected_kwargs"),
+ [
+ (
+ "circles",
+ "circles",
+ "generate_circles_datasets",
+ {"n_samples": [9], "noise": [0.2], "save_path": "out", "random_state": 5},
+ ),
+ (
+ "classes",
+ "make_class",
+ "generate_classification_datasets",
+ {
+ "n_samples": [9],
+ "n_features": [6],
+ "n_informative": [2],
+ "n_redundant": [1],
+ "n_classes": [2],
+ "n_clusters_per_class": [1],
+ "weights": [[0.5, 0.5]],
+ "save_path": "out",
+ "random_state": 5,
+ },
+ ),
+ (
+ "spheres",
+ "spheres",
+ "generate_spheres_datasets",
+ {"n_s": [9], "dim": [6], "radius": [4], "save_path": "out", "random_state": 5},
+ ),
+ (
+ "swiss_roll",
+ "swiss_roll",
+ "generate_swiss_roll_datasets",
+ {
+ "n_samples": [9],
+ "noise": [0.2],
+ "hole": [True],
+ "save_path": "out",
+ "random_state": 5,
+ },
+ ),
+ ],
+)
+def test_generate_data_dispatches_to_expected_backend(
+ monkeypatch,
+ dataset_type,
+ module_attr,
+ function_name,
+ expected_kwargs,
+):
+ generator = load_generator_module()
+ captured = {}
+
+ def fake_backend(**kwargs):
+ captured.update(kwargs)
+
+ monkeypatch.setattr(getattr(generator, module_attr), function_name, fake_backend)
+
+ generator.generate_data(
+ type_of_data=dataset_type,
+ save_path="out",
+ n_samples=[9],
+ noise=[0.2],
+ hole=[True],
+ n_classes=[2],
+ dim=[6],
+ rad=[4],
+ n_features=[6],
+ n_informative=[2],
+ n_redundant=[1],
+ n_clusters_per_class=[1],
+ weights=[[0.5, 0.5]],
+ random_state=5,
+ )
+
+ assert captured == expected_kwargs
+
+
+def test_generate_data_rejects_unknown_dataset_type():
+ generator = load_generator_module()
+
+ with pytest.raises(ValueError, match="Invalid type_of_data"):
+ generator.generate_data(type_of_data="unknown", save_path="out")