# Utilities

## Libraries

In [45]:
# data manipulation
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Sequence, Union, Optional, Any
from math import comb, gamma, log
from scipy.special import digamma
import matplotlib.pyplot as plt
import itertools
from itertools import combinations
import random

# data preprocessing
from sklearn.preprocessing import StandardScaler

# mutual information
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.neighbors import NearestNeighbors
from joblib import Parallel, delayed

# classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate, RepeatedStratifiedKFold
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, balanced_accuracy_score

# qiskit
from qiskit import QuantumCircuit
from qiskit.quantum_info import Statevector, Pauli, SparsePauliOp
from qiskit.circuit.library import PauliEvolutionGate
from qiskit.synthesis import LieTrotter, SuzukiTrotter
from qiskit_ibm_runtime import QiskitRuntimeService, EstimatorV2 as Estimator
from qiskit_aer import AerSimulator
from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager

# saving and loading
import json
import os

## Hyperparameters

In [16]:
n_trotter_steps = 1
delta_t = 0.005
T = n_trotter_steps * delta_t

## Load and preprocess dataset

In [6]:
def data_preprocessing(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    scaler = StandardScaler() # Rescale the data using z-score: (x - μ (mean))/μ (standard deviation)
    X_tr = scaler.fit_transform(X_train)
    Xte = scaler.transform(X_test)
    return X_tr, Xte

In [59]:
dataset_dir = "C:/Users/jiriv/Documents/škola/Diplom_thesis/Quantum-feature-maps/Dataset/data.csv"
df = pd.read_csv(dataset_dir)

# Map the class labels to binary values
df['Class'] = df['Class'].map({'NonToxic': 1,'Toxic': 0})

# Show the first few rows of the dataframe
print(df.head(), "\n")

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

   MATS3v  nHBint10  MATS3s  MATS3p  nHBDon_Lipinski  minHBint8  MATS3e  \
0  0.0908         0  0.0075  0.0173                0        0.0 -0.0436   
1  0.0213         0  0.1144 -0.0410                0        0.0  0.1231   
2  0.0018         0 -0.0156 -0.0765                2        0.0 -0.1138   
3 -0.0251         0 -0.0064 -0.0894                3        0.0 -0.0747   
4  0.0135         0  0.0424 -0.0353                0        0.0 -0.0638   

   MATS3c  minHBint2  MATS3m  ...   WTPT-4   WTPT-5  ETA_EtaP_L  ETA_EtaP_F  \
0  0.0409        0.0  0.1368  ...   0.0000   0.0000      0.1780      1.5488   
1 -0.0316        0.0  0.1318  ...   8.8660  19.3525      0.1739      1.3718   
2 -0.1791        0.0  0.0615  ...   5.2267  27.8796      0.1688      1.4395   
3 -0.1151        0.0  0.0361  ...   7.7896  24.7336      0.1702      1.4654   
4  0.0307        0.0  0.0306  ...  12.3240  19.7486      0.1789      1.4495   

   ETA_EtaP_B  nT5Ring  SHdNH  ETA_dEpsilon_C  MDEO-22  Class  
0      0.0

# Quantum algorithm

## C_s via mutual information

In [30]:
def compute_mutual_information(a: np.ndarray, b: np.ndarray, random_state: int = 0, n_neighbors: int = 3) -> float:
    a = np.asarray(a)
    b = np.asarray(b)

    # X must be 2D: (n_samples, n_features). For single feature make it (n_samples, 1)
    if a.ndim == 1:
        X = a.reshape(-1, 1)
    elif a.ndim == 2 and a.shape[1] == 1:
        X = a
    else:
        # If user passed a multi-column array, keep as-is (mutual_info_regression will return array)
        X = a

    # y must be 1D
    if b.ndim > 1:
        y = b.ravel()
    else:
        y = b

    mi = mutual_info_regression(X, y, random_state=random_state, n_neighbors=n_neighbors)
    return float(mi[0]) 

def mi_matrix(X: np.ndarray, plot_option: bool) -> pd.DataFrame:
    n_features = X.shape[1]
    #feature_names = [f"Feature_{i}" for i in range(n_features)]
    feature_names = df.columns[:-1]

    MI = np.zeros((n_features, n_features))

    # Compute pairwise mutual information
    for i in range(n_features):
        for j in range(i + 1, n_features):
            mi = compute_mutual_information(X[:, i], X[:, j])
            MI[i, j] = mi
            MI[j, i] = mi

    # Put into a pandas DataFrame
    MI_df = pd.DataFrame(MI, columns=feature_names, index=feature_names)

    if plot_option == True:
        plt.figure(figsize=(8, 6))
        plt.imshow(MI_df, interpolation='nearest')
        plt.colorbar(label='Mutual Information')
        plt.xticks(ticks=np.arange(n_features), labels=feature_names, rotation=45, ha='right')
        plt.yticks(ticks=np.arange(n_features), labels=feature_names)
        plt.tight_layout()
        plt.show()

    return MI_df

PairList = List[Tuple[Tuple[int, int], float]]

def compute_all_pairwise_mi(
    X: np.ndarray,
    feature_names: Optional[List[str]] = None,
    random_state: int = 0,
    n_neighbors: int = 3,
    n_jobs: int = 1,
    verbose: bool = False
) -> List[Tuple[Tuple[int, int], float]]:

    X = np.asarray(X)
    if X.ndim != 2:
        raise ValueError("X must be 2D (n_samples, n_features)")

    n_samples, n_features = X.shape
    if feature_names is None:
        feature_names = [f"Feature_{i}" for i in range(n_features)]
    elif len(feature_names) != n_features:
        raise ValueError("feature_names length mismatch")

    pairs = list(combinations(range(n_features), 2))
    if verbose:
        print(f"Computing {len(pairs)} pairs (n_features={n_features}) using n_jobs={n_jobs} ...")

    def _compute_pair(pair):
        i, j = pair
        mi_val = compute_mutual_information(
            X[:, i], X[:, j],
            random_state=random_state,
            n_neighbors=n_neighbors
        )
        return (i, j, float(mi_val))

    # parallel computation
    results = Parallel(n_jobs=n_jobs)(
        delayed(_compute_pair)(pair) for pair in pairs
    )

    # sort by absolute MI
    results_sorted = sorted(results, key=lambda x: abs(x[2]), reverse=True)

    # return list of ((i,j), MI)
    compact_list = [((i, j), mi) for (i, j, mi) in results_sorted]

    return compact_list


def select_top_pairs(pairs: Union[pd.DataFrame, PairList],
                     top_n: int = 10,
                     mode: str = "absolute",
                     feature_names: Optional[List[str]] = None,
                     return_type: str = "list"
                     ) -> Union[PairList, pd.DataFrame]:
    
    if mode not in {"absolute", "largest", "most_negative"}:
        raise ValueError("mode must be 'absolute', 'largest', or 'most_negative'")

    pair_list = list(pairs)

    # build enriched list (i,j,mi,abs_mi)
    enriched = [ (a_b[0][0], a_b[0][1], float(a_b[1]), abs(float(a_b[1]))) for a_b in pair_list ]

    # sort according to mode
    if mode == "absolute":
        enriched_sorted = sorted(enriched, key=lambda t: t[3], reverse=True)
    elif mode == "largest":
        enriched_sorted = sorted(enriched, key=lambda t: t[2], reverse=True)
    else:  # most_negative
        enriched_sorted = sorted(enriched, key=lambda t: t[2])  # ascending (most negative first)

    # selection
    chosen = enriched_sorted[:top_n]

    # format output
    if return_type == "list":
        out_list: PairList = [(((int(i), int(j))), float(mi)) for (i, j, mi, _) in chosen]
        return out_list

    # build DataFrame
    rows = []
    for i, j, mi, abs_mi in chosen:
        if feature_names is not None:
            name_i = feature_names[int(i)]
            name_j = feature_names[int(j)]
            names = (name_i, name_j)
        else:
            names = (f"Feature_{int(i)}", f"Feature_{int(j)}")
        rows.append({
            'S_idx': (int(i), int(j)),
            'S_names': names,
            'mi': float(mi),
            'abs_mi': float(abs_mi)
        })
    df_out = pd.DataFrame(rows)
    return df_out

# I(x,y,z) = H(x) + H(y) + H(z) - H(x,y) - H(x,z) - H(y,z) + H(x,y,z)

def _entropy_knn(X: np.ndarray, k: int = 3) -> float:
    """
    Kozachenko-Leonenko k-NN entropy estimator for continuous variables.
    X: shape (n_samples, d)  (if 1D pass reshape(-1,1))
    returns entropy (natural units, nats)
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = X.reshape(-1, 1)
    n, d = X.shape
    if n <= k:
        raise ValueError(f"n_samples ({n}) must be > k ({k})")

    nbrs = NearestNeighbors(n_neighbors=k+1).fit(X)
    distances, _ = nbrs.kneighbors(X, return_distance=True)
    # distance to k-th neighbor (exclude self at distance 0)
    eps = distances[:, -1] + 1e-15  # tiny offset for numerical stability

    # volume of unit ball in d dimensions (Euclidean)
    c_d = (np.pi ** (d / 2.0)) / gamma(d / 2.0 + 1.0)
    avg_log_eps = np.mean(np.log(eps))

    H = digamma(n) - digamma(k) + d * avg_log_eps + log(c_d)
    return float(H)

def mi3_knn(x: Union[np.ndarray, list], y: Union[np.ndarray, list], z: Union[np.ndarray, list], k: int = 3, to_bits: bool = False) -> float:

    x = np.asarray(x).reshape(-1)
    y = np.asarray(y).reshape(-1)
    z = np.asarray(z).reshape(-1)

    if not (len(x) == len(y) == len(z)):
        raise ValueError("x, y, z must have the same number of samples")

    Hx  = _entropy_knn(x.reshape(-1, 1), k=k)
    Hy  = _entropy_knn(y.reshape(-1, 1), k=k)
    Hz  = _entropy_knn(z.reshape(-1, 1), k=k)

    Hxy = _entropy_knn(np.column_stack([x, y]), k=k)
    Hxz = _entropy_knn(np.column_stack([x, z]), k=k)
    Hyz = _entropy_knn(np.column_stack([y, z]), k=k)

    Hxyz = _entropy_knn(np.column_stack([x, y, z]), k=k)

    I_xyz = (Hx + Hy + Hz) - (Hxy + Hxz + Hyz) + Hxyz

    if to_bits:
        I_xyz = I_xyz / log(2.0)

    return float(I_xyz)

TripletList = List[Tuple[Tuple[int,int,int], float]]

def compute_all_triplet_mi(
    X: np.ndarray,
    feature_names: Optional[List[str]] = None,
    k: int = 3,
    to_bits: bool = False,
    n_jobs: int = 1,
    verbose: bool = False
) -> TripletList:

    X = np.asarray(X)
    if X.ndim != 2:
        raise ValueError("X must be 2D (n_samples, n_features)")
    n_samples, n_features = X.shape

    if feature_names is None:
        feature_names = [f"Feature_{i}" for i in range(n_features)]
    elif len(feature_names) != n_features:
        raise ValueError("feature_names length mismatch")

    combos = list(combinations(range(n_features), 3))
    if verbose:
        print(f"Computing {len(combos)} triplets (n_features={n_features}) with k={k} using n_jobs={n_jobs}...")

    def _compute_for_triplet(trip):
        i, j, l = trip
        mi_val = mi3_knn(X[:, i], X[:, j], X[:, l], k=k, to_bits=to_bits)
        return ((int(i), int(j), int(l)), float(mi_val))

    results = Parallel(n_jobs=n_jobs)(
        delayed(_compute_for_triplet)(trip) for trip in combos
    )

    # sort by absolute MI descending
    results_sorted = sorted(results, key=lambda t: abs(t[1]), reverse=True)
    return results_sorted


def select_top_triplets(
    triplets: Union[pd.DataFrame, TripletList],
    top_n: int = 10,
    mode: str = "absolute",
    feature_names: Optional[List[str]] = None,
    return_type: str = "list"
) -> Union[TripletList, pd.DataFrame]:

    if mode not in {"absolute", "largest", "most_negative"}:
        raise ValueError("mode must be 'absolute', 'largest', or 'most_negative'")


    trip_list = list(triplets)

    # enrich: (i,j,k,mi,abs_mi)
    enriched = [ (int(t[0][0]), int(t[0][1]), int(t[0][2]), float(t[1]), abs(float(t[1]))) for t in trip_list ]

    # sort according to mode
    if mode == "absolute":
        enriched_sorted = sorted(enriched, key=lambda x: x[4], reverse=True)
    elif mode == "largest":
        enriched_sorted = sorted(enriched, key=lambda x: x[3], reverse=True)
    else:  # most_negative
        enriched_sorted = sorted(enriched, key=lambda x: x[3])  # ascending (most negative first)

    # selection
    chosen = enriched_sorted[:top_n]
    
    # format output
    if return_type == "list":
        out_list: TripletList = [(((int(i), int(j), int(k))), float(mi)) for (i,j,k,mi,_) in chosen]
        return out_list

    # build DataFrame
    rows = []
    for i,j,k,mi,abs_mi in chosen:
        if feature_names is not None:
            names = (feature_names[int(i)], feature_names[int(j)], feature_names[int(k)])
        else:
            names = (f"Feature_{int(i)}", f"Feature_{int(j)}", f"Feature_{int(k)}")
        rows.append({
            'S_idx': (int(i), int(j), int(k)),
            'S_names': names,
            'mi': float(mi),
            'abs_mi': float(abs_mi)
        })
    df_out = pd.DataFrame(rows)
    return df_out

def create_set(
    k: str,
    top_pairs: Optional[List[Tuple[Tuple[int, int], float]]] = None,
    top_triplets: Optional[List[Tuple[Tuple[int, int, int], float]]] = None
) -> List[int]:
    set = []

    # Use only pairs
    if k == '2':
        set = top_pairs

    # Use only triplets
    elif k == '3':
        set = top_triplets

    # Combine both
    elif k == '2+3':
        set = []
        if top_pairs is not None:
            set.extend(top_pairs)
        if top_triplets is not None:
            set.extend(top_triplets)
    return set

def divide_into_subsets(
    items: List[Any],
    num_subsets: Optional[int] = None,
    subset_size: Optional[int] = None,
    shuffle: bool = True
) -> List[List[Any]]:
    
    if num_subsets is None and subset_size is None:
        raise ValueError("Provide either num_subsets or subset_size.")

    items_copy = items.copy()
    if shuffle:
        random.shuffle(items_copy)

    if subset_size is not None:
        num_subsets = max(1, (len(items_copy) + subset_size - 1) // subset_size)

    subsets = []
    avg_size = len(items_copy) / num_subsets

    start = 0
    for i in range(num_subsets):
        end = round((i + 1) * avg_size)
        subsets.append(items_copy[start:end])
        start = end

    return subsets

def compute_c_S(subset: List[Tuple[Tuple[int, ...], float]]) -> float:
    m = len(subset)
    if m < 2:
        return 0.0
    mi = sum([t[1] for t in subset])
    normalization = comb(m, 2)
    c_S = mi / normalization
    return c_S

In [44]:
if __name__ == "__main__":
    PairList = List[Tuple[Tuple[int, int], float]]
    pairs = compute_all_pairwise_mi(X, feature_names=list(df.columns[:-1]), random_state=42, n_neighbors=3, n_jobs=2)
    print(pairs)

    top_2_list = select_top_pairs(pairs, top_n=10, mode='absolute', return_type='list')
    #print(top_list)

    # If you have compact list produced earlier:
    top_2_df = select_top_pairs(pairs, top_n=10, mode='largest', feature_names=list(df.columns[:-1]), return_type='df')
    #print(top_2_df)

[((8, 9), 0.931028709319897), ((4, 10), 0.6410958759850565), ((5, 10), 0.5907776717062916), ((4, 5), 0.5799626247303435), ((0, 5), 0.5209933730247807), ((0, 10), 0.502056844971249), ((3, 11), 0.43487316378660523), ((0, 4), 0.41728668564205496), ((0, 12), 0.3866240867225108), ((0, 6), 0.28891003344387434), ((6, 10), 0.2852418262445884), ((0, 11), 0.269898283210396), ((3, 10), 0.2322709226991213), ((5, 7), 0.2164685261740522), ((1, 10), 0.21123926348835864), ((3, 5), 0.20729063848633933), ((1, 12), 0.2062452750753483), ((3, 7), 0.20204475267127053), ((4, 12), 0.19663827163017755), ((0, 1), 0.19643967402573503), ((4, 11), 0.19378406966204098), ((10, 11), 0.19246606381606357), ((5, 6), 0.18403579628875466), ((1, 11), 0.18135070540175446), ((2, 9), 0.1703450471091017), ((5, 11), 0.16785090011487114), ((0, 7), 0.15975926900547366), ((5, 12), 0.15759103878656688), ((2, 8), 0.15033728642724276), ((0, 3), 0.1488748160057054), ((6, 12), 0.1479880858538376), ((9, 11), 0.1367366496073399), ((10, 1

In [55]:
if __name__ == "__main__":
    TripletList = List[Tuple[Tuple[int,int,int], float]]
    triplets = compute_all_triplet_mi(X, feature_names=list(df.columns[:-1]), k=4, to_bits=False, n_jobs=4, verbose=True)
    #print(triplets)

    top_3_list = select_top_triplets(triplets, top_n=50, mode='absolute', return_type='list')
    #print(top_list)

    # If you have compact list produced earlier:
    top_3_df = select_top_triplets(triplets, top_n=50, mode='largest', feature_names=list(df.columns[:-1]), return_type='df')
    print(top_3_df)

Computing 286 triplets (n_features=13) with k=4 using n_jobs=4...
          S_idx                           S_names        mi    abs_mi
0    (0, 1, 11)        (MDEC-23, MATS2v, VE3_Dzi)  0.666439  0.666439
1    (1, 3, 10)      (MATS2v, VE3_Dt, SpMax5_Bhv)  0.664592  0.664592
2    (3, 8, 10)      (VE3_Dt, GATS8e, SpMax5_Bhv)  0.603316  0.603316
3     (0, 1, 3)         (MDEC-23, MATS2v, VE3_Dt)  0.581498  0.581498
4    (0, 2, 11)        (MDEC-23, ATSC8s, VE3_Dzi)  0.569993  0.569993
5    (1, 9, 11)         (MATS2v, GATS8s, VE3_Dzi)  0.559539  0.559539
6    (0, 9, 11)        (MDEC-23, GATS8s, VE3_Dzi)  0.547544  0.547544
7    (0, 8, 11)        (MDEC-23, GATS8e, VE3_Dzi)  0.535994  0.535994
8     (1, 3, 4)       (MATS2v, VE3_Dt, CrippenMR)  0.532519  0.532519
9   (1, 10, 11)     (MATS2v, SpMax5_Bhv, VE3_Dzi)  0.524398  0.524398
10    (1, 3, 9)          (MATS2v, VE3_Dt, GATS8s)  0.523967  0.523967
11   (4, 9, 11)      (CrippenMR, GATS8s, VE3_Dzi)  0.523726  0.523726
12   (3, 9, 10)      (VE

In [58]:
if __name__ == "__main__":
    set = create_set(
        k='2',
        top_pairs=top_2_list,
        top_triplets=top_3_list
    )
    subsets = divide_into_subsets(set, num_subsets=4)

    for i, s in enumerate(subsets):
        print(f"Subset {i+1}: {s}")
        c_s = compute_c_S(s)
        print(f"Computed c_S for subset: {c_s}")

Subset 1: [((8, 9), 0.931028709319897), ((0, 5), 0.5209933730247807)]
Computed c_S for subset: 1.4520220823446777
Subset 2: [((4, 5), 0.5799626247303435), ((0, 6), 0.28891003344387434), ((5, 10), 0.5907776717062916)]
Computed c_S for subset: 0.48655010996016984
Subset 3: [((4, 10), 0.6410958759850565), ((3, 11), 0.43487316378660523), ((0, 12), 0.3866240867225108)]
Computed c_S for subset: 0.48753104216472415
Subset 4: [((0, 10), 0.502056844971249), ((0, 4), 0.41728668564205496)]
Computed c_S for subset: 0.919343530613304


## Alpha calculation

In [32]:
def make_Rt(x: Sequence[float], c_s: Dict[Tuple[int, ...], float], _lambda: float) -> float:
    # --- quadratic sums ---
    sum_x2 = sum(xi**2 for xi in x)
    sum_c2 = sum(ci**2 for ci in c_s.values())

    # --- quartic sums ---
    sum_x4 = sum(xi**4 for xi in x)
    sum_c4 = sum(ci**4 for ci in c_s.values())

    # doublets contributions
    doublet_contribution = 0

    # triplets contributions
    triplet_contribution = 0

    # --- Rt ---
    Rt = ((1 - _lambda)**2) * (sum_x2 + 2 * sum_c2) + (_lambda**2) * (sum_x4 + 2 * sum_c4 + doublet_contribution + triplet_contribution)

    return Rt

def make_alpha1(x: Sequence[float], c_s: Dict[Tuple[int, ...], float], _lambda: float) -> float:
    sum_x2, sum_c2 = 0, 0
    sum_x2 = sum(xi**2 for xi in x)
    sum_c2 = sum(ci**2 for ci in c_s.values())

    # prefactor from commutator norm (paper-corrected)
    alpha1 = -(1/4) * (sum_x2 + 2 * sum_c2)

    Rt = make_Rt(x, c_s, _lambda)

    return alpha1 / Rt


# Gauge potential A(x,t)

In [46]:
def build_A_circuit(n_qubits: int, x: Sequence[float], subsets: Sequence[Tuple[int, ...]], c_s: Dict[Tuple[int, ...], float], T: float) -> QuantumCircuit:
    qc = QuantumCircuit(n_qubits)

    # Initialize to |+> on all qubits (ground state of X-field Hamiltonian)
    qc.h(range(n_qubits))

    pauli_labels = []
    coeffs = []

    # single-body term
    for i, x_i in enumerate(x):
        label = ['I'] * n_qubits
        label[i] = 'Y'
        pauli_labels.append(''.join(label))
        coeffs.append(x_i)

    
    # multi-body terms
    for S in subsets:
        subset = tuple(S)
        if subset == () or subset is None:
            continue
        #if len(subset) == 1:
            #coef = float(c_s.get(subset, 0.0))
            #if coef != 0.0:
                #single_body_term(qc, subset[0], coef, dt)
        else:
            coef = float(c_s.get(subset, 0.0))
            if coef != 0.0:
                if any((idx < 0 or idx >= n_qubits) for idx in subset):
                    raise IndexError("subset contains invalid qubit index")
                for i in S:
                    label = ['I']*n_qubits
                    label[i] = 'Y'
                    for j in S:
                        if j != i:
                            label[j] = 'Z'
                pauli_labels.append(''.join(label))
                coeffs.append(coef)
    
    def alpha(t: float, T: float) -> float:
        _lambda = np.sin((np.pi*np.sin((np.pi*t)/2*T)**2)/2)**2
        return make_alpha1(x, c_s, _lambda)
    
    _alpha = alpha(delta_t, T)
    # prefactor 2*alpha*(A*B' - B*A')
    def prefactor(alpha: float, t: float, T: float)-> float:
        A = np.sin((np.pi*np.sin((np.pi*t)/2*T)**2)/2)**2
        B = np.cos((np.pi*np.sin((np.pi*t)/2*T)**2)/2)**2
        A_prime = (np.pi**2/T) * np.sin((np.pi*t)/T) * np.sin((np.pi*t)**2/(2*T**2)) * np.cos((np.pi*t)**2/(2*T**2))
        B_prime = -(np.pi**2/T) * np.sin((np.pi*t)/T) * np.sin((np.pi*t)**2/(2*T**2)) * np.cos((np.pi*t)**2/(2*T**2))
        return 2*_alpha*(A*B_prime - B*A_prime)

    # Build SparsePauliOp
    ham_0 = SparsePauliOp(pauli_labels, np.array(coeffs))
    ham = prefactor(_alpha, delta_t, T) * ham_0

    # Synthesis method
    synthesis = SuzukiTrotter(order=2, reps=1, insert_barriers=False, preserve_order=True) 

    # Build the evolution gate
    evo = PauliEvolutionGate(ham, time=T, synthesis=synthesis)

    # Append the evolution gate to the circuit
    qc.append(evo, range(n_qubits))

    return qc    

### test: gauge potential

In [47]:
if __name__ == "__main__":
    # 1) toy example: 4 qubits
    n_q = 4
    # example feature vector x (mapped to single Z fields)
    x = [0.5, -0.7, 0.0, 0.2]
    # example subsets: pair (0,1), triplet (1,2,3)
    subsets = [(0,1), (1,2,3)]
    # example c_S coefficients (these would come from your MI computations)
    c_s = {
        (0,1): 0.9,         # two-body coupling strength
        (1,2,3): 0.15       # three-body coupling strength
    }

    circ = build_A_circuit(n_q, x, subsets, c_s, T)
    print(circ.draw(output="text"))

     ┌───┐┌────────────────────────────────────────────────────────────┐
q_0: ┤ H ├┤0                                                           ├
     ├───┤│                                                            │
q_1: ┤ H ├┤1                                                           ├
     ├───┤│  exp(-it (YIII + IYII + IIYI + IIIY + ZYII + IZZY))(0.005) │
q_2: ┤ H ├┤2                                                           ├
     ├───┤│                                                            │
q_3: ┤ H ├┤3                                                           ├
     └───┘└────────────────────────────────────────────────────────────┘


# Measurement (Observables)

In [53]:
def build_feature_observables(n_qubits: int, subsets: Optional[Sequence[Tuple[int, ...]]] = None, K: Optional[int] = None):
    """
    Build a list of SparsePauliOp observables in the order:
      [Z_0, Z_1, ..., Z_{n-1}, then all 2-body in subsets with |S|=2, ..., up to K]
    """

    subsets = [tuple(s) for s in subsets]
    observables = []
    names = []

    # singles
    for i in range(n_qubits):
        observables.append(SparsePauliOp.from_list([( "I"*i + "Z" + "I"*(n_qubits-i-1), 1.0 )]))
        names.append(f"Z_{i}")

    # k-body grouped by increasing k
    for k in range(2, K + 1):
        for S in subsets:
            if len(S) == k:
                p = ["I"]*n_qubits
                for j in S: p[j]="Z"
                observables.append(SparsePauliOp.from_list([("".join(p), 1.0)]))
                names.append("Z_" + "_".join(map(str, S)))

    return observables, names

def transpilation_setup(backend: Optional[str] = None, optimization_level: int = 3):
    backend = backend or AerSimulator(shots=4096)
    backend.options.seed_simulator = 42
    backend.options.seed_transpiler = 42
    estimator = Estimator(mode=backend)

    pm = generate_preset_pass_manager(backend=backend, optimization_level=optimization_level)
    return estimator, pm


def quantum_features_via_estimator(qc: QuantumCircuit, n_qubits: int, subsets: Optional[Sequence[Tuple[int, ...]]] = None,
                                   K: Optional[int] = None,
                                   estimator=None,
                                   run_options: Optional[Dict] = None):
    
    estimator, pm = transpilation_setup()

    observables, names = build_feature_observables(n_qubits, subsets=subsets, K=K)

    # Estimator supports batching: circuits list must align with observables list.
    circuits = [qc] * len(observables)
    circuits_transpiled = [pm.run(circ) for circ in circuits]

    pubs = [(circ, [obs]) for circ, obs in zip(circuits_transpiled, observables)]
    job = estimator.run(pubs)
    res = job.result()
    ev_list = []
    for pub_res in res:
        # pub_res.data.evs is typically array-like
        try:
            evs_pub = np.asarray(pub_res.data.evs).reshape(-1)
        except Exception:
            # Some runtimes may return nested structures; attempt alternative accesses
            evs_pub = np.asarray(getattr(pub_res, "values", getattr(pub_res, "data", {}).get("evs", []))).reshape(-1)
        ev_list.extend(evs_pub.tolist())
    evs = np.array(ev_list)

    x_tilde = evs.copy()
    meta = {'names': names, 'values': x_tilde}
    return x_tilde, meta


### test: measurement

In [54]:
if __name__ == "__main__":
    # toy example: 4 qubits
    n_q = 4
    # example feature vector x (mapped to single Z fields)
    x = [0.5, -0.7, 0.0, 0.3]
    # example subsets: pair (0,1), triplet (1,2,3)
    subsets = [(0,1), (0,2), (1,2,3)]
    # example c_S coefficients (these would come from your MI computations)
    c_s = {
        (0,1): 0.9,         # two-body coupling strength
        (1,2,3): 0.15,       # three-body coupling strength
        (0,2): 0.4        # another two-body coupling
    }

    circ = build_A_circuit(n_q, x, subsets, c_s, T)

    # If you want to measure singles + all 2-body + all 3-body:
    x_tilde, meta = quantum_features_via_estimator(circ, n_q, subsets, K=3)
    print("feature names:", meta['names'])
    print("x_tilde:", x_tilde)

feature names: ['Z_0', 'Z_1', 'Z_2', 'Z_3', 'Z_0_1', 'Z_0_2', 'Z_1_2_3']
x_tilde: [-0.01269531  0.01611328 -0.02099609 -0.01367188 -0.00976562 -0.01757812
  0.00439453]


# Classification

In [4]:
def classification(X, y):
    rscv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)

    metrics = {
        'AUC': [],
        'F1 Macro': [],
        'Precision Macro': [],
        'Recall Macro': [],
        'Accuracy': []
    }
    for fold, (train_id, test_id) in enumerate(rscv.split(X,y)):

        x_train = X[train_id]
        x_test = X[test_id]
        y_train = y[train_id]
        y_test = y[test_id]

    x_train, x_test = data_preprocessing(x_train, x_test)
    classifier = GradientBoostingClassifier(n_estimators=1000, random_state=42)
    classifier.fit(x_train, y_train)

    y_pred = classifier.predict(x_test)
    y_proba = classifier.predict_proba(x_test)[:, 1]

    metrics['AUC'].append(roc_auc_score(y_test, y_proba))
    metrics['F1 Macro'].append(f1_score(y_test, y_pred, average='macro'))
    metrics['Precision Macro'].append(precision_score(y_test, y_pred, average='macro'))
    metrics['Recall Macro'].append(recall_score(y_test, y_pred, average='macro'))
    metrics['Accuracy'].append(accuracy_score(y_test, y_pred))    

    median_metrics = {k: np.median(v) for k, v in metrics.items()}

    return median_metrics

# Results

### Classical result

In [10]:
metrics_original = classification(X, y)
for k, v in metrics_original.items():
    print(f"{k}: {v:.4f}")

AUC: 0.6443
F1 Macro: 0.6222
Precision Macro: 0.6561
Recall Macro: 0.6166
Accuracy: 0.7059


### Quantum result

In [60]:
if __name__ == "__main__":
    n_samples, n_features = X.shape

    # Prepare subsets and c_s mapping (from your earlier top_2_list)
    subsets = [tuple(t[0]) for t in top_2_list]   # e.g. [(0,1), (2,3), ...]
    c_s_dict = { tuple(t[0]) : float(t[1]) for t in top_2_list }

    X_tilde_list = []

    for idx, x in enumerate(X):
        # build circuit using dict-form c_s
        circ = build_A_circuit(n_features, x, subsets, c_s_dict, T)

        # call your estimator - must return (x_tilde, meta)
        x_tilde, meta = quantum_features_via_estimator(circ, n_features, subsets, K=3)

        # Ensure x_tilde is a 1D array
        x_tilde = np.asarray(x_tilde).reshape(-1)

        X_tilde_list.append(x_tilde)
    
    X_tilde = np.array(X_tilde_list)

TranspilerError: '[PhysicalQubit(1202)] not in Target'

In [58]:
metrics_quantum = classification(X_tilde, y)
for k, v in metrics_quantum.items():
    print(f"{k}: {v:.4f}")

AUC: 0.5000
F1 Macro: 0.4035
Precision Macro: 0.3382
Recall Macro: 0.5000
Accuracy: 0.6765


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
