# Projected Quantum Kernel tutorial on OV cancer data

#### Download data from: https://ibm.biz/DataTutorialISMB2025

## Import necessary libraries

In [55]:
import sys, os, re
dir_home = 'qbiocode/'
sys.path.append( dir_home )

In [56]:

import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from qbiocode import pqk

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC


### Dataset preparation

First, we download the...

In [None]:
def declare_hyperparams(param_list): 

    param_list.extend([x*0.001 for x in list(range(1,10, 2))])
    param_list.extend([x*0.01 for x in list(range(1,10,2))])
    param_list.extend([x*0.1 for x in list(range(1,10,2))])
    param_list.extend(list(range(1,10,2)))
    param_list.extend([x*0.1 for x in list(range(1,10,2))])
    param_list.extend([x*10 for x in list(range(1,10,2))])
    param_list.extend([x*100 for x in list(range(1,10,2))])
    param_list.extend([x*1000 for x in list(range(1,10,2))])

    return param_list


def run_model(model, x_train, x_test, y_train, y_test, 
                param_grid, cv, verbose=1, n_jobs=-1, scoring='f1_weighted'):
    grid_search =  GridSearchCV(model, 
                                param_grid, 
                                cv=cv, 
                                verbose=verbose,
                                n_jobs=n_jobs, 
                                scoring='f1_weighted')
    grid_search.fit(x_train, y_train)
    best_svc = grid_search.best_estimator_
    print(f"The best parameters are {grid_search.best_params_} with a score of {grid_search.best_score_:.4f}")

    f1_score = best_svc.score(x_test, y_test)
    print(f"Test F1 with best model: {f1_score:.4f}")

    return f1_score

n_components = 10
pca = PCA(n_components=n_components)
scaler = StandardScaler()
pqk_args = {
           'seed': 1234, 
           'backend': 'simulator',
           }
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
svc = SVC()
cv = StratifiedKFold(n_splits=5)
res_dict = {}


for data in ['mirna', 'methy', 'exp', 'integrated']:
    print(data)
    df = pd.read_csv('OV/OV_'+data+'.csv')
    print(df.shape)
    samples = df['sampleID'].tolist()
    outcome = np.asarray(df['3y_survival'])
    df.drop(columns=['sampleID','3y_survival'], inplace=True)

    X = df.values

    x_train, x_test, y_train, y_test = train_test_split(X, outcome, test_size=0.2, stratify=outcome, random_state=42)

    num_train = x_train.shape[0]
    num_test = x_test.shape[0]

    # Flatten images to 1D arrays
    x_train = np.reshape(x_train, (num_train, -1))
    x_test = np.reshape(x_test, (num_test, -1))

    # Run PCA
    
    x_train = pca.fit_transform(x_train)
    x_test = pca.fit_transform(x_test)

    # Normalize each feature in dataset
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.fit_transform(x_test)

    print(f'New number of features per datapoint:', len(x_train[0]))

    x_train_prj, x_test_prj = pqk(x_train, 
                                  x_test, 
                                  args=pqk_args, 
                                  store=False, 
                                  encoding='ZZ', 
                                  entanglement='pairwise',  
                                  reps= 8)
    
    
    new_C_range = [] 
    new_C_range = declare_hyperparams(new_C_range)
    
    new_gamma_range = ['auto', 'scale']
    new_gamma_range = declare_hyperparams(new_gamma_range)

    param_grid = dict(C=new_C_range, gamma=new_gamma_range, kernel=kernel)

    f1_score_q = run_model(model=svc, 
                           x_train=x_train_prj, 
                           y_train=y_train,
                           x_test=x_test_prj, 
                           y_test=y_test, 
                           param_grid=param_grid, 
                           cv=cv
                           )
    
    f1_score_c = run_model(model=svc, 
                           x_train=x_train, 
                           y_train=y_train,
                           x_test=x_test, 
                           y_test=y_test, 
                           param_grid=param_grid, 
                           cv=cv
                           )
    
    res_dict[data] = [f1_score_q, f1_score_c]
    

res_df = pd.DataFrame.from_dict(res_dict).T
res_df.columns = ['F1_Q', 'F1_C']
res_df.to_csv('PQK_OV_results.csv')


(287, 5004)


### Read data

In [58]:
df = pd.read_csv('OV/OV_methy.csv')
print(df.shape)
samples = df['sampleID'].tolist()
outcome = np.asarray(df['3y_survival'])
df.drop(columns=['sampleID','3y_survival'], inplace=True)

X = df.values

x_train, x_test, y_train, y_test = train_test_split(X, outcome, test_size=0.2, stratify=outcome, random_state=42)


(287, 5004)


### Definitions

In [59]:
def declare_hyperparams(param_list): 

    param_list.extend([x*0.001 for x in list(range(1,10, 2))])
    param_list.extend([x*0.01 for x in list(range(1,10,2))])
    param_list.extend([x*0.1 for x in list(range(1,10,2))])
    param_list.extend(list(range(1,10,2)))
    param_list.extend([x*0.1 for x in list(range(1,10,2))])
    param_list.extend([x*10 for x in list(range(1,10,2))])
    param_list.extend([x*100 for x in list(range(1,10,2))])
    param_list.extend([x*1000 for x in list(range(1,10,2))])

    return param_list


def run_model(model, x_train, x_test, y_train, y_test, 
                param_grid, cv, verbose=1, n_jobs=-1, scoring='f1_weighted'):
    grid_search =  GridSearchCV(model, 
                                param_grid, 
                                cv=cv, 
                                verbose=verbose,
                                n_jobs=n_jobs, 
                                scoring='f1_weighted')
    grid_search.fit(x_train, y_train)
    best_svc = grid_search.best_estimator_
    print(f"The best parameters are {grid_search.best_params_} with a score of {grid_search.best_score_:.4f}")

    f1_score = best_svc.score(x_test, y_test)
    print(f"Test F1 with best model: {f1_score:.4f}")

    return f1_score

n_components = 10
pca = PCA(n_components=n_components)
scaler = StandardScaler()
pqk_args = {
           'seed': 1234, 
           'backend': 'simulator',
           }
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
svc = SVC()
cv = StratifiedKFold(n_splits=5)
res_dict = {}

We use PCA to reduce the dimensionality of the dataset. Here, we reduce the dataset dimension to 10, which will correspond to the number of qubits that we have in our quantum circuit.

In [60]:

# Use PCA to transform each image to an n-dimensional vector
n_components = 10

# # Convert tensors to numpy arrays
# x_train = x_train.numpy()
# x_test = x_test.numpy()

# Number of train and test samples
num_train = x_train.shape[0]
num_test = x_test.shape[0]

# Flatten images to 1D arrays
x_train = np.reshape(x_train, (num_train, -1))
x_test = np.reshape(x_test, (num_test, -1))

# Run PCA
pca = PCA(n_components=n_components)
x_train = pca.fit_transform(x_train)
x_test = pca.fit_transform(x_test)

# Normalize each feature in dataset
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)


print(f'New number of features per datapoint:', len(x_train[0]))

New number of features per datapoint: 10


### Project the data 

In [61]:
pqk_args = {
           'seed': 1234, 
           'backend': 'simulator',
           }

x_train_prj, x_test_prj = pqk(x_train, x_test, args=pqk_args, store=False, encoding='ZZ', entanglement='pairwise',  reps= 8)

### Define the projected quantum kernel

In [62]:
new_C_range = [] 
new_C_range = declare_hyperparams(new_C_range)
print(len(new_C_range))
new_gamma_range = ['auto', 'scale']
new_gamma_range = declare_hyperparams(new_gamma_range)
print(len(new_gamma_range))

40
42


In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC


# Range of 'C' and 'gamma' values as SVC hyperparameters
# Define the hyperparameter ranges #[7.4][1.3]#
#C_range = [range(0.01, 1, 5)]
#C_range.extend([x * 0.01 for x in range(1,11)])
#C_range.extend([x * 0.25 for x in range(1, 60)])
#C_range.extend()
#C_range.extend([20, 50, 100, 200, 500, 700, 1000, 1100, 1200, 1300, 1400, 1500, 1700, 2000])

gamma_range = ['auto', 'scale', 0.001, 0.005, 0.007]
gamma_range.extend([x * 0.01 for x in range(1,11)])
gamma_range.extend([x * 0.25 for x in range(1, 60)])
gamma_range.extend([20, 50, 100])

kernel = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = dict(C=new_C_range, gamma=new_gamma_range, kernel=kernel)

# Support vector classifier
svc = SVC()

# Define the cross validation
cv = StratifiedKFold(n_splits=5)

# Grid search for hyperparameter tuning (q: quantum)
grid_search_q = GridSearchCV(svc, param_grid, cv=cv, verbose=1, n_jobs=-1, scoring='f1_weighted')
grid_search_q.fit(x_train_prj, y_train)

# Best model with best parameters
best_svc_q = grid_search_q.best_estimator_
print(f"The best parameters are {grid_search_q.best_params_} with a score of {grid_search_q.best_score_:.4f}")

# Test accuracy
F1_q = best_svc_q.score(x_test_prj, y_test)
print(f"Test F1 with best model: {F1_q:.4f}")

### Classical benchmarking


We can run a classical SVM without doing a quantum projection. This result is our classical benchmark.

In [None]:
# Support vector classifier
svc = SVC()

# Grid search for hyperparameter tuning (c: classical)
grid_search_c = GridSearchCV(svc, param_grid, cv=cv, verbose=1, n_jobs=-1, scoring='f1_weighted')
grid_search_c.fit(x_train, y_train)

# Best model with best parameters
best_svc_c = grid_search_c.best_estimator_
print(f"The best parameters are {grid_search_c.best_params_} with a score of {grid_search_c.best_score_:.4f}")

# Test accuracy
F1_c = best_svc_c.score(x_test, y_test)
print(f"Test F1 with best model: {F1_c:.4f}")

Fitting 5 folds for each of 6720 candidates, totalling 33600 fits
The best parameters are {'C': 300, 'gamma': 5, 'kernel': 'sigmoid'} with a score of 0.6068
Test F1 with best model: 0.5862
