<a href="https://colab.research.google.com/github/Mohadesehzarei/skqulacs_QSVM/blob/main/skqulacs_QSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Potential of quantum machine learning for solving the real-world problem of cancers classification

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from skfeature.function.similarity_based import fisher_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

#QSVM
from skqulacs.circuit.pre_defined import create_yzcx_ansatz
from qulacs import QuantumCircuit
from qulacsvis import circuit_drawer
from skqulacs.qsvm import QSVC

# Expression_DEGs

In [None]:
#import Data
X = #expression Data
y = #class

In [None]:
# split data to train and test
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0
)
x_train_=x_train
y_train=np.array(y_train)
y_test=np.array(y_test)
x_train = x_train.to_numpy()

# Ordering features by Fisher score algorithm

In [None]:
# Calculating scores of features with Fisher score algorithm
y_train_new=y_train.reshape(-1)
scores = fisher_score.fisher_score(x_train, y_train_new)
x_train_sorted_ = x_train_.iloc[:,scores]
x_test_sorted_ = x_test.iloc[:,scores]

In [None]:
#SMOTE:oversampling method to solve the imbalance problem
smt=SMOTE(k_neighbors=2,random_state=42)
x_train_SMOTE_,y_train_SMOTE_=smt.fit_resample(x_train_sorted_, y_train)

# Quantum Machine Learning (skqulacs-QSVM)

In [None]:
#Computing accurscy of QSVM for the first 20 features with higher scores obtained by the Fisher method
c_depth = 1
time_step = 1
acc=[]
num_features=[]
for i in range(1,16):
    x_train_SMOTE=x_train_SMOTE_.iloc[:,0:i]
    x_train_SMOTE = x_train_SMOTE.to_numpy()
    y_train_SMOTE=np.array(y_train_SMOTE_)

    x_test_sorted=x_test_sorted_.iloc[:,0:i]
    x_test_sorted = x_test_sorted.to_numpy()

    n_qubit = i# Requires at least the number of dimensions of x_train. If it is too small, the result will be bad.
    circuit_yzcx = create_yzcx_ansatz(n_qubit, c_depth)
    qsvm = QSVC(circuit_yzcx)
    model=qsvm.fit(x_train_SMOTE, y_train_SMOTE)
    y_pred = qsvm.predict(x_test_sorted)
    accuracy=accuracy_score(y_test, y_pred)
    acc.append(accuracy)
    num_features.append(i)

plt.figure(figsize=(8,7))

plt.plot(num_features, acc, 'o-', linewidth=2.5, linestyle = '--', color='blue')
plt.axvline(x=7, color = 'r', linestyle = '-')

plt.xlabel('Number of Features', fontsize=16, labelpad=20)
plt.ylabel('Accuracy', fontsize=16, labelpad=20)

plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.rcParams.update({'font.family':'cambria'})
plt.xlim(1, 16)
plt.locator_params(axis='x', nbins=10)


plt.show()


In [None]:
i= #optimum number of features with the highest accuracy

X_train_SMOTE=x_train_SMOTE_.iloc[:,0:i]
X_train_SMOTE = X_train_SMOTE.to_numpy()
y_train_SMOTE=np.array(y_train_SMOTE_)

x_test_sorted=x_test_sorted_.iloc[:,0:i]
x_test_sorted = x_test_sorted.to_numpy()

In [None]:
# Constructing QSVM model with the optimum number of features
start = time.time()


n_qubit = i# Requires at least the number of dimensions of x_train. If it is too small, the result will be bad.
circuit_yzcx = create_yzcx_ansatz(n_qubit, c_depth)
qsvm = QSVC(circuit_yzcx)
model=qsvm.fit(X_train_SMOTE, y_train_SMOTE)
y_pred = qsvm.predict(x_test_sorted)
accuracy=accuracy_score(y_test, y_pred)

df=classification_report(y_test,y_pred)
print(df)
print(accuracy)

elapsed = time.time() - start

print(f"Training time: {(elapsed)} seconds")

In [None]:
# plot circuit
circuit_drawer(circuit_yzcx._circuit, "mpl")
plt.show()


In [None]:
def plot_classification_report(classificationReport,
                               title='Classification report',
                               cmap='RdBu'):

    classificationReport = classificationReport.replace('\n\n', '\n')
    classificationReport = classificationReport.replace(' / ', '/')
    lines = classificationReport.split('\n')

    classes, plotMat, support, class_names = [], [], [], []
    for line in lines[1:]:  # if you don't want avg/total result, then change [1:] into [1:-1]
        t = line.strip().split()
        if len(t) < 2:
            continue
        classes.append(t[0])
        v = [float(x) for x in t[1: len(t) - 1]]
        support.append(int(t[-1]))
        class_names.append(t[0])
        plotMat.append(v)

    plotMat = np.array(plotMat)
    xticklabels = ['Precision', 'Recall', 'F1-score']
    yticklabels = ['{0} ({1})'.format(class_names[idx], sup)
                   for idx, sup in enumerate(support)]

    plt.imshow(plotMat, interpolation='nearest', cmap=cmap, aspect='auto')
    plt.title(title)
    plt.colorbar()
    plt.xticks(np.arange(3), xticklabels, rotation=45)
    plt.yticks(np.arange(len(classes)), yticklabels)

    upper_thresh = plotMat.min() + (plotMat.max() - plotMat.min()) / 10 * 8
    lower_thresh = plotMat.min() + (plotMat.max() - plotMat.min()) / 10 * 2
    for i, j in itertools.product(range(plotMat.shape[0]), range(plotMat.shape[1])):
        plt.text(j, i, format(plotMat[i, j], '.2f'),
                 horizontalalignment="center",
                 color="white" if (plotMat[i, j] > upper_thresh or plotMat[i, j] < lower_thresh) else "black")

    #plt.ylabel('Metrics')
    #plt.xlabel('Classes')
    plt.tight_layout()

In [None]:
sampleClassificationReport = """       precision    recall  f1-score   support
    normal
    tumor
weighted_avg       """

# Plot
plt.figure(figsize=(8,7))
plt.rcParams.update({'font.size': 18})

plot_classification_report(sampleClassificationReport)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.rcParams.update({'font.family':'cambria'})

plt.show()

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

confusion_matrix_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

# Plot
plt.figure(figsize=(8,7))
plt.rcParams.update({'font.size': 18})
confusion_matrix_display.plot()


plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.rcParams.update({'font.family':'cambria'})

plt.show()

# Classical Machine Learning (CSVM)

In [None]:
#Computing accurscy of CSVM for the first 20 features with higher scores obtained by the Fisher method

acc=[]
num_features=[]

for i in range(1,16):
    x_train_SMOTE=x_train_SMOTE_.iloc[:,0:i]
    x_test_sorted=x_test_sorted_.iloc[:,0:i]

    sc = StandardScaler()
    sc.fit(x_train_SMOTE)
    X_train_std = sc.transform(x_train_SMOTE)
    X_test_std = sc.transform(x_test_sorted)

# Training a SVM classifier
    svc = SVC(kernel= 'linear', random_state=1, C=0.1)
    svc.fit(X_train_std, y_train_SMOTE)

#  performance

    y_pred = svc.predict(X_test_std)
    accuracy=accuracy_score(y_test, y_pred)

    acc.append(accuracy)
    num_features.append(i)

# Plot
plt.figure(figsize=(8,7))

plt.plot(num_features, acc, 'o-', linewidth=2.5, linestyle = '--', color='blue')
plt.axvline(x=3, color = 'r', linestyle = '-')

plt.xlabel('Number of Features', fontsize=16, labelpad=20)
plt.ylabel('Accuracy', fontsize=16, labelpad=20)

plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.rcParams.update({'font.family':'cambria'})
plt.xlim(1, 16)
plt.locator_params(axis='x', nbins=10)

plt.show()


In [None]:
i=#optimum number of features with the highest accuracy

X_train_SMOTE=x_train_SMOTE_.iloc[:,0:i]
X_train_SMOTE = X_train_SMOTE.to_numpy()
y_train_SMOTE=np.array(y_train_SMOTE_)

x_test_sorted=x_test_sorted_.iloc[:,0:i]
x_test_sorted = x_test_sorted.to_numpy()

In [None]:
# Constructing QSVM model with the optimum number of features

start = time.time()

sc = StandardScaler()
sc.fit(X_train_SMOTE)
X_train_std = sc.transform(X_train_SMOTE)
X_test_std = sc.transform(x_test_sorted)

# Training a SVM classifier
svc = SVC(kernel= 'linear', random_state=1, C=0.1)
svc.fit(X_train_std, y_train_SMOTE)

#  performance

y_pred = svc.predict(X_test_std)
accuracy=accuracy_score(y_test, y_pred)
print(accuracy)
elapsed = time.time() - start
print(f"Training time: {(elapsed)} seconds")

In [None]:
df=classification_report(y_test,y_pred)
print(df)

In [None]:
sampleClassificationReport = """       precision    recall  f1-score   support

      normal
       tumor
weighted_avg       """

# Plot
plt.figure(figsize=(8,7))
plt.rcParams.update({'font.size': 18})

plot_classification_report(sampleClassificationReport)


plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.rcParams.update({'font.family':'cambria'})

plt.show()

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

confusion_matrix_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

# Plot
plt.figure(figsize=(8,7))
plt.rcParams.update({'font.size': 18})
confusion_matrix_display.plot()


plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.rcParams.update({'font.family':'cambria'})

plt.show()