In [298]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from qiskit import QuantumCircuit, QuantumRegister, ClassicalRegister
from qiskit.circuit import ParameterVector
from qiskit.opflow import Z
from qiskit.providers.aer import StatevectorSimulator
from qiskit.algorithms.optimizers import COBYLA
from qiskit_machine_learning.algorithms import VQC
from qiskit_machine_learning.circuit.library import RawFeatureVector

In [299]:
import pandas as pd

# Read the training and testing CSV files into pandas dataframes
train_df = pd.read_csv('Dataset/AGNews/train.csv')
test_df = pd.read_csv('Dataset/AGNews/test.csv')

# Print the shapes of the dataframes to verify they were loaded correctly
print("Train data shape:", train_df.shape)
print("Test data shape:", test_df.shape)


Train data shape: (120000, 3)
Test data shape: (7600, 3)


# Cleaning Data and preprocess it

In [300]:
import nltk
import re

# Download stop words
nltk.download('stopwords')

# Define a list of stop words
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MAXFRAME\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [301]:
# Define function for preprocessing text data
def preprocess_text(text):
    # Convert text to lower case
    text = text.lower()
    
    # Remove stop words
    tokens = nltk.tokenize.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove symbols and digits
    tokens = [re.sub(r'[^a-zA-Z]+', '', token) for token in tokens]
    
    # Remove words shorter than 3 characters
    tokens = [token for token in tokens if len(token) > 2]
    
    # Join tokens back into a string
    text = ' '.join(tokens)
    
    return text

In [302]:
train_df['Description'] = train_df['Description'].apply(preprocess_text)
train_df['Title'] = train_df['Title'].apply(preprocess_text)

In [303]:
train_df["Title"]

0                        wall bears claw back black reuters
1         carlyle looks toward commercial aerospace reuters
2                  oil economy cloud stocks outlook reuters
3         iraq halts oil exports main southern pipeline ...
4         oil prices soar alltime record posing new mena...
                                ...                        
119995              pakistan musharraf says quit army chief
119996                       renteria signing topshelf deal
119997                             saban going dolphins yet
119998                                      today nfl games
119999                              nets get carter raptors
Name: Title, Length: 120000, dtype: object

In [304]:
test_df['Description'] = test_df['Description'].apply(preprocess_text)
test_df['Title'] = test_df['Title'].apply(preprocess_text)

In [305]:
X_train = train_df.sample(frac=0.0085, random_state=42)
X_test = test_df.sample(frac=0.0079, random_state=42)

In [306]:
X_train.shape

(1020, 3)

In [307]:
X_test.shape

(60, 3)

In [308]:
X_train['Text_classif'] = X_train['Title'].str.cat(X_train['Description'], sep=' ')

In [309]:
X_test['Text_classif'] = X_test['Title'].str.cat(X_test['Description'], sep=' ')

In [310]:
y_train = X_train['Class Index']

In [311]:
y_test= X_test['Class Index']

In [312]:
X_train = X_train.drop(['Title', 'Description', 'Class Index'], axis=1)
X_test = X_test.drop(['Title', 'Description', 'Class Index'], axis=1)

In [313]:
def filter_data(x, y):
  """
  Helper Function to filter the dataset
  """
  #filter the data using labels
  keep = (y == 1) | (y == 2)
  x, y = x[keep], y[keep]

  # convert labels to boolean
  # y = True if y==5
  # y = False if y==9
  y = y == 1
  return x,y

In [314]:
#Filter the train set
X_train_, y_train_ = filter_data(X_train, y_train)

#Filter the test_set
X_test_, y_test_ = filter_data(X_test, y_test)

In [315]:
y_train_ = y_train_.astype(int)
y_test_ = y_test_.astype(int)

In [316]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Convert preprocessed reviews to a matrix of token counts
vectorizer = TfidfVectorizer(max_features=10)
X_train_ = vectorizer.fit_transform(X_train_['Text_classif']).toarray()
X_test_ = vectorizer.fit_transform(X_test_['Text_classif']).toarray()

In [317]:
import numpy as np
from sklearn.decomposition import PCA
n_components = 8
pca = PCA(n_components=n_components)
pca.fit(X_train_)
# get the transformed data
X_train_= pca.transform(X_train_)
X_test_=pca.transform(X_test_)

In [318]:
y_train_ = np.array(y_train_)
y_test_ = np.array(y_test_)

In [319]:
y_train_ = np.array(y_train_).reshape(-1, 1)
y_test_ = np.array(y_test_).reshape(-1, 1)

# Define Circuit, simulator, optimizer and cost operator

In [320]:
from qiskit.circuit.library import TwoLocal
from qiskit_machine_learning.circuit.library import RawFeatureVector

feature_dim = X_train_.shape[1]
feature_map = RawFeatureVector(feature_dimension=feature_dim)


var_circuit = TwoLocal(feature_map.num_qubits, ['ry', 'rz'], 'cx', reps=3, entanglement='linear', insert_barriers=True)

In [321]:
# Step 3: Training the hybrid quantum-classical model
# Define the quantum instance to run the VQC algorithm
quantum_instance = StatevectorSimulator()

In [322]:
# Define the optimizer
optimizer = COBYLA(maxiter=100)

In [323]:
# Define the cost function
cost_operator = Z ^ Z

# Train model

## VQC model

In [324]:
# Instantiate the VQC algorithm
vqc = VQC(feature_map=feature_map,
          ansatz=var_circuit,
          optimizer=optimizer,
          quantum_instance=quantum_instance,
          callback=None,
          initial_point=None)

In [325]:
# Train the model
vqc.fit(X_train_, y_train_)

<qiskit_machine_learning.algorithms.classifiers.vqc.VQC at 0x25babf06f10>

In [326]:
# Step 4: Evaluating the classifier
accuracy = vqc.score(X_test_, y_test_)
print(f"Test accuracy: {accuracy:.2f}")

Test accuracy: 0.50
