In [None]:
!pip install qiskit



In [None]:
!pip install qiskit-aer



In [None]:
!pip install qiskit-machine-learning



In [None]:
# Cell 1: Imports and user-tunable parameters
import os, time
import numpy as np
import pandas as pd
from collections import Counter

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Qiskit
from qiskit_aer import Aer
from qiskit.circuit.library import ZZFeatureMap
from qiskit_machine_learning.kernels import FidelityStatevectorKernel

# Repro
SEED = 42
np.random.seed(SEED)

# ----- Tunable parameters -----
PCA_COMPONENTS = 12        # number of PCA components = number of qubits
QC_REPS = 2                # reps for ZZFeatureMap
NUM_CLASSES = 4            # total classes including 'normal' (e.g., normal + top-3 attacks)
PER_CLASS = 400            # training samples per class (adjust to control runtime)
TEST_SIZE = 1000            # number of test samples to evaluate
# ------------------------------
print("Parameters: PCA_COMPONENTS =", PCA_COMPONENTS, "NUM_CLASSES =", NUM_CLASSES,
      "PER_CLASS =", PER_CLASS, "TEST_SIZE =", TEST_SIZE)

Parameters: PCA_COMPONENTS = 12 NUM_CLASSES = 4 PER_CLASS = 400 TEST_SIZE = 1000


In [None]:
# Cell 2: Load preprocessed PCA arrays if present, else do preprocessing from raw KDD files
# It produces: df_train, df_test, X_train_pca, X_test_pca (and label arrays)

# Filenames
TRAIN_RAW = "/bin/KDD_Dataset/KDDTrain+.txt"
TEST_RAW  = "/bin/KDD_Dataset/KDDTest+.txt"
SAVE_XTRAIN = "/bin/KDD_Dataset/X_train_pca.npy"
SAVE_XTEST  = "/bin/KDD_Dataset/X_test_pca.npy"
SAVE_DFTRAIN = "/bin/KDD_Dataset/df_train.pkl"
SAVE_DFTEST  = "/bin/KDD_Dataset/df_test.pkl"

if os.path.exists(SAVE_XTRAIN) and os.path.exists(SAVE_XTEST) and os.path.exists(SAVE_DFTRAIN) and os.path.exists(SAVE_DFTEST):
    print("Found saved PCA arrays and dataframes — loading them.")
    X_train_pca = np.load(SAVE_XTRAIN)
    X_test_pca  = np.load(SAVE_XTEST)
    df_train = pd.read_pickle(SAVE_DFTRAIN)
    df_test  = pd.read_pickle(SAVE_DFTEST)
    print("Loaded:", X_train_pca.shape, X_test_pca.shape, "df_train:", df_train.shape)
else:
    # Preprocess from raw files
    print("Saved arrays not found — preprocessing from raw NSL-KDD files. This may take a few minutes.")
    columns = [
        'duration','protocol_type','service','flag','src_bytes','dst_bytes',
        'land','wrong_fragment','urgent','hot','num_failed_logins','logged_in',
        'num_compromised','root_shell','su_attempted','num_root','num_file_creations',
        'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login',
        'count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate',
        'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count',
        'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate',
        'dst_host_rerror_rate','dst_host_srv_rerror_rate','label','difficulty'
    ]
    # Load raw text files
    df_train = pd.read_csv(TRAIN_RAW, names=columns)
    df_test  = pd.read_csv(TEST_RAW,  names=columns)
    # Clean labels: string type, strip, lowercase, rstrip('.')
    df_train['label'] = df_train['label'].astype(str).str.strip().str.lower().str.rstrip('.')
    df_test['label']  = df_test['label'].astype(str).str.strip().str.lower().str.rstrip('.')
    # Drop difficulty (we won't use it)
    df_train_proc = df_train.drop(columns=['difficulty']).copy()
    df_test_proc  = df_test.drop(columns=['difficulty']).copy()
    # One-hot encode categorical columns (detect objects except label)
    cat_cols = df_train_proc.select_dtypes(include=['object']).columns.tolist()
    cat_cols = [c for c in cat_cols if c != 'label']
    print("Categorical columns detected:", cat_cols)
    df_train_proc = pd.get_dummies(df_train_proc, columns=cat_cols)
    df_test_proc  = pd.get_dummies(df_test_proc,  columns=cat_cols)
    # Align test to train columns
    df_test_proc = df_test_proc.reindex(columns=df_train_proc.columns, fill_value=0)
    # Separate labels and features
    y_train_all = df_train_proc['label'].astype(str).values
    y_test_all  = df_test_proc['label'].astype(str).values
    X_train_df = df_train_proc.drop(columns=['label']).copy()
    X_test_df  = df_test_proc.drop(columns=['label']).copy()
    # Scale
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_df)
    X_test_scaled  = scaler.transform(X_test_df)
    # PCA
    from sklearn.decomposition import PCA
    pca = PCA(n_components=PCA_COMPONENTS, random_state=SEED)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca  = pca.transform(X_test_scaled)
    # Save for reuse
    np.save(SAVE_XTRAIN, X_train_pca)
    np.save(SAVE_XTEST, X_test_pca)
    df_train = df_train  # keep original df_train (with difficulty), but df_train_proc exists too
    df_test = df_test
    df_train.to_pickle(SAVE_DFTRAIN)
    df_test.to_pickle(SAVE_DFTEST)
    print("Preprocessing complete. PCA shapes:", X_train_pca.shape, X_test_pca.shape)

Found saved PCA arrays and dataframes — loading them.
Loaded: (125973, 12) (22544, 12) df_train: (125973, 43)


In [None]:
# Cell 3: Create quantum feature map and FidelityStatevectorKernel
num_qubits = X_train_pca.shape[1]
print("Using", num_qubits, "qubits (PCA components).")

feature_map = ZZFeatureMap(feature_dimension=num_qubits, reps=QC_REPS, entanglement='linear')
quantum_kernel = FidelityStatevectorKernel(feature_map=feature_map)  # no quantum_instance argument for this version

print("Feature map and fidelity kernel initialized.")

Using 12 qubits (PCA components).
Feature map and fidelity kernel initialized.


In [None]:
# Cell 4: Select classes (include 'normal') and build a balanced training subset
# Get label array from df_train (cleaned)
labels_all = df_train['label'].astype(str).str.strip().str.lower().to_numpy()
unique, counts = np.unique(labels_all, return_counts=True)
label_counts = sorted(zip(unique, counts), key=lambda x: x[1], reverse=True)
print("Top label counts (train):", label_counts[:10])

# Build list of classes: always include 'normal' if present, then top attacks
classes = ['normal'] if 'normal' in unique else []
for lbl, _ in label_counts:
    if lbl == 'normal': continue
    if len(classes) >= NUM_CLASSES:
        break
    classes.append(lbl)
classes = classes[:NUM_CLASSES]
print("Selected classes for experiment:", classes)

# Collect up to PER_CLASS samples per class
np.random.seed(SEED)
indices_per_class = []
for cls in classes:
    idxs = np.where(labels_all == cls)[0]
    if len(idxs) == 0:
        raise RuntimeError(f"No samples found for class '{cls}'")
    take = min(PER_CLASS, len(idxs))
    chosen = np.random.choice(idxs, take, replace=False)
    indices_per_class.append(chosen)

train_idx = np.concatenate(indices_per_class)
X_train_multi = X_train_pca[train_idx]
y_train_multi = labels_all[train_idx]

print("Training multiclass subset shape:", X_train_multi.shape)
print("Per-class counts:", Counter(y_train_multi))

Top label counts (train): [('normal', np.int64(67343)), ('neptune', np.int64(41214)), ('satan', np.int64(3633)), ('ipsweep', np.int64(3599)), ('portsweep', np.int64(2931)), ('smurf', np.int64(2646)), ('nmap', np.int64(1493)), ('back', np.int64(956)), ('teardrop', np.int64(892)), ('warezclient', np.int64(890))]
Selected classes for experiment: ['normal', 'neptune', 'satan', 'ipsweep']
Training multiclass subset shape: (1600, 12)
Per-class counts: Counter({'normal': 400, 'neptune': 400, 'satan': 400, 'ipsweep': 400})


In [None]:
# Cell 5: Compute quantum training kernel K_train and train One-vs-Rest SVM
t0 = time.time()
print("Computing quantum kernel matrix K_train for training subset (may take time)...")
K_train = quantum_kernel.evaluate(x_vec=X_train_multi)   # shape (n_train, n_train)
t1 = time.time()
print(f"K_train shape: {K_train.shape}  computed in {t1-t0:.1f} s")

# Train One-vs-Rest SVM with precomputed kernel
print("Training One-vs-Rest SVM with precomputed kernel...")
t2 = time.time()
base = SVC(kernel='precomputed')
ovr = OneVsRestClassifier(base)
ovr.fit(K_train, y_train_multi)
t3 = time.time()
print(f"Trained One-vs-Rest in {t3-t2:.1f} s")

Computing quantum kernel matrix K_train for training subset (may take time)...
K_train shape: (1600, 1600)  computed in 95.2 s
Training One-vs-Rest SVM with precomputed kernel...
Trained One-vs-Rest in 0.2 s


In [None]:
# Cell 6: Prepare test set (first TEST_SIZE samples or stratified sample) and evaluate
# Use first TEST_SIZE test samples by default
n_test_available = X_test_pca.shape[0]
test_n = min(TEST_SIZE, n_test_available)
X_test_eval = X_test_pca[:test_n]
y_test_labels = df_test['label'].astype(str).str.strip().str.lower().to_numpy()[:test_n]

print("Evaluating on test_n =", test_n, "samples. Classes considered:", classes)

t4 = time.time()
print("Computing K_test (between test samples and training subset)...")
K_test = quantum_kernel.evaluate(x_vec=X_test_eval, y_vec=X_train_multi)   # (n_test, n_train)
t5 = time.time()
print(f"K_test shape: {K_test.shape}  computed in {t5-t4:.1f} s")

# Predict and evaluate
y_pred = ovr.predict(K_test)
print("\nConfusion Matrix (rows=actual, cols=predicted) for selected classes:")
print(confusion_matrix(y_test_labels, y_pred, labels=classes))

print("\nClassification Report (for selected classes):")
print(classification_report(y_test_labels, y_pred, labels=classes, zero_division=0))

# Overall accuracy on these test_n samples (labels outside 'classes' will be counted as "other/mismatch")
acc = accuracy_score(y_test_labels, y_pred)
print(f"\nOverall accuracy (on {test_n} test samples): {acc:.4f}")

# Print timings summary
print("\nTimings (s): K_train =", t1-t0, "train_svm =", t3-t2, "K_test =", t5-t4)


Evaluating on test_n = 1000 samples. Classes considered: ['normal', 'neptune', 'satan', 'ipsweep']
Computing K_test (between test samples and training subset)...
K_test shape: (1000, 1600)  computed in 80.7 s

Confusion Matrix (rows=actual, cols=predicted) for selected classes:
[[449   0   3   0]
 [ 18 196   1   0]
 [ 12   0  21   0]
 [  0   0   0   6]]

Classification Report (for selected classes):
              precision    recall  f1-score   support

      normal       0.59      0.99      0.74       452
     neptune       1.00      0.91      0.95       215
       satan       0.51      0.64      0.57        33
     ipsweep       1.00      1.00      1.00         6

   micro avg       0.67      0.95      0.79       706
   macro avg       0.78      0.89      0.82       706
weighted avg       0.72      0.95      0.80       706


Overall accuracy (on 1000 test samples): 0.6720

Timings (s): K_train = 95.22372436523438 train_svm = 0.21852850914001465 K_test = 80.69538807868958


In [None]:
# Cell 7: Optional save of kernel / indices / predictions (only if size reasonable)
save_dir = "quantum_multiclass_results"
os.makedirs(save_dir, exist_ok=True)

# Save training indices and classes
np.save(os.path.join(save_dir, "train_idx.npy"), train_idx)
with open(os.path.join(save_dir, "classes.txt"), "w") as f:
    f.write("\n".join(classes))

# Save small kernel/results (be cautious if matrices are large)
np.save(os.path.join(save_dir, "K_train.npy"), K_train)
np.save(os.path.join(save_dir, "K_test.npy"), K_test)
pd.DataFrame({"y_test": y_test_labels, "y_pred": y_pred}).to_csv(os.path.join(save_dir, "predictions.csv"), index=False)

print("Saved results to", save_dir)


Saved results to quantum_multiclass_results
