# 1. Data Preprocessing

In [1]:
# Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Importing the data
df = pd.read_csv("ghana_water_quality_data.csv")
df.head()

Unnamed: 0,Community,Region,Latitude,Longitude,Water Quality,Distance to Nearest River (km),Is Mining Zone,Contamination Level,Contamination Type,Water Source,...,Average Daily Water Needs (liters),Prevalence of Water Borne Diseases,Accessibility,Urban/Rural,Sanitation Facilities Available,Average Household Income (GHS),Education Level (Avg Years),Government Intervention Present,NGO Presence,Year Data Collected
0,Wa port,North East,10.246253,-0.121952,0,10.8,1,7.02,Sediment,Sachet water,...,742000,0.3,Boat access only,Urban,0,1168,4.9,0,1,2018
1,Bawku view,Western,5.172394,-2.512235,1,2.4,0,0.62,,Pipe-borne,...,92870,0.261,Dirt road,Rural,0,4185,3.6,1,0,2015
2,Hohoe stad,Upper West,10.199718,-2.293461,1,1.1,0,1.72,,Dam,...,527119,0.104,Boat access only,Rural,1,4870,7.6,0,0,2016
3,Madina ville,North East,10.662465,-0.774931,1,19.3,0,0.4,,Stream,...,168816,0.276,Footpath,Urban,1,3841,9.2,0,0,2023
4,Obuasi stad,Savannah,9.022542,-1.752403,1,17.9,0,1.61,,Rainwater,...,243015,0.276,Asphalt road,Rural,1,4245,6.4,0,0,2021


In [3]:
# Exploring the dataset
summary = pd.DataFrame({
    'Field': df.columns,
    'Type': df.dtypes.values,
    'Example Datapoint': df.iloc[0].values
})

# Print the summary
print(summary.to_string(index=False))

                             Field    Type Example Datapoint
                         Community  object           Wa port
                            Region  object        North East
                          Latitude float64         10.246253
                         Longitude float64         -0.121952
                     Water Quality   int64                 0
    Distance to Nearest River (km) float64              10.8
                    Is Mining Zone   int64                 1
               Contamination Level float64              7.02
                Contamination Type  object          Sediment
                      Water Source  object      Sachet water
                Water Access Score float64               1.6
                Number of Children   int64             12119
                        Population   int64             37100
Average Daily Water Needs (liters)   int64            742000
Prevalence of Water Borne Diseases float64               0.3
                     Acc

In [4]:
# Dropping irrelevant columns
columns_to_drop = ['Community', 'Latitude', 'Longitude', 'Year Data Collected']

df = df.drop(columns=columns_to_drop)
df.head()

Unnamed: 0,Region,Water Quality,Distance to Nearest River (km),Is Mining Zone,Contamination Level,Contamination Type,Water Source,Water Access Score,Number of Children,Population,Average Daily Water Needs (liters),Prevalence of Water Borne Diseases,Accessibility,Urban/Rural,Sanitation Facilities Available,Average Household Income (GHS),Education Level (Avg Years),Government Intervention Present,NGO Presence
0,North East,0,10.8,1,7.02,Sediment,Sachet water,1.6,12119,37100,742000,0.3,Boat access only,Urban,0,1168,4.9,0,1
1,Western,1,2.4,0,0.62,,Pipe-borne,4.9,5059,18574,92870,0.261,Dirt road,Rural,0,4185,3.6,1,0
2,Upper West,1,1.1,0,1.72,,Dam,6.2,11396,31007,527119,0.104,Boat access only,Rural,1,4870,7.6,0,0
3,North East,1,19.3,0,0.4,,Stream,7.0,4423,14068,168816,0.276,Footpath,Urban,1,3841,9.2,0,0
4,Savannah,1,17.9,0,1.61,,Rainwater,5.8,4332,14295,243015,0.276,Asphalt road,Rural,1,4245,6.4,0,0


In [5]:
# Encoding binary variables

le = LabelEncoder()

df["Region"] = le.fit_transform(df["Region"])
df["Contamination Type"] = le.fit_transform(df["Contamination Type"])
df["Water Source"] = le.fit_transform(df["Water Source"])
df["Accessibility"] = le.fit_transform(df["Accessibility"])
df["Urban/Rural"] = le.fit_transform(df["Urban/Rural"])

df.head()

Unnamed: 0,Region,Water Quality,Distance to Nearest River (km),Is Mining Zone,Contamination Level,Contamination Type,Water Source,Water Access Score,Number of Children,Population,Average Daily Water Needs (liters),Prevalence of Water Borne Diseases,Accessibility,Urban/Rural,Sanitation Facilities Available,Average Household Income (GHS),Education Level (Avg Years),Government Intervention Present,NGO Presence
0,7,0,10.8,1,7.02,7,6,1.6,12119,37100,742000,0.3,1,1,0,1168,4.9,0,1
1,14,1,2.4,0,0.62,9,3,4.9,5059,18574,92870,0.261,2,0,0,4185,3.6,1,0
2,12,1,1.1,0,1.72,9,1,6.2,11396,31007,527119,0.104,1,0,1,4870,7.6,0,0
3,7,1,19.3,0,0.4,9,7,7.0,4423,14068,168816,0.276,3,1,1,3841,9.2,0,0
4,10,1,17.9,0,1.61,9,4,5.8,4332,14295,243015,0.276,0,0,1,4245,6.4,0,0


In [6]:
# Checking the datatypes
df.dtypes

Unnamed: 0,0
Region,int64
Water Quality,int64
Distance to Nearest River (km),float64
Is Mining Zone,int64
Contamination Level,float64
Contamination Type,int64
Water Source,int64
Water Access Score,float64
Number of Children,int64
Population,int64


In [7]:
# Ensuring the datatypes are all correct

df["Region"] = df["Region"].astype("category")
df["Water Quality"] = df["Water Quality"].astype("category")
df['Is Mining Zone'] = df['Is Mining Zone'].astype("category")
df["Contamination Type"] = df["Contamination Type"].astype("category")
df["Water Source"] = df["Water Source"].astype("category")
df["Urban/Rural"] = df["Urban/Rural"].astype("category")
df["Accessibility"] = df["Accessibility"].astype("category")
df['Government Intervention Present'] = df['Government Intervention Present'].astype("category")
df['NGO Presence'] = df['NGO Presence'].astype("category")
df['Sanitation Facilities Available'] = df['Sanitation Facilities Available'].astype("category")

# Sanity check
df.dtypes

Unnamed: 0,0
Region,category
Water Quality,category
Distance to Nearest River (km),float64
Is Mining Zone,category
Contamination Level,float64
Contamination Type,category
Water Source,category
Water Access Score,float64
Number of Children,int64
Population,int64


In [8]:
# Separate the features and target variable
X = df.drop('Water Quality', axis=1)
y = df['Water Quality']

In [9]:
# Scaling the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

#2. Classical SVM

In [11]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=1.0, gamma='scale')
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.77

#3. Quantum SVC (QSVC)

In [12]:
# Installing PennyLane

!pip install PennyLane

Collecting PennyLane
  Downloading PennyLane-0.41.1-py3-none-any.whl.metadata (10 kB)
Collecting rustworkx>=0.14.0 (from PennyLane)
  Downloading rustworkx-0.16.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting appdirs (from PennyLane)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting autoray>=0.6.11 (from PennyLane)
  Downloading autoray-0.7.1-py3-none-any.whl.metadata (5.8 kB)
Collecting pennylane-lightning>=0.41 (from PennyLane)
  Downloading pennylane_lightning-0.41.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting diastatic-malt (from PennyLane)
  Downloading diastatic_malt-2.15.2-py3-none-any.whl.metadata (2.6 kB)
Collecting scipy-openblas32>=0.3.26 (from pennylane-lightning>=0.41->PennyLane)
  Downloading scipy_openblas32-0.3.30.0.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 kB[0m [31m2.4 MB/s

In [15]:
import pennylane as qml
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# Configuration
N_COMPONENTS = 4
N_QUBITS = 4
dev = qml.device("default.qubit", wires=N_QUBITS)

# Quantum kernel circuit (AngleEmbedding)
@qml.qnode(dev)
def feature_map(x):
    qml.templates.AngleEmbedding(x, wires=range(N_QUBITS))
    return qml.state()

# Fast kernel computation using PennyLane built-ins
def quantum_kernel_matrix(X1, X2):
    return qml.kernels.kernel_matrix(X1, X2, feature_map=feature_map, kernel_fn=qml.kernels.fidelity)

# Apply PCA and scaling
def apply_pca(X_train, X_test, n_components=N_COMPONENTS):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    scaler = MinMaxScaler(feature_range=(0, np.pi))
    X_train_scaled = scaler.fit_transform(X_train_pca)
    X_test_scaled = scaler.transform(X_test_pca)

    return X_train_scaled, X_test_scaled

# Evaluation function
def evaluate_model(y_true, y_pred):
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'AUROC': roc_auc_score(y_true, y_pred),
        'AUPRC': average_precision_score(y_true, y_pred)
    }
    for name, value in metrics.items():
        print(f'{name}: {value:.4f}')

    # Plot curves
    plt.figure(figsize=(12, 5))

    # ROC
    plt.subplot(1, 2, 1)
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    plt.plot(fpr, tpr, label=f'AUROC = {metrics["AUROC"]:.4f}')
    plt.plot([0, 1], [0, 1], '--', label='Random')
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curve'); plt.legend()

    # PR
    plt.subplot(1, 2, 2)
    precision_vals, recall_vals, _ = precision_recall_curve(y_true, y_pred)
    plt.plot(recall_vals, precision_vals, label=f'AUPRC = {metrics["AUPRC"]:.4f}')
    plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('Precision-Recall Curve'); plt.legend()

    plt.tight_layout()
    plt.show()

# Main QSVC workflow
def run_qsvc(X_train, X_test, y_train, y_test):
    print("Reducing dimensionality with PCA...")
    X_train_scaled, X_test_scaled = apply_pca(X_train, X_test)

    print("Computing quantum kernel matrix (training)...")
    K_train = quantum_kernel_matrix(X_train_scaled, X_train_scaled)

    print("Training SVM...")
    clf = SVC(kernel='precomputed')
    clf.fit(K_train, y_train)

    print("Computing quantum kernel matrix (testing)...")
    K_test = quantum_kernel_matrix(X_test_scaled, X_train_scaled)

    print("Predicting...")
    y_pred = clf.predict(K_test)

    print("Evaluating model...")
    evaluate_model(y_test, y_pred)