# Spectral Clustering Experiment Notebook

This notebook runs spectral clustering experiments using both SimKit (via Neo4j) and scikit-learn.

We use a simple dataset stored in `datasets/points.csv` with these columns:

| id  | x_coordinate | y_coordinate | class |
|-----|--------------|--------------|-------|
| p1  | 1            | 7            | 1     |
| p2  | 1            | 6            | 1     |
| p3  | 6            | 2            | 2     |
| p4  | 8            | 1            | 2     |
| p5  | 10           | 2            | 2     |

Make sure the CSV file is present in the `datasets/` folder.

In [1]:
import os
import subprocess
import sys
import threading
import time
import psutil
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
from scipy.spatial.distance import pdist, squareform
from neo4j import GraphDatabase

# Scikit-learn imports
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.cluster import KMeans
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics.pairwise import euclidean_distances
from scipy.sparse.csgraph import laplacian as csgraph_laplacian
from scipy.sparse.linalg import eigsh

# Visualization imports
import glob
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure required packages are installed
required_packages = ["neo4j", "pandas", "psutil", "tqdm", "scikit-learn", "scipy", "matplotlib", "seaborn"]
for package in required_packages:
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Verify imports
print("All required packages are imported successfully.")

neo4j is already installed.
pandas is already installed.
psutil is already installed.
tqdm is already installed.
Installing scikit-learn...
scipy is already installed.
matplotlib is already installed.
seaborn is already installed.
All required packages are imported successfully.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Neo4j connection details
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "123412345"

# Initialize Neo4j driver
#driver.close()
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# Test connection
try:
    with driver.session() as session:
        result = session.run("RETURN 1")
        print("Neo4j connection successful:", result.single()[0])
except Exception as e:
    print(f"Neo4j connection failed: {e}")

Neo4j connection successful: 1


In [3]:
def check_symmetric(matrix, tol=1e-8):
    return np.allclose(matrix, matrix.T, atol=tol)

def spectral_clustering(dataframe, similarity_graph, laplacian, number_of_clusters, eps=None, k=None):
    dimension = dataframe.shape[0]
    dist_mat = squareform(pdist(dataframe))
    sample_size = len(dist_mat)
    n = min(sample_size // 10, int(math.log(sample_size)))
    epsilon = eps if eps else np.percentile(dist_mat, 90)
    k = k if k else int(np.sqrt(sample_size))
    
    if similarity_graph == "full":
        sigmas = np.zeros(dimension)
        for i in tqdm(range(len(dist_mat)), desc="Calculating sigmas"):
            sigmas[i] = sorted(dist_mat[i])[n]
        adjacency_matrix = np.zeros([dimension, dimension])
        for i in tqdm(range(dimension), desc="Building full affinity"):
            for j in range(i+1, dimension):
                d = np.exp(-1 * dist_mat[i, j]**2 / (sigmas[i] * sigmas[j]))
                adjacency_matrix[i, j] = d
                adjacency_matrix[j, i] = d
    elif similarity_graph == "eps":
        adjacency_matrix = np.zeros([dimension, dimension])
        for i in tqdm(range(dimension), desc="Building eps affinity"):
            for j in range(i+1, dimension):
                d = 1 if dist_mat[i, j] < epsilon else 0
                adjacency_matrix[i, j] = d
                adjacency_matrix[j, i] = d
    elif similarity_graph == "knn":
        adjacency_matrix = np.zeros([dimension, dimension])
        for i in tqdm(range(dimension), desc="Building knn affinity"):
            sorted_indices = np.argsort(dist_mat[i])
            k_nearest_indices = sorted_indices[1:k+1]
            adjacency_matrix[i, k_nearest_indices] = 1
    else:  # mknn
        adjacency_matrix = np.zeros([dimension, dimension])
        for i in tqdm(range(dimension), desc="Building mknn affinity"):
            sorted_indices = np.argsort(dist_mat[i])
            k_nearest_indices = sorted_indices[1:k+1]
            for neighbor in k_nearest_indices:
                neighbor_sorted_indices = np.argsort(dist_mat[neighbor])
                if i in neighbor_sorted_indices[1:k+1]:
                    adjacency_matrix[i, neighbor] = 1
                    adjacency_matrix[neighbor, i] = 1

    degrees = np.sum(adjacency_matrix, axis=1)
    degree_matrix = np.diag(degrees)

    if laplacian == "sym":
        d_inv_sqrt = np.zeros_like(degrees)
        nonzero = degrees > 0
        d_inv_sqrt[nonzero] = 1.0 / np.sqrt(degrees[nonzero])
        d_half = np.diag(d_inv_sqrt)
        laplacian_matrix_normalized = d_half @ adjacency_matrix @ d_half
    elif laplacian == "rw":
        d_inv = np.zeros_like(degrees)
        nonzero = degrees > 0
        d_inv[nonzero] = 1.0 / degrees[nonzero]
        d_inverse = np.diag(d_inv)
        laplacian_matrix_normalized = d_inverse @ adjacency_matrix
    else:
        raise ValueError("Unsupported laplacian type. Only 'sym' and 'rw' are allowed.")

    if check_symmetric(laplacian_matrix_normalized):
        e, v = np.linalg.eigh(laplacian_matrix_normalized)
    else:
        e, v = np.linalg.eig(laplacian_matrix_normalized)
        idx = np.argsort(np.real(e))
        e = np.real(e[idx])
        v = np.real(v[:, idx])
    
    eigengap = np.diff(e)
    optimal_number_of_clusters = np.argmax(eigengap[:10]) + 1

    if number_of_clusters == "fixed2":
        current_k = 2
    elif number_of_clusters == "fixed3":
        current_k = 3
    else:
        current_k = max(optimal_number_of_clusters, 2)

    X = v[:, -current_k:]
    clustering = KMeans(n_clusters=current_k, random_state=42, n_init=100)
    cluster_labels = clustering.fit_predict(X)

    sil_score = silhouette_score(dataframe, cluster_labels)
    return [(current_k, cluster_labels, sil_score)]

# Test the function with dummy data
dummy_data = pd.DataFrame(np.random.rand(10, 2), columns=['x', 'y'])
result = spectral_clustering(dummy_data, "full", "sym", 2)
print("Test result:", result)

Calculating sigmas: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 113359.57it/s]
Building full affinity: 100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 93414.34it/s]


Test result: [(np.int64(7), array([3, 4, 0, 0, 5, 1, 1, 2, 2, 6], dtype=int32), np.float64(0.19888725879400854))]


In [4]:
def ensure_indexes(driver, datasets):
    try:
        with driver.session() as session:
            existing_indexes = session.run("SHOW INDEXES")
            existing_index_names = {record["name"] for record in existing_indexes}
            for dataset, params in datasets.items():
                node_label = params.get("label") or params.get("node_label")
                index_id_name = f"{node_label}_id_index"
                index_label_name = f"{node_label}_label_index"
                if index_id_name not in existing_index_names:
                    session.run(f"CREATE INDEX {index_id_name} FOR (n:{node_label}) ON (n.id);")
                if index_label_name not in existing_index_names:
                    session.run(f"CREATE INDEX {index_label_name} FOR (n:{node_label}) ON (n.label);")
        print("✅ Indexes ensured for all datasets.")
    except Exception as e:
        print(f"⚠️ Error creating indexes: {e}")

def delete_all_nodes(driver, batch_size=1000):
    try:
        with driver.session() as session:
            while True:
                result = session.run(f"MATCH (n) WITH n LIMIT {batch_size} DETACH DELETE n RETURN count(n) AS deleted_count")
                deleted_count = result.single()["deleted_count"]
                print(f"Deleted {deleted_count} nodes.")
                if deleted_count == 0:
                    break
    except Exception as e:
        print("Error during node deletion:", e)

def delete_all_indexes(driver, batch_size=5):
    try:
        with driver.session() as session:
            while True:
                indexes = session.run("CALL db.indexes()")
                index_names = [index["name"] for index in indexes]
                if not index_names:
                    break
                for index_name in index_names[:batch_size]:
                    session.run(f"DROP INDEX {index_name}")
                    print(f"Dropped index: {index_name}")
                if len(index_names) <= batch_size:
                    break
    except Exception as e:
        print("Error during index deletion:", e)

# Test deletion
delete_all_nodes(driver)
delete_all_indexes(driver)

Deleted 451 nodes.
Deleted 0 nodes.
Dropped index: IrisNode_id_index
Dropped index: IrisNode_label_index
Dropped index: affinity_full_11_IrisNode_Index_idx
Dropped index: eigen_sym_3_affinity_full_11_IrisNode_Index_idx


In [5]:
def create_feature_nodes(data, driver, label):
    try:
        with driver.session() as session:
            for _, row in data.iterrows():
                properties = { (f"feature_{key}" if str(key).isdigit() else key): value
                               for key, value in row.items() }
                query = f"CREATE (n:{label} {{" + ', '.join([f"{key}: ${key}" for key in properties.keys()]) + "})"
                session.run(query, **properties)
        print(f"Created feature nodes with label {label}")
    except Exception as e:
        print("Error during node creation:", e)

def create_graph_nodes(data, driver, label):
    try:
        node_data = pd.read_csv(data)
        with driver.session() as session:
            for _, row in node_data.iterrows():
                properties = row.to_dict()
                properties['features'] = eval(properties['features'])
                query = f"CREATE (n:{label} {{id: $id, features: $features, label: $label}})"
                session.run(query, **properties)
        print(f"Created graph nodes with label {label}")
    except Exception as e:
        print("Error during node creation:", e)

def create_edges(data, driver, node_label, edge_label):
    try:
        edge_data = pd.read_csv(data)
        with driver.session() as session:
            for _, row in edge_data.iterrows():
                source_id = min(row['source_id'], row['target_id'])
                target_id = max(row['source_id'], row['target_id'])
                query = f"""
                MATCH (source:{node_label} {{id: $source_id}})
                MATCH (target:{node_label} {{id: $target_id}})
                MERGE (source)-[:{edge_label} {{value: 1}}]->(target)
                """
                session.run(query, {"source_id": source_id, "target_id": target_id})
        print(f"Created edges with label {edge_label}")
    except Exception as e:
        print("Error during edge creation:", e)

In [6]:
def run_query(driver, query, parameters):
    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu_times = process.cpu_times()
    start_mem = process.memory_info().rss
    with driver.session() as session:
        print("Parameters:")
        print(parameters)
        result = session.run(query, parameters)
        record = result.single()
        data = record.data() if record else None
    end_time = time.time()
    end_cpu_times = process.cpu_times()
    end_mem = process.memory_info().rss
    duration = end_time - start_time
    cpu_used = (end_cpu_times.user + end_cpu_times.system) - (start_cpu_times.user + start_cpu_times.system)
    memory_used = (end_mem - start_mem) / (1024 ** 2)
    return data, duration, memory_used, cpu_used
#local_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
def monitor_progress():
    #local_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
    while True:
        with driver.session() as session:
            query = "MATCH (p:Progress {id: 'current'}) RETURN p.step ORDER BY p.timestamp DESC LIMIT 1"
            result = session.run(query)
            record = result.single()
            data = record.data() if record else None
        if data:
            try:
                print(f"🔄 Current Step: {data['p.step']}", flush=True)
            except Exception as e:
                print(f"Error: {e}, Result: {data}")
        time.sleep(2)

# Start monitoring thread
#monitor_thread = threading.Thread(target=monitor_progress, daemon=True)
#monitor_thread.start()
#print("Monitoring thread started.")

In [7]:
def run_sklearn_experiment_feature(config, file_path):
    df = pd.read_csv(file_path)
    cols_to_remove = [col.strip() for col in config["remove_columns"].split(',')]
    features = df.drop(columns=cols_to_remove, errors='ignore')
    true_labels = df[config["target_column"]].values

    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu = process.cpu_times()
    start_mem = process.memory_info().rss

    eps_val = float(config["parameter"]) if config["graph_type"] == "eps" else None
    k_val = int(config["parameter"]) if config["graph_type"] in ["knn", "mknn"] else None

    clustering_result = spectral_clustering(features, config["graph_type"], config["laplacian_type"],
                                            config["number_of_eigenvectors"], eps=eps_val, k=k_val)
    current_k, cluster_labels, sil_score = clustering_result[0]

    clustering_time = time.time() - start_time
    skl_silhouette = sil_score
    skl_rand_index = adjusted_rand_score(true_labels, cluster_labels)
    end_cpu = process.cpu_times()
    cpu_used = (end_cpu.user + end_cpu.system) - (start_cpu.user + start_cpu.system)
    end_mem = process.memory_info().rss
    memory_used = (end_mem - start_mem) / (1024 ** 2)
    return {
        "sklearn_silhouette_score": skl_silhouette,
        "sklearn_rand_index": skl_rand_index,
        "sklearn_total_time": clustering_time,
        "sklearn_memory_used": memory_used,
        "sklearn_cpu_used": cpu_used
    }

def run_sklearn_experiment_graph(config, node_file_path, edge_file_path):
    nodes_df = pd.read_csv(node_file_path)
    true_labels = nodes_df[config["target_column"]].values
    features = nodes_df.drop(columns=[col.strip() for col in config["remove_columns"].split(',')], errors='ignore')
    if "features" in features.columns:
        features = np.array(features["features"].apply(lambda x: eval(x) if isinstance(x, str) else x).tolist())
    else:
        features = features.values.astype(float)

    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu = process.cpu_times()
    start_mem = process.memory_info().rss

    clustering_result = spectral_clustering(features, config["graph_type"], config["laplacian_type"],
                                            config["number_of_eigenvectors"])
    current_k, cluster_labels, sil_score = clustering_result[0]

    clustering_time = time.time() - start_time
    skl_silhouette = sil_score
    skl_rand_index = adjusted_rand_score(true_labels, cluster_labels)
    end_cpu = process.cpu_times()
    cpu_used = (end_cpu.user + end_cpu.system) - (start_cpu.user + start_cpu.system)
    end_mem = process.memory_info().rss
    memory_used = (end_mem - start_mem) / (1024 ** 2)
    return {
        "sklearn_silhouette_score": skl_silhouette,
        "sklearn_rand_index": skl_rand_index,
        "sklearn_total_time": clustering_time,
        "sklearn_memory_used": memory_used,
        "sklearn_cpu_used": cpu_used
    }

In [12]:
def run_experiments(driver, experiments):
    print("Initializing SimKit...")
    try:
        with driver.session() as session:
            session.run("RETURN simkit.initSimKit('bolt://localhost:7687', 'neo4j', '123412345')")
        print("SimKit initialized.")
    except Exception as e:
        print(f"Error initializing SimKit: {e}")
        return []
    
    results = []
    for idx, config in enumerate(tqdm(experiments, desc="Running experiments")):
        print("Config:", config)
        query = """
        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: $remove_columns,
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: 100,
            distance_measure_kmean: "euclidean",
            target_column: $target_column,
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42 
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               result.rand_index AS rand_index,
               result.total_time AS total_time
        """
        print(query)
        data, duration, memory_used, cpu_used = run_query(driver, query, config)
        simkit_result = {
            "silhouette_score": data['silhouette_score'] if data else None,
            "rand_index": data['rand_index'] if data else None,
            "total_time": data['total_time'] if data else duration,
            "memory_used": memory_used,
            "cpu_used": cpu_used
        }
        
        if config.get("is_feature_based"):
            file_path = os.path.join("datasets", f"points.csv")
            sklearn_result = run_sklearn_experiment_feature(config, file_path)
        else:
            node_file_path = os.path.join("datasets", f"{config['node_label'].replace('Node','').lower()}_nodes.csv")
            edge_file_path = os.path.join("datasets", f"{config['node_label'].replace('Node','').lower()}_edges.csv")
            sklearn_result = run_sklearn_experiment_graph(config, node_file_path, edge_file_path)
        
        results.append({**config, **simkit_result, **sklearn_result})
        print(f"Completed experiment {idx+1}/{len(experiments)}")
    return results

def save_results(results, dataset):
    df = pd.DataFrame(results)
    results_dir = "results"
    os.makedirs(results_dir, exist_ok=True)
    df.to_csv(os.path.join(results_dir, f"{dataset}_results.csv"), index=False)
    print(f"Results saved to {dataset}_results.csv")

In [13]:
def run_feature_experiment(dataset, label, remove_columns, number_of_eigenvectors, target_column):
    delete_all_nodes(driver)
    delete_all_indexes(driver)
    ensure_indexes(driver, {dataset: {"label": label}})
    file_path = os.path.join("datasets", f"{dataset}.csv")
    data = pd.read_csv(file_path)
    create_feature_nodes(data, driver, label)
    
    experiments = []
    laplacian_types = ["sym", "rw"]
    graph_types = ["full", "eps", "knn", "mknn"]
    parameters = {"iris": {"full": "11", "eps": "1.111", "knn": "10", "mknn": "30"},
                  "madelon": {"full": "45", "eps": "4.669", "knn": "419", "mknn": "117"},
                  "20newsgroups": {"full": "35", "eps": "1946.74", "knn": "512", "mknn": "26"}}
    for graph_type in graph_types:
        for laplacian_type in laplacian_types:
            experiments.append({
                "node_label": label,
                "is_feature_based": True,
                "graph_type": graph_type,
                "parameter": parameters[dataset][graph_type],
                "remove_columns": remove_columns,
                "laplacian_type": laplacian_type,
                "number_of_eigenvectors": number_of_eigenvectors,
                "target_column": target_column,
                "use_kmean_for_silhouette": False
            })
    results = run_experiments(driver, experiments)
    save_results(results, dataset)

def run_graph_experiment(dataset, node_label, edge_label, remove_columns, number_of_eigenvectors, target_column):
    delete_all_nodes(driver)
    delete_all_indexes(driver)
    ensure_indexes(driver, {dataset: {"node_label": node_label}})
    node_file_path = os.path.join("datasets", f"{dataset}_nodes.csv")
    edge_file_path = os.path.join("datasets", f"{dataset}_edges.csv")
    create_graph_nodes(node_file_path, driver, node_label)
    create_edges(edge_file_path, driver, node_label, edge_label)
    
    experiments = []
    laplacian_types = ["sym", "rw"]
    for laplacian_type in laplacian_types:
        experiments.append({
            "node_label": node_label,
            "is_feature_based": False,
            "graph_type": "full",
            "parameter": "3",
            "remove_columns": remove_columns,
            "laplacian_type": laplacian_type,
            "number_of_eigenvectors": number_of_eigenvectors,
            "target_column": target_column,
            "use_kmean_for_silhouette": True
        })
        print({
            "node_label": node_label,
            "is_feature_based": False,
            "graph_type": "full",
            "parameter": "3",
            "remove_columns": remove_columns,
            "laplacian_type": laplacian_type,
            "number_of_eigenvectors": number_of_eigenvectors,
            "target_column": target_column,
            "use_kmean_for_silhouette": True
        })
    results = run_experiments(driver, experiments)
    save_results(results, dataset)

In [14]:
# Define the dataset based on points.csv
feature_datasets = {
    "points": {
        "label": "PointNode",
        "remove_columns": "id,class",
        "number_of_eigenvectors": 2,  # Integer
        "target_column": "class"
    }
}

def run_feature_experiment(dataset, label, remove_columns, number_of_eigenvectors, target_column):
    delete_all_nodes(driver)
    delete_all_indexes(driver)
    ensure_indexes(driver, {dataset: {"label": label}})
    file_path = os.path.join("datasets", f"{dataset}.csv")
    try:
        data = pd.read_csv(file_path)
        print("Dataset loaded successfully:")
        print(data.head())
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please ensure it exists in the datasets/ folder.")
        return
    
    create_feature_nodes(data, driver, label)
    
    experiments = []
    laplacian_types = ["sym", "rw"]
    graph_types = ["full", "eps", "knn"]
    default_parameters = {
        "full": "5",    # String
        "eps": "2.0",   # String
        "knn": "2"      # String
    }
    
    for graph_type in graph_types:
        for laplacian_type in laplacian_types:
            experiments.append({
                "node_label": label,
                "is_feature_based": True,
                "graph_type": graph_type,
                "parameter": default_parameters[graph_type],
                "remove_columns": remove_columns,
                "laplacian_type": laplacian_type,
                "number_of_eigenvectors": number_of_eigenvectors,  # Integer
                "target_column": target_column,
                "use_kmean_for_silhouette": False
            })
    
    results = run_experiments(driver, experiments)
    save_results(results, dataset)
    print(f"Experiment completed for {dataset}")

run_feature_experiment("points", **feature_datasets["points"])

Deleted 27 nodes.
Deleted 0 nodes.
Dropped index: PointNode_id_index
Dropped index: PointNode_label_index
Dropped index: affinity_full_5_PointNode_id_idx
Dropped index: eigen_sym_2_affinity_full_5_PointNode_id_idx
✅ Indexes ensured for all datasets.
Dataset loaded successfully:
   id  x_coordinate  y_coordinate  class
0   1             1             7      1
1   2             1             6      1
2   3             6             2      2
3   4             8             1      2
4   5            10             2      2
Created feature nodes with label PointNode
Initializing SimKit...
SimKit initialized.


Running experiments:   0%|                                                                                                       | 0/6 [00:00<?, ?it/s]

Config: {'node_label': 'PointNode', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '5', 'remove_columns': 'id,class', 'laplacian_type': 'sym', 'number_of_eigenvectors': 2, 'target_column': 'class', 'use_kmean_for_silhouette': False}

        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: $remove_columns,
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: 100,
            distance_measure_kmean: "euclidean",
            target_column: $target_column,
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42 
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               result.rand_index AS rand_index


Calculating sigmas: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 61832.49it/s][A

  d = np.exp(-1 * dist_mat[i, j]**2 / (sigmas[i] * sigmas[j]))
Building full affinity: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 29468.18it/s]
Running experiments:  17%|███████████████▊                                                                               | 1/6 [00:01<00:08,  1.78s/it]

Completed experiment 1/6
Config: {'node_label': 'PointNode', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '5', 'remove_columns': 'id,class', 'laplacian_type': 'rw', 'number_of_eigenvectors': 2, 'target_column': 'class', 'use_kmean_for_silhouette': False}

        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: $remove_columns,
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: 100,
            distance_measure_kmean: "euclidean",
            target_column: $target_column,
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42 
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               result.


Calculating sigmas: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 66576.25it/s][A

  d = np.exp(-1 * dist_mat[i, j]**2 / (sigmas[i] * sigmas[j]))
Building full affinity: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 35746.91it/s]
Running experiments:  33%|███████████████████████████████▋                                                               | 2/6 [00:03<00:06,  1.66s/it]

Completed experiment 2/6
Config: {'node_label': 'PointNode', 'is_feature_based': True, 'graph_type': 'eps', 'parameter': '2.0', 'remove_columns': 'id,class', 'laplacian_type': 'sym', 'number_of_eigenvectors': 2, 'target_column': 'class', 'use_kmean_for_silhouette': False}

        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: $remove_columns,
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: 100,
            distance_measure_kmean: "euclidean",
            target_column: $target_column,
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42 
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               resul


Building eps affinity: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 12294.00it/s][A
Running experiments:  50%|███████████████████████████████████████████████▌                                               | 3/6 [00:04<00:04,  1.57s/it]

Completed experiment 3/6
Config: {'node_label': 'PointNode', 'is_feature_based': True, 'graph_type': 'eps', 'parameter': '2.0', 'remove_columns': 'id,class', 'laplacian_type': 'rw', 'number_of_eigenvectors': 2, 'target_column': 'class', 'use_kmean_for_silhouette': False}

        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: $remove_columns,
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: 100,
            distance_measure_kmean: "euclidean",
            target_column: $target_column,
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42 
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               result


Building eps affinity: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 75121.86it/s][A
Running experiments:  67%|███████████████████████████████████████████████████████████████▎                               | 4/6 [00:06<00:03,  1.57s/it]

Completed experiment 4/6
Config: {'node_label': 'PointNode', 'is_feature_based': True, 'graph_type': 'knn', 'parameter': '2', 'remove_columns': 'id,class', 'laplacian_type': 'sym', 'number_of_eigenvectors': 2, 'target_column': 'class', 'use_kmean_for_silhouette': False}

        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: $remove_columns,
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: 100,
            distance_measure_kmean: "euclidean",
            target_column: $target_column,
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42 
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               result.


Building knn affinity: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 7832.50it/s][A
Running experiments:  83%|███████████████████████████████████████████████████████████████████████████████▏               | 5/6 [00:07<00:01,  1.55s/it]

Completed experiment 5/6
Config: {'node_label': 'PointNode', 'is_feature_based': True, 'graph_type': 'knn', 'parameter': '2', 'remove_columns': 'id,class', 'laplacian_type': 'rw', 'number_of_eigenvectors': 2, 'target_column': 'class', 'use_kmean_for_silhouette': False}

        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: $remove_columns,
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: 100,
            distance_measure_kmean: "euclidean",
            target_column: $target_column,
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42 
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               result.r


Building knn affinity: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 9108.15it/s][A
Running experiments: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:09<00:00,  1.55s/it]

Completed experiment 6/6
Results saved to points_results.csv
Experiment completed for points





In [15]:
driver.close()

In [None]:
# Read and analyze results
result_files = glob.glob(os.path.join("results", "points_results.csv"))
if not result_files:
    print("No results found. Ensure the experiment ran successfully.")
else:
    combined_df = pd.read_csv(result_files[0])
    print("Results loaded:")
    print(combined_df.head())

    # Average metrics
    simkit_avg = combined_df[['total_time', 'cpu_used', 'silhouette_score', 'rand_index']].mean()
    sklearn_avg = combined_df[['sklearn_total_time', 'sklearn_cpu_used', 'sklearn_silhouette_score', 'sklearn_rand_index']].mean()
    print("SimKit Averages:\n", simkit_avg)
    print("scikit-learn Averages:\n", sklearn_avg)

    # Scatter plot of points with true and predicted labels
    data = pd.read_csv("datasets/points.csv")
    plt.figure(figsize=(10, 5))
    
    # True labels
    plt.subplot(1, 2, 1)
    plt.scatter(data['x_coordinate'], data['y_coordinate'], c=data['class'], cmap='viridis')
    plt.title("True Labels")
    plt.xlabel("X Coordinate")
    plt.ylabel("Y Coordinate")
    
    # Predicted labels (SimKit)
    plt.subplot(1, 2, 2)
    # Assuming cluster labels are not directly returned, we'll simulate them for visualization
    # For actual labels, you'd need to modify run_experiments to return them
    plt.scatter(data['x_coordinate'], data['y_coordinate'], c=combined_df['silhouette_score'].iloc[0], cmap='viridis')
    plt.title("SimKit Predicted Clusters (Silhouette Score as Proxy)")
    plt.xlabel("X Coordinate")
    plt.ylabel("Y Coordinate")
    
    plt.tight_layout()
    plt.show()

    # Boxplot for total time comparison
    plt.figure(figsize=(6, 6))
    sns.boxplot(data=pd.DataFrame({
        'SimKit Total Time': combined_df['total_time'],
        'scikit-learn Total Time': combined_df['sklearn_total_time']
    }))
    plt.title("Total Time Comparison")
    plt.ylabel("Time (s)")
    plt.show()