In [2]:
import os
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import networkx as nx 

# Set a random seed for reproducibility
def set_random_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)

# ---- Step 1: Feature Extraction from Graphs ----
def extract_graph_features(adj_matrix):
    """
    Extracts features from a given adjacency matrix of a graph.
    Features:
    - Number of nodes
    - Number of edges
    - Graph density
    - Average node degree
    - Average clustering coefficient
    """
    G = nx.from_numpy_array(adj_matrix)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()// 2
    density = nx.density(G)
    avg_degree = sum(dict(G.degree()).values()) / num_nodes

    
    return np.array([num_nodes,num_edges,density,avg_degree])

# ---- Step 2: Load Dataset and Split into Training/Validation/Testing ----
def load_and_extract_features(hamiltonian_dir, non_hamiltonian_dir):
    # Hamiltonian graph files
    hamiltonian_files = [os.path.join(hamiltonian_dir, f) for f in os.listdir(hamiltonian_dir) if f.endswith('.npy')]
    # Non-Hamiltonian graph files
    non_hamiltonian_files = [os.path.join(non_hamiltonian_dir, f) for f in os.listdir(non_hamiltonian_dir) if f.endswith('.npy')]
    
    # Extract features and labels
    features = []
    labels = []
    
    for file in hamiltonian_files:
        adj_matrix = np.load(file)
        features.append(extract_graph_features(adj_matrix))
        labels.append(1)  # Hamiltonian = 1
    
    for file in non_hamiltonian_files:
        adj_matrix = np.load(file)
        features.append(extract_graph_features(adj_matrix))
        labels.append(0)  # Non-Hamiltonian = 0
    
    return np.array(features), np.array(labels)

# ---- Step 3: Sample Training, Validation, and Testing Datasets ----
def sample_train_val_test(features, labels, test_size=500, val_ratio=0.2, seed=42, sample_size=None):
    """
    Splits the dataset into training, validation, and testing sets.
    
    Args:
    - features: The extracted graph features.
    - labels: The corresponding graph labels.
    - test_size: The number of samples reserved for testing.
    - val_ratio: The proportion of validation samples in the training/validation split.
    - seed: Random seed for reproducibility.
    - sample_size: The total number of samples for training and validation (optional).
    
    Returns:
    - X_train: Training set features.
    - X_val: Validation set features.
    - X_test: Test set features.
    - y_train: Training set labels.
    - y_val: Validation set labels.
    - y_test: Test set labels.
    """
    set_random_seed(seed)

    # Split off the test set (fixed size of 500)
    X_train_val, X_test, y_train_val, y_test = train_test_split(features, labels, test_size=test_size, random_state=seed, stratify=labels)
    
    # If sample_size is specified, sample that many from the remaining training+validation set
    if sample_size is not None:
        X_train_val, _, y_train_val, _ = train_test_split(X_train_val, y_train_val, train_size=sample_size, random_state=seed, stratify=y_train_val)
    
    # Split training+validation into 80% training and 20% validation
    train_size = int((1 - val_ratio) * len(X_train_val))
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, train_size=train_size, random_state=seed, stratify=y_train_val)
    
    # ---- Print the class distribution of each dataset ----
    print(f"Training set class distribution: {Counter(y_train)}")
    print(f"Validation set class distribution: {Counter(y_val)}")
    print(f"Test set class distribution: {Counter(y_test)}")

    return X_train, X_val, X_test, y_train, y_val, y_test

# ---- Step 4: Apply Naive Bayes Model ----
def apply_naive_bayes(X_train, X_val, X_test, y_train, y_val, y_test):
    # Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    # Train Naive Bayes Model
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)

    # Predict on validation set
    y_val_pred = nb_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)

    # Predict on test set
    y_test_pred = nb_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
    print(f"Validation F1 Score: {val_f1:.4f}")
    print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Test F1 Score: {test_f1:.4f}")

# ---- Main Workflow ----
# hamiltonian_dir = './hamiltonian_small_mat'
# non_hamiltonian_dir = './non_hamiltonian_small_mat'
hamiltonian_dir = './small_Hamiltonian_hard'
non_hamiltonian_dir = './small_non_Hamiltonian_hard'

# Load and extract features from the dataset
features, labels = load_and_extract_features(hamiltonian_dir, non_hamiltonian_dir)

# Sample training, validation, and test sets (with sample size control)
sample_size = 1000  # Experiment with different sample sizes

# Define the seeds you want to use
seeds = [3, 7, 11, 13, 23]

for seed in seeds:
    print(f"\nRunning model with seed: {seed}")
    X_train, X_val, X_test, y_train, y_val, y_test = sample_train_val_test(features, labels, test_size=500, val_ratio=0.2, seed=seed, sample_size=sample_size)

    # Apply Naive Bayes and evaluate
    apply_naive_bayes(X_train, X_val, X_test, y_train, y_val, y_test)



Running model with seed: 3
Training set class distribution: Counter({0: 400, 1: 400})
Validation set class distribution: Counter({0: 100, 1: 100})
Test set class distribution: Counter({1: 250, 0: 250})
Validation Accuracy: 55.00%
Validation F1 Score: 0.5946
Test Accuracy: 57.40%
Test F1 Score: 0.6283

Running model with seed: 7
Training set class distribution: Counter({1: 400, 0: 400})
Validation set class distribution: Counter({1: 100, 0: 100})
Test set class distribution: Counter({0: 250, 1: 250})
Validation Accuracy: 59.50%
Validation F1 Score: 0.6124
Test Accuracy: 54.60%
Test F1 Score: 0.5626

Running model with seed: 11
Training set class distribution: Counter({0: 400, 1: 400})
Validation set class distribution: Counter({0: 100, 1: 100})
Test set class distribution: Counter({0: 250, 1: 250})
Validation Accuracy: 49.00%
Validation F1 Score: 0.5565
Test Accuracy: 59.40%
Test F1 Score: 0.6530

Running model with seed: 13
Training set class distribution: Counter({1: 400, 0: 400})
Va