# 02807 - Computational Tools for Data Science
This notebook is intended to showcase the methods used in the project, for the scripts used to produce the results in the report plaese refer to run_clustering_methods_and_classifiers.zip

In [1]:
from sklearn.metrics import classification_report
import pandas as pd
from utils.scoring import purity_score, score_clustering, clustering_classification_report
import numpy as np
from scipy.sparse.linalg import eigsh
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix, diags
from sklearn.metrics import pairwise_distances_argmin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
from utils.plotting import plot_2d_clusters
from xgboost import XGBClassifier
from spectralnet import SpectralNet
import numpy as np
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import NearestNeighbors
from scipy.stats import mode
from sklearn.metrics import silhouette_score, davies_bouldin_score
from utils.scoring import clustering_classification_report, score_clustering
import torch
import networkx as nx
from community import community_louvain
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

warnings.filterwarnings('ignore')

In [2]:
####################################################################
# Genral Utils for Clustering                                      #
####################################################################

def format_clustering_metrics(train_scores, test_scores, **kwargs):
    results = kwargs
    results.update({
        'Train_DB': train_scores[0],
        'Train_Sil': train_scores[1],
        'Train_Pur': train_scores[2],
        'Test_DB': test_scores[0],
        'Test_Sil': test_scores[1],
        'Test_Pur': test_scores[2],
    })
    return results

def format_classification_metrics(train_metrics, test_metrics, **kwargs):
    results = kwargs
    results.update({
        'Train_Acc': train_metrics['accuracy'],
        'Train_F1': train_metrics['f1'],
        'Train_Recall': train_metrics['recall'],
        'Train_Precision': train_metrics['precision'],
        'Test_Acc': test_metrics['accuracy'],
        'Test_F1': test_metrics['f1'],
        'Test_Recall': test_metrics['recall'],
        'Test_Precision': test_metrics['precision'],
    })
    return results

# Load and preprocess data

In [3]:
data = pd.read_csv('creditcard.csv')
X = data.drop(['Class', 'Amount', 'Time'], axis=1)
y = data['Class']

X_scale = MinMaxScaler().fit_transform(X)

# sample data to reduce class imbalance
non_fraud_df = X_scale[y == 0][:2000]
fraud_df = X_scale[y == 1]

X_sample = np.vstack([non_fraud_df, fraud_df])
fraud_idx = np.zeros(len(X_sample))
fraud_idx[-len(fraud_df):] = 1

indices = np.arange(len(X_sample))
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(X_sample, fraud_idx, indices, test_size=0.2, random_state=42, stratify=fraud_idx)

## Logistic Classification Model

In [4]:
def logistic_regression_classifier(X_train, X_test, y_train, y_test):
    param_dist = {
        'C': np.logspace(-2, 2, 20),
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    }

    model = LogisticRegression(random_state=42, max_iter=10000)

    # Use RandomizedSearchCV for faster hyperparameter tuning
    rand_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=10,
        scoring='neg_log_loss',
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    rand_search.fit(X_train, y_train)

    # Get the best model
    best_model = rand_search.best_estimator_

    # Predict on the test data
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    # Calculate evaluation metrics
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }

    # Get classification report as a dictionary
    classification_report_dict = classification_report(y_test, y_pred, output_dict=True)

    # Add classification report and best hyperparameters to metrics
    metrics['classification_report'] = classification_report_dict
    metrics['best_hyperparameters'] = rand_search.best_params_

    # Convert metrics dictionary to a DataFrame
    metrics_df = pd.DataFrame([metrics])

    return metrics_df

# Clustering on raw data

## Kmeans

In [5]:
def fit_kmeans(X_train, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X_train)
    return kmeans

def extract_kmeans_cluster_labels(kmeans, X):
    return kmeans.predict(X)

def evaluate_kmeans(X_train, X_test, y_train, y_test, n_clusters):
    kmeans = fit_kmeans(X_train, n_clusters)
    train_cluster_labels = extract_kmeans_cluster_labels(kmeans, X_train)
    test_cluster_labels = extract_kmeans_cluster_labels(kmeans, X_test)

    train_clustering_scores = score_clustering(X_train, y_train, train_cluster_labels)
    test_clustering_scores = score_clustering(X_test, y_test, test_cluster_labels)

    train_classification_report = clustering_classification_report(train_cluster_labels, y_train)
    test_classification_report = clustering_classification_report(test_cluster_labels, y_test)
    clustering_metrics = format_clustering_metrics(train_clustering_scores, test_clustering_scores, n_clusters=n_clusters)
    classification_metrics = format_classification_metrics(train_classification_report, test_classification_report, n_clusters=n_clusters)
    return clustering_metrics, classification_metrics

def score_kmeans(X_train, X_test, y_train, y_test, cluster_counts=None):
    if cluster_counts is None:
        cluster_counts = [2, 3, 5, 10]
    clustering_metrics_list = []
    classification_metrics_list = []
    for n_clusters in cluster_counts:
        clustering_scores, classification_scores = evaluate_kmeans(X_train, X_test, y_train, y_test, n_clusters)
        clustering_metrics_list.append(clustering_scores)
        classification_metrics_list.append(classification_scores)
    return clustering_metrics_list, classification_metrics_list

In [6]:
n_clusters = [2, 3, 5, 10]
cluster_metrics, classification_metrics = score_kmeans(X_train, X_test, y_train, y_test)
results_df = pd.DataFrame(cluster_metrics)
reports_df = pd.DataFrame(classification_metrics)

results_df.round(2)

Unnamed: 0,n_clusters,Train_DB,Train_Sil,Train_Pur,Test_DB,Test_Sil,Test_Pur
0,2,0.84,0.63,0.56,0.79,0.64,0.6
1,3,1.09,0.48,0.79,1.05,0.47,0.81
2,5,1.97,0.12,0.77,1.93,0.13,0.79
3,10,1.96,0.11,0.82,1.74,0.12,0.83


## DBSCAN

In [7]:
def fit_dbscan(X_train, eps):
    dbscan = DBSCAN(eps=eps, min_samples=5)
    dbscan.fit(X_train)
    return dbscan

def test_dbscan(dbscan, X_train, X_test):
    core_samples_mask = dbscan.core_sample_indices_
    core_points = X_train[core_samples_mask]

    nn = NearestNeighbors(n_neighbors=1).fit(core_points)
    distances, indices = nn.kneighbors(X_test)

    test_clusters = np.array([dbscan.labels_[core_samples_mask[i]] if distances[j] < dbscan.eps else -1
                            for j, i in enumerate(indices.flatten())])
    return test_clusters

def evaluate_dbscan(X_train, X_test, y_train, y_test, eps, min_samples):
    dbscan = fit_dbscan(X_train, eps, min_samples)
    train_clusters = dbscan.labels_
    test_clusters = test_dbscan(dbscan, X_train, X_test)

    train_clustering_scores = score_clustering(X_train, y_train, train_clusters)
    test_clustering_scores = score_clustering(X_test, y_test, test_clusters)

    train_classification_report = clustering_classification_report(train_clusters, y_train)
    test_classification_report = clustering_classification_report(test_clusters, y_test)

    clustering_metrics = format_clustering_metrics(train_clustering_scores, test_clustering_scores, eps=eps, min_samples=min_samples)
    classification_metrics = format_classification_metrics(train_classification_report, test_classification_report, eps=eps, min_samples=min_samples)
    return clustering_metrics, classification_metrics

def score_dbscan(X_train, X_test, y_train, y_test, eps=None, min_samples=None):
    if eps is None:
        eps = [0.2, 0.3, 0.4]
    if min_samples is None:
        min_samples = [5, 10]
    clustering_metrics_list = []
    classification_metrics_list = []
    for e in eps:
        for ms in min_samples:
            try:
                clustering_scores, classification_scores = evaluate_dbscan(X_train, X_test, y_train, y_test, e, ms)
                clustering_metrics_list.append(clustering_scores)
                classification_metrics_list.append(classification_scores)
            except ValueError:
                print(f'Fitting DBSCAN with eps={e} and min_samples={ms} leads to a single cluster. Skipping...')
    return clustering_metrics_list, classification_metrics_list

def fit_dbscan(X_train, eps, min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(X_train)
    return dbscan

In [8]:
eps = [0.1, 0.15, 0.2, 0.25]
min_samples = [5, 10, 15]
dbscan_cluster_metrics, dbscan_classification_metrics = score_dbscan(X_train, X_test, y_train, y_test, eps, min_samples)
dbscan_results_df = pd.DataFrame(dbscan_cluster_metrics)
dbscan_reports_df = pd.DataFrame(dbscan_classification_metrics)

dbscan_results_df.round(2)

Unnamed: 0,eps,min_samples,Train_DB,Train_Sil,Train_Pur,Test_DB,Test_Sil,Test_Pur
0,0.1,5,1.6,-0.22,0.49,1.57,-0.27,0.42
1,0.1,10,2.19,-0.19,0.38,2.11,-0.22,0.35
2,0.1,15,2.28,-0.23,0.31,2.31,-0.26,0.28
3,0.15,5,1.54,0.22,0.83,1.84,0.15,0.79
4,0.15,10,1.26,0.1,0.82,1.46,0.08,0.78
5,0.15,15,1.78,0.35,0.8,1.78,0.34,0.75
6,0.2,5,1.87,0.31,0.83,1.93,0.25,0.82
7,0.2,10,1.76,0.28,0.84,1.97,0.25,0.82
8,0.2,15,1.65,0.28,0.84,1.6,0.24,0.82
9,0.25,5,1.26,0.54,0.31,1.56,0.56,0.37


# Spectral Clustering

In [9]:
def rbf_kernel_manual_sparse(X, gamma=1.0, n_neighbors=1000):
    """
    Compute a sparse RBF kernel using nearest neighbors.
    """
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto').fit(X)
    distances, indices = nbrs.kneighbors(X)

    # Compute sparse RBF kernel
    row_indices = np.repeat(np.arange(X.shape[0]), n_neighbors)
    col_indices = indices.flatten()
    exp_values = np.exp(-gamma * distances.flatten()**2)

    affinity_matrix = csr_matrix((exp_values, (row_indices, col_indices)), shape=(X.shape[0], X.shape[0]))
    return affinity_matrix


def construct_laplacian(affinity_matrix):
    """
    Construct the normalized graph Laplacian.
    Supports both dense and sparse affinity matrices.
    """
    # Ensure affinity_matrix is sparse
    if not isinstance(affinity_matrix, csr_matrix):
        affinity_matrix = csr_matrix(affinity_matrix)

    # Compute degree values
    degree_values = np.array(affinity_matrix.sum(axis=1)).flatten()
    degree_values[degree_values == 0] = 1e-10  # Avoid division by zero

    # Create D^(-1/2) as a sparse diagonal matrix
    d_inv_sqrt = 1.0 / np.sqrt(degree_values)
    d_inv_sqrt_sparse = diags(d_inv_sqrt)

    # Compute the normalized Laplacian
    laplacian = diags([1.0], [0], shape=affinity_matrix.shape) - d_inv_sqrt_sparse @ affinity_matrix @ d_inv_sqrt_sparse

    return laplacian


def kmeans_pp_init(X, n_clusters, seed=42):
    np.random.seed(seed)
    n_samples, n_features = X.shape
    centroids = []

    # Step 1: Randomly select the first centroid
    first_centroid_idx = np.random.randint(0, n_samples)
    centroids.append(X[first_centroid_idx])

    # Step 2: Select remaining centroids
    for _ in range(1, n_clusters):
        # Compute distances from each point to the nearest centroid
        sq_norms_X = np.sum(X ** 2, axis=1, keepdims=True)  # Shape: (n_samples, 1)
        sq_norms_centroids = np.sum(np.array(centroids) ** 2, axis=1, keepdims=True).T  # Shape: (1, len(centroids))
        distances = sq_norms_X + sq_norms_centroids - 2 * np.dot(X, np.array(centroids).T)  # Shape: (n_samples, len(centroids))
        distances = np.sqrt(np.maximum(distances, 0))  # Ensure non-negativity

        # For each point, find the minimum distance to any centroid
        min_distances = np.min(distances, axis=1)

        # Compute the probability distribution for the next centroid
        probabilities = min_distances ** 2 / np.sum(min_distances ** 2)

        # Randomly select the next centroid based on the probabilities
        next_centroid_idx = np.random.choice(n_samples, p=probabilities)
        centroids.append(X[next_centroid_idx])

    return np.array(centroids)


def kmeans_manual(X, n_clusters, max_iter=500, tol=1e-5, seed=42):
    np.random.seed(seed)
    centroids = kmeans_pp_init(X, n_clusters, seed)
    for _ in range(max_iter):
        sq_norms_X = np.sum(X ** 2, axis=1, keepdims=True)
        sq_norms_centroids = np.sum(centroids ** 2, axis=1, keepdims=True).T
        distances = sq_norms_X + sq_norms_centroids - 2 * np.dot(X, centroids.T)
        distances = np.sqrt(np.maximum(distances, 0))
        labels = np.argmin(distances, axis=1)
        new_centroids = np.array([
            X[labels == i].mean(axis=0) if np.any(labels == i) else X[np.random.randint(0, X.shape[0])]
            for i in range(n_clusters)
        ])
        if np.allclose(centroids, new_centroids, atol=tol):
            break
        centroids = new_centroids
    return labels, centroids

def derive_cluster_mapping(y_train, train_cluster_labels):
    """
    Derives a mapping of cluster labels to actual class labels using the training set.
    """
    mapping = {}
    for cluster in np.unique(train_cluster_labels):
        mask = (train_cluster_labels == cluster)
        if np.any(mask):  # Ensure the mask selects at least one element
            mode_result = mode(y_train[mask], nan_policy='omit')  # Handle potential NaNs
            cluster_mode = mode_result.mode
            if isinstance(cluster_mode, np.ndarray) and cluster_mode.size > 0:
                mapping[cluster] = cluster_mode[0]  # Access the mode value if it's an array
            else:
                mapping[cluster] = cluster_mode  # Directly assign the scalar value
        else:
            raise ValueError(f"Cluster {cluster} is empty. Check clustering assignments.")
    return mapping


def map_clusters(cluster_labels, mapping):
    """
    Maps cluster labels to class labels based on the derived mapping.
    """
    return np.array([mapping[label] for label in cluster_labels])


def spectral_clustering(X, n_clusters, gamma=1.0, seed=42):
    np.random.seed(seed)
    affinity_matrix = rbf_kernel_manual_sparse(X, gamma=gamma)
    laplacian = construct_laplacian(affinity_matrix)
    eigvals, eigvecs = eigsh(laplacian, k=n_clusters, which='SM')  # 'SM' = Smallest Magnitude
    # Select eigenvectors corresponding to the smallest n_clusters eigenvalues
    eigvecs_subset = eigvecs[:, np.argsort(eigvals)[:n_clusters]]
    normalized_eigvecs = eigvecs_subset / np.linalg.norm(eigvecs_subset, axis=1, keepdims=True)
    labels, _ = kmeans_manual(normalized_eigvecs, n_clusters=n_clusters, seed=seed)
    return labels


def evaluate_spectral(X_train, X_test, y_train, y_test, n_clusters, gamma=1.0):
    # Step 1: Perform spectral clustering
    train_cluster_labels = spectral_clustering(X_train, n_clusters, gamma)

    # Step 2: Derive cluster-to-class mapping from training data
    cluster_to_class_mapping = derive_cluster_mapping(y_train, train_cluster_labels)

    # Step 3: Assign test data to nearest training clusters
    train_centroids = np.array([X_train[train_cluster_labels == i].mean(axis=0) for i in range(n_clusters)])

    # Compute pairwise distances between test data and training centroids
    sq_norms_X_test = np.sum(X_test**2, axis=1, keepdims=True)  # Shape: (n_test_samples, 1)
    sq_norms_train_centroids = np.sum(train_centroids**2, axis=1, keepdims=True).T  # Shape: (1, n_clusters)
    distances = sq_norms_X_test + sq_norms_train_centroids - 2 * np.dot(X_test, train_centroids.T)  # Shape: (n_test_samples, n_clusters)
    distances = np.sqrt(np.maximum(distances, 0))  # Ensure non-negativity

    test_cluster_labels = np.argmin(distances, axis=1)

    # Step 4: Map cluster labels to class labels
    train_aligned_labels = map_clusters(train_cluster_labels, cluster_to_class_mapping)
    test_aligned_labels = map_clusters(test_cluster_labels, cluster_to_class_mapping)

    # Step 5: Compute metrics (you can plug in your scoring and classification report functions here)
    train_clustering_scores = score_clustering(X_train, y_train, train_aligned_labels)
    test_clustering_scores = score_clustering(X_test, y_test, test_aligned_labels)

    train_classification_report = clustering_classification_report(train_aligned_labels, y_train)
    test_classification_report = clustering_classification_report(test_aligned_labels, y_test)

    clustering_metrics = format_clustering_metrics(train_clustering_scores, test_clustering_scores, n_clusters=n_clusters)
    classification_metrics = format_classification_metrics(train_classification_report, test_classification_report, n_clusters=n_clusters)

    return clustering_metrics, classification_metrics

def score_spectral(X_train, X_test, y_train, y_test, cluster_counts=None):
    if cluster_counts is None:
        cluster_counts = [2, 3, 5, 10]
    clustering_metrics_list = []
    classification_metrics_list = []
    for n_clusters in cluster_counts:
        clustering_scores, classification_scores = evaluate_spectral(X_train, X_test, y_train, y_test, n_clusters)
        clustering_metrics_list.append(clustering_scores)
        classification_metrics_list.append(classification_scores)

    return clustering_metrics_list, classification_metrics_list

In [10]:
spectral_cluster_metrics, spectral_classification_metrics = score_spectral(X_train, X_test, y_train, y_test)
spectral_cluster_results_df = pd.DataFrame(spectral_cluster_metrics)
spectral_cluster_reports_df = pd.DataFrame(spectral_classification_metrics)

spectral_cluster_results_df.round(2)

Unnamed: 0,n_clusters,Train_DB,Train_Sil,Train_Pur,Test_DB,Test_Sil,Test_Pur
0,2,0.93,0.59,0.72,0.83,0.63,0.66
1,3,0.92,0.59,0.7,0.82,0.63,0.65
2,5,0.8,0.64,0.49,0.74,0.65,0.54
3,10,0.95,0.58,0.76,0.92,0.58,0.8


## SpectralNet

In [11]:
def evaluate_spectral_net(X_train, X_test, y_train, y_test, n_clusters=None):
    # Define the SpectralNet model
    spectralnet = SpectralNet(n_clusters=n_clusters, spectral_hiddens=[128, 64, 16, n_clusters])

    # Convert training data to torch tensors
    X_train_tensor = torch.from_numpy(X_train).float()
    X_test_tensor = torch.from_numpy(X_test).float()

    # Train the SpectralNet model on training data
    spectralnet.fit(X_train_tensor)

    # Predict cluster labels for training data
    with torch.no_grad():
        train_cluster_labels = spectralnet.predict(X_train_tensor)

    # Map clusters to class labels using training data
    def derive_cluster_mapping(y_true, cluster_labels):
        mapping = {}
        for cluster in np.unique(cluster_labels):
            labels_in_cluster = y_true[cluster_labels == cluster]
            if len(labels_in_cluster) == 0:
                continue
            mode_result = mode(labels_in_cluster)
            most_common = mode_result.mode.item()
            mapping[cluster] = most_common
        return mapping

    cluster_to_class_mapping = derive_cluster_mapping(y_train, train_cluster_labels)

    # Align training labels
    train_aligned_labels = np.array([cluster_to_class_mapping[label] for label in train_cluster_labels])

    # Predict cluster labels for test data
    with torch.no_grad():
        test_cluster_labels = spectralnet.predict(X_test_tensor)

    # Align test labels
    test_aligned_labels = np.array([cluster_to_class_mapping.get(label, -1) for label in test_cluster_labels])

    # Evaluate performance
    train_classification_report = clustering_classification_report(train_aligned_labels, y_train)
    test_classification_report = clustering_classification_report(test_aligned_labels, y_test)

    train_clustering_scores = score_clustering(X_train, y_train, train_aligned_labels)
    test_clustering_scores = score_clustering(X_test, y_test, test_aligned_labels)

    clustering_metrics = format_clustering_metrics(train_clustering_scores, test_clustering_scores, n_clusters=n_clusters)
    classification_metrics = format_classification_metrics(train_classification_report, test_classification_report, n_clusters=n_clusters)

    return clustering_metrics, classification_metrics

def score_spectral_net(X_train, X_test, y_train, y_test, cluster_counts=None):
    if cluster_counts is None:
        cluster_counts = [2, 3, 5, 10]
    clustering_metrics_list = []
    classification_metrics_list = []
    for n_clusters in cluster_counts:
        clustering_scores, classification_scores = evaluate_spectral_net(X_train, X_test, y_train, y_test, n_clusters)
        clustering_metrics_list.append(clustering_scores)
        classification_metrics_list.append(classification_scores)

    return clustering_metrics_list, classification_metrics_list

In [12]:
spectralnet_cluster_metrics, spectralnet_classification_metrics = score_spectral_net(X_train, X_test, y_train, y_test)
spectralnet_cluster_results_df = pd.DataFrame(spectralnet_cluster_metrics)
spectralnet_cluster_reports_df = pd.DataFrame(spectralnet_classification_metrics)

spectralnet_cluster_results_df.round(2)

Training SpectralNet:


Train Loss: 0.1212750, Valid Loss: 0.1491229, LR: 0.000100: 100%|██████████| 30/30 [00:08<00:00,  3.63it/s]


Training SpectralNet:


Train Loss: 0.3893485, Valid Loss: 0.4729806, LR: 0.001000: 100%|██████████| 30/30 [00:08<00:00,  3.46it/s]


Training SpectralNet:


Train Loss: 3.2403634, Valid Loss: 9.7300339, LR: 0.001000: 100%|██████████| 30/30 [00:09<00:00,  3.25it/s] 


Training SpectralNet:


Train Loss: 13.9248409, Valid Loss: 41.8620605, LR: 0.001000: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Unnamed: 0,n_clusters,Train_DB,Train_Sil,Train_Pur,Test_DB,Test_Sil,Test_Pur
0,2,0.86,0.62,0.59,0.79,0.64,0.6
1,3,0.78,0.65,0.44,0.62,0.68,0.39
2,5,0.97,0.57,0.8,4.04,0.17,0.02
3,10,1.1,0.53,0.85,4.28,0.14,0.1


## Louvain Algorithm

In [13]:
def louvain_algorithm(X_train, X_test):
    modularity_scores = []
    k_values = range(5, 30, 5)  # Example range for k

    for k in k_values:
        # Construct KNN graph
        nn = NearestNeighbors(n_neighbors=k, metric='cosine')
        nn.fit(X_train)
        distances, indices = nn.kneighbors(X_train)

        # Build graph
        G = nx.Graph()
        for i, neighbors in enumerate(indices):
            for j, dist in zip(neighbors, distances[i]):
                if i != j:
                    G.add_edge(i, j, weight=1 - dist)

        # Apply Louvain clustering
        partition = community_louvain.best_partition(G)
        modularity = community_louvain.modularity(partition, G)
        modularity_scores.append((k, modularity))

    # Find k with the highest modularity
    optimal_k = max(modularity_scores, key=lambda x: x[1])[0]
    print(f"Optimal k based on modularity: {optimal_k}")

    # Map partition dictionary to a list aligned with training data indices
    train_clusters = np.array([partition[i] for i in range(len(X_train))])

    # Compute cluster prototypes (mean vectors of clusters)
    cluster_ids = np.unique(train_clusters)
    cluster_prototypes = {}

    for cluster_id in cluster_ids:
        cluster_members = X_train[train_clusters == cluster_id]
        cluster_prototype = cluster_members.mean(axis=0)
        cluster_prototypes[cluster_id] = cluster_prototype

    def compute_cluster_distances(X, cluster_prototypes):
        distances = []
        for x in X:
            dists = [np.linalg.norm(x - cluster_prototypes[cluster_id]) for cluster_id in cluster_prototypes]
            distances.append(dists)
        return np.array(distances)

    # Compute distances for training data
    train_cluster_distances = compute_cluster_distances(X_train, cluster_prototypes)

    # Compute distances for test data
    test_cluster_distances = compute_cluster_distances(X_test, cluster_prototypes)


    # Combine original features with cluster distance features
    X_train_with_features = np.hstack((X_train, train_cluster_distances))
    X_test_with_features = np.hstack((X_test, test_cluster_distances))

    # Convert to DF
    X_train_with_features = pd.DataFrame(X_train_with_features)
    X_test_with_features = pd.DataFrame(X_test_with_features)

    return X_train_with_features, X_test_with_features

In [14]:
train_cluster_distances, test_cluster_distances = louvain_algorithm(X_train, X_test)

# Print number of features before and after adding cluster distances
print(f"Number of features before adding cluster distances: {X_train.shape[1]}")
print(f"Number of features after adding cluster distances: {train_cluster_distances.shape[1]}")
 

Optimal k based on modularity: 5
Number of features before adding cluster distances: 28
Number of features after adding cluster distances: 42


## Baseline Model

In [15]:
results = logistic_regression_classifier(X_train, X_test, y_train, y_test)
print(results)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
   accuracy  precision   recall  f1_score   roc_auc  \
0  0.973948   0.967391  0.89899  0.931937  0.989369   

                               classification_report  \
0  {'0.0': {'precision': 0.9754299754299754, 'rec...   

                                best_hyperparameters  
0  {'solver': 'liblinear', 'penalty': 'l2', 'C': ...  
