In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.cluster import KMeans
from graph import Graph
from helpers import get_degree_matrix, get_laplacian_matrix

In [None]:
font = {'size'   : 20}

plt.rc('font', **font)

In [None]:
def compute_bounds_RST(eigenvalues, rayleigh_quotients, q):
    """eigenvalues is an array of the eigenvalues of the matrix
    expected_eigenvalues is an array of the eigenvalues of the expected matrix
    q is a list of indices. If p = len(q), then we return the lower bound for A_1, A_2, ... A_p.
    For example, if q = [2,5] then we return the lower bounds for A_1 and A_2 where A_1 corresponds to the first two rows of alpha and A_2 corresponds to the next three rows of alpha."""
    A_1_lower_bound =(q[0]*eigenvalues[q[0]] - np.sum(rayleigh_quotients[0:q[0]]))/(eigenvalues[q[0]] - eigenvalues[0])
    lower_bounds = [A_1_lower_bound]
    for i in range(1,len(q)):
        width = q[i] - q[i-1]
        lower_bound = (width*eigenvalues[q[i]] - np.sum(rayleigh_quotients[q[i-1]:q[i]]) - eigenvalues[q[i]]*(q[i-1] - np.sum(lower_bounds)))/(eigenvalues[q[i]] - eigenvalues[q[i-1]])
        lower_bounds.append(lower_bound)
    return lower_bounds

In [None]:

def k_means_indicator_vectors(eigenvectors, K):
    assert eigenvectors.shape[1] >= K, 'Number of eigenvectors should be greater than or equal to K'
    kmeans = KMeans(n_clusters=K, random_state=0).fit(eigenvectors)
    indicator_vectors = np.zeros((eigenvectors.shape[0], K))
    for i in range(K):
        indicator_vectors[:,i] = kmeans.labels_ == i
    return indicator_vectors

def degree_correction(vectors, D_sqrt):
    vectors_corrected = vectors.copy()
    for i in range(vectors.shape[1]):
        vectors_corrected[:,i] = D_sqrt @ vectors[:,i]
        vectors_corrected[:,i] = vectors_corrected[:,i] / np.linalg.norm(vectors_corrected[:,i])
    return vectors_corrected

In [None]:
def compute_k_way_estimate(normalised_L, indicator_vectors, K):
    k_way_possibilities = []
    assert indicator_vectors.shape[1] == K, 'Indicator vectors should have K columns'
    for i in range(K):
        indicator = indicator_vectors[:, i]
        val = indicator.T @ normalised_L @ indicator
        k_way_possibilities.append(val)
    return max(k_way_possibilities)


def compute_rayleigh_quotients(normalised_L, indicator_vectors, K):
    rayleigh_quotients = []
    assert indicator_vectors.shape[1] == K, 'Indicator vectors should have K columns'
    for i in range(K):
        indicator = indicator_vectors[:, i]
        val = (indicator.T @ normalised_L @ indicator) / (indicator.T @ indicator)
        rayleigh_quotients.append(val)
    return rayleigh_quotients


def compute_all_bounds(G: Graph, K: int, q: list, true_clusters: list):
    D = get_degree_matrix(G)
    D_sqrt = np.sqrt(D)
    normalized_L = get_laplacian_matrix(G, normalized=True)

    # compute first K eigenvectors of the normalized Laplacian
    normalized_L_eigenvalues, normalized_L_eigenvectors = np.linalg.eigh(normalized_L)
    idx = normalized_L_eigenvalues.argsort()
    normalized_L_eigenvalues = normalized_L_eigenvalues[idx]
    normalized_L_eigenvectors = normalized_L_eigenvectors[:, idx]

    indicator_vectors = k_means_indicator_vectors(normalized_L_eigenvectors[:, 0:K], K)
    indicator_vectors = degree_correction(indicator_vectors, D_sqrt)
    beta_K_by_K = indicator_vectors.T @ normalized_L_eigenvectors[:, 0:K]
    combined_indicator_vectors = indicator_vectors @ beta_K_by_K
    for i in range(K):
        combined_indicator_vectors[:, i] = combined_indicator_vectors[:, i] / np.linalg.norm(
            combined_indicator_vectors[:, i])
        for j in range(i):
            combined_indicator_vectors[:, i] = combined_indicator_vectors[:, i] - (
                        combined_indicator_vectors[:, j].T @ combined_indicator_vectors[:,
                                                             i]) * combined_indicator_vectors[:, j]

    rayleigh_quotients = compute_rayleigh_quotients(normalized_L, combined_indicator_vectors, K)
    rayleigh_quotients = np.sort(rayleigh_quotients)
    ST_standard = K * max(rayleigh_quotients) / normalized_L_eigenvalues[K]

    # create indicator vectors from true clusters
    true_indicator_vectors = np.zeros((len(G.vertices), K))
    for i in range(K):
        true_indicator_vectors[true_clusters[i], i] = 1
    true_indicator_vectors = degree_correction(true_indicator_vectors, D_sqrt)
    alpha = true_indicator_vectors.T @ normalized_L_eigenvectors[:, 0:K]
    true_value = K - np.sum(np.sum(alpha ** 2, axis=1), axis=0)

    recursive_ST = K - np.sum(compute_bounds_RST(normalized_L_eigenvalues, rayleigh_quotients, q))
    general_ST = K - np.sum(compute_bounds_RST(normalized_L_eigenvalues, rayleigh_quotients, [K]))

    return {'Recursive ST': recursive_ST,
            'General ST': general_ST,
            'ST Standard': ST_standard,
            'True Value': true_value} 

In [None]:
# construct a graph from the data points using a threshold
def construct_graph(X, threshold):
    N = X.shape[0]
    A = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            if i != j:
                dist = np.linalg.norm(X[i] - X[j])
                if dist < threshold:
                    A[i, j] = 1
    return A

## Varying n at the Threshold
we take an SBM with two clusters and P defined as
$$ P = \begin{pmatrix}
p & q \\
q & p \\
\end{pmatrix}$$
where $p = \frac{\alpha\log(N)}{N}$ and $q = \frac{\beta \log(N)}{N}$
and $\sqrt{\alpha} - \sqrt{\beta} \geq \sqrt{2}$

In this experiment we will fix the values of $\alpha, \beta$ at the threshold with $\beta = 20$.

In [None]:
K = 2
b = 20
a = (np.sqrt(b) + np.sqrt(K))**2

def get_P(a,b,N):
    p = a * np.log(N) / N
    q = b * np.log(N) / N
    P = np.array([[p,q],[q,p]])
    return P

bounds = {}
sample_size = 10
for n in [200,300,400,500,600,700,800,900,1000]:
    N = K*n
    bounds[n] = 0
    P = get_P(a,b,N)
    for _ in range(sample_size):
        edges = []
        for i in range(K):
            for j in range(i,K):
                prob_existing_edge = P[i,j]
                if i == j:
                    for u in range(n):
                        for v in range(u+1,n):
                            if np.random.rand() <= prob_existing_edge:
                                edges.append((i * n + u, j * n + v))
                                
                else:
                    for u in range(n):
                        for v in range(n):
                            if np.random.rand() <= prob_existing_edge:
                                edges.append((i * n + u, j * n + v))
        
        true_clusters = [list(range(i*n, (i+1)*n)) for i in range(K)]                        
        G = Graph(vertices = list(range(n * K)), edges = edges)
        bounds[n] = bounds[n] + pd.Series(compute_all_bounds(G, K, [1,2], true_clusters = true_clusters))
    bounds[n] = bounds[n] / sample_size

In [None]:
df = pd.DataFrame(bounds).T
columns = ['Corollary 5', 'General ST', r'$\frac{\rho(2)}{\lambda_3}$', 'True Value']
df.columns = columns
df.to_csv("Data/ThresholdVaryingNBeta20.csv")
df = pd.read_csv("Data/ThresholdVaryingNBeta20.csv").set_index(["Unnamed: 0"])
# df = df.drop(columns = ['General ST'])
(df.loc[200:,:] / 2).plot(marker = 'o', xlabel = 'n', ylabel = 'Error', figsize = (10,10))
# plt.title(r'Bounds of $\frac{1}{2}\sum_{i=1}^2\|f_i - \hat{g}_i\|^2$ for SBM with 2 clusters at threshold' + '\n' + r'$\beta = 20, \alpha = (\sqrt{2} + \sqrt{\beta})^2$', y=1.03)
plt.xlabel(r'Cluster size $n$')
plt.ylabel(r'Error')
plt.grid(True)
# add ticks
plt.xticks(np.arange(200,1000,100))
#plt.yticks(np.arange(0,0.2,0.01))
# make legend larger
plt.legend(fontsize='large', bbox_to_anchor = (1.0,1.05))
plt.savefig("Data/ThresholdVaryingNBeta20.png", bbox_inches="tight")

In [None]:
df = pd.DataFrame(bounds).T
columns = ['Recursive ST', 'General ST', r'$\frac{\rho(2)}{\lambda_3}$', 'True Value']
df.columns = columns
# df = df.drop(columns = ['General ST'])
(df.loc[200:,:] / 2).plot(marker = 'o', xlabel = 'n', ylabel = 'Bound Value', figsize = (10,10), logy = True)
# plt.title(r'Bounds of $\frac{1}{2}\sum_{i=1}^2\|f_i - \hat{g}_i\|^2$ for SBM with 2 clusters at threshold' + '\n' + r'$\beta = 20, \alpha = (\sqrt{2} + \sqrt{\beta})^2$', y=1.03)
plt.xlabel(r'Cluster size $n$')
plt.ylabel(r'Error')
plt.grid(True)
# add ticks
plt.xticks(np.arange(200,1000,100))
#plt.yticks(np.arange(0,0.2,0.01))
# make legend larger
plt.legend(fontsize='large', bbox_to_anchor = (1.0,1.05))
plt.savefig("Data/ThresholdVaryingNBeta20LogScale.png", bbox_inches="tight")

In [None]:
df_copy = pd.read_csv("Data/ThresholdVaryingNBeta20.csv")
df_copy = df_copy.set_index(["Unnamed: 0"])
df_copy.columns = ["Theorem 4", "Theorem 1", "Macgregor & Sun", "True Value"]
(df_copy.loc[200:,:] / 2).plot(marker = 'o', xlabel = 'n', ylabel = 'Bound Value', figsize = (12,10), logy=True)
# plt.title(r'Bounds of $\frac{1}{8}\sum_{i=1}^8\|f_i - \hat{g}_i\|^2$ for SBM with 8 clusters (with one pair)', y=1.03)
plt.xlabel(r'Cluster size $n$', fontsize = 30)
plt.ylabel(r'Error', fontsize = 30)
plt.grid(True)
# add ticks
#plt.xticks(np.arange(200,1000,100))
#plt.yticks(np.arange(0,1.2,0.1))
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
# make legend larger
plt.legend(fontsize=25, bbox_to_anchor = (1.0,1.05))
plt.savefig('Data/ThresholdVaryingNBeta20LogScale.png', bbox_inches = "tight")

In [None]:
K = 2
b = 1
a = (np.sqrt(b) + np.sqrt(K)) ** 2


def get_P(a,b,N):
    p = a * np.log(N) / N
    q = b * np.log(N) / N
    P = np.array([[p,q],[q,p]])
    return P


bounds = {}
sample_size = 10
for n in [200, 300, 400, 500, 600, 700, 800, 900, 1000]:
    N = K*n
    bounds[n] = 0
    P = get_P(a, b, N)
    for _ in range(sample_size):
        edges = []
        for i in range(K):
            for j in range(i, K):
                prob_existing_edge = P[i, j]
                if i == j:
                    for u in range(n):
                        for v in range(u + 1, n):
                            if np.random.rand() <= prob_existing_edge:
                                edges.append((i * n + u, j * n + v))

                else:
                    for u in range(n):
                        for v in range(n):
                            if np.random.rand() <= prob_existing_edge:
                                edges.append((i * n + u, j * n + v))

        true_clusters = [list(range(i * n, (i + 1) * n)) for i in range(K)]
        G = Graph(vertices=list(range(n * K)), edges=edges)
        bounds[n] = bounds[n] + pd.Series(compute_all_bounds(G, K, [1, 2], true_clusters=true_clusters))
    bounds[n] = bounds[n] / sample_size

In [None]:
df = pd.DataFrame(bounds).T
columns = ['Corollary 5', 'General ST', r'$\frac{\rho(2)}{\lambda_3}$', 'True Value']
df.columns = columns
df.to_csv("Data/ThresholdVaryingNBeta1.csv")
#df = df.drop(columns=['General ST'])
(df.loc[200:, :] / 2).plot(marker='o', xlabel='n', ylabel='Bound Value', figsize=(10, 10))
# plt.title(r'Bounds of $\frac{1}{2}\sum_{i=1}^2\|f_i - \hat{g}_i\|^2$ for SBM with 2 clusters at threshold' + '\n' + r'$\beta = 20, \alpha = (\sqrt{2} + \sqrt{\beta})^2$', y=1.03)
plt.xlabel(r'Cluster size $n$')
plt.ylabel(r'Error')
plt.grid(True)
# add ticks
plt.xticks(np.arange(200, 1000, 100))
#plt.yticks(np.arange(0,0.2,0.01))
# make legend larger
plt.legend(fontsize='large', bbox_to_anchor=(1.0, 1.05))
plt.savefig("ThresholdVaryingNBeta1.png", bbox_inches="tight")

In [None]:
df = pd.DataFrame(bounds).T
columns = ['Corollary 5', 'General ST', r'$\frac{\rho(2)}{\lambda_3}$', 'True Value']
df.columns = columns
#df = df.drop(columns=['General ST'])

In [None]:

df = pd.read_csv("Data/ThresholdVaryingNBeta1.csv")
df = df.set_index(["Unnamed: 0"])
(df.loc[200:, :] / 2).plot(marker='o', xlabel='n', ylabel='Bound Value', figsize=(10, 10), logy=True)
# plt.title(r'Bounds of $\frac{1}{2}\sum_{i=1}^2\|f_i - \hat{g}_i\|^2$ for SBM with 2 clusters at threshold' + '\n' + r'$\beta = 20, \alpha = (\sqrt{2} + \sqrt{\beta})^2$', y=1.03)
plt.xlabel(r'Cluster size $n$',fontsize=30)
plt.ylabel(r'Error',fontsize=30)
plt.grid(True)
# add ticks
plt.xticks(np.arange(200, 1000, 100), fontsize=25)

plt.yticks(fontsize=25)
# make legend larger
plt.legend(fontsize=25, bbox_to_anchor = (1.0,1.05))
#plt.yticks(np.arange(0,0.2,0.01))
# make legend larger
plt.legend(fontsize='large', bbox_to_anchor=(1.0, 1.05))
plt.savefig("Data/ThresholdVaryingNBeta1LogScale.png")

Repeating the same experiment but varying $\beta$ with n fixed. We fix n = 500 and vary $\beta$ from 1 to 20. We set $\alpha = (\sqrt{\beta} + \sqrt{2})^2$

In [None]:
K = 2

def get_P(b,N):
    a = (np.sqrt(b) + np.sqrt(K))**2
    p = a * np.log(N) / N
    q = b * np.log(N) / N
    P = np.array([[p,q],[q,p]])
    return P

bounds = {}
sample_size = 1
n = 500
for b in range(5,20):
    bounds[b] = 0
    N = K*n
    P = get_P(b,N)
    for _ in range(sample_size):
        edges = []
        for i in range(K):
            for j in range(i,K):
                prob_existing_edge = P[i,j]
                if i == j:
                    for u in range(n):
                        for v in range(u+1,n):
                            if np.random.rand() <= prob_existing_edge:
                                edges.append((i * n + u, j * n + v))
                                
                else:
                    for u in range(n):
                        for v in range(n):
                            if np.random.rand() <= prob_existing_edge:
                                edges.append((i * n + u, j * n + v))
        
        true_clusters = [list(range(i*n, (i+1)*n)) for i in range(K)]                        
        G = Graph(vertices = list(range(n * K)), edges = edges)
        bounds[b] = bounds[b] + pd.Series(compute_all_bounds(G, K, [1,2], true_clusters = true_clusters))
    bounds[b] = bounds[b] / sample_size

In [None]:
df = pd.DataFrame(bounds).T
columns = ['Corollary 5', 'General ST', r'$\frac{\rho(2)}{\lambda_3}$', 'True Value']
df.columns = columns
# df = df.drop(columns = ['General ST'])
df.to_csv("Data/ThresholdVaryingBetaAtThreshold.csv")
(df.loc[1:,:] / 2).plot(marker = 'o', xlabel = 'n', ylabel = 'Bound Value', figsize = (10,10))
# plt.title(r'Bounds of $\frac{1}{2}\sum_{i=1}^2\|f_i - \hat{g}_i\|^2$ for SBM with 2 clusters at threshold' + '\n' + r' varying $\beta$, $\alpha = (\sqrt{2} + \sqrt{\beta})^2$, cluster size $n=500$', y=1.03)
plt.xlabel(r'$\beta$')
plt.ylabel(r'Error')
plt.grid(True)
# add ticks
#plt.xticks(np.arange(200,1000,100))
#plt.yticks(np.arange(0,0.2,0.01))
# make legend larger
plt.legend(fontsize='large', bbox_to_anchor = (1.0,1.05))
plt.savefig("Data/ThresholdVaryingBetaAtThreshold.png", bbox_inches="tight")

In [None]:
df = pd.DataFrame(bounds).T
columns = ['Corollary 5', 'General ST', r'$\frac{\rho(2)}{\lambda_3}$', 'True Value']
df.columns = columns
# df = df.drop(columns = ['General ST'])
(df.loc[1:,:] / 2).plot(marker = 'o', xlabel = 'n', ylabel = 'Bound Value', figsize = (10,10), logy=True)
# plt.title(r'Bounds of $\frac{1}{2}\sum_{i=1}^2\|f_i - \hat{g}_i\|^2$ for SBM with 2 clusters at threshold' + '\n' + r' varying $\beta$, $\alpha = (\sqrt{2} + \sqrt{\beta})^2$, cluster size $n=500$', y=1.03)
plt.xlabel(r'$\beta$')
plt.ylabel(r'Error')
plt.grid(True)
# add ticks
#plt.xticks(np.arange(200,1000,100))
#plt.yticks(np.arange(0,0.2,0.01))
# make legend larger
plt.legend(fontsize='large', bbox_to_anchor = (1.0,1.05))
plt.savefig("Data/ThresholdVaryingBetaAtThresholdLogScale.png", bbox_inches="tight")

In [None]:
df_copy = pd.read_csv("Data/ThresholdVaryingBetaAtThreshold.csv")
df_copy = df_copy.set_index(["Unnamed: 0"])
df_copy.columns = ["Theorem 4", "Theorem 1", "Macgregor & Sun", "True Value"]
(df_copy.loc[1:,:] / 2).plot(marker = 'o', xlabel = 'n', ylabel = 'Bound Value', figsize = (12,10), logy=True)
# plt.title(r'Bounds of $\frac{1}{8}\sum_{i=1}^8\|f_i - \hat{g}_i\|^2$ for SBM with 8 clusters (with one pair)', y=1.03)
plt.xlabel(r'$\beta$', fontsize = 20)
plt.ylabel(r'Error', fontsize = 20)
plt.grid(True)
# add ticks
#plt.xticks(np.arange(200,1000,100))
#plt.yticks(np.arange(0,1.2,0.1))
# make legend larger
plt.legend(fontsize='large', bbox_to_anchor = (1.0,1.05))
plt.savefig('Data/ThresholdVaryingBetaAtThresholdLogScale.png', bbox_inches = "tight")

Now we consider fixing larger $\alpha$ and increasing $\beta$

In [None]:
K = 2
max_b = 20
a = 35

def get_P(b, n):
    
    p = a * np.log(n) / n
    q = b * np.log(n) / n
    P = np.array([[p, q], [q, p]])
    return P


bounds = {}
sample_size = 10
n = 500

for b in range(10, max_b + 10):
    bounds[b] = 0
    P = get_P(b, n)
    for _ in range(sample_size):
        edges = []
        for i in range(K):
            for j in range(i, K):
                prob_existing_edge = P[i, j]
                if i == j:
                    for u in range(n):
                        for v in range(u + 1, n):
                            if np.random.rand() <= prob_existing_edge:
                                edges.append((i * n + u, j * n + v))

                else:
                    for u in range(n):
                        for v in range(n):
                            if np.random.rand() <= prob_existing_edge:
                                edges.append((i * n + u, j * n + v))

        true_clusters = [list(range(i * n, (i + 1) * n)) for i in range(K)]
        G = Graph(vertices=list(range(n * K)), edges=edges)
        bounds[b] = bounds[b] + pd.Series(compute_all_bounds(G, K, [1, 2], true_clusters=true_clusters))
    bounds[b] = bounds[b] / sample_size
    


In [None]:
    
df = pd.DataFrame(bounds).T
columns = ['Corollary 5', 'General ST', r'$\frac{\rho(2)}{\lambda_3}$', 'True Value']
df.columns = columns
#df = df.drop(columns=['General ST'])
df.to_csv("Data/ThresholdVaryingBetaFixedAlpha.csv")
(df.loc[1:, :] / 2).plot(marker='o', xlabel='n', ylabel='Bound Value', figsize=(10, 10))
#plt.title(
 #   r'Bounds of $\frac{1}{2}\sum_{i=1}^2\|f_i - \hat{g}_i\|^2$ for SBM with 2 clusters at threshold' + '\n' + r' varying $\beta$, $\alpha = (\sqrt{2} + \sqrt{20})^2$, cluster size $n=500$',
 #   y=1.03)
plt.xlabel(r'$\beta$')
plt.ylabel(r'Error')
plt.grid(True)
# add ticks
#plt.xticks(np.arange(200,1000,100))
#plt.yticks(np.arange(0,0.2,0.01))
# make legend larger
plt.legend(fontsize='large', bbox_to_anchor=(1.0, 1.05))
plt.savefig("Data/ThresholdVaryingBetaFixedAlpha.png", bbox_inches="tight")

In [None]:
df = pd.DataFrame(bounds).T
columns = ['Corollary 5', 'General ST', r'$\frac{\rho(2)}{\lambda_3}$', 'True Value']
df.columns = columns
# df = df.drop(columns=['General ST'])
df.to_csv("Data/ThresholdVaryingBetaFixedAlpha.csv")
(df.loc[1:, :] / 2).plot(marker='o', xlabel='n', ylabel='Bound Value', figsize=(10, 10), logy=True)
#plt.title(
 #   r'Bounds of $\frac{1}{2}\sum_{i=1}^2\|f_i - \hat{g}_i\|^2$ for SBM with 2 clusters at threshold' + '\n' + r' varying $\beta$, $\alpha = (\sqrt{2} + \sqrt{20})^2$, cluster size $n=500$',
 #   y=1.03)
plt.xlabel(r'$\beta$')
plt.ylabel(r'Error')
plt.grid(True)
# add ticks
#plt.xticks(np.arange(200,1000,100))
#plt.yticks(np.arange(0,0.2,0.01))
# make legend larger
plt.legend(fontsize='large', bbox_to_anchor=(1.0, 1.05))
plt.savefig("Data/ThresholdVaryingBetaFixedAlphaLogScale.png", bbox_inches="tight")

In [None]:
df_copy = pd.read_csv("Data/ThresholdVaryingBetaFixedAlpha.csv")
df_copy = df_copy.set_index(["Unnamed: 0"])
df_copy.columns = ["Theorem 4", "Theorem 1", "Macgregor & Sun", "True Value"]
(df_copy.loc[1:,:] / 2).plot(marker = 'o', xlabel = 'n', ylabel = 'Bound Value', figsize = (12,10), logy=True)
# plt.title(r'Bounds of $\frac{1}{8}\sum_{i=1}^8\|f_i - \hat{g}_i\|^2$ for SBM with 8 clusters (with one pair)', y=1.03)
plt.xlabel(r'$\beta$', fontsize = 20)
plt.ylabel(r'Error', fontsize = 20)
plt.grid(True)
# add ticks
#plt.xticks(np.arange(200,1000,100))
#plt.yticks(np.arange(0,1.2,0.1))
# make legend larger
plt.legend(fontsize='large', bbox_to_anchor = (1.0,1.05))
plt.savefig('Data/ThresholdVaryingBetaFixedAlphaLogScale.png', bbox_inches = "tight")