In [1]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.metrics import pairwise_distances
from sklearn import metrics
import os
import networkx as nx
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

%matplotlib inline

In [2]:
data_dir = os.getcwd()
edge_location = os.path.expanduser(os.path.join(data_dir, "fb-CMU-Carnegie49/fb-CMU-Carnegie49.edges"))
g_nx = nx.read_edgelist(path=edge_location)


class_data_location = os.path.expanduser(os.path.join(data_dir, "fb-CMU-Carnegie49/fb-CMU-Carnegie49.node_labels"))
node_attr = pd.read_csv(class_data_location, sep=',', header=None)
values = { str(row.tolist()[0]): row.tolist()[-1] for _, row in node_attr.iterrows()}
nx.set_node_attributes(g_nx, values, 'class')

column_names =  ["node_id" ,"class"]
node_data = pd.read_csv(os.path.join(data_dir, "fb-CMU-Carnegie49/fb-CMU-Carnegie49.node_labels"), header=None, names=column_names)



In [3]:
g_nx_ccs = (g_nx.subgraph(c).copy() for c in nx.connected_components(g_nx))
g_nx = max(g_nx_ccs, key=len)
node_ids = list(g_nx.nodes())
print("Largest subgraph statistics: {} nodes, {} edges".format(
    g_nx.number_of_nodes(), g_nx.number_of_edges()))

node_targets = [ g_nx.nodes[node_id]['class'] for node_id in node_ids]

print(f"There are {len(np.unique(node_targets))} unique labels on the nodes.")

print(f"There are {len(g_nx.nodes())} nodes in the network.")


Largest subgraph statistics: 6621 nodes, 249959 edges
There are 3 unique labels on the nodes.
There are 6621 nodes in the network.


In [4]:
s = set(node_data["class"])
print(s)
convert_table = {e:idx for idx, e in enumerate(s)}

def word2idx(word):
    return convert_table[word]

ground_truth =  [word2idx(i) for i in node_targets]

print(ground_truth)

{1, 2, 3}
[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1, 0, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 0, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 0, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 0, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 0, 0, 2, 1, 2, 1, 1, 1, 1, 2, 0, 1, 1, 2, 0, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 0, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 0, 1, 2, 1, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 0, 2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2,

In [5]:
A = nx.to_numpy_array(g_nx) 

In [6]:
D = np.diag(A.sum(axis=1))
print(D)

[[17.  0.  0. ...  0.  0.  0.]
 [ 0. 76.  0. ...  0.  0.  0.]
 [ 0.  0. 47. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  1.  0.  0.]
 [ 0.  0.  0. ...  0.  1.  0.]
 [ 0.  0.  0. ...  0.  0.  1.]]


In [7]:
L = D-A
print(L)

[[17. -1. -1. ...  0.  0.  0.]
 [-1. 76.  0. ...  0.  0.  0.]
 [-1.  0. 47. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  1.  0.  0.]
 [ 0.  0.  0. ...  0.  1.  0.]
 [ 0.  0.  0. ...  0.  0.  1.]]


In [8]:
# eigenvalues, eigenvectors = np.linalg.eig(L) 
# eigenvalues = np.real(eigenvalues)
# eigenvectors = np.real(eigenvectors)

# order = np.argsort(eigenvalues)  
# eigenvalues = eigenvalues[order]

In [9]:
# embedding_size = 32
# v_0 = eigenvectors[:, order[0]]
# v = eigenvectors[:, order[1:(embedding_size+1)]] 

In [14]:
n = 3 #number of clusters
#Spectral Clustering method

model = SpectralClustering(n_clusters = n, n_init=100,assign_labels='discretize', affinity='precomputed')
labels = model.fit_predict(A)

In [11]:
# n = 3 #number of clusters
# #Spectral Clustering method
# model = SpectralClustering(n_clusters = n, n_init=100,assign_labels='kmeans', affinity='precomputed', n_components=9)
# #model.fit(vecs[:,1])
# model.fit(A)



In [12]:
print(metrics.adjusted_rand_score(ground_truth, labels))
print(metrics.adjusted_mutual_info_score(ground_truth, labels))
print(metrics.accuracy_score(ground_truth, labels))
print(ground_truth)
print(labels)

AttributeError: 'SpectralClustering' object has no attribute 'labels_'

In [13]:
for i in range(0,6500,500):
    print(model.labels_[i:i+500])

AttributeError: 'SpectralClustering' object has no attribute 'labels_'