In [34]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.metrics import pairwise_distances
from sklearn import metrics
import os
import networkx as nx
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

%matplotlib inline

In [35]:
data_dir = os.getcwd()
cora_location = os.path.expanduser(os.path.join(data_dir, "cora/cora.cites"))
g_nx = nx.read_edgelist(path=cora_location)


cora_data_location = os.path.expanduser(os.path.join(data_dir, "cora/cora.content"))
node_attr = pd.read_csv(cora_data_location, sep='\t', header=None)
values = { str(row.tolist()[0]): row.tolist()[-1] for _, row in node_attr.iterrows()}
nx.set_node_attributes(g_nx, values, 'subject')

feature_names = ["w_{}".format(ii) for ii in range(1433)]
column_names =  feature_names + ["subject"]
node_data = pd.read_table(os.path.join(data_dir, "cora/cora.content"), header=None, names=column_names)



In [36]:
g_nx_ccs = (g_nx.subgraph(c).copy() for c in nx.connected_components(g_nx))
g_nx = max(g_nx_ccs, key=len)
node_ids = list(g_nx.nodes())
print("Largest subgraph statistics: {} nodes, {} edges".format(
    g_nx.number_of_nodes(), g_nx.number_of_edges()))

node_targets = [ g_nx.nodes[node_id]['subject'] for node_id in node_ids]

print(f"There are {len(np.unique(node_targets))} unique labels on the nodes.")

print(f"There are {len(g_nx.nodes())} nodes in the network.")


Largest subgraph statistics: 2485 nodes, 5069 edges
There are 7 unique labels on the nodes.
There are 2485 nodes in the network.


In [37]:
s = set(node_data["subject"])
#build a dictionary to convert string to numbers
convert_table = {e:idx for idx, e in enumerate(s)}

def word2idx(word):
    return convert_table[word]

ground_truth =  [word2idx(i) for i in node_targets]

In [38]:
A = nx.to_numpy_array(g_nx) 

In [39]:
D = np.diag(A.sum(axis=1))
print(D)

[[168.   0.   0. ...   0.   0.   0.]
 [  0.   5.   0. ...   0.   0.   0.]
 [  0.   0.   6. ...   0.   0.   0.]
 ...
 [  0.   0.   0. ...   4.   0.   0.]
 [  0.   0.   0. ...   0.   4.   0.]
 [  0.   0.   0. ...   0.   0.   2.]]


In [40]:
L = D-A
print(L)

[[168.  -1.  -1. ...   0.   0.   0.]
 [ -1.   5.   0. ...   0.   0.   0.]
 [ -1.   0.   6. ...   0.   0.   0.]
 ...
 [  0.   0.   0. ...   4.  -1.  -1.]
 [  0.   0.   0. ...  -1.   4.   0.]
 [  0.   0.   0. ...  -1.   0.   2.]]


In [41]:
eigenvalues, eigenvectors = np.linalg.eig(L) 
eigenvalues = np.real(eigenvalues)
eigenvectors = np.real(eigenvectors)

order = np.argsort(eigenvalues)  
eigenvalues = eigenvalues[order]

In [42]:
embedding_size = 32
v_0 = eigenvectors[:, order[0]]
v = eigenvectors[:, order[1:(embedding_size+1)]] 

In [43]:
convert_table

{'Theory': 0,
 'Reinforcement_Learning': 1,
 'Neural_Networks': 2,
 'Probabilistic_Methods': 3,
 'Case_Based': 4,
 'Rule_Learning': 5,
 'Genetic_Algorithms': 6}

In [48]:
n = 7 #number of clusters
#Spectral Clustering method
model = SpectralClustering(n_clusters = n, n_init=100,assign_labels='discretize')
#model.fit(vecs[:,1])
labels = model.fit_predict(v[:,1:8])

#labels = [abs(i-1) for i in labels]
print(metrics.adjusted_rand_score(ground_truth, labels))
print(metrics.adjusted_mutual_info_score(ground_truth, labels))
print(metrics.accuracy_score(ground_truth, labels))
print(ground_truth)
print(labels)

0.003947727630004761
0.16392265963104333
0.16257545271629778
[6, 6, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 6, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 6, 6, 6, 6, 6, 2, 6, 6, 6, 6, 4, 1, 6, 6, 6, 6, 6, 6, 6, 2, 6, 6, 6, 6, 6, 6, 6, 0, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 2, 4, 4, 2, 4, 4, 3, 3, 3, 3, 3, 3, 4, 1, 1, 1, 1, 1, 2, 4, 6, 1, 0, 2, 2, 0, 4, 6, 1, 1, 1, 1, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4,

array([[ 0.00211898, -0.00599277,  0.00367151, ..., -0.00094008,
        -0.00019546, -0.00015991],
       [ 0.002144  , -0.00611701,  0.00381872, ...,  0.00051456,
         0.00056262, -0.00069991],
       [ 0.00183635, -0.00473534,  0.00298016, ...,  0.00390489,
         0.0020503 , -0.00024456],
       ...,
       [ 0.00133535, -0.00346527,  0.00254527, ..., -0.0030409 ,
        -0.00621298,  0.00129151],
       [ 0.00138206, -0.00344062,  0.00257468, ..., -0.00381997,
        -0.00629437,  0.00179128],
       [ 0.00117545, -0.00345737,  0.0023799 , ..., -0.00019461,
        -0.00491115, -0.00043305]])