In [47]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.metrics import pairwise_distances
from sklearn import metrics
import os
import networkx as nx
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

%matplotlib inline

In [48]:
data_dir = os.getcwd()
cora_location = os.path.expanduser(os.path.join(data_dir, "cora/cora.cites"))
g_nx = nx.read_edgelist(path=cora_location)


cora_data_location = os.path.expanduser(os.path.join(data_dir, "cora/cora.content"))
node_attr = pd.read_csv(cora_data_location, sep='\t', header=None)
values = { str(row.tolist()[0]): row.tolist()[-1] for _, row in node_attr.iterrows()}
nx.set_node_attributes(g_nx, values, 'subject')

feature_names = ["w_{}".format(ii) for ii in range(1433)]
column_names =  feature_names + ["subject"]
node_data = pd.read_table(os.path.join(data_dir, "cora/cora.content"), header=None, names=column_names)



In [49]:
g_nx_ccs = (g_nx.subgraph(c).copy() for c in nx.connected_components(g_nx))
g_nx = max(g_nx_ccs, key=len)
node_ids = list(g_nx.nodes())
print("Largest subgraph statistics: {} nodes, {} edges".format(
    g_nx.number_of_nodes(), g_nx.number_of_edges()))

node_targets = [ g_nx.nodes[node_id]['subject'] for node_id in node_ids]

print(f"There are {len(np.unique(node_targets))} unique labels on the nodes.")

print(f"There are {len(g_nx.nodes())} nodes in the network.")


Largest subgraph statistics: 2485 nodes, 5069 edges
There are 7 unique labels on the nodes.
There are 2485 nodes in the network.


In [50]:

node_ids = [int(i) for i in node_ids] 

In [62]:
#node_data.loc[node_ids]

KMean_matrix = node_data.loc[node_ids]
KMean_matrix = KMean_matrix.iloc[:, 0:1433]
print(f"There are {KMean_matrix.shape} dimensions in the network.")

There are (2485, 1433) dimensions in the network.


In [63]:
s = set(node_data["subject"])
#build a dictionary to convert string to numbers
convert_table = {e:idx for idx, e in enumerate(s)}

def word2idx(word):
    return convert_table[word]

ground_truth =  [word2idx(i) for i in node_targets]

In [83]:
n = 7 #number of clusters
#Spectral Clustering method
model = KMeans(n_clusters = n, n_init=100)
#model.fit(vecs[:,1])
labels = model.fit_predict(KMean_matrix.iloc[:, 0:500])

#print(ground_truth)
#print(labels)
print(metrics.adjusted_rand_score(ground_truth, labels))
print(metrics.adjusted_mutual_info_score(ground_truth, labels))
print(metrics.accuracy_score(ground_truth, labels))


0.05729178335580698
0.16614571477465204
0.22937625754527163
