In [40]:
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
import random
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score
import community as comm
from community import community_louvain
warnings.filterwarnings('ignore')
%run ./helper.ipynb
#%run ./prediction.ipynb


In [7]:
graphfile = 'cora.txt'
labelfile = 'cora.nodes.labels'
G = nx.read_edgelist('cora.txt', nodetype=None)
G = G.to_directed()
print("Number of nodes: ", G.number_of_nodes())
print("Number of edges: ", G.number_of_edges())

Number of nodes:  2708
Number of edges:  10556


<h4>Question 1<h4>



<h6>a<h6>

In [9]:
walks_deepwalk1 = deepwalk_walks(G, walk_length=1, num_walks=80)
embeddings_deepwalk1 = get_embedding(G,walks_deepwalk1)

Learning embedding vectors...
Learning embedding vectors done!


In [10]:
walks_deepwalk5 = deepwalk_walks(G, walk_length=5, num_walks=80)
embeddings_deepwalk5 = get_embedding(G,walks_deepwalk5)

Learning embedding vectors...
Learning embedding vectors done!


In [11]:
walks_deepwalk10 = deepwalk_walks(G, walk_length=10, num_walks=80)
embeddings_deepwalk10 = get_embedding(G,walks_deepwalk10)

Learning embedding vectors...
Learning embedding vectors done!


In [12]:
walks_deepwalk20 = deepwalk_walks(G, walk_length=20, num_walks=80)
embeddings_deepwalk20 = get_embedding(G,walks_deepwalk20)

Learning embedding vectors...
Learning embedding vectors done!


In [13]:
walks_deepwalk50 = deepwalk_walks(G, walk_length=50, num_walks=80)
embeddings_deepwalk50 = get_embedding(G,walks_deepwalk50)

Learning embedding vectors...
Learning embedding vectors done!


In [15]:
def node_classification(embeddings, label):
    X, Y = read_node_label(label,skip_head=True)
    
    ltrainfrac = [.7]
    for tf in ltrainfrac:
        print("Training classifier using {:.2f}% nodes...".format(tf * 100))
        split_train_evaluate(X, Y, embeddings, tf)

In [16]:
node_classification(embeddings_deepwalk1, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.29484029484029484}
-------------------


In [19]:
node_classification(embeddings_deepwalk5, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.769041769041769}
-------------------


In [20]:
node_classification(embeddings_deepwalk10, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.7641277641277642}
-------------------


In [21]:
node_classification(embeddings_deepwalk20, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.7985257985257985}
-------------------


In [22]:
node_classification(embeddings_deepwalk50, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.8083538083538083}
-------------------


<h6>question1 b<h6>

In [66]:
def makeLinkPredictionData(graph, embeddings):
    # converting embedding to a numpy array
    X = [[0] for i in range(G.number_of_nodes())]
    for i in range(0, G.number_of_nodes()):
        X[i] = embeddings[str(i+1)]
    X = np.array(X)
    
    Xd = []
    Yd = []
    count = 0
    # for all vertices
    for u in range(graph.number_of_nodes()):
        Nu = list(graph.neighbors(u))
        count += len(Nu)
        cn = 0
        totalns = 0
        # for all neighbors of u
        for n in Nu:
            x = []
            if n > u:
                for d in range(len(X[0])):
                    x.append(X[u][d] - X[n][d]) # distance between the embeddings of u and its neighbor n
                Xd.append(x)
                Yd.append(1) # positive sample (edge present)
                totalns += 1
        tmpnn = []
        if len(Nu) > graph.number_of_nodes() // 2:
            totalns = (graph.number_of_nodes() - len(Nu)) // 2
            #print("Testing neighbors!")
        while cn < totalns:
            nn = random.randint(0, graph.number_of_nodes() - 1)
            # non-neighbors of u
            if nn not in Nu and nn not in tmpnn:
                cn += 1
                x = []
                for d in range(len(X[0])):
                    x.append(X[u][d] - X[nn][d])
                Xd.append(x)
                Yd.append(0) # negative sample (edge absent)
                tmpnn.append(nn)
    Xd, Yd = np.array(Xd), np.array(Yd)
    indices = np.array(range(len(Yd)))
    np.random.shuffle(indices)
    Xt = Xd[indices]
    Yt = Yd[indices]
    #print(len(Xd), len(Yd), count)
    
    
    ltrainfrac = [.7]
    for tf in ltrainfrac:
        CV = int(len(Yt) * tf)
        trainX = Xt[0:CV]
        testX = Xt[CV:]
        trainY = Yt[0:CV]
        testY = Yt[CV:]
        modelLR = LogisticRegression().fit(trainX, trainY)
        predictedY = modelLR.predict(testX)
        acc = accuracy_score(predictedY, testY)
        #f1macro = f1_score(predictedY, testY, average='macro', labels=np.unique(predictedY))
        #f1micro = f1_score(predictedY, testY, average='micro', labels=np.unique(predictedY))
        #print("Link predictions:", tf, ":Accuracy:",acc, "F1-macro:", f1macro, "F1-micro:",f1micro)
        print("Link predictions:", tf, ":Accuracy:",acc)

In [26]:
G1 = G.to_undirected()
G1 = nx.relabel_nodes(G1, lambda x: int(x)-1)

In [27]:
makeLinkPredictionData(G1, embeddings_deepwalk1) 

Link predictions: 0.7 :Accuracy: 0.5108935901484054


In [28]:
makeLinkPredictionData(G1, embeddings_deepwalk5) 

Link predictions: 0.7 :Accuracy: 0.5614145879381118


In [29]:
makeLinkPredictionData(G1, embeddings_deepwalk10) 

Link predictions: 0.7 :Accuracy: 0.588569624250079


In [30]:
makeLinkPredictionData(G1, embeddings_deepwalk20) 

Link predictions: 0.7 :Accuracy: 0.5879381117777076


In [31]:
makeLinkPredictionData(G1, embeddings_deepwalk50) 

Link predictions: 0.7 :Accuracy: 0.587622355541522


<h6>question 1c<h6>

In [49]:
def cluster_eval(G, embeddings):
    # converting embedding to a numpy array
    X = [[0] for i in range(G.number_of_nodes())]
    for i in range(0, G.number_of_nodes()):
        X[i] = embeddings[str(i+1)]
    X = np.array(X)

    bestModularity = 0
    bestC = 10
    NOC = 11
    allmodularity = []
    for cls in range(10, NOC):
        
        # find clusters using a kmeans clustering algorithm on the embedding
        # Number of clusters is set to cls
        clusters = KMeans(n_clusters=cls, random_state=0).fit(X)
        predG = dict()
        for node in range(len(clusters.labels_)):
            predG[node] = clusters.labels_[node]
        
        # compute the modularity score of the Kmeans clustering
        modularity = comm.community_louvain.modularity(predG, G)
        allmodularity.append(modularity)
        print("Number of clusters: ", cls, "  Modularity: ", modularity)
        if modularity > bestModularity:
            bestModularity = modularity
            bestC = cls
    #plt.scatter(range(2, NOC), allmodularity)
    #plt.xlabel("Number of clusters")
    #plt.ylabel("Modularity score")
    #plt.show()

In [50]:
cluster_eval(G1, embeddings_deepwalk1)

Number of clusters:  10   Modularity:  0.0002124578979056206


In [51]:
cluster_eval(G1, embeddings_deepwalk5)

Number of clusters:  10   Modularity:  0.7583287372882439


In [52]:
cluster_eval(G1, embeddings_deepwalk10)

Number of clusters:  10   Modularity:  0.7575264875068223


In [53]:
cluster_eval(G1, embeddings_deepwalk20)

Number of clusters:  10   Modularity:  0.7380323760205768


In [54]:
cluster_eval(G1, embeddings_deepwalk50)

Number of clusters:  10   Modularity:  0.7325338500336214


In [55]:
walks_deepwalk_n1 = deepwalk_walks(G, walk_length=10, num_walks=1)
embeddings_deepwalkn1 = get_embedding(G,walks_deepwalk_n1)

Learning embedding vectors...
Learning embedding vectors done!


In [56]:
walks_deepwalk_n10 = deepwalk_walks(G, walk_length=10, num_walks=10)
embeddings_deepwalkn10 = get_embedding(G,walks_deepwalk_n10)

Learning embedding vectors...
Learning embedding vectors done!


In [57]:
walks_deepwalk_n40 = deepwalk_walks(G, walk_length=10, num_walks=40)
embeddings_deepwalkn40 = get_embedding(G,walks_deepwalk_n40)

Learning embedding vectors...
Learning embedding vectors done!


In [58]:
walks_deepwalk_n80 = deepwalk_walks(G, walk_length=10, num_walks=80)
embeddings_deepwalkn80 = get_embedding(G,walks_deepwalk_n80)

Learning embedding vectors...
Learning embedding vectors done!


In [59]:
walks_deepwalk_n200 = deepwalk_walks(G, walk_length=10, num_walks=200)
embeddings_deepwalkn200 = get_embedding(G,walks_deepwalk_n200)

Learning embedding vectors...
Learning embedding vectors done!


<h6>node_classification<h6>

In [102]:
node_classification(embeddings_deepwalkn1, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.601965601965602}
-------------------


In [61]:
node_classification(embeddings_deepwalkn10, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.7592137592137592}
-------------------


In [62]:
node_classification(embeddings_deepwalkn40, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.7862407862407862}
-------------------


In [63]:
node_classification(embeddings_deepwalkn80, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.7936117936117936}
-------------------


In [64]:
node_classification(embeddings_deepwalkn200, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.800982800982801}
-------------------


<h6>make link prediction<h6>

In [67]:
makeLinkPredictionData(G1, embeddings_deepwalkn1) 

Link predictions: 0.7 :Accuracy: 0.5787811809283233


In [68]:
makeLinkPredictionData(G1, embeddings_deepwalkn10) 

Link predictions: 0.7 :Accuracy: 0.5345753078623303


In [69]:
makeLinkPredictionData(G1, embeddings_deepwalkn40) 

Link predictions: 0.7 :Accuracy: 0.5718345437322387


In [70]:
makeLinkPredictionData(G1, embeddings_deepwalkn80) 

Link predictions: 0.7 :Accuracy: 0.5797284496368803


In [71]:
makeLinkPredictionData(G1, embeddings_deepwalkn200) 

Link predictions: 0.7 :Accuracy: 0.6024628986422482


<h6>cluster_eval<h6>

In [72]:
cluster_eval(G1, embeddings_deepwalkn1)

Number of clusters:  10   Modularity:  0.5615763726284299


In [73]:
cluster_eval(G1, embeddings_deepwalkn10)

Number of clusters:  10   Modularity:  0.7660417469269437


In [75]:
cluster_eval(G1, embeddings_deepwalkn40)

Number of clusters:  10   Modularity:  0.7628934321091747


In [76]:
cluster_eval(G1, embeddings_deepwalkn80)

Number of clusters:  10   Modularity:  0.7557577041609657


In [77]:
cluster_eval(G1, embeddings_deepwalkn200)

Number of clusters:  10   Modularity:  0.7496526043242407


<h4>question 3<h4>

In [80]:
walks_deepwalk = deepwalk_walks(G, walk_length=10, num_walks=80)
embeddings_deepwalke1 = get_embedding(G,walks_deepwalk, embed_size=2)

Learning embedding vectors...
Learning embedding vectors done!


In [81]:
walks_deepwalk = deepwalk_walks(G, walk_length=10, num_walks=80)
embeddings_deepwalke10 = get_embedding(G,walks_deepwalk, embed_size=10)

Learning embedding vectors...
Learning embedding vectors done!


In [83]:
walks_deepwalk = deepwalk_walks(G, walk_length=10, num_walks=80)
embeddings_deepwalke50 = get_embedding(G,walks_deepwalk, embed_size=50)

Learning embedding vectors...
Learning embedding vectors done!


In [84]:
walks_deepwalk = deepwalk_walks(G, walk_length=10, num_walks=80)
embeddings_deepwalke120 = get_embedding(G,walks_deepwalk, embed_size=120)

Learning embedding vectors...
Learning embedding vectors done!


In [85]:
walks_deepwalk = deepwalk_walks(G, walk_length=10, num_walks=80)
embeddings_deepwalke500 = get_embedding(G,walks_deepwalk, embed_size=500)

Learning embedding vectors...
Learning embedding vectors done!


<h6>node classification<h6>

In [86]:
node_classification(embeddings_deepwalke1, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.3046683046683047}
-------------------


In [87]:
node_classification(embeddings_deepwalke10, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.7592137592137592}
-------------------


In [88]:
node_classification(embeddings_deepwalke50, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.7592137592137592}
-------------------


In [89]:
node_classification(embeddings_deepwalke120, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.800982800982801}
-------------------


In [90]:
node_classification(embeddings_deepwalke500, labelfile)

Training classifier using 70.00% nodes...
-------------------
{'acc': 0.7985257985257985}
-------------------


<h6>make link prediction<h6>

In [91]:
makeLinkPredictionData(G1, embeddings_deepwalke1) 

Link predictions: 0.7 :Accuracy: 0.6381433533312283


In [92]:
makeLinkPredictionData(G1, embeddings_deepwalke10) 

Link predictions: 0.7 :Accuracy: 0.5923586990843069


In [93]:
makeLinkPredictionData(G1, embeddings_deepwalke50) 

Link predictions: 0.7 :Accuracy: 0.5854120618882223


In [94]:
makeLinkPredictionData(G1, embeddings_deepwalke120) 

Link predictions: 0.7 :Accuracy: 0.5945689927376065


In [95]:
makeLinkPredictionData(G1, embeddings_deepwalke500) 

Link predictions: 0.7 :Accuracy: 0.57246605620461


<h6>clus_eval<h6>

In [96]:
cluster_eval(G1, embeddings_deepwalke1)

Number of clusters:  10   Modularity:  0.6016346747945708


In [98]:
cluster_eval(G1, embeddings_deepwalke10)

Number of clusters:  10   Modularity:  0.7588636063731123


In [99]:
cluster_eval(G1, embeddings_deepwalke50)

Number of clusters:  10   Modularity:  0.7653932450844814


In [100]:
cluster_eval(G1, embeddings_deepwalke120)

Number of clusters:  10   Modularity:  0.747204339805704


In [101]:
cluster_eval(G1, embeddings_deepwalke500)

Number of clusters:  10   Modularity:  0.7280326933523025


<h6>Observation: As observed walk lengths should be balanced not too high and not too low as low walk 
lengths prevents us from exploring graph and high walk lengths can result in repetition of 
information.
Num of walks should be in upper mid range for best result in node classification and link prediction 
but if target is cluster evaluation it should be balanced.
Embedding size should be low if target is link prediction else it should be balanced for best result<h6>