In [2]:
# import all libraries 
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
import random
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score
import community as comm
import pandas as pd
warnings.filterwarnings('ignore')
%run ./helper.ipynb

In [18]:
def node_classification(embeddings, label):
    X, Y = read_node_label(label,skip_head=True)
    
    ltrainfrac = [0.05, 0.1, 0.2, 0.3, .4, .5, .6, .7, .8]
    for tf in ltrainfrac:
        print("Training classifier using {:.2f}% nodes...".format(tf * 100))
        split_train_evaluate(X, Y, embeddings, tf)


        
        
def makeLinkPredictionData(graph, embeddings):
    # converting embedding to a numpy array
    X = [[0] for i in range(G.number_of_nodes())]
    for i in range(0,G.number_of_nodes()):
        X[i] = embeddings[i]
    X = np.array(X)
    
    Xd = []
    Yd = []
    count = 0
    # for all vertices
    nodes=np.array(list(graph.nodes()),dtype=int)
    nodes.sort()
    print(nodes.shape)
    print(nodes[:20])
    for u in range(graph.number_of_nodes()):
        # print(u)
        Nu = list(graph.neighbors(u))
        count += len(Nu)
        cn = 0
        totalns = 0
        # for all neighbors of u
        for n in Nu:
            x = []
            if n > u:
                for d in range(len(X[0])):
                    x.append(X[u][d] - X[n][d]) # distance between the embeddings of u and its neighbor n
                Xd.append(x)
                Yd.append(1) # positive sample (edge present)
                totalns += 1
        tmpnn = []
        if len(Nu) > graph.number_of_nodes() // 2:
            totalns = (graph.number_of_nodes() - len(Nu)) // 2
            #print("Testing neighbors!")
        while cn < totalns:
            nn = random.randint(0, graph.number_of_nodes() - 1)
            # non-neighbors of u
            if nn not in Nu and nn not in tmpnn:
                cn += 1
                x = []
                for d in range(len(X[0])):
                    x.append(X[u][d] - X[nn][d])
                Xd.append(x)
                Yd.append(0) # negative sample (edge absent)
                tmpnn.append(nn)
    Xd, Yd = np.array(Xd), np.array(Yd)
    indices = np.array(range(len(Yd)))
    np.random.shuffle(indices)
    Xt = Xd[indices]
    Yt = Yd[indices]
    #print(len(Xd), len(Yd), count)
    
    
    ltrainfrac = .75
    # for tf in ltrainfrac:
    CV = int(len(Yt) * ltrainfrac)
    trainX = Xt[0:CV]
    testX = Xt[CV:]
    trainY = Yt[0:CV]
    testY = Yt[CV:]
    modelLR = LogisticRegression().fit(trainX, trainY)
    predictedY = modelLR.predict(testX)
    acc = accuracy_score(predictedY, testY)
    #f1macro = f1_score(predictedY, testY, average='macro', labels=np.unique(predictedY))
    #f1micro = f1_score(predictedY, testY, average='micro', labels=np.unique(predictedY))
    #print("Link predictions:", tf, ":Accuracy:",acc, "F1-macro:", f1macro, "F1-micro:",f1micro)
    print("Link predictions:", ltrainfrac, ":Accuracy:",acc)



def cluster_eval(G, embeddings):
    # converting embedding to a numpy array
    X = [[0] for i in range(G.number_of_nodes())]
    for i in range(0, G.number_of_nodes()):
        X[i] = embeddings[str(i+1)]
    X = np.array(X)

    bestModularity = 0
    bestC = 2
    NOC = 30
    allmodularity = []
    for cls in range(2, NOC):
        
        # find clusters using a kmeans clustering algorithm on the embedding
        # Number of clusters is set to cls
        clusters = KMeans(n_clusters=cls, random_state=0).fit(X)
        predG = dict()
        for node in range(len(clusters.labels_)):
            predG[node] = clusters.labels_[node]
        
        # compute the modularity score of the Kmeans clustering
        modularity = comm.community_louvain.modularity(predG, G)
        allmodularity.append(modularity)
        print("Number of clusters: ", cls, "  Modularity: ", modularity)
        if modularity > bestModularity:
            bestModularity = modularity
            bestC = cls
    plt.scatter(range(2, NOC), allmodularity)
    plt.xlabel("Number of clusters")
    plt.ylabel("Modularity score")
    plt.show()
    #print("Best Modularity:",bestModularity, "Clusters:", bestC)

    
    
def plot_embeddings(embeddings, label):

    X, Y = read_node_label(label,skip_head=True)
    emb_list = []
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)
    color_idx = {}

    for i in range(len(X)):
        color_idx.setdefault(Y[i][0], [])
        color_idx[Y[i][0]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)  # c=node_colors)
    plt.legend()
    plt.show()

In [3]:
graphfile = 'twitch_edges.txt'
# G = pd.read_csv('large_twitch_edges.txt')
# labelfile = 'cora.nodes.labels'
G = nx.read_edgelist('edges_twich_gamers.txt', nodetype=int)
# G = G.to_directed()
print("Number of nodes: ", G.number_of_nodes())
print("Number of edges: ", G.number_of_edges())


Number of nodes:  168114
Number of edges:  6797557


In [4]:
# Get embedding 128 dimension
walks_deepwalk = deepwalk_walks(G, walk_length=6,num_walks=10)
embeddings_deepwalk = get_embedding(G,walks_deepwalk)



Learning embedding vectors...
Learning embedding vectors done!


In [14]:
len(embeddings_deepwalk)

168114

In [13]:
embeddings_deepwalk[0]

array([ 0.3236165 , -0.08275992,  0.43821412,  0.43181616, -0.11131468,
       -0.1063156 , -0.45930204, -0.16285433, -0.03571598,  0.34196168,
       -0.39276353, -0.25330806,  0.3939766 ,  0.4795174 , -0.5403613 ,
        0.55807835, -0.5091514 , -0.39936104,  0.00733084, -0.00713692,
        0.24127099, -0.509476  ,  0.10707782, -0.72262317,  0.5735699 ,
       -0.02766008,  0.39398977,  0.6084681 , -0.00463788,  0.36586136,
        0.06890488,  0.22692527,  0.10163262,  0.1313394 , -0.16566713,
       -0.14603743,  0.25727773, -0.39185178,  0.32752234,  0.5367702 ,
        0.84412456,  0.49011832, -0.62687933, -0.35515675,  0.35783276,
        0.37517372, -0.17012076, -0.291559  ,  0.49715376,  0.48664126,
       -0.34207138,  0.2844901 ,  0.39653644, -0.06646778,  0.06239418,
       -0.35014305,  0.30183172,  0.236775  ,  0.00134017,  0.18113793,
        0.19234318,  0.54058075, -0.07578667,  0.13742556, -0.07931186,
       -0.02793577, -0.16832095, -0.03654853,  0.16242751, -0.06

In [10]:
nodes_G = list(G.nodes())
nodes_G = np.array(nodes_G,dtype=int)
nodes_G.sort()
nodes_G[:50]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])

In [None]:
makeLinkPredictionData(G, embeddings_deepwalk) 

(168114,)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [None]:
import pandas as pd

df_embedding = pd.read_csv('embeddings.csv',header=None)


In [None]:
df_embedding

In [None]:
embeddings_csv = nx.read_edgelist('embeddings.csv', nodetype=None)

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Extract the node embeddings
embeddings = np.array([embeddings_deepwalk[str(node)] for node in G.nodes()])

# Reduce the dimensionality of the embeddings to 2D using t-SNE
tsne = TSNE(n_components=2)
embeddings_2d = tsne.fit_transform(embeddings)

# Plot the embeddings in 2D
plt.scatter(embeddings_2d[:,0], embeddings_2d[:,1], s=10)
plt.show()


In [None]:

# # Extract the node embeddings
# embeddings = np.array([embeddings_deepwalk[str(node)] for node in G.nodes()])

# Save the embeddings as a CSV file
np.savetxt("embeddings.csv", embeddings, delimiter=",")


In [7]:
np.savetxt("embeddings.txt", embeddings_deepwalk, delimiter=" ", fmt="%s")


ValueError: Expected 1D or 2D array, got 0D array instead

In [None]:
makeLinkPredictionData(G, df_embedding) 

In [None]:
def deepwalk_walks_2(G, num_walks, walk_length):
    nodes = [n for n in G.nodes() if n in G]
    walks = []
    for _ in range(num_walks):
        for v in nodes:
            walk = [v]
            while len(walk) < walk_length:
                cur = walk[-1]
                cur_nbrs = list(G.neighbors(cur))
                if len(cur_nbrs) > 0:
                    walk.append(random.choice(cur_nbrs))
                else:
                    break
            walks.append(walk)
    return walks


In [None]:
def makeLinkPredictionData_2(graph, embeddings):
    # converting embedding to a numpy array
    X = [[0] for i in range(G.number_of_nodes())]
    for i in range(0, G.number_of_nodes()):
        if str(i+1) in embeddings:
            X[i] = embeddings[str(i+1)]
    X = np.array(X)

    Xd = []
    Yd = []
    count = 0
    # for all vertices
    for u in range(graph.number_of_nodes()):
        Nu = list(graph.neighbors(u))
        count += len(Nu)
        cn = 0
        totalns = 0
        # for all neighbors of u
        for n in Nu:
            if n > u and n in graph:
                x = []
                for d in range(len(X[0])):
                    x.append(X[u][d] - X[n][d]) # distance between the embeddings of u and its neighbor n
                Xd.append(x)
                Yd.append(1) # positive sample (edge present)
                totalns += 1
        tmpnn = []
        if len(Nu) > graph.number_of_nodes() // 2:
            totalns = (graph.number_of_nodes() - len(Nu)) // 2
            #print("Testing neighbors!")
        while cn < totalns:
            nn = random.randint(0, graph.number_of_nodes() - 1)
            # non-neighbors of u
            if nn not in Nu and nn not in tmpnn and nn in graph:
                cn += 1
                x = []
                for d in range(len(X[0])):
                    x.append(X[u][d] - X[nn][d])
                Xd.append(x)
                Yd.append(0) # negative sample (edge absent)
                tmpnn.append(nn)
    Xd, Yd = np.array(Xd), np.array(Yd)
    indices = np.array(range(len(Yd)))
    np.random.shuffle(indices)
    Xt = Xd[indices]
    Yt = Yd[indices]
    #print(len(Xd), len(Yd), count)

    ltrainfrac = [0.05, 0.1, 0.2, 0.3, .4, .5, .6, .7, .8]
    for tf in ltrainfrac:
        CV = int(len(Yt) * tf)
        trainX = Xt[0:CV]
        testX = Xt[CV:]
        trainY = Yt[0:CV]
        testY = Yt[CV:]
        modelLR = LogisticRegression().fit(trainX, trainY)
        predictedY = model


In [None]:
# Get embedding 128 dimension
walks_deepwalk_2 = deepwalk_walks_2(G, walk_length=6,num_walks=10)
embeddings_deepwalk_2 = get_embedding(G,walks_deepwalk_2)



In [None]:

# Extract the node embeddings
embeddings_2 = np.array([embeddings_deepwalk_2[str(node)] for node in G.nodes()])

# Reduce the dimensionality of the embeddings to 2D using t-SNE
tsne = TSNE(n_components=2)
embeddings_2d_2 = tsne.fit_transform(embeddings_2)

# Plot the embeddings in 2D
plt.scatter(embeddings_2d_2[:,0], embeddings_2d_2[:,1], s=10)
plt.show()


In [None]:

# # Extract the node embeddings
# embeddings = np.array([embeddings_deepwalk[str(node)] for node in G.nodes()])

# Save the embeddings as a CSV file
np.savetxt("embeddings_2.csv", embeddings_2, delimiter=",")
np.savetxt("embeddings_2.txt", embeddings_2, delimiter=" ", fmt="%s")


In [None]:
makeLinkPredictionData(G, embeddings_deepwalk) 

In [None]:
def makeLinkPredictionData_3(graph, embeddings):
    n_nodes = len(graph.nodes())
    nodes = list(graph.nodes())
    edges = list(graph.edges())
    
    Xd = np.zeros((len(edges), embeddings.shape[1]*2))
    Yd = np.zeros(len(edges))
    
    for i in range(len(edges)):
        edge = edges[i]
        node1 = nodes.index(edge[0])
        node2 = nodes.index(edge[1])
        Xd[i] = np.hstack((embeddings[node1], embeddings[node2]))
        if graph.has_edge(*edge):
            Yd[i] = 1
            
    indices = np.array(range(len(Yd)))
    np.random.shuffle(indices)
    Xt = Xd[indices.astype(int)]
    Yt = Yd[indices.astype(int)]
    
    return Xt, Yt


In [None]:
makeLinkPredictionData_3(G, embeddings_deepwalk_2) 