In [7]:
# use word2vec to implement deepwalk: get sentence from network using random walk, use word2vec to get node vector
import networkx as nx
import numpy as np
import random
from gensim.models import Word2Vec

class DeepWalk:
    def __init__(self, G, emb_size=128, length_walk=40, num_walks=10, window_size=10, num_iters=1):
        """
        meaning of each paramaters:
        G: network
        emb_size : 128 the size of feature vectors
        length_walk : 40 the length of random walk
        num_walk : 10 the number of sentience?
        window_size : 10 ?
        num_iters : 1 ?
        """
        self.G = G
        self.emb_size = emb_size
        self.length_walk = length_walk
        self.num_walks = num_walks
        self.window_size = window_size
        self.num_iters = num_iters
        
    def random_walk(self):
        # random walk with every node as start point once
        walks = []
        for node in self.G.nodes():
            walk = [str(node)]
            v = node
            for _ in range(self.length_walk):
                nbs = list(self.G.neighbors(v))
                if len(nbs) == 0:
                    break
                v = random.choice(nbs) # random.choice 从list中随机选择一个元素
                walk.append(str(v))
            walks.append(walk)
        return walks
    
    def sentenses(self):
        # what's sentence ? the number of preserve random walks ?
        sts = []
        for _ in range(self.num_walks):
            sts.append(self.random_walk())
        return sts
    
    def train(self, workers=4, is_loadmodel = False, is_loaddata=False):
        # workers : the number of thread
        if is_loadamodel:
            print('load model from file')
            w2v = Word2Vec.load('../models/DeepWalk.model')
            return w2v
        
        if is_loaddata:
            print('load data from file')
            with open('../data/tencent_random_walk.txt', 'r'):
                sts = f.read()
                sentences = eval(sts) # 将字符串str当成有效的表达式来求值并返回计算结果, 将字符串抓变为对应的表达式
        else:
            print('Random walk to get training data...')
            sentences = self.sentenses()
            print('number of sentence to train: ', len(sentences))
            with open('../data/tencent_random_walk.txt', 'w') as f:
                f.write(str(sentences))
                
        print('start trainoing...')
        random.seed(616)
        w2v = Word2Vec(sentences=senteces, size = self.emb_size, wiondow=self.window_size, iter = self.num_iters, sg=1, hs=1, min_count=0, workers=workers)
        """
        随机游走数量那么少是否会产生每个node的vector？
        用当前词推测windows size内相邻词的vector，并计算损失
        参数介绍：https://blog.csdn.net/szlcw1/article/details/52751314
        """
        w2v.save('../models/DeepWalk.model')
        print('train done.')
        
        return w2v

In [None]:
# OpenNE
a = [[1,2],[2,3]]
b = [[3,4],[4,5]]
np.vstack([a, b])

In [None]:
from sklearn.metrics import roc_auc_score

#training 
random.seed(616)
edges = np.load('../tencent/train_edges.npy')
G = nx.Graph()
for i in range(169209):
    G.add_node(i)
G.add_edges_from(edges)

deepwalk = DeepWalk(G, emb_size=128, length_walk=50, num_walks=10, num_iters=2)
w2v = deepwalk.train(workers=4, is_loadmodel = False, is_loaddata = False)

# test
pos_test = np.load('../tencent/test_edges.npy')
neg_test = np.load('../tencent/test_edges_false')

# shape函数的功能是读取矩阵的长度，比如shape[0]就是读取矩阵第一维度的长度,相当于行数
y_ture = [True]*pos_test.shape[0] + [False]*neg_test.shape[0]
X = np.vstack([pos_test, neg_test]) # 堆叠数组

print('testing ...')
y_score = []
from u, v in X:
    y_score.append(w2v.wv.similarity(str(u), str(v)))

auc_test = roc_auc_score(y_true, y_score)
print('Tencent, test AUC:', auc_test)