# LINE: Large-scale Information Network Embedding

### 目录
1. 设置模型参数
2. 读图，存点和边并做归一化
3. 计算点和边的alias table
4. Line模型实现
5. 模型按边训练以及负采样
6. 结果展示和可视化

In [2]:
import argparse
from utils.utils import *
from utils.line import Line
from tqdm import trange
import torch
import torch.optim as optim
import sys
import pickle

### 1. 设置模型参数; 读图，存点和边并做归一化

1) 设置模型参数
设置模型超参数，如1st order, 2nd order，负样本数量(K), embedding维度, batch、epoch、learning rate等

2）输入输出

输入文件 './data/weighted.karate.edgelist'

输出文件 './model.pt'

In [3]:
# 使用parser加载信息
parser = argparse.ArgumentParser()
# 输入文件 
# parser.add_argument("-g", "--graph_path", type=str, default='./data/erdosrenyi.edgelist')
parser.add_argument("-g", "--graph_path", type=str, default='./data/weighted.karate.edgelist')
# 模型信息输出文件
parser.add_argument("-save", "--save_path", type=str, default='./model.pt')
# 模型损失函数值输出文件
parser.add_argument("-lossdata", "--lossdata_path", type=str, default='./loss.pkl')

# Hyperparams. 超参数
# 论文中的1st order, 2nd order
parser.add_argument("-order", "--order", type=int, default=2)
# 负样本数量
parser.add_argument("-neg", "--negsamplesize", type=int, default=5)
# embedding维度
parser.add_argument("-dim", "--dimension", type=int, default=128)
# batch大小
parser.add_argument("-batchsize", "--batchsize", type=int, default=5)
# epoch数量
parser.add_argument("-epochs", "--epochs", type=int, default=1)
# 学习率设置
parser.add_argument("-lr", "--learning_rate", type=float,
                default=0.025)  # As starting value in paper
# 负采样指数值设置
parser.add_argument("-negpow", "--negativepower", type=float, default=0.75)  #3/4 一般都是0.75
args = parser.parse_args(args=[])

### 2. 读图，存点和边并做归一化

1）读图
自己实现的makeDist函数，在utils.py中


In [4]:
# Create dict of distribution when opening file
# 读图，函数在utils.py中
edgedistdict, nodedistdict, weights, nodedegrees, maxindex = makeDist(
args.graph_path, args.negativepower)

Reading edgelist file...


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [00:00<00:00, 25778.56it/s]


In [13]:
# 展示模块
#normalize的边和点
#edgedistdict
#nodedistdict

# 边的权重和点的出度
# weights
# nodedegrees

# 最大的index
#maxindex

### 3. 计算点和边的alias table

In [14]:
# 构建alias table,达到O(1)的采样效率
edgesaliassampler = VoseAlias(edgedistdict)
nodesaliassampler = VoseAlias(nodedistdict)

1/2. Building and sorting scaled probabilities for alias table...


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:00<00:00, 77171.18it/s]


2/2. Building alias table...
1/2. Building and sorting scaled probabilities for alias table...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<?, ?it/s]

2/2. Building alias table...





### 4. Line模型实现

In [19]:
# 按batchsize将训练样本分组
batchrange = int(len(edgedistdict) / args.batchsize)
print(maxindex)
# line.py中的nn.Module类
line = Line(maxindex + 1, embed_dim=args.dimension, order=args.order)
# SGD算法优化模型
opt = optim.SGD(line.parameters(), lr=args.learning_rate,
            momentum=0.9, nesterov=True)

34


### 5.模型按边训练以及负采样

In [7]:
# 选用gpu或cpu训练
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

lossdata = {"it": [], "loss": []}
it = 0
helper = 0

print("\nTraining on {}...\n".format(device))
# 共训练epoch次数
for epoch in range(args.epochs):
    print("Epoch {}".format(epoch))
    # 每次训练组数：batchsize
    for b in trange(batchrange):
        # edgesaliassampler是实现alias building的VoseAlias类，这里采样出batchsize条边
        samplededges = edgesaliassampler.sample_n(args.batchsize)
        # makeData是utils.py中的函数，为每条边采样出K条负样本边
        # 每一条格式是(node i, node j, negative nodes...)
        batch = list(makeData(samplededges, args.negsamplesize, weights, nodedegrees,
                              nodesaliassampler))
        # 转换成tensor格式
        batch = torch.LongTensor(batch)
        if helper == 0:
            print (batch)
            helper = 1
        # 第0列
        v_i = batch[:, 0]
        # 第1列
        v_j = batch[:, 1]
        # 第2列-最后列
        negsamples = batch[:, 2:]
        # 在做BP之前将gradients置0因为是累加的
        line.zero_grad()
        # Line模型实现部分
        loss = line(v_i, v_j, negsamples, device)
        # 计算梯度
        loss.backward()
        # 根据梯度值更新参数值
        opt.step()

        lossdata["loss"].append(loss.item())
        lossdata["it"].append(it)
        it += 1

print("\nDone training, saving model to {}".format(args.save_path))
torch.save(line, "{}".format(args.save_path))

print("Saving loss data at {}".format(args.lossdata_path))
with open(args.lossdata_path, "wb") as ldata:
    pickle.dump(lossdata, ldata)
# sys.exit()

100%|██████████| 15/15 [00:00<00:00, 664.15it/s]


Training on cpu...

Epoch 0
tensor([[28, 34, 32,  6,  1, 10,  3],
        [31, 33, 27, 30,  5, 27, 23],
        [ 1,  6, 19, 29, 20,  7, 21],
        [ 6,  7, 19, 25, 27, 32, 20],
        [26, 32, 23, 25,  6, 27,  4]])

Done training, saving model to ./model.pt
Saving loss data at ./loss.pkl



  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


### 6.结果展示和可视化

In [8]:
print (line.nodes_embeddings)
input = torch.LongTensor([0])
print (line.nodes_embeddings(input))

Embedding(35, 128)
tensor([[-2.1563e-03, -3.5730e-03,  7.7897e-04, -2.0625e-03,  8.7192e-04,
         -1.4623e-03, -9.5805e-04,  2.0489e-03,  2.3217e-03,  1.4508e-03,
          1.4889e-03,  2.2237e-03, -1.3446e-03, -2.1977e-03, -1.9399e-03,
          4.0687e-04,  1.8620e-03,  2.4981e-03, -3.4427e-03,  8.3303e-04,
          2.3330e-03, -3.0304e-03, -2.6531e-03,  3.6443e-03, -7.7189e-04,
         -3.7786e-03, -3.4815e-03, -1.4232e-03,  1.3029e-03,  1.4854e-03,
         -2.3697e-03,  1.8344e-03,  3.5209e-03, -1.8567e-03,  3.6229e-04,
         -3.2596e-03, -2.6341e-03,  4.8524e-04, -6.5288e-04, -5.8510e-04,
          3.5332e-03, -3.3168e-03, -3.4773e-03, -9.4892e-05,  3.5563e-03,
         -9.3188e-04,  2.3504e-03, -3.0754e-03, -2.9352e-03, -1.8730e-03,
          1.0574e-03,  1.4161e-03, -3.4506e-03,  3.8624e-03, -2.6382e-03,
          2.3032e-04,  3.0217e-03, -3.3979e-04, -3.2616e-03, -3.0722e-03,
         -3.5899e-03,  3.8665e-03, -3.0809e-03,  1.4839e-03, -1.5289e-03,
          1.7523e-0

In [9]:
# k-means聚类
from sklearn import  cluster
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import train_test_split
import pandas as pd
embedding_node=[]
for i in range(1,35):
    input = torch.LongTensor([i])
    t = line.nodes_embeddings(input)
    embedding_node.append(t.tolist()[0])
embedding_node=np.matrix(embedding_node).reshape((34,-1))
y_pred = cluster.KMeans(n_clusters=3, random_state=9).fit_predict(embedding_node) # 调用 test_RandomForestClassifier
y_pred

array([2, 1, 2, 0, 1, 1, 0, 1, 1, 0, 2, 2, 2, 0, 0, 1, 2, 2, 1, 1, 0, 2,
       2, 2, 2, 1, 0, 0, 1, 0, 1, 1, 0, 0], dtype=int32)