In [5]:
import pandas as pd
import numpy as np

In [3]:
HIDDEN_DIM = 100
ENCODE_DIM = 128
LABELS = 1
EPOCHS = 1
BATCH_SIZE = 3
USE_GPU = False
root = 'data/'
language = 'c'
EMBEDDING_DIM = 128



In [6]:
train_data = pd.read_pickle(root+language+'/train/blocks.pkl').sample(frac=1)
test_data = pd.read_pickle(root+language+'/test/blocks.pkl').sample(frac=1)

In [8]:
from gensim.models.word2vec import Word2Vec

word2vec = Word2Vec.load(root+language+"/train/embedding_astnode_w2v_"+str(EMBEDDING_DIM)).wv
MAX_TOKENS = word2vec.syn0.shape[0]
EMBEDDING_DIM = word2vec.syn0.shape[1]

embeddings = np.zeros((MAX_TOKENS + 1, EMBEDDING_DIM), dtype="float32")
embeddings[:word2vec.syn0.shape[0]] = word2vec.syn0

In [14]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable
import random


class BatchTreeEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, encode_dim, batch_size, use_gpu, pretrained_weight=None):
        super(BatchTreeEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim
        self.encode_dim = encode_dim
        self.W_c = nn.Linear(embedding_dim, encode_dim)
        self.activation = F.relu
        self.stop = -1
        self.batch_size = batch_size
        self.use_gpu = use_gpu
        self.node_list = []
        self.th = torch.cuda if use_gpu else torch
        self.batch_node = None
        self.max_index = vocab_size
        # pretrained  embedding
        if pretrained_weight is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_weight))
            # self.embedding.weight.requires_grad = False

    def create_tensor(self, tensor):
        if self.use_gpu:
            return tensor.cuda()
        return tensor

    def traverse_mul(self, batch_btrees, batch_index):
        len_btrees = len(batch_btrees)
        if not len_btrees:
            return None
        batch_current = torch.zeros(len_btrees, self.embedding_dim)

        index, children_index = [], []
        current_node, children = [], []

        # 遍历每一个语句树
        for i in range(len_btrees):
            # if node[i][0] is not -1:
                index.append(i)
                
                # 每个语句树的第一个节点（父节点）
                current_node.append(batch_btrees[i][0])
                # 每个语句树除父节点以外所有节点（根节点的所有孩子节点） 
                temp = batch_btrees[i][1:]
                
                # 求出根节点的孩子结点的个数
                c_num = len(temp)

                # 遍历根节点的孩子节点
                for j in range(c_num):
                    if temp[j][0] is not -1:
                        if len(children_index) <= j:
                            children_index.append([i])
                            children.append([temp[j]])
                        else:
                            children_index[j].append(i)
                            children[j].append(temp[j])
            # else:
            #     batch_index[i] = -1

        batch_current = self.W_c(batch_current.index_copy(0, self.th.LongTensor(index),
                                                          self.embedding(self.th.LongTensor(current_node))))

        for c in range(len(children)):
            zeros = torch.zeros(len_btrees, self.encode_dim)
            batch_children_index = [batch_index[i] for i in children_index[c]]
            tree = self.traverse_mul(children[c], batch_children_index)
            if tree is not None:
                batch_current += zeros.index_copy(0, Variable(self.th.LongTensor(children_index[c])), tree)
        # batch_index = [i for i in batch_index if i is not -1]
        b_in = self.th.LongTensor(batch_index)
        self.node_list.append(self.batch_node.index_copy(0, b_in, batch_current))
        return batch_current

    def forward(self, batch_btrees, bs):
        self.batch_size = bs
        #self.batch_node = self.create_tensor(Variable(torch.zeros(self.batch_size, self.encode_dim)))
        # 将每个语句树编码为一个固定维度
        self.batch_node = torch.zeros(self.batch_size, self.encode_dim)
        # print(self.batch_node)
        self.node_list = []
        self.traverse_mul(batch_btrees, list(range(self.batch_size)))
        self.node_list = torch.stack(self.node_list)
        return torch.max(self.node_list, 0)[0]


class BatchProgramCC(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, encode_dim, label_size, batch_size, use_gpu=True, pretrained_weight=None):
        super(BatchProgramCC, self).__init__()
        self.stop = [vocab_size-1]
        self.hidden_dim = hidden_dim
        self.num_layers = 1
        self.gpu = use_gpu
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.encode_dim = encode_dim
        self.label_size = label_size
        self.encoder = BatchTreeEncoder(self.vocab_size, self.embedding_dim, self.encode_dim,
                                        self.batch_size, self.gpu, pretrained_weight)
        self.root2label = nn.Linear(self.encode_dim, self.label_size)
        # gru
        self.bigru = nn.GRU(self.encode_dim, self.hidden_dim, num_layers=self.num_layers, bidirectional=True,
                            batch_first=True)
        # linear
        self.hidden2label = nn.Linear(self.hidden_dim * 2, self.label_size)
        # hidden
        self.hidden = self.init_hidden()
        self.dropout = nn.Dropout(0.2)

    def init_hidden(self):
        if self.gpu is True:
            if isinstance(self.bigru, nn.LSTM):
                h0 = Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim).cuda())
                c0 = Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim).cuda())
                return h0, c0
            return Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim)).cuda()
        else:
            return Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim))

    def get_zeros(self, num):
        zeros = torch.zeros(num, self.encode_dim)
        if self.gpu:
            return zeros.cuda()
        return zeros

    def encode(self, x):
        # 一个batch中有多少个样本
        
        print(len(x))

        # 遍历一个batch中的所有ast
        for item in x:
            # 得到每个ast中有多少个语句树
            print(len(item))

            # 打印每个语句树的内容
            """ for i in item:
                print(i) """
        
        # 得到每个ast中有多少个语句树
        lens = [len(item) for item in x]
        # 取一个batch中的所有ast的 最大语句树个数
        max_len = max(lens)
        print(max_len)

        encodes = []
        # 对每一个样本i
        for i in range(self.batch_size):
            # 取样本i的每一个语句树j
            for j in range(lens[i]):
                # print(x[i][j])
                encodes.append(x[i][j])
        # encodes 得到了每个批次的所有语句树

        encodes = self.encoder(encodes, sum(lens))
        seq, start, end = [], 0, 0
        for i in range(self.batch_size):
            end += lens[i]
            if max_len-lens[i]:
                seq.append(self.get_zeros(max_len-lens[i]))
                print(seq)
            #print('*********')
            seq.append(encodes[start:end])
            #print(seq)
            start = end
        encodes = torch.cat(seq)
        """ print('-----------------------')
        print(seq) """

        # 最后输入到GRU层的数据格式：batch_size, 最大语句树个数, 每个语句树的嵌入维度（128）
        encodes = encodes.view(self.batch_size, max_len, -1)
        # return encodes

        gru_out, hidden = self.bigru(encodes, self.hidden)
        gru_out = torch.transpose(gru_out, 1, 2)
        # pooling
        gru_out = F.max_pool1d(gru_out, gru_out.size(2)).squeeze(2)
        # gru_out = gru_out[:,-1]

        return gru_out

    # x1和x2都是输入的一个batch，每个batch包含：32个样本，每个样本是由ast拆分得到的所有语句树序列组成的
    # 即：每个样本是一个完整的ast树拆分得到的语句树组成，这些语句树的每个结点都被word2vec嵌入表示
    # ast的某个语句树的表示形式: [1, [21, [34, [50]], [138]]]
    # 同一个list中的结点是兄弟结点, 临近的在不同list的结点是父母-孩子结点关系, 如21是1的孩子结点, 34是21的孩子结点

    def forward(self, x1, x2):
        lvec, rvec = self.encode(x1), self.encode(x2)

        # 一维范数计算两个编码的距离
        abs_dist = torch.abs(torch.add(lvec, -rvec))

        y = torch.sigmoid(self.hidden2label(abs_dist))
        return y



In [16]:
#from model import BatchProgramCC

model = BatchProgramCC(EMBEDDING_DIM,
                        HIDDEN_DIM,
                        MAX_TOKENS+1,
                        ENCODE_DIM,
                        LABELS,
                        BATCH_SIZE,
                        USE_GPU, embeddings)

In [11]:
import torch

parameters = model.parameters()
optimizer = torch.optim.Adamax(parameters)
loss_function = torch.nn.BCELoss()

In [12]:
def get_batch(dataset, idx, bs):
    tmp = dataset.iloc[idx: idx+bs]
    x1, x2, labels = [], [], []
    for _, item in tmp.iterrows():
        x1.append(item['code_x'])
        x2.append(item['code_y'])
        labels.append([item['label']])
    return x1, x2, torch.FloatTensor(labels)

In [17]:
import time

#from sklearn.metrics import precision_recall_fscore_support
#from torch.utils.data import TensorDataset

train_data_t, test_data_t = train_data.iloc[:3], test_data.iloc[:3]

for epoch in range(EPOCHS):
    start_time = time.time()
    # training epoch
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    i = 0
    while i < len(train_data_t):
        batch = get_batch(train_data_t, i, BATCH_SIZE)
        i += BATCH_SIZE
        train1_inputs, train2_inputs, train_labels = batch
        if USE_GPU:
            train1_inputs, train2_inputs, train_labels = train1_inputs, train2_inputs, train_labels.cuda()

        #model.zero_grad()
        optimizer.zero_grad()
        model.batch_size = len(train_labels)
        model.hidden = model.init_hidden()
        output = model(train1_inputs, train2_inputs)

        loss = loss_function(output, Variable(train_labels))
        loss.backward()
        optimizer.step()
#print("Testing-%d..."%t)

3
71
45
22
71
*********
[tensor([[0.0000, 0.1709, 0.0000,  ..., 0.2148, 0.0000, 0.0687],
        [0.0000, 0.0467, 0.0000,  ..., 0.0000, 0.0000, 0.0832],
        [0.4827, 0.3540, 0.3587,  ..., 0.5540, 0.0000, 0.4836],
        ...,
        [0.0000, 0.1980, 0.1301,  ..., 0.2296, 0.0000, 0.1207],
        [0.0000, 0.1381, 0.0080,  ..., 0.2438, 0.0000, 0.1302],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0898]],
       grad_fn=<SliceBackward>)]
[tensor([[0.0000, 0.1709, 0.0000,  ..., 0.2148, 0.0000, 0.0687],
        [0.0000, 0.0467, 0.0000,  ..., 0.0000, 0.0000, 0.0832],
        [0.4827, 0.3540, 0.3587,  ..., 0.5540, 0.0000, 0.4836],
        ...,
        [0.0000, 0.1980, 0.1301,  ..., 0.2296, 0.0000, 0.1207],
        [0.0000, 0.1381, 0.0080,  ..., 0.2438, 0.0000, 0.1302],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0898]],
       grad_fn=<SliceBackward>), tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]