In [21]:
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class BatchTreeEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, encode_dim, batch_size, use_gpu, pretrained_weight=None):
        super(BatchTreeEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encode_dim = encode_dim
        self.W_c = nn.Linear(embedding_dim, encode_dim)
        self.W_l = nn.Linear(encode_dim, encode_dim)
        self.W_r = nn.Linear(encode_dim, encode_dim)
        self.activation = F.relu
        self.stop = -1
        self.batch_size = batch_size
        self.use_gpu = use_gpu
        self.node_list = []
        self.th = torch.cuda if use_gpu else torch
        self.batch_node = None
        # pretrained  embedding
        if pretrained_weight is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_weight))
            # self.embedding.weight.requires_grad = False

    def create_tensor(self, tensor):
        if self.use_gpu:
            return tensor.cuda()
        return tensor

    def traverse_mul(self, node, batch_index):
        size = len(node)
        if not size:
            return None
        batch_current = self.create_tensor(Variable(torch.zeros(size, self.encode_dim)))

        index, children_index = [], []
        current_node, children = [], []
        for i in range(size):
            if node[i][0] != -1:
                index.append(i)
                current_node.append(node[i][0])
                temp = node[i][1:]
                c_num = len(temp)
                for j in range(c_num):
                    if temp[j][0] != -1:
                        if len(children_index) <= j:
                            children_index.append([i])
                            children.append([temp[j]])
                        else:
                            children_index[j].append(i)
                            children[j].append(temp[j])
            else:
                batch_index[i] = -1

        batch_current = self.W_c(batch_current.index_copy(0, Variable(self.th.LongTensor(index)),
                                                          self.embedding(Variable(self.th.LongTensor(current_node)))))

        for c in range(len(children)):
            zeros = self.create_tensor(Variable(torch.zeros(size, self.encode_dim)))
            batch_children_index = [batch_index[i] for i in children_index[c]]
            tree = self.traverse_mul(children[c], batch_children_index)
            if tree is not None:
                batch_current += zeros.index_copy(0, Variable(self.th.LongTensor(children_index[c])), tree)
        # batch_current = F.tanh(batch_current)
        batch_index = [i for i in batch_index if i != -1]
        b_in = Variable(self.th.LongTensor(batch_index))
        self.node_list.append(self.batch_node.index_copy(0, b_in, batch_current))
        return batch_current

    def forward(self, x, bs):
        self.batch_size = bs
        self.batch_node = self.create_tensor(Variable(torch.zeros(self.batch_size, self.encode_dim)))
        self.node_list = []
        self.traverse_mul(x, list(range(self.batch_size)))
        self.node_list = torch.stack(self.node_list)
        return torch.max(self.node_list, 0)[0]


class BatchProgramClassifier(nn.Module):
    # def __init__(self, embedding_dim, hidden_dim, vocab_size, encode_dim, label_size, batch_size, use_gpu=True, pretrained_weight=None):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, encode_dim, label_size, batch_size, use_gpu=True, pretrained_weight=None):
        super(BatchProgramClassifier, self).__init__()
        self.stop = [vocab_size-1]
        self.hidden_dim = hidden_dim
        self.num_layers = 1
        self.gpu = use_gpu
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.encode_dim = encode_dim
        self.label_size = label_size
        #class "BatchTreeEncoder"
        self.encoder = BatchTreeEncoder(self.vocab_size, self.embedding_dim, self.encode_dim,
                                        self.batch_size, self.gpu, pretrained_weight)
        self.root2label = nn.Linear(self.encode_dim, self.label_size)
        # gru
        self.bigru = nn.GRU(self.encode_dim, self.hidden_dim, num_layers=self.num_layers, bidirectional=True,
                            batch_first=True)
        # linear
        self.hidden2label = nn.Linear(self.hidden_dim * 2, self.label_size)
        # hidden
        self.hidden = self.init_hidden()
        self.dropout = nn.Dropout(0.2)

    def init_hidden(self):
        if self.gpu is True:
            if isinstance(self.bigru, nn.LSTM):
                h0 = Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim).cuda())
                c0 = Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim).cuda())
                return h0, c0
            return Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim)).cuda()
        else:
            return Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim))

    def get_zeros(self, num):
        zeros = Variable(torch.zeros(num, self.encode_dim))
        if self.gpu:
            return zeros.cuda()
        return zeros

    def forward(self, x):
        lens = [len(item) for item in x]
        max_len = max(lens)

        encodes = []
        for i in range(self.batch_size):
            for j in range(lens[i]):
                encodes.append(x[i][j])

        encodes = self.encoder(encodes, sum(lens))
        seq, start, end = [], 0, 0
        for i in range(self.batch_size):
            end += lens[i]
            if max_len-lens[i]:
                seq.append(self.get_zeros(max_len-lens[i]))
            seq.append(encodes[start:end])
            start = end
        encodes = torch.cat(seq)
        encodes = encodes.view(self.batch_size, max_len, -1)

        # gru
        gru_out, hidden = self.bigru(encodes, self.hidden)

        gru_out = torch.transpose(gru_out, 1, 2)
        # pooling
        gru_out = F.max_pool1d(gru_out, gru_out.size(2)).squeeze(2)
        # gru_out = gru_out[:,-1]

        # linear
        y = self.hidden2label(gru_out)
        return y
    
def get_batch(dataset, idx, bs):
    tmp = dataset.iloc[idx: idx+bs]
    data, labels = [], []
    for _, item in tmp.iterrows():
        data.append(item[1])
        labels.append(item[2]-1)  # from [1, 104] to [0, 103]
    return data, torch.LongTensor(labels)

# Training

In [18]:
import torch

def get_device():
    if torch.cuda.is_available():
        if torch.cuda.get_device_name(0) == 'GeForce GT 730':
            device = 'cpu'
        else:
            device = 'cuda'
    else:
        device = 'cpu'
    return torch.device(device)

device = get_device()

    Found GPU0 GeForce GT 730 which is of cuda capability 3.0.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability that we support is 3.5.
    
GeForce GT 730 with CUDA capability sm_30 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 compute_37.
If you want to use the GeForce GT 730 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [None]:
import random
import time
import numpy as np
from torch.utils.data import DataLoader
import os
import sys
import pandas as pd
from gensim.models.word2vec import Word2Vec

root = 'data/'
train_data = pd.read_pickle(root+'train/blocks.pkl')
val_data = pd.read_pickle(root + 'dev/blocks.pkl')
test_data = pd.read_pickle(root+'test/blocks.pkl')

word2vec = Word2Vec.load(root+"train/embedding/node_w2v_128").wv

embeddings = np.zeros((word2vec.vectors.shape[0] + 1, word2vec.vectors.shape[1]), dtype="float32")
embeddings[:word2vec.vectors.shape[0]] = word2vec.vectors



In [50]:
USE_GPU = False
HIDDEN_DIM = 100
ENCODE_DIM = 128
LABELS = 104
EPOCHS = 15
BATCH_SIZE = 128
MAX_TOKENS = word2vec.vectors.shape[0]
EMBEDDING_DIM = word2vec.vectors.shape[1]

model = BatchProgramClassifier(EMBEDDING_DIM,HIDDEN_DIM,MAX_TOKENS+1,ENCODE_DIM,LABELS,BATCH_SIZE,
                               USE_GPU, embeddings)
if USE_GPU:
    model.cuda()
    
parameters = model.parameters()
optimizer = torch.optim.Adamax(parameters)
loss_function = torch.nn.CrossEntropyLoss()

In [None]:
train_loss_ = []
val_loss_ = []
train_acc_ = []
val_acc_ = []
best_acc = 0.0
print('Start training...')
# training procedure
best_model = model
for epoch in range(EPOCHS):
    start_time = time.time()

    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    i = 0
    model.train()
    while i < len(train_data):
        train_inputs, train_labels = get_batch(train_data, i, BATCH_SIZE)
        i += BATCH_SIZE
        if USE_GPU:
            train_inputs, train_labels = train_inputs.cuda(), train_labels.cuda()

        model.zero_grad()
        model.batch_size = len(train_labels)
        model.hidden = model.init_hidden()
        output = model(train_inputs)

        loss = loss_function(output, Variable(train_labels))
        loss.backward()
        optimizer.step()
        
        print('[Epoch: %3d/%3d] [data: %d/%d] Training Loss: %.4f'
          % (epoch + 1, EPOCHS, i, len(train_data), loss))

        # calc training acc
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == train_labels).sum()
        total += len(train_labels)
        total_loss += loss.item()*len(train_inputs)

    train_loss_.append(total_loss / total)
    train_acc_.append(total_acc.item() / total)
    
    # validation epoch
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    i = 0
    model.eval()
    with torch.no_grad():
        while i < len(val_data):
            val_inputs, val_labels = get_batch(val_data, i, BATCH_SIZE)
            i += BATCH_SIZE
            if USE_GPU:
                val_inputs, val_labels = val_inputs.cuda(), val_labels.cuda()

            model.batch_size = len(val_labels)
            model.hidden = model.init_hidden()
            output = model(val_inputs)

            loss = loss_function(output, Variable(val_labels))

            # calc valing acc
            _, predicted = torch.max(output.data, 1)
            total_acc += (predicted == val_labels).sum()
            total += len(val_labels)
            total_loss += loss.item()*len(val_inputs)
    val_loss_.append(total_loss / total)
    val_acc_.append(total_acc.item() / total)
    end_time = time.time()
    if total_acc/total > best_acc:
        best_model = model
    print('[Epoch: %3d/%3d] Training Loss: %.4f, Validation Loss: %.4f,'
          ' Training Acc: %.3f, Validation Acc: %.3f, Time Cost: %.3f s'
          % (epoch + 1, EPOCHS, train_loss_[epoch], val_loss_[epoch],
             train_acc_[epoch], val_acc_[epoch], end_time - start_time))

Start training...
[Epoch:   1/ 15] [data: 128/31200] Training Loss: 4.6677
[Epoch:   1/ 15] [data: 256/31200] Training Loss: 4.6522
[Epoch:   1/ 15] [data: 384/31200] Training Loss: 4.6492
[Epoch:   1/ 15] [data: 512/31200] Training Loss: 4.6613
[Epoch:   1/ 15] [data: 640/31200] Training Loss: 4.6094
[Epoch:   1/ 15] [data: 768/31200] Training Loss: 4.6266
[Epoch:   1/ 15] [data: 896/31200] Training Loss: 4.6145
[Epoch:   1/ 15] [data: 1024/31200] Training Loss: 4.5881
[Epoch:   1/ 15] [data: 1152/31200] Training Loss: 4.6344
[Epoch:   1/ 15] [data: 1280/31200] Training Loss: 4.6222
[Epoch:   1/ 15] [data: 1408/31200] Training Loss: 4.5870
[Epoch:   1/ 15] [data: 1536/31200] Training Loss: 4.5669
[Epoch:   1/ 15] [data: 1664/31200] Training Loss: 4.5778
[Epoch:   1/ 15] [data: 1792/31200] Training Loss: 4.5783
[Epoch:   1/ 15] [data: 1920/31200] Training Loss: 4.5096
[Epoch:   1/ 15] [data: 2048/31200] Training Loss: 4.5499
[Epoch:   1/ 15] [data: 2176/31200] Training Loss: 4.5186
[Ep

[Epoch:   1/ 15] [data: 18048/31200] Training Loss: 2.4059
[Epoch:   1/ 15] [data: 18176/31200] Training Loss: 2.2714
[Epoch:   1/ 15] [data: 18304/31200] Training Loss: 2.2058
[Epoch:   1/ 15] [data: 18432/31200] Training Loss: 2.3676
[Epoch:   1/ 15] [data: 18560/31200] Training Loss: 2.2991
[Epoch:   1/ 15] [data: 18688/31200] Training Loss: 2.4477
[Epoch:   1/ 15] [data: 18816/31200] Training Loss: 2.2919
[Epoch:   1/ 15] [data: 18944/31200] Training Loss: 2.3330
[Epoch:   1/ 15] [data: 19072/31200] Training Loss: 2.3934
[Epoch:   1/ 15] [data: 19200/31200] Training Loss: 2.3940
[Epoch:   1/ 15] [data: 19328/31200] Training Loss: 2.2056
[Epoch:   1/ 15] [data: 19456/31200] Training Loss: 2.2569
[Epoch:   1/ 15] [data: 19584/31200] Training Loss: 2.2208
[Epoch:   1/ 15] [data: 19712/31200] Training Loss: 2.1420
[Epoch:   1/ 15] [data: 19840/31200] Training Loss: 2.2765
[Epoch:   1/ 15] [data: 19968/31200] Training Loss: 2.2346
[Epoch:   1/ 15] [data: 20096/31200] Training Loss: 2.16

[Epoch:   2/ 15] [data: 4480/31200] Training Loss: 0.9127
[Epoch:   2/ 15] [data: 4608/31200] Training Loss: 1.0367
[Epoch:   2/ 15] [data: 4736/31200] Training Loss: 1.0634
[Epoch:   2/ 15] [data: 4864/31200] Training Loss: 1.0955
[Epoch:   2/ 15] [data: 4992/31200] Training Loss: 0.9984
[Epoch:   2/ 15] [data: 5120/31200] Training Loss: 0.9458
[Epoch:   2/ 15] [data: 5248/31200] Training Loss: 1.0814
[Epoch:   2/ 15] [data: 5376/31200] Training Loss: 1.0295
[Epoch:   2/ 15] [data: 5504/31200] Training Loss: 1.0562
[Epoch:   2/ 15] [data: 5632/31200] Training Loss: 1.0950
[Epoch:   2/ 15] [data: 5760/31200] Training Loss: 1.0805
[Epoch:   2/ 15] [data: 5888/31200] Training Loss: 1.0396
[Epoch:   2/ 15] [data: 6016/31200] Training Loss: 0.9551
[Epoch:   2/ 15] [data: 6144/31200] Training Loss: 0.9497
[Epoch:   2/ 15] [data: 6272/31200] Training Loss: 0.9406
[Epoch:   2/ 15] [data: 6400/31200] Training Loss: 0.8852
[Epoch:   2/ 15] [data: 6528/31200] Training Loss: 0.9841
[Epoch:   2/ 1

[Epoch:   2/ 15] [data: 22400/31200] Training Loss: 0.6487
[Epoch:   2/ 15] [data: 22528/31200] Training Loss: 0.7700
[Epoch:   2/ 15] [data: 22656/31200] Training Loss: 0.6195
[Epoch:   2/ 15] [data: 22784/31200] Training Loss: 0.5738
[Epoch:   2/ 15] [data: 22912/31200] Training Loss: 0.5502
[Epoch:   2/ 15] [data: 23040/31200] Training Loss: 0.5309
[Epoch:   2/ 15] [data: 23168/31200] Training Loss: 0.5823
[Epoch:   2/ 15] [data: 23296/31200] Training Loss: 0.5826
[Epoch:   2/ 15] [data: 23424/31200] Training Loss: 0.5734
[Epoch:   2/ 15] [data: 23552/31200] Training Loss: 0.6143
[Epoch:   2/ 15] [data: 23680/31200] Training Loss: 0.5254
[Epoch:   2/ 15] [data: 23808/31200] Training Loss: 0.5859
[Epoch:   2/ 15] [data: 23936/31200] Training Loss: 0.5281
[Epoch:   2/ 15] [data: 24064/31200] Training Loss: 0.6110
[Epoch:   2/ 15] [data: 24192/31200] Training Loss: 0.5832
[Epoch:   2/ 15] [data: 24320/31200] Training Loss: 0.5434
[Epoch:   2/ 15] [data: 24448/31200] Training Loss: 0.42

[Epoch:   3/ 15] [data: 8832/31200] Training Loss: 0.3199
[Epoch:   3/ 15] [data: 8960/31200] Training Loss: 0.3428
[Epoch:   3/ 15] [data: 9088/31200] Training Loss: 0.3658
[Epoch:   3/ 15] [data: 9216/31200] Training Loss: 0.4865
[Epoch:   3/ 15] [data: 9344/31200] Training Loss: 0.4016
[Epoch:   3/ 15] [data: 9472/31200] Training Loss: 0.3827
[Epoch:   3/ 15] [data: 9600/31200] Training Loss: 0.3368
[Epoch:   3/ 15] [data: 9728/31200] Training Loss: 0.2818
[Epoch:   3/ 15] [data: 9856/31200] Training Loss: 0.3494
[Epoch:   3/ 15] [data: 9984/31200] Training Loss: 0.3793
[Epoch:   3/ 15] [data: 10112/31200] Training Loss: 0.3356
[Epoch:   3/ 15] [data: 10240/31200] Training Loss: 0.2709
[Epoch:   3/ 15] [data: 10368/31200] Training Loss: 0.3280
[Epoch:   3/ 15] [data: 10496/31200] Training Loss: 0.2913
[Epoch:   3/ 15] [data: 10624/31200] Training Loss: 0.3203
[Epoch:   3/ 15] [data: 10752/31200] Training Loss: 0.4100
[Epoch:   3/ 15] [data: 10880/31200] Training Loss: 0.3740
[Epoch:

[Epoch:   3/ 15] [data: 26752/31200] Training Loss: 0.1789
[Epoch:   3/ 15] [data: 26880/31200] Training Loss: 0.2173
[Epoch:   3/ 15] [data: 27008/31200] Training Loss: 0.2644
[Epoch:   3/ 15] [data: 27136/31200] Training Loss: 0.2293
[Epoch:   3/ 15] [data: 27264/31200] Training Loss: 0.2592
[Epoch:   3/ 15] [data: 27392/31200] Training Loss: 0.2840
[Epoch:   3/ 15] [data: 27520/31200] Training Loss: 0.2001
[Epoch:   3/ 15] [data: 27648/31200] Training Loss: 0.1976
[Epoch:   3/ 15] [data: 27776/31200] Training Loss: 0.2326
[Epoch:   3/ 15] [data: 27904/31200] Training Loss: 0.1932
[Epoch:   3/ 15] [data: 28032/31200] Training Loss: 0.2632
[Epoch:   3/ 15] [data: 28160/31200] Training Loss: 0.2671
[Epoch:   3/ 15] [data: 28288/31200] Training Loss: 0.2032
[Epoch:   3/ 15] [data: 28416/31200] Training Loss: 0.2040
[Epoch:   3/ 15] [data: 28544/31200] Training Loss: 0.2207
[Epoch:   3/ 15] [data: 28672/31200] Training Loss: 0.1975
[Epoch:   3/ 15] [data: 28800/31200] Training Loss: 0.24

[Epoch:   4/ 15] [data: 13312/31200] Training Loss: 0.1990
[Epoch:   4/ 15] [data: 13440/31200] Training Loss: 0.2996
[Epoch:   4/ 15] [data: 13568/31200] Training Loss: 0.2080
[Epoch:   4/ 15] [data: 13696/31200] Training Loss: 0.1948
[Epoch:   4/ 15] [data: 13824/31200] Training Loss: 0.1994
[Epoch:   4/ 15] [data: 13952/31200] Training Loss: 0.1878
[Epoch:   4/ 15] [data: 14080/31200] Training Loss: 0.1294
[Epoch:   4/ 15] [data: 14208/31200] Training Loss: 0.1691
[Epoch:   4/ 15] [data: 14336/31200] Training Loss: 0.2232
[Epoch:   4/ 15] [data: 14464/31200] Training Loss: 0.1697
[Epoch:   4/ 15] [data: 14592/31200] Training Loss: 0.1356
[Epoch:   4/ 15] [data: 14720/31200] Training Loss: 0.2104
[Epoch:   4/ 15] [data: 14848/31200] Training Loss: 0.2284
[Epoch:   4/ 15] [data: 14976/31200] Training Loss: 0.1412
[Epoch:   4/ 15] [data: 15104/31200] Training Loss: 0.2184
[Epoch:   4/ 15] [data: 15232/31200] Training Loss: 0.2345
[Epoch:   4/ 15] [data: 15360/31200] Training Loss: 0.18

In [None]:
total_acc = 0.0
total_loss = 0.0
total = 0.0
i = 0
model = best_model
model.eval()
with torch.no_grad():
    while i < len(test_data):
        test_inputs, test_labels = get_batch(test_data, i, BATCH_SIZE)
        i += BATCH_SIZE
        if USE_GPU:
            test_inputs, test_labels = test_inputs.cuda(), test_labels.cuda()

        model.batch_size = len(test_labels)
        model.hidden = model.init_hidden()
        output = model(test_inputs)

        loss = loss_function(output, Variable(test_labels))

        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == test_labels).sum()
        total += len(test_labels)
        total_loss += loss.item() * len(test_inputs)
    print("Testing results(Acc):", total_acc.item() / total)

torch.save(model.state_dict(), 'astnn_classify_best.pt')

In [None]:
torch.save(model.state_dict(), 'astnn_classify.pt')

In [None]:
# train.py

import pandas as pd
import random
import torch
import time
import numpy as np
from gensim.models.word2vec import Word2Vec
from model import BatchProgramClassifier
from torch.autograd import Variable
from torch.utils.data import DataLoader
import os
import sys


def get_batch(dataset, idx, bs):
    tmp = dataset.iloc[idx: idx+bs]
    data, labels = [], []
    for _, item in tmp.iterrows():
        data.append(item[1])
        labels.append(item[2]-1)
    return data, torch.LongTensor(labels)


root = 'data/'
train_data = pd.read_pickle(root+'train/blocks.pkl')
val_data = pd.read_pickle(root + 'dev/blocks.pkl')
test_data = pd.read_pickle(root+'test/blocks.pkl')

word2vec = Word2Vec.load(root+"train/embedding/node_w2v_128").wv
embeddings = np.zeros((word2vec.syn0.shape[0] + 1, word2vec.syn0.shape[1]), dtype="float32")
embeddings[:word2vec.syn0.shape[0]] = word2vec.syn0

HIDDEN_DIM = 100
ENCODE_DIM = 128
LABELS = 104
EPOCHS = 15
BATCH_SIZE = 64
USE_GPU = True
MAX_TOKENS = word2vec.syn0.shape[0]
EMBEDDING_DIM = word2vec.syn0.shape[1]

model = BatchProgramClassifier(EMBEDDING_DIM,HIDDEN_DIM,MAX_TOKENS+1,ENCODE_DIM,LABELS,BATCH_SIZE,
                               USE_GPU, embeddings)
if USE_GPU:
    model.cuda()

parameters = model.parameters()
optimizer = torch.optim.Adamax(parameters)
loss_function = torch.nn.CrossEntropyLoss()

train_loss_ = []
val_loss_ = []
train_acc_ = []
val_acc_ = []
best_acc = 0.0
print('Start training...')
# training procedure
best_model = model
for epoch in range(EPOCHS):
    start_time = time.time()

    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    i = 0
    while i < len(train_data):
        batch = get_batch(train_data, i, BATCH_SIZE)
        i += BATCH_SIZE
        train_inputs, train_labels = batch
        if USE_GPU:
            train_inputs, train_labels = train_inputs, train_labels.cuda()

        model.zero_grad()
        model.batch_size = len(train_labels)
        model.hidden = model.init_hidden()
        output = model(train_inputs)

        loss = loss_function(output, Variable(train_labels))
        loss.backward()
        optimizer.step()

        # calc training acc
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == train_labels).sum()
        total += len(train_labels)
        total_loss += loss.item()*len(train_inputs)

    train_loss_.append(total_loss / total)
    train_acc_.append(total_acc.item() / total)
    # validation epoch
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    i = 0
    while i < len(val_data):
        batch = get_batch(val_data, i, BATCH_SIZE)
        i += BATCH_SIZE
        val_inputs, val_labels = batch
        if USE_GPU:
            val_inputs, val_labels = val_inputs, val_labels.cuda()

        model.batch_size = len(val_labels)
        model.hidden = model.init_hidden()
        output = model(val_inputs)

        loss = loss_function(output, Variable(val_labels))

        # calc valing acc
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == val_labels).sum()
        total += len(val_labels)
        total_loss += loss.item()*len(val_inputs)
    val_loss_.append(total_loss / total)
    val_acc_.append(total_acc.item() / total)
    end_time = time.time()
    if total_acc/total > best_acc:
        best_model = model
    print('[Epoch: %3d/%3d] Training Loss: %.4f, Validation Loss: %.4f,'
          ' Training Acc: %.3f, Validation Acc: %.3f, Time Cost: %.3f s'
          % (epoch + 1, EPOCHS, train_loss_[epoch], val_loss_[epoch],
             train_acc_[epoch], val_acc_[epoch], end_time - start_time))

total_acc = 0.0
total_loss = 0.0
total = 0.0
i = 0
model = best_model
while i < len(test_data):
    batch = get_batch(test_data, i, BATCH_SIZE)
    i += BATCH_SIZE
    test_inputs, test_labels = batch
    if USE_GPU:
        test_inputs, test_labels = test_inputs, test_labels.cuda()

    model.batch_size = len(test_labels)
    model.hidden = model.init_hidden()
    output = model(test_inputs)

    loss = loss_function(output, Variable(test_labels))

    _, predicted = torch.max(output.data, 1)
    total_acc += (predicted == test_labels).sum()
    total += len(test_labels)
    total_loss += loss.item() * len(test_inputs)
print("Testing results(Acc):", total_acc.item() / total)


# generate preprocessed data

In [19]:
import pandas as pd
import os
def parse_source(root, output_file, option):
    path = root+output_file
    if os.path.exists(path) and option is 'existing':
        source = pd.read_pickle(path)
    else:
        from pycparser import c_parser
        parser = c_parser.CParser()
        source = pd.read_pickle(root+'programs.pkl')

        source.columns = ['id', 'code', 'label']
        source['code'] = source['code'].apply(parser.parse)

        source.to_pickle(path)
    return source

#s = parse_source(root='data/', output_file='ast.pkl',option='existing')

In [128]:
def split_data(root,ratio):
    data = pd.read_pickle(root+'ast.pkl')
    data_num = len(data)
    ratios = [int(r) for r in ratio.split(':')]
    train_split = int(ratios[0]/sum(ratios)*data_num)
    val_split = train_split + int(ratios[1]/sum(ratios)*data_num)
    data = data.sample(frac=1, random_state=666)
    train = data.iloc[:train_split] 
    dev = data.iloc[train_split:val_split] 
    test = data.iloc[val_split:] 

    def check_or_create(path):
        if not os.path.exists(path):
            os.mkdir(path)
    train_path = root+'train/'
    check_or_create(train_path)
    train_file_path = train_path+'train_.pkl'
    train.to_pickle(train_file_path)

    dev_path = root+'dev/'
    check_or_create(dev_path)
    dev_file_path = dev_path+'dev_.pkl'
    dev.to_pickle(dev_file_path)

    test_path = root+'test/'
    check_or_create(test_path)
    test_file_path = test_path+'test_.pkl'
    test.to_pickle(test_file_path)

def dictionary_and_embedding(root, input_file, size):
    if not input_file:
        input_file = 'data/train/train_.pkl'
    trees = pd.read_pickle(input_file)
    if not os.path.exists(root+'train/embedding'):
        os.mkdir(root+'train/embedding')
    from prepare_data import get_sequences

    def trans_to_sequences(ast):
        sequence = []
        get_sequences(ast, sequence)
        return sequence
    corpus = trees['code'].apply(trans_to_sequences)  # every row of corpus is list of str
    str_corpus = [' '.join(c) for c in corpus]  # str_corpus is list of str
    trees['code'] = pd.Series(str_corpus)  
    # the first saveral ones are alway the same in every row, can we delete them ?
    trees.to_csv(root+'train/programs_ns.tsv')  # tsv?

    from gensim.models.word2vec import Word2Vec
    w2v = Word2Vec(corpus, size=size, workers=16, sg=1, min_count=3)
    w2v.save(root+'train/embedding/node_w2v_' + str(size))    

def generate_block_seqs(root,data_path,part,size=128):
    from prepare_data import get_blocks as func
    from gensim.models.word2vec import Word2Vec

    word2vec = Word2Vec.load(root+'train/embedding/node_w2v_' + str(size)).wv
    vocab = word2vec.vocab
    max_token = word2vec.vectors.shape[0]
    # Attribute `syn0` will be removed in 4.0.0, use self.vectors instead

    def tree_to_index(node):
        token = node.token
        result = [vocab[token].index if token in vocab else max_token]
        children = node.children
        for child in children:
            result.append(tree_to_index(child))
        return result

    def trans2seq(r):
        blocks = []
        func(r, blocks)
        tree = []
        for b in blocks:
            btree = tree_to_index(b)
            tree.append(btree)
        return tree
    trees = pd.read_pickle(data_path)
    trees['code'] = trees['code'].apply(trans2seq)
    trees.to_pickle(root+part+'/blocks.pkl')

generate_block_seqs(root='data/', data_path='data/train/train_.pkl', part='train')
generate_block_seqs(root='data/', data_path='data/dev/dev_.pkl', part='dev')
generate_block_seqs(root='data/', data_path='data/test/test_.pkl', part='test')

FileNotFoundError: [Errno 2] No such file or directory: 'data/train/dev_.pkl'

In [None]:
# source code

import pandas as pd
import os

class Pipeline:
    def __init__(self,  ratio, root):
        self.ratio = ratio
        self.root = root
        self.sources = None
        self.train_file_path = None
        self.dev_file_path = None
        self.test_file_path = None
        self.size = None

    # parse source code
    def parse_source(self, output_file, option):
        path = self.root+output_file
        if os.path.exists(path) and option is 'existing':
            source = pd.read_pickle(path)
        else:
            from pycparser import c_parser
            parser = c_parser.CParser()
            source = pd.read_pickle(self.root+'programs.pkl')

            source.columns = ['id', 'code', 'label']
            source['code'] = source['code'].apply(parser.parse)

            source.to_pickle(path)
        self.sources = source
        return source

    # split data for training, developing and testing
    def split_data(self):
        data = self.sources
        data_num = len(data)
        ratios = [int(r) for r in self.ratio.split(':')]
        train_split = int(ratios[0]/sum(ratios)*data_num)
        val_split = train_split + int(ratios[1]/sum(ratios)*data_num)
        data = data.sample(frac=1, random_state=666)
        train = data.iloc[:train_split] 
        dev = data.iloc[train_split:val_split] 
        test = data.iloc[val_split:] 

        def check_or_create(path):
            if not os.path.exists(path):
                os.mkdir(path)
        train_path = self.root+'train/'
        check_or_create(train_path)
        self.train_file_path = train_path+'train_.pkl'
        train.to_pickle(self.train_file_path)

        dev_path = self.root+'dev/'
        check_or_create(dev_path)
        self.dev_file_path = dev_path+'dev_.pkl'
        dev.to_pickle(self.dev_file_path)

        test_path = self.root+'test/'
        check_or_create(test_path)
        self.test_file_path = test_path+'test_.pkl'
        test.to_pickle(self.test_file_path)

    # construct dictionary and train word embedding
    def dictionary_and_embedding(self, input_file, size):
        self.size = size
        if not input_file:
            input_file = self.train_file_path
        trees = pd.read_pickle(input_file)
        if not os.path.exists(self.root+'train/embedding'):
            os.mkdir(self.root+'train/embedding')
        from prepare_data import get_sequences

        def trans_to_sequences(ast):
            sequence = []
            get_sequences(ast, sequence)
            return sequence
        corpus = trees['code'].apply(trans_to_sequences)  # every row of corpus is list of str
        str_corpus = [' '.join(c) for c in corpus]  # str_corpus is list of str
        trees['code'] = pd.Series(str_corpus)  
        # the first saveral ones are alway the same in every row, can we delete them ?
        trees.to_csv(self.root+'train/programs_ns.tsv')

        from gensim.models.word2vec import Word2Vec
        w2v = Word2Vec(corpus, size=size, workers=16, sg=1, min_count=3)
        w2v.save(self.root+'train/embedding/node_w2v_' + str(size))

    # generate block sequences with index representations
    def generate_block_seqs(self,data_path,part):
        from prepare_data import get_blocks as func
        from gensim.models.word2vec import Word2Vec

        word2vec = Word2Vec.load(self.root+'train/embedding/node_w2v_' + str(self.size)).wv
        vocab = word2vec.vocab
        max_token = word2vec.vectors.shape[0]
        # Attribute `syn0` will be removed in 4.0.0, use self.vectors instead

        def tree_to_index(node):
            token = node.token
            result = [vocab[token].index if token in vocab else max_token]
            children = node.children
            for child in children:
                result.append(tree_to_index(child))
            return result

        def trans2seq(r):
            blocks = []
            func(r, blocks)
            tree = []
            for b in blocks:
                btree = tree_to_index(b)
                tree.append(btree)
            return tree
        trees = pd.read_pickle(data_path)
        trees['code'] = trees['code'].apply(trans2seq)
        trees.to_pickle(self.root+part+'/blocks.pkl')

    # run for processing data to train
    def run(self):
        print('parse source code...')
        self.parse_source(output_file='ast.pkl',option='existing')
        print('split data...')
        self.split_data()
        print('train word embedding...')
        self.dictionary_and_embedding(None,128)
        print('generate block sequences...')
        self.generate_block_seqs(self.train_file_path, 'train')
        self.generate_block_seqs(self.dev_file_path, 'dev')
        self.generate_block_seqs(self.test_file_path, 'test')


ppl = Pipeline('3:1:1', 'data/')
ppl.run()