In [1]:
import sys
sys.path.append("../src")

In [33]:
import glob
import os
import pickle
import random

from torch_geometric.data import Dataset
from torch_geometric.data import DataLoader
from src.code_parser import  get_data_from_graph

from torch_geometric.nn import GraphConv, TopKPooling
# train_dataset = FastCloneDataset(root="data/", functions_path="data/networkx_data_idx/", pairs_path="data/train.npz", return_pair_data=False)
class FastCloneDataset(Dataset):
    '''
    FastCloneDataset： 

    networkx_data_idx 存储的是: source code的语法树形式，且每个语法树的node已经被fasttext嵌入表示
    '''
    def __init__(self, root, functions_path, pairs_path, return_pair_data=False, transform=None, pre_transform=None):
        self.functions_path = functions_path
        self.return_pair_data = return_pair_data
        # 训练集或测试集中存储的pair
        self.pairs = np.load(pairs_path)['arr_0']
        self.functions = {}
        self.processed_pairs = {}
        super(FastCloneDataset, self).__init__(root, transform, pre_transform)

    def len(self):
        return len(self.pairs)

    def load_id(self, idx):
        if idx in self.functions:
            return self.functions[idx]

        g = nx.read_gpickle(os.path.join(self.functions_path, str(idx)))

        if self.return_pair_data:
            self.functions[idx] = get_data_from_graph(g)
        else:
            self.functions[idx] = g

        return self.functions[idx]

    def get_pair(self, idx):
        if idx in self.processed_pairs:
            return self.processed_pairs[idx]

        id1, id2, label = self.pairs[idx]
        data1, data2 = self.load_id(id1), self.load_id(id2)
        if self.return_pair_data:
            data = PairData(edge_index_s=data1.edge_index, x_s=data1.x, edge_index_t=data2.edge_index, x_t=data2.x,
                            y=torch.tensor([label], dtype=torch.int64))
            self.processed_pairs[idx] = data
        else:
            g1, g2 = data1, data2
            g3 = nx.disjoint_union(g1, g2)
            # for node1 in g1.nodes(data=True):
            #     for node2 in g2.nodes(data=True):
            #         if node1[1]['idx'] == node2[1]['idx']:
            #             g3.add_edge("s_" + str(node1[0]), "t_" + str(node2[0]))

            g3 = nx.convert_node_labels_to_integers(g3.to_undirected())
            self.processed_pairs[idx] = get_data_from_graph(g3, label)

        return self.processed_pairs[idx]

    def get(self, idx):
        return self.get_pair(idx)



### 使用Dataset基类构建自己的数据集

In [28]:
""" import os
for file in os.listdir('data/funcs'):
    print(file) 

 """

" import os\nfor file in os.listdir('data/funcs'):\n    print(file) \n\n "

In [46]:
train_dataset = FastCloneDataset(root="data/", functions_path="data/networkx_data_idx/", pairs_path="data/train.npz", return_pair_data=False)
val_dataset = FastCloneDataset(root="data/", functions_path="data/networkx_data_idx/", pairs_path="data/valid.npz", return_pair_data=False)
test_dataset = FastCloneDataset(root="data/", functions_path="data/networkx_data_idx/", pairs_path="data/test.npz", return_pair_data=False)

In [47]:
print(train_dataset.num_features)

256


In [48]:
batch_size = 64
workers = 32

Data loader which merges data objects from a torch_geometric.data.dataset to a mini-batch

In [49]:
test_loader = DataLoader(test_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=workers, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=workers, shuffle=True)

### 建立图网络结构

In [50]:
import torch

from torch_geometric.nn import GraphConv, TopKPooling
#from torch.nn import Linear

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = GraphConv(train_dataset.num_features, 128)
        self.pool1 = TopKPooling(128, ratio=0.8)
        self.conv2 = GraphConv(128, 128)
        self.pool2 = TopKPooling(128, ratio=0.8)
        self.conv3 = GraphConv(128, 128)
        self.pool3 = TopKPooling(128, ratio=0.8)

        self.lin1 = torch.nn.Linear(256, 128)
        self.lin2 = torch.nn.Linear(128, 64)
        self.lin3 = torch.nn.Linear(64, 6)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        x = F.relu(self.conv1(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv3(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool3(x, edge_index, None, batch)
        x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = x1 + x2 + x3

        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.lin2(x))
        x = F.log_softmax(self.lin3(x), dim=-1)

        return x

In [51]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

In [52]:
import torch.nn.functional as F

def train(epoch):
    # model.train() 让model变成训练模式，此时 dropout和batch normalization的操作在训练q起到防止网络过拟合的问题
    print(f'the {epoch} th training! ')
    model.train()

    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, data.y)
        loss.backward()
        print(f"loss = {loss.item()}", end="\r")
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)


@torch.no_grad()
def test(loader):
    model.eval()

    correct = 0
    for data in loader:
        data = data.to(device)
        pred = model(data).max(dim=1)[1]
        correct += pred.eq(data.y).sum().item()
    return correct / len(loader.dataset)


In [42]:
import logging
logging.basicConfig(
    filename='saved_models/train-test.logs',
    filemode='a',
    format='%(asctime)s [%(levelname)s] %(message)s',
    datefmt='%H:%M:%S',
    level=logging.DEBUG,
)
logger = logging.getLogger()

best_val_acc = 0
for epoch in range(3, 201):
    loss = train(epoch)
    train_acc = test(train_loader)
    val_acc = test(val_loader)
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "saved_models/new_main.pt")
        
    test_acc = test(test_loader)
    print(f'the {epoch} th acc on test is: {test_acc}')
    scheduler.step()
    
    log = 'Epoch: {:03d}, Loss: {:.5f}, Train Acc: {:.5f}, Val Acc: {:.5f}, Test Acc: {:.5f}'.format(epoch, loss, train_acc, val_acc, test_acc)
    logger.info(log)
    
    """ with open("../data/new_main.log", "a") as f:
        f.write(log + "\n") """

the 3 th training! 


PicklingError: Can't pickle <class '__main__.FastCloneDataset'>: it's not the same object as __main__.FastCloneDataset

In [None]:
test_acc = test(test_loader)
test_acc