In [1]:
import os
import pandas as pd
import numpy as np
import torch
import dgl

Using backend: pytorch


In [2]:
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GraphConv

In [3]:
from sklearn.utils import shuffle
my_batch_size = 30

In [4]:
from dgl.data import DGLDataset

class MyDataset(DGLDataset):
    """
    Parameters
    -------------------------
    raw_dir: str
        Specifying the directory that already stores the input data.
    
    """
    _pos_directory= '../positive_graph_save/'
    _neg_directory= '../negative_graph_save/'
    def __init__(self, 
                 url=None,
                 raw_dir=None,
                 save_dir=None,
                 force_reload=False,
                 verbose=False):
        super(MyDataset, self).__init__(name='docking_classify',
                                        url=url,
                                        raw_dir=raw_dir,
                                        save_dir=save_dir,
                                        force_reload=force_reload,
                                        verbose=verbose)
    def download(self):
        pass

    #must be implemented
    def process(self):
        df_pos = pd.read_csv('./positive_dataset.csv')
        df_neg = pd.read_csv('./negative_dataset.csv')
        pos_graphs = df_pos['file_name']
        pos_labels = df_pos['label']
        neg_graphs = df_neg['file_name']
        neg_labels = df_neg['label']

        #half_batch = int(my_batch_size/2)
        self.graph_dataset = []
        self.graph_labels = []
        #negative graphs are more
        for i in range(len(neg_graphs)):
            self.graph_dataset.append(pos_graphs[i%len(pos_graphs)])
            self.graph_dataset.append(neg_graphs[i])
            self.graph_labels.append(torch.Tensor([1,0])) #positive
            self.graph_labels.append(torch.Tensor([0,1])) #negative
            
        self.df_dataset = pd.DataFrame({'file_name':self.graph_dataset, 'label':self.graph_labels})
        self.df_dataset = shuffle(self.df_dataset)
        #for i in range(len())

    
    #must be implemented
    def __getitem__(self, idx):
        """get one item by index
        
        Parameters
        ---------------
        idx: int
            Item index

        Returns
        ---------------
        (dgl.DGLGraph, Tensor)
        """
        graph = dgl.load_graphs(self.df_dataset['file_name'][idx.item()])[0] #idx.item():convert torch.Tensor to int
        #print(self.df_dataset['file_name'][idx.item()])
        label = self.df_dataset['label'][idx.item()]
        return graph[0], label[0].long()

    #must be implemented
    def __len__(self):
        #number of data examples
        return self.df_dataset.shape[0]
        

    def save(self):
        pass

    def load(self):
        pass

    def has_cache(self):
        pass



In [16]:
my_dataset = MyDataset()

In [17]:
from dgl.dataloading.pytorch import GraphDataLoader
from torch.utils.data.sampler import SubsetRandomSampler

num_examples = len(my_dataset)
print("dataset length:", num_examples)
num_train = int(num_examples*0.8)

train_sampler = SubsetRandomSampler(torch.arange(num_train))
test_sampler = SubsetRandomSampler(torch.arange(num_train, num_examples))

train_dataloader = GraphDataLoader(my_dataset, sampler=train_sampler, batch_size=my_batch_size, drop_last=False)
test_dataloader = GraphDataLoader(my_dataset, sampler=test_sampler, batch_size=my_batch_size, drop_last=False)

dataset length: 14996


## Use $train\_dataloader$ and $test\_dataloader$ to get a batched graph with a batch size of $my\_batch\_size$

In [18]:
it = iter(train_dataloader)
batch = next(it)
print(batch)

[Graph(num_nodes=68123, num_edges=64322414,
      ndata_schemes={'h': Scheme(shape=(10,), dtype=torch.int64)}
      edata_schemes={'h': Scheme(shape=(1,), dtype=torch.float32)}), tensor([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 1, 0, 1, 1])]


In [19]:
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, hidden_size, allow_zero_in_degree=True)
        self.conv2 = GraphConv(hidden_size, num_classes, allow_zero_in_degree=True)

    def forward(self, g, inputs):
        h = self.conv1(g, inputs)
        h = F.relu(h)
        h = self.conv2(g, h)
        g.ndata['h'] = h
        return dgl.mean_nodes(g, 'h')
gnn = GCN(10, 16, 2)

In [21]:
import itertools
from tqdm import tqdm

optimizer = torch.optim.Adam(gnn.parameters(), lr=0.01)
all_logits = []
for epoch in range(30):
    for batched_graph, labels in tqdm(train_dataloader):
        pred = gnn(batched_graph, batched_graph.ndata['h'].float())
        #print(pred.shape)
        #print(labels.shape)
        loss = F.cross_entropy(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print("batches:"+str(batch)+"------------------------loss:"+str(loss))
    