# Setup

In [None]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.1+cu113.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

In [None]:
!git clone https://github.com/Miyamura80/GNN_Graph_Classification_GRL.git
%cd GNN_Graph_Classification_GRL

# Data Loader

# Data Loader

In [None]:
import json
from torch_geometric.data import Data
import torch
import pickle
import os
import os.path as osp
import numpy as np


def read_sc_graph(direc, file, processed_root):
    path = osp.join(direc, file + ".json")
    presaved_path = osp.join(processed_root, file + ".pre")
    if not osp.exists(presaved_path):  # The file doesn't exist
        print("making directory")
        with open(path, "r") as f:
            data = f.readlines()

            data[0] = data[0][1:] + ""
            data = [jline.strip()[:len(jline.strip())-1] for jline in data]

            graphs = [json.loads(jline) for jline in data]
            # Load Json into PyG `Data` type
            pyg_graphs = [
                map_sc_graph_to_pyg(graph, make_undirected=True, remove_dup=False)
                for graph in graphs
            ]

            if not osp.exists(processed_root):
                os.mkdir(processed_root)
            with open(presaved_path, "wb") as g:  # Save for future reference
                pickle.dump(pyg_graphs, g)
                g.close()
            f.close()
            return pyg_graphs
    else:  # Load the pre-existing pickle
        print("load preexisting")
        with open(presaved_path, "rb") as g:
            pyg_graphs = pickle.load(g)
            g.close()
        return pyg_graphs


def map_sc_graph_to_pyg(json_file, make_undirected=True, remove_dup=False):
    # Note: make_undirected makes duplicate edges, so we need to preserve edge types.
    edge_index = np.array([[g[0], g[2]] for g in json_file["graph"]]).T  # Edge Index
    edge_attributes = np.array(
        [g[1] - 1 for g in json_file["graph"]]
    )  # Edge type (-1 to put in [0, 3] range)

    if (
        make_undirected
    ):  # This will invariably cost us edge types because we reduce duplicates
        edge_index_reverse = edge_index[[1, 0], :]
        # Concat and remove duplicates
        if remove_dup:
            edge_index = torch.LongTensor(
                np.unique(
                    np.concatenate([edge_index, edge_index_reverse], axis=1), axis=1
                )
            )
        else:
            edge_index = torch.LongTensor(
                np.concatenate([edge_index, edge_index_reverse], axis=1)
            )
            edge_attributes = torch.LongTensor(
                np.concatenate([edge_attributes, np.copy(edge_attributes)], axis=0)
            )
    features = np.array(json_file["node_features"])
    features = np.concatenate((features[:, 0:11], features[:,22:36]), axis=1) 
    x = torch.FloatTensor(features)
    y = torch.FloatTensor(np.array([[int(json_file["targets"])]]).T)
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attributes, y=y)


def get_dataset(args, root_dir):
    dataset_path = osp.join(root_dir, "data", args.dataset)
    sc_proc_root = osp.join(dataset_path, f"{args.dataset}_proc")

    train_graphs = read_sc_graph(
        dataset_path, "train", sc_proc_root
    )
    valid_graphs = read_sc_graph(
        dataset_path, "valid", sc_proc_root
    )
    num_feat = 25
    num_pred = 1
    return train_graphs, valid_graphs, num_feat, num_pred

def fyi(x):
    return 2*x

# Model Loader

## Models

### GAT

In [None]:
import torch
import torch.nn.functional as F
from torch.nn import ModuleList, BatchNorm1d
from torch.nn import Sequential, Linear, ReLU
from torch_geometric.nn import GATConv
from torch_scatter import scatter_max, scatter_mean


class NetGAT(torch.nn.Module):
    def __init__(
        self,
        num_features,
        num_classes,
        emb_sizes=None,
        drpt_prob=0.5,
        scatter="max",
        device="cpu",
    ):
        super(NetGAT, self).__init__()
        if emb_sizes is None:  # Python default handling for mutable input
            emb_sizes = [32, 64, 64]  # The 0th entry is the input feature size.
        self.num_features = num_features
        self.emb_sizes = emb_sizes
        self.num_layers = len(self.emb_sizes) - 1
        self.drpt_prob = drpt_prob
        self.scatter = scatter
        self.device = device

        self.initial_mlp_modules = ModuleList(
            [
                Linear(num_features, emb_sizes[0]).to(device),
                BatchNorm1d(emb_sizes[0]).to(device),
                ReLU().to(device),
                Linear(emb_sizes[0], emb_sizes[0]).to(device),
                BatchNorm1d(emb_sizes[0]).to(device),
                ReLU().to(device),
            ]
        )
        self.initial_mlp = Sequential(*self.initial_mlp_modules).to(device)
        self.initial_linear = Linear(emb_sizes[0], num_classes).to(device)

        gat_layers = []
        linears = []
        for i in range(self.num_layers):
            in_channel = emb_sizes[i]
            out_channel = emb_sizes[i + 1]
            gat_layer = GATConv(in_channels=in_channel, out_channels=out_channel).to(
                device
            )
            gat_layers.append(gat_layer)
            linears.append(Linear(emb_sizes[i + 1], num_classes).to(device))

        self.gat_modules = ModuleList(gat_layers)
        self.linear_modules = ModuleList(linears)

    def reset_parameters(self):
        for (name, module) in self._modules.items():
            if hasattr(module, "reset_parameters"):
                module.reset_parameters()
        for module in self.gat_modules:
            if hasattr(module, "reset_parameters"):
                module.reset_parameters()
        for module in self.linear_modules:
            if hasattr(module, "reset_parameters"):
                module.reset_parameters()
        for module in self.initial_mlp_modules:
            if hasattr(module, "reset_parameters"):
                module.reset_parameters()

    def pooling(self, x_feat, batch):
        if self.scatter == "max":
            return scatter_max(x_feat, batch, dim=0)[0].to(self.device)
        elif self.scatter == "mean":
            return scatter_mean(x_feat, batch, dim=0).to(self.device)
        else:
            pass

    def forward(self, data):
        x_feat = data.x.to(self.device)
        edge_index = data.edge_index.to(self.device)
        edge_attributes = data.edge_attr.to(self.device)

        x_feat = self.initial_mlp(x_feat)

        out = F.dropout(
            self.pooling(self.initial_linear(x_feat), data.batch), 
            p=self.drpt_prob
        )

        for gat_layer, linear_layer in zip(self.gat_modules, self.linear_modules):
            edges = edge_index.T[edge_attributes == 1].T
            x_feat = gat_layer(x_feat, edges).to(self.device)

            out += F.dropout(
                linear_layer(self.pooling(x_feat, data.batch)),
                p=self.drpt_prob,
                training=self.training,
            )

        return out



### GCN

In [None]:
import torch
import torch.nn.functional as F
from torch_scatter import scatter_max, scatter_mean
from torch.nn import ModuleList, BatchNorm1d
from torch.nn import Sequential, Linear, ReLU
from torch_geometric.nn import GCNConv


class NetGCN(torch.nn.Module):
    def __init__(
        self,
        num_features,
        num_classes,
        emb_sizes=None,
        drpt_prob=0.5,
        scatter="max",
        device="cpu",
    ):
        super(NetGCN, self).__init__()
        if emb_sizes is None:  # Python default handling for mutable input
            emb_sizes = [32, 64, 64]  # The 0th entry is the input feature size.
        self.num_features = num_features
        self.emb_sizes = emb_sizes
        self.num_layers = len(self.emb_sizes) - 1
        self.drpt_prob = drpt_prob
        self.scatter = scatter
        self.device = device

        self.initial_mlp_modules = ModuleList(
            [
                Linear(num_features, emb_sizes[0]).to(device),
                BatchNorm1d(emb_sizes[0]).to(device),
                ReLU().to(device),
                Linear(emb_sizes[0], emb_sizes[0]).to(device),
                BatchNorm1d(emb_sizes[0]).to(device),
                ReLU().to(device),
            ]
        )
        self.initial_mlp = Sequential(*self.initial_mlp_modules).to(device)
        self.initial_linear = Linear(emb_sizes[0], num_classes).to(device)

        gcn_layers = []
        linears = []
        for i in range(self.num_layers):
            in_channel = emb_sizes[i]
            out_channel = emb_sizes[i + 1]
            gcn_layer = GCNConv(in_channels=in_channel, out_channels=out_channel).to(
                device
            )
            gcn_layers.append(gcn_layer)
            linears.append(Linear(emb_sizes[i + 1], num_classes).to(device))

        self.gcn_modules = ModuleList(gcn_layers)
        self.linear_modules = ModuleList(linears)

    def reset_parameters(self):
        for (name, module) in self._modules.items():
            if hasattr(module, "reset_parameters"):
                module.reset_parameters()
        for module in self.gcn_modules:
            if hasattr(module, "reset_parameters"):
                module.reset_parameters()
        for module in self.linear_modules:
            if hasattr(module, "reset_parameters"):
                module.reset_parameters()
        for module in self.initial_mlp_modules:
            if hasattr(module, "reset_parameters"):
                module.reset_parameters()

    def pooling(self, x_feat, batch):
        if self.scatter == "max":
            return scatter_max(x_feat, batch, dim=0)[0].to(self.device)
        elif self.scatter == "mean":
            return scatter_mean(x_feat, batch, dim=0).to(self.device)
        else:
            pass

    def forward(self, data):
        x_feat = data.x.to(self.device)
        edge_index = data.edge_index.to(self.device)
        edge_attributes = data.edge_attr.to(self.device)

        x_feat = self.initial_mlp(x_feat)

        out = F.dropout(
            self.pooling(self.initial_linear(x_feat), data.batch), 
            p=self.drpt_prob
        )

        for gcn_layer, linear_layer in zip(self.gcn_modules, self.linear_modules):
            edges = edge_index.T[edge_attributes == 1].T
            x_feat = gcn_layer(x_feat, edges).to(self.device)

            out += F.dropout(
                linear_layer(self.pooling(x_feat, data.batch)),
                p=self.drpt_prob,
                training=self.training,
            )

        return out



## Loader

In [None]:

def get_model(args, device="cpu", num_features=None, num_classes=None):

    if args.model == "GAT":
        emb_sizes = [args.emb_dim] * (args.num_layers + 1)
        model = NetGAT(
            num_features,
            num_classes,
            emb_sizes=emb_sizes,
            device=device,
            scatter=args.scatter,
            drpt_prob=args.dropout,
        )
        return model
    elif args.model == "GCN":
        emb_sizes = [args.emb_dim] * (args.num_layers + 1)
        model = NetGCN(
            num_features,
            num_classes,
            emb_sizes=emb_sizes,
            device=device,
            scatter=args.scatter,
            drpt_prob=args.dropout,
        )
        return model




# Experiments

In [None]:
import torch
from torch_geometric.loader import DataLoader
import numpy as np
import matplotlib.pyplot as plt

avail_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def train(model, loader, optimizer, loss_fun):
    model.train()
    loss_all = 0

    for data in loader:
        data = data.to(avail_device)
        optimizer.zero_grad()
        loss = loss_fun(model(data), data.y).to(avail_device)
        loss.backward()
        loss_all += loss.item()
        optimizer.step()

    return loss_all / len(loader.dataset)


def val(model, loader, loss_fun, y_idx=0):
    model.eval()
    loss_all = 0

    for data in loader:
        data = data.to(avail_device)
        loss_all += loss_fun(model(data), data.y).item()

    return loss_all / len(loader.dataset)


def test(model, loader):
    model.eval()
    total_err = 0

    for data in loader:
        data = data.to(avail_device)
        pred = model(data).max(1)[1]
        correct += pred.eq(data.y).sum().item()

    return correct / len(loader.dataset)

def run_sc_model_gc(
    model,
    dataset_tr,
    dataset_val,
    batch_size=32,
    lr=0.0001,
    epochs=300,
    nb_reruns=5,
):

    plot_train_loss = []
    plot_val_loss = []
    plot_epoch = []

    loss_fun = torch.nn.BCEWithLogitsLoss()

    print("----------------- Predicting bug presence -----------------")
    all_val_loss = np.zeros(nb_reruns,)

    for rerun in range(nb_reruns): 
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)  # Made static

        val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False) # no shufflng for training
        train_loader = DataLoader( # Shuffle for training
            dataset_tr, batch_size=batch_size, shuffle=True
        )  

        print(
            "---------------- "
            + ": Re-run {} ----------------".format(rerun)
        )

        best_val_loss = 100000
 
        for epoch in range(1, epochs + 1):
            # lr = scheduler.optimizer.param_groups[0]['lr']  # Same as GC
            train_loss = train(
                model, train_loader, optimizer, loss_fun
            )
            val_loss = val(model, val_loader, loss_fun)
            # scheduler.step(val_mse_sum)
            if best_val_loss >= val_loss:  # Improvement in validation loss
                best_val_loss = val_loss


            # ======================================
            # Plotting
            # ======================================
            
            plot_train_loss.append(train_loss)
            plot_epoch.append(epoch)
            plot_val_loss.append(val_loss)


            print(
                "Epoch: {:03d}, LR: {:7f}, Train Loss: {:.7f}, "
                "Val Loss: {:.7f}".format(
                    epoch, lr, train_loss, val_loss
                )
            )

        all_val_loss[rerun] = best_val_loss

    # Calculate mean and standard deviation of validation results
    avg_val_loss = all_val_loss.mean()
    std_val_loss = np.std(all_val_loss)


    torch.save(model, "../model_eito.pt")

    plt.plot(plot_epoch, plot_train_loss, label = "training loss")
    plt.plot(plot_epoch, plot_val_loss, label = "validation loss")
    plt.legend()
    plt.show()

    print("---------------- Final Result ----------------")
    print("Validation -- Mean: " + str(avg_val_loss) + ", Std: " + str(std_val_loss))



# Main

In [None]:
import time
import torch
import argparse
import os.path as osp
from collections import DotMap

args_dict = {
    "emb_dim": 64,
    "dataset": "reentrancy",
    "batch_size": 32,
    "model": "GAT",
    "lr": 0.001,
    "max_distance": 5,
    "num_layers": 2,
    "emb_dim": 64,
    "scatter": "max",
    "dropout": 0.5,
    "eps": 0.0,
    "epochs": 300
}
args = DotMap(args_dict)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
root_dir = osp.join(osp.dirname(osp.realpath(__file__)), "..")



train_graphs, valid_graphs, num_feat, num_pred = get_dataset(args, root_dir)

model = get_model(
    args,
    device,
    num_features=num_feat,
    num_classes=num_pred,
)

run_sc_model_gc(
    model,
    train_graphs,
    valid_graphs,
    lr=args.lr,
    batch_size=args.batch_size,
    epochs=args.epochs,
)
