## Prediction Verification GCN

Load libraries

In [1]:
import networkx as nx
import torch
from torch_geometric.utils.convert import from_networkx
import numpy as np
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.linear_model import LinearRegression

  from .autonotebook import tqdm as notebook_tqdm


Load data

In [3]:
train_ratio = 0.6
val_ratio = 0.2

# DEJAN NETWORK
DejanGraph = nx.read_gml('Datasets/Dejan-Full-Node-Info.gml')
DejanGraph = nx.convert_node_labels_to_integers(DejanGraph, label_attribute="id")

# rename verified to y attribute (verified) to graph before conversion 
for n, data in DejanGraph.nodes(data=True):
    data["y"] = data.pop('verified')

DejanData = from_networkx(DejanGraph, group_node_attrs=['follower_count','friends_count','listed_count','favourites_count','statuses_count'])
#     data = from_networkx(SoccerGraph, group_node_attrs=['followers'])

num_node_features = len(DejanData.x[0])
num_classes = 2

DejanData.x = DejanData.x.type(torch.FloatTensor)

# HEDDEN NETWORK
HeddenGraph = nx.read_gml('Datasets/Hedden.gml')
HeddenGraph = nx.convert_node_labels_to_integers(HeddenGraph, label_attribute="id")

# rename verified to y attribute (verified) to graph before conversion 
for n, data in HeddenGraph.nodes(data=True):
    data["y"] = data.pop('verified')

HeddenData = from_networkx(HeddenGraph, group_node_attrs=['follower_count','friends_count','listed_count','favourites_count','statuses_count'])

num_node_features = len(HeddenData.x[0])
num_classes = 2

HeddenData.x = HeddenData.x.type(torch.FloatTensor)

# MCCORMICK NETWORK
McCormickGraph = nx.read_gml('Datasets/McCormick.gml')
McCormickGraph = nx.convert_node_labels_to_integers(McCormickGraph, label_attribute="id")

# rename verified to y attribute (verified) to graph before conversion 
for n, data in McCormickGraph.nodes(data=True):
    data["y"] = data.pop('verified')

McCormickData = from_networkx(McCormickGraph, group_node_attrs=['follower_count','friends_count','listed_count','favourites_count','statuses_count'])
#     data = from_networkx(SoccerGraph, group_node_attrs=['followers'])

num_node_features = len(McCormickData.x[0])
num_classes = 2

McCormickData.x = McCormickData.x.type(torch.FloatTensor)

Linear regression models as a baseline

In [4]:
DejanModel = LinearRegression().fit(DejanData.x, DejanData.y)
HeddenModel = LinearRegression().fit(HeddenData.x, HeddenData.y)
McCormickModel = LinearRegression().fit(McCormickData.x, McCormickData.y)

r_sq = DejanModel.score(DejanData.x, DejanData.y)
print(f"coefficient of determination dejan: {r_sq}")
r_sq = HeddenModel.score(HeddenData.x, HeddenData.y)
print(f"coefficient of determination hedden: {r_sq}")
r_sq = McCormickModel.score(McCormickData.x, McCormickData.y)
print(f"coefficient of determination mccormick: {r_sq}")

coefficient of determination dejan: 0.4021911191578005
coefficient of determination hedden: 0.16435991757673385
coefficient of determination mccormick: 0.16573663521979654


Define data masking function - 60/20/20 split

In [None]:
def mask_data(data, train_ratio, val_ratio):

    test_ratio = 1 - train_ratio - val_ratio

    num_nodes = data.x.shape[0]
    num_train = int(num_nodes * train_ratio)
    num_val = int(num_nodes * val_ratio)
    num_test = 1 - num_train - num_val
    idx = [i for i in range(num_nodes)]

    np.random.shuffle(idx)

    train_mask = torch.full_like(data.y, False, dtype=bool)
    train_mask[idx[:num_train]] = True

    val_mask = torch.full_like(data.y, False, dtype=bool)
    val_mask[idx[num_train:num_train+num_val]] = True

    test_mask = torch.full_like(data.y, False, dtype=bool)
    test_mask[idx[num_train+num_val:]] = True

    data['train_mask'] = train_mask
    data['val_mask'] = val_mask
    data['test_mask'] = test_mask

Mask data

In [None]:
mask_data(DejanData, 0.6, 0.2)
mask_data(HeddenData, 0.6, 0.2)
mask_data(McCormickData, 0.6, 0.2)

Define GCN, hyperparameter grid search

In [None]:
import statistics

class GCN(torch.nn.Module):
    def __init__(self, num_layers):
        super().__init__()
        self.layers = torch.nn.ModuleList()
        self.layers.append(GCNConv(num_node_features, 16))
        for i in range(num_layers):
            self.layers.append(GCNConv(16, 16))
        self.layers.append(GCNConv(16, num_classes))

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        for layer in self.layers[:-1]:
            x = layer(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, training=self.training)
        x = self.layers[-1](x, edge_index)

        return F.log_softmax(x, dim=1)

num_layers = [2, 3]
l2_regs = [0.0001, 0.001]
learning_rates = [0.05, 0.01, 0.001]

NUM_ITERS = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = DejanData.to(device)

all_trials = []
for n in num_layers:            
    for l2_reg in l2_regs:
        for learning_rate in learning_rates:
            
            results = []
            
            for i in range(NUM_ITERS):
                model = GCN(n).to(device)
                
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_reg)

                model.train()
                for epoch in range(200):
                    optimizer.zero_grad()
                    out = model(data)
                    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
                    loss.backward()
                    optimizer.step()
                model.eval()
                pred = model(data).argmax(dim=1)
                correct = (pred[data.val_mask] == data.y[data.val_mask]).sum()
                acc = int(correct) / int(data.val_mask.sum())
                print(n, l2_reg, learning_rate)
                print(f'Accuracy: {acc:.4f}')
                
                results.append(acc)
                
            avg = sum(results) / len(results)
            stdev = statistics.pstdev(results)
            all_trials.append(((n, l2_reg, learning_rate), avg, stdev))

print(("Number of layers", "L2 regularization", "Learning rate"), "Average", "Standard Dev.")
for t in all_trials:
    print(t)

max = max(all_trials, key=lambda x:x[1])
print("max: ", max)

Train a model on each network

In [None]:
def Run_GCN(data):

    data = data.to(device)

    data.validate(raise_on_error=True)

    model = GCN(2).to(device) 

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    
    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
    # train on own model
    model.eval()
    pred = model(data).argmax(dim=1)
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
    acc = int(correct) / int(data.test_mask.sum())
    print(f'Accuracy: {acc:.4f}')
    
    
print('dejan')
Run_GCN(DejanData)
print('mccormick')
Run_GCN(McCormickData)
print('hedden')
Run_GCN(HeddenData)

Test each model on the other networks

In [None]:
# define method to train on one graph, test on the other two
def test_on_other_two(train_data, test_data1, test_data2):

    train_data = train_data.to(device)
    test_data1 = test_data1.to(device)
    test_data2 = test_data2.to(device)

    train_data.validate(raise_on_error=True)
    test_data1.validate(raise_on_error=True)
    test_data2.validate(raise_on_error=True)

    model = GCN(2).to(device) 

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    
    # train on one model
    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(train_data)
        loss = F.nll_loss(out[train_data.train_mask], train_data.y[train_data.train_mask])
        loss.backward()
        optimizer.step()
        
        
    # test on trained on model
    model.eval()
    pred = model(train_data).argmax(dim=1)
    correct = (pred[train_data.test_mask] == train_data.y[train_data.test_mask]).sum()
    acc = int(correct) / int(train_data.test_mask.sum())
    print(f'Accuracy on trained on model: {acc:.4f}')    
        
    # test on other two
    model.eval()
    pred = model(test_data1).argmax(dim=1)
    correct = (pred[test_data1.test_mask] == test_data1.y[test_data1.test_mask]).sum()
    acc = int(correct) / int(test_data1.test_mask.sum())
    print(f'Accuracy on other model: {acc:.4f}')
    
    model.eval()
    pred = model(test_data2).argmax(dim=1)
    correct = (pred[test_data2.test_mask] == test_data2.y[test_data2.test_mask]).sum()
    acc = int(correct) / int(test_data2.test_mask.sum())
    print(f'Accuracy on other other model: {acc:.4f}')

 
print("trained on dejan_____________")
print("Dejan, McCormick, Hedden")
test_on_other_two(DejanData, McCormickData, HeddenData)

 
print("trained on mccormick_________")
print("McCormick, Dejan, Hedden")
test_on_other_two(McCormickData, DejanData, HeddenData)

 
print("trained on hedden____________")
print("Hedden, Dejan, McCormick")
test_on_other_two(HeddenData, DejanData, McCormickData)


Train and test on all three networks

In [None]:
# relabel node indexes so graphs don't overlap
# add 685 to every Hedden (# of nodes in Dejan)
HeddenDataNewEdgeIndex = torch.add(HeddenData.edge_index, 685)
# add 685 + 2446 = 3131 to every McCormick (# of nodes in Dejan + Hedden)
McCormickDataNewEdgeIndex = torch.add(McCormickData.edge_index, 3131)

# combine edge lists
CombinedDataEdgeInd = torch.cat((DejanData.edge_index, HeddenDataNewEdgeIndex, McCormickDataNewEdgeIndex), 1)

# combine x
CombinedX = torch.cat((DejanData.x, HeddenData.x, McCormickData.x), 0)

# combine y
CombinedY = torch.cat((DejanData.y, HeddenData.y, McCormickData.y), 0)

# make new PyTorch data object
from torch_geometric.data import Data

CombinedData = Data(x=CombinedX, edge_index=CombinedDataEdgeInd, y=CombinedY)
mask_data(CombinedData, 0.6, 0.2)
print(CombinedData)



# run GCN with combined data
Run_GCN(CombinedData)