In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import Library

In [None]:
import pickle
import random as rd
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.io import loadmat
import copy as cp
from sklearn.metrics import f1_score, accuracy_score, recall_score, roc_auc_score, average_precision_score
from collections import defaultdict
import warnings
import torch
from torch.utils.data import DataLoader, TensorDataset
warnings.filterwarnings("ignore", category=DeprecationWarning)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 1. Load Data

In [None]:
def load_data(data):
	prefix = '/content/drive/MyDrive/DOAN/'
	if(data == 'YelpChi'):
		data_file = loadmat(prefix + 'YelpChi.mat')
		with open(prefix + 'yelp_homo_adjlists.pickle', 'rb') as file:
			homo = pickle.load(file)
			file.close()
		with open(prefix + 'yelp_rur_adjlists.pickle', 'rb') as file:
			relation1 = pickle.load(file)
			file.close()
		with open(prefix + 'yelp_rtr_adjlists.pickle', 'rb') as file:
			relation2 = pickle.load(file)
			file.close()
		with open(prefix + 'yelp_rsr_adjlists.pickle', 'rb') as file:
			relation3 = pickle.load(file)
			file.close()
	labels = data_file['label'].flatten()
	feat_data = data_file['features'].todense().A
	return [homo, relation1, relation2, relation3], feat_data, labels


## 2. Processing Function

In [None]:
def normalize(mx):
	rowsum = np.array(mx.sum(1)) + 0.01
	r_inv = np.power(rowsum, -1).flatten()
	r_inv[np.isinf(r_inv)] = 0.
	r_mat_inv = sp.diags(r_inv)
	mx = r_mat_inv.dot(mx)
	return mx

In [None]:
def sparse_to_adjlist(sp_matrix, filename):
	homo_adj = sp_matrix + sp.eye(sp_matrix.shape[0])
	adj_lists = defaultdict(set)
	edges = homo_adj.nonzero()
	for index, node in enumerate(edges[0]):
		adj_lists[node].add(edges[1][index])
		adj_lists[edges[1][index]].add(node)
	with open(filename, 'wb') as file:
		pickle.dump(adj_lists, file)
	file.close()

In [None]:
def pos_neg_split(nodes, labels):
	pos_nodes = []
	neg_nodes = cp.deepcopy(nodes)
	aux_nodes = cp.deepcopy(nodes)
	for idx, label in enumerate(labels):
		if label == 1:
			pos_nodes.append(aux_nodes[idx])
			neg_nodes.remove(aux_nodes[idx])

	return pos_nodes, neg_nodes

In [None]:
def undersample(pos_nodes, neg_nodes, scale=1):

	aux_nodes = cp.deepcopy(neg_nodes)
	aux_nodes = rd.sample(aux_nodes, k=int(len(pos_nodes)*scale))
	batch_nodes = pos_nodes + aux_nodes

	return batch_nodes

## 3. CARE GNN model

In [None]:
import torch
import torch.nn as nn
from torch.nn import init


class OneLayerCARE(nn.Module):

	def __init__(self, num_classes, inter1, lambda_1):
		super(OneLayerCARE, self).__init__()
		self.inter1 = inter1
		self.xent = nn.CrossEntropyLoss()

		self.weight = nn.Parameter(torch.FloatTensor(inter1.embed_dim, num_classes))
		init.xavier_uniform_(self.weight)
		self.lambda_1 = lambda_1

	def forward(self, nodes, labels, train_flag=True):
		embeds1, label_scores = self.inter1(nodes, labels, train_flag)
		scores = torch.mm(embeds1, self.weight)
		return scores, label_scores

	def to_prob(self, nodes, labels, train_flag=True):
		gnn_scores, label_scores = self.forward(nodes, labels, train_flag)
		gnn_prob = nn.functional.softmax(gnn_scores, dim=1)
		label_prob = nn.functional.softmax(label_scores, dim=1)
		return gnn_prob, label_prob

	def loss(self, nodes, labels, train_flag=True):
		gnn_scores, label_scores = self.forward(nodes, labels, train_flag)
		label_loss = self.xent(label_scores, labels.squeeze())
		gnn_loss = self.xent(gnn_scores, labels.squeeze())
		final_loss = gnn_loss + self.lambda_1 * label_loss
		return final_loss

In [None]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.nn.functional as F
from torch.autograd import Variable


from operator import itemgetter
import math

class InterAgg(nn.Module):

	def __init__(self, features, feature_dim,
				 embed_dim, adj_lists, intraggs,
				 inter='GNN', step_size=0.02, cuda=True):
		super(InterAgg, self).__init__()

		self.features = features
		self.dropout = 0.6
		self.adj_lists = adj_lists
		self.intra_agg1 = intraggs[0]
		self.intra_agg2 = intraggs[1]
		self.intra_agg3 = intraggs[2]
		self.embed_dim = embed_dim
		self.feat_dim = feature_dim
		self.inter = inter
		self.step_size = step_size
		self.cuda = cuda
		self.intra_agg1.cuda = cuda
		self.intra_agg2.cuda = cuda
		self.intra_agg3.cuda = cuda

		# RL condition flag
		self.RL = True

		# number of batches for current epoch, assigned during training
		self.batch_num = 0

		# initial filtering thresholds
		self.thresholds = [0.5, 0.5, 0.5]

		# the activation function used by attention mechanism
		self.leakyrelu = nn.LeakyReLU(0.2)

		# parameter used to transform node embeddings before inter-relation aggregation
		self.weight = nn.Parameter(torch.FloatTensor(self.feat_dim, self.embed_dim))
		init.xavier_uniform_(self.weight)

		# weight parameter for each relation used by CARE-Weight
		self.alpha = nn.Parameter(torch.FloatTensor(self.embed_dim, 3))
		init.xavier_uniform_(self.alpha)

		# parameters used by attention layer
		self.a = nn.Parameter(torch.FloatTensor(2 * self.embed_dim, 1))
		init.xavier_uniform_(self.a)

		# label predictor for similarity measure
		self.label_clf = nn.Linear(self.feat_dim, 2)

		# initialize the parameter logs
		self.weights_log = []
		self.thresholds_log = [self.thresholds]
		self.relation_score_log = []

	def forward(self, nodes, labels, train_flag=True):

		# extract 1-hop neighbor ids from adj lists of each single-relation graph
		to_neighs = []
		for adj_list in self.adj_lists:
			to_neighs.append([set(adj_list[int(node)]) for node in nodes])

		# find unique nodes and their neighbors used in current batch
		unique_nodes = set.union(set.union(*to_neighs[0]), set.union(*to_neighs[1]),
								 set.union(*to_neighs[2], set(nodes)))

		# calculate label-aware scores
		if self.cuda:
			batch_features = self.features(torch.cuda.LongTensor(list(unique_nodes)))
		else:
			batch_features = self.features(torch.LongTensor(list(unique_nodes)))
		batch_scores = self.label_clf(batch_features)
		id_mapping = {node_id: index for node_id, index in zip(unique_nodes, range(len(unique_nodes)))}

		# the label-aware scores for current batch of nodes
		center_scores = batch_scores[itemgetter(*nodes)(id_mapping), :]

		# get neighbor node id list for each batch node and relation
		r1_list = [list(to_neigh) for to_neigh in to_neighs[0]]
		r2_list = [list(to_neigh) for to_neigh in to_neighs[1]]
		r3_list = [list(to_neigh) for to_neigh in to_neighs[2]]

		# assign label-aware scores to neighbor nodes for each batch node and relation
		r1_scores = [batch_scores[itemgetter(*to_neigh)(id_mapping), :].view(-1, 2) for to_neigh in r1_list]
		r2_scores = [batch_scores[itemgetter(*to_neigh)(id_mapping), :].view(-1, 2) for to_neigh in r2_list]
		r3_scores = [batch_scores[itemgetter(*to_neigh)(id_mapping), :].view(-1, 2) for to_neigh in r3_list]

		# count the number of neighbors kept for aggregation for each batch node and relation
		r1_sample_num_list = [math.ceil(len(neighs) * self.thresholds[0]) for neighs in r1_list]
		r2_sample_num_list = [math.ceil(len(neighs) * self.thresholds[1]) for neighs in r2_list]
		r3_sample_num_list = [math.ceil(len(neighs) * self.thresholds[2]) for neighs in r3_list]

		# intra-aggregation steps for each relation
		# Eq. (8) in the paper
		r1_feats, r1_scores = self.intra_agg1.forward(nodes, r1_list, center_scores, r1_scores, r1_sample_num_list)
		r2_feats, r2_scores = self.intra_agg2.forward(nodes, r2_list, center_scores, r2_scores, r2_sample_num_list)
		r3_feats, r3_scores = self.intra_agg3.forward(nodes, r3_list, center_scores, r3_scores, r3_sample_num_list)

		# concat the intra-aggregated embeddings from each relation
		neigh_feats = torch.cat((r1_feats, r2_feats, r3_feats), dim=0)

		# get features or embeddings for batch nodes
		if self.cuda and isinstance(nodes, list):
			index = torch.LongTensor(nodes).cuda()
		else:
			index = torch.LongTensor(nodes)
		self_feats = self.features(index)

		# number of nodes in a batch
		n = len(nodes)

		# inter-relation aggregation steps
		# Eq. (9) in the paper
		if self.inter == 'Att':
			# 1) CARE-Att Inter-relation Aggregator
			combined, attention = att_inter_agg(len(self.adj_lists), self.leakyrelu, self_feats, neigh_feats, self.embed_dim,
												self.weight, self.a, n, self.dropout, self.training, self.cuda)
		elif self.inter == 'Weight':
			# 2) CARE-Weight Inter-relation Aggregator
			combined = weight_inter_agg(len(self.adj_lists), self_feats, neigh_feats, self.embed_dim, self.weight, self.alpha, n, self.cuda)
			gem_weights = F.softmax(torch.sum(self.alpha, dim=0), dim=0).tolist()
			if train_flag:
				print(f'Weights: {gem_weights}')
		elif self.inter == 'Mean':
			# 3) CARE-Mean Inter-relation Aggregator
			combined = mean_inter_agg(len(self.adj_lists), self_feats, neigh_feats, self.embed_dim, self.weight, n, self.cuda)
		elif self.inter == 'GNN':
			# 4) CARE-GNN Inter-relation Aggregator
			combined = threshold_inter_agg(len(self.adj_lists), self_feats, neigh_feats, self.embed_dim, self.weight, self.thresholds, n, self.cuda)

		# the reinforcement learning module
		if self.RL and train_flag:
			relation_scores, rewards, thresholds, stop_flag = RLModule([r1_scores, r2_scores, r3_scores],
																	   self.relation_score_log, labels, self.thresholds,
																	   self.batch_num, self.step_size)
			self.thresholds = thresholds
			self.RL = stop_flag
			self.relation_score_log.append(relation_scores)
			self.thresholds_log.append(self.thresholds)

		return combined, center_scores


class IntraAgg(nn.Module):

	def __init__(self, features, feat_dim, cuda=False):
		super(IntraAgg, self).__init__()

		self.features = features
		self.cuda = cuda
		self.feat_dim = feat_dim

	def forward(self, nodes, to_neighs_list, batch_scores, neigh_scores, sample_list):

		# filer neighbors under given relation
		samp_neighs, samp_scores = filter_neighs_ada_threshold(batch_scores, neigh_scores, to_neighs_list, sample_list)

		# find the unique nodes among batch nodes and the filtered neighbors
		unique_nodes_list = list(set.union(*samp_neighs))
		unique_nodes = {n: i for i, n in enumerate(unique_nodes_list)}

		# intra-relation aggregation only with sampled neighbors
		mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes)))
		column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh]
		row_indices = [i for i in range(len(samp_neighs)) for _ in range(len(samp_neighs[i]))]
		mask[row_indices, column_indices] = 1
		if self.cuda:
			mask = mask.cuda()
		num_neigh = mask.sum(1, keepdim=True)
		mask = mask.div(num_neigh)
		if self.cuda:
			embed_matrix = self.features(torch.LongTensor(unique_nodes_list).cuda())
		else:
			embed_matrix = self.features(torch.LongTensor(unique_nodes_list))
		to_feats = mask.mm(embed_matrix)
		to_feats = F.relu(to_feats)
		return to_feats, samp_scores


def RLModule(scores, scores_log, labels, thresholds, batch_num, step_size):

	relation_scores = []
	stop_flag = True

	# only compute the average neighbor distances for positive nodes
	pos_index = (labels == 1).nonzero().tolist()
	pos_index = [i[0] for i in pos_index]

	# compute average neighbor distances for each relation
	for score in scores:
		pos_scores = itemgetter(*pos_index)(score)
		neigh_count = sum([1 if isinstance(i, float) else len(i) for i in pos_scores])
		pos_sum = [i if isinstance(i, float) else sum(i) for i in pos_scores]
		relation_scores.append(sum(pos_sum) / neigh_count)

	if len(scores_log) % batch_num != 0 or len(scores_log) < 2 * batch_num:
		# do not call RL module within the epoch or within the first two epochs
		rewards = [0, 0, 0]
		new_thresholds = thresholds
	else:
		# update thresholds according to average scores in last epoch
		# Eq.(5) in the paper
		previous_epoch_scores = [sum(s) / batch_num for s in zip(*scores_log[-2 * batch_num:-batch_num])]
		current_epoch_scores = [sum(s) / batch_num for s in zip(*scores_log[-batch_num:])]

		# compute reward for each relation and update the thresholds according to reward
		# Eq. (6) in the paper
		rewards = [1 if previous_epoch_scores[i] - s >= 0 else -1 for i, s in enumerate(current_epoch_scores)]
		new_thresholds = [thresholds[i] + step_size if r == 1 else thresholds[i] - step_size for i, r in enumerate(rewards)]

		# avoid overflow
		new_thresholds = [0.999 if i > 1 else i for i in new_thresholds]
		new_thresholds = [0.001 if i < 0 else i for i in new_thresholds]

		print(f'epoch scores: {current_epoch_scores}')
		print(f'rewards: {rewards}')
		print(f'thresholds: {new_thresholds}')

	# TODO: add terminal condition

	return relation_scores, rewards, new_thresholds, stop_flag


def filter_neighs_ada_threshold(center_scores, neigh_scores, neighs_list, sample_list):

	samp_neighs = []
	samp_scores = []
	for idx, center_score in enumerate(center_scores):
		center_score = center_scores[idx][0]
		neigh_score = neigh_scores[idx][:, 0].view(-1, 1)
		center_score = center_score.repeat(neigh_score.size()[0], 1)
		neighs_indices = neighs_list[idx]
		num_sample = sample_list[idx]

		# compute the L1-distance of batch nodes and their neighbors
		# Eq. (2) in paper
		score_diff = torch.abs(center_score - neigh_score).squeeze()
		sorted_scores, sorted_indices = torch.sort(score_diff, dim=0, descending=False)
		selected_indices = sorted_indices.tolist()

		# top-p sampling according to distance ranking and thresholds
		# Section 3.3.1 in paper
		if len(neigh_scores[idx]) > num_sample + 1:
			selected_neighs = [neighs_indices[n] for n in selected_indices[:num_sample]]
			selected_scores = sorted_scores.tolist()[:num_sample]
		else:
			selected_neighs = neighs_indices
			selected_scores = score_diff.tolist()
			if isinstance(selected_scores, float):
				selected_scores = [selected_scores]

		samp_neighs.append(set(selected_neighs))
		samp_scores.append(selected_scores)

	return samp_neighs, samp_scores


def mean_inter_agg(num_relations, self_feats, neigh_feats, embed_dim, weight, n, cuda):

	# transform batch node embedding and neighbor embedding in each relation with weight parameter
	center_h = torch.mm(self_feats, weight)
	neigh_h = torch.mm(neigh_feats, weight)

	# initialize the final neighbor embedding
	if cuda:
		aggregated = torch.zeros(size=(n, embed_dim)).cuda()
	else:
		aggregated = torch.zeros(size=(n, embed_dim))

	# sum neighbor embeddings together
	for r in range(num_relations):
		aggregated += neigh_h[r * n:(r + 1) * n, :]

	# sum aggregated neighbor embedding and batch node embedding
	# take the average of embedding and feed them to activation function
	combined = F.relu((center_h + aggregated) / 4.0)

	return combined


def weight_inter_agg(num_relations, self_feats, neigh_feats, embed_dim, weight, alpha, n, cuda):

	# transform batch node embedding and neighbor embedding in each relation with weight parameter
	center_h = torch.mm(self_feats, weight)
	neigh_h = torch.mm(neigh_feats, weight)

	# compute relation weights using softmax
	w = F.softmax(alpha, dim=1)

	# initialize the final neighbor embedding
	if cuda:
		aggregated = torch.zeros(size=(n, embed_dim)).cuda()
	else:
		aggregated = torch.zeros(size=(n, embed_dim))

	# add weighted neighbor embeddings in each relation together
	for r in range(num_relations):
		aggregated += neigh_h[r * n:(r + 1) * n, :] * w[:, r]

	# sum aggregated neighbor embedding and batch node embedding
	# feed them to activation function
	combined = nn.relu(center_h + aggregated)

	return combined


def att_inter_agg(num_relations, att_layer, self_feats, neigh_feats, embed_dim, weight, a, n, dropout, training, cuda):

	# transform batch node embedding and neighbor embedding in each relation with weight parameter
	center_h = torch.mm(self_feats, weight)
	neigh_h = torch.mm(neigh_feats, weight)

	import pdb
	pdb.set_trace()
	# compute attention weights
	combined = torch.cat((center_h.repeat(3, 1), neigh_h), dim=1)
	e = att_layer(combined.mm(a))
	attention = torch.cat((e[0:n, :], e[n:2 * n, :], e[2 * n:3 * n, :]), dim=1)
	ori_attention = F.softmax(attention, dim=1)
	attention = F.dropout(ori_attention, dropout, training=training)

	# initialize the final neighbor embedding
	if cuda:
		aggregated = torch.zeros(size=(n, embed_dim)).cuda()
	else:
		aggregated = torch.zeros(size=(n, embed_dim))

	# add neighbor embeddings in each relation together with attention weights
	for r in range(num_relations):
		aggregated += torch.mul(attention[:, r].unsqueeze(1).repeat(1, embed_dim), neigh_h[r * n:(r + 1) * n, :])

	# sum aggregated neighbor embedding and batch node embedding
	# feed them to activation function
	combined = nn.relu((center_h + aggregated))

	# extract the attention weights
	att = F.softmax(torch.sum(ori_attention, dim=0), dim=0)

	return combined, att


def threshold_inter_agg(num_relations, self_feats, neigh_feats, embed_dim, weight, threshold, n, cuda):

	# transform batch node embedding and neighbor embedding in each relation with weight parameter
	center_h = torch.mm(self_feats, weight)
	neigh_h = torch.mm(neigh_feats, weight)

	# initialize the final neighbor embedding
	if cuda:
		aggregated = torch.zeros(size=(n, embed_dim)).cuda()
	else:
		aggregated = torch.zeros(size=(n, embed_dim))

	# add weighted neighbor embeddings in each relation together
	for r in range(num_relations):
		aggregated += neigh_h[r * n:(r + 1) * n, :] * threshold[r]

	# sum aggregated neighbor embedding and batch node embedding
	# feed them to activation function
	combined = F.relu(center_h + aggregated)

	return combined

In [None]:
from scipy.io import loadmat
import numpy as np
import scipy.sparse as sp

# Simi_comp

def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1)) + 0.01
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

# load data
data_name = '/content/drive/MyDrive/DOAN/YelpChi.mat'  # 'Amazon.mat' or 'YelpChi.mat'
mode = 'pos'  # if set to pos, it only compute two metrics for positive nodes

data = loadmat(data_name)

net_list = [data['net_rur'].nonzero(), data['net_rtr'].nonzero(),
                 data['net_rsr'].nonzero(), data['homo'].nonzero()]

feature = normalize(data['features']).toarray()
label = data['label'][0]

# extract the edges of positive nodes in each relation graph
pos_nodes = set(label.nonzero()[0].tolist())
node_list = [set(net[0].tolist()) for net in net_list]
pos_node_list = [list(net_nodes.intersection(pos_nodes)) for net_nodes in node_list]
pos_idx_list = []
for net, pos_node in zip(net_list, pos_node_list):
    pos_idx_list.append(np.in1d(net[0], np.array(pos_node)).nonzero()[0])


feature_simi_list = []
label_simi_list = []
print('compute two metrics')
for net, pos_idx in zip(net_list, pos_idx_list):
    feature_simi = 0
    label_simi = 0
    if mode == 'pos':  # compute two metrics for positive nodes
        for idx in pos_idx:
            u, v = net[0][idx], net[1][idx]
            feature_simi += np.exp(-1 * np.square(np.linalg.norm(feature[u] - feature[v])))
            label_simi += label[u] == label[v]

        feature_simi = feature_simi / pos_idx.size
        label_simi = label_simi / pos_idx.size

    else:  # compute two metrics for all nodes
        for u, v in zip(net[0].tolist(), net[1].tolist()):
            feature_simi += np.exp(-1 * np.square(np.linalg.norm(feature[u] - feature[v])))
            label_simi += label[u] == label[v]

        feature_simi = feature_simi / net[0].size
        label_simi = label_simi / net[0].size

    feature_simi_list.append(feature_simi)
    label_simi_list.append(label_simi)

print(f'feature_simi: {feature_simi_list}')
print(f'label_simi: {label_simi_list}')

compute two metrics
feature_simi: [0.9905982527769122, 0.9879541091332517, 0.9878335774439747, 0.9878477617186091]
label_simi: [0.9089026915113871, 0.17636951567291514, 0.18574074326971865, 0.18376222591604388]


## 4. GraphSage model

In [None]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.nn.functional as F
from torch.autograd import Variable
import random

class GraphSage(nn.Module):
	"""
	Vanilla GraphSAGE Model
	Code partially from https://github.com/williamleif/graphsage-simple/
	"""
	def __init__(self, num_classes, enc):
		super(GraphSage, self).__init__()
		self.enc = enc
		self.xent = nn.CrossEntropyLoss()
		self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim))
		init.xavier_uniform_(self.weight)

	def forward(self, nodes):
		embeds = self.enc(nodes)
		scores = self.weight.mm(embeds)
		return scores.t()

	def to_prob(self, nodes):
		pos_scores = torch.sigmoid(self.forward(nodes))
		return pos_scores

	def loss(self, nodes, labels):
		scores = self.forward(nodes)
		return self.xent(scores, labels.squeeze())


class MeanAggregator(nn.Module):

	def __init__(self, features, gcn=False):

		super(MeanAggregator, self).__init__()

		self.features = features
		self.gcn = gcn

	def forward(self, nodes, to_neighs, num_sample=10):
		# Local pointers to functions (speed hack)
		_set = set
		if not num_sample is None:
			_sample = random.sample
			samp_neighs = [_set(_sample(to_neigh,
										num_sample,
										)) if len(to_neigh) >= num_sample else to_neigh for to_neigh in to_neighs]
		else:
			samp_neighs = to_neighs

		if self.gcn:
			samp_neighs = [samp_neigh.union(set([int(nodes[i])])) for i, samp_neigh in enumerate(samp_neighs)]
		unique_nodes_list = list(set.union(*samp_neighs))
		unique_nodes = {n: i for i, n in enumerate(unique_nodes_list)}
		mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes)))
		column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh]
		row_indices = [i for i in range(len(samp_neighs)) for j in range(len(samp_neighs[i]))]
		mask[row_indices, column_indices] = 1
		mask = mask.to(device)
		num_neigh = mask.sum(1, keepdim=True)
		mask = mask.div(num_neigh)
		embed_matrix = self.features(torch.LongTensor(unique_nodes_list).to(device))
		to_feats = mask.mm(embed_matrix)
		return to_feats


class Encoder(nn.Module):

	def __init__(self, features, feature_dim, embed_dim, adj_lists, aggregator, num_sample=10,
				 base_model=None, gcn=False, cuda=False,
				 feature_transform=False):
		super(Encoder, self).__init__()

		self.features = features
		self.feat_dim = feature_dim
		self.adj_lists = adj_lists
		self.aggregator = aggregator
		self.num_sample = num_sample
		if base_model != None:
			self.base_model = base_model

		self.gcn = gcn
		self.embed_dim = embed_dim
		self.cuda = cuda
		self.aggregator.cuda = cuda
		self.weight = nn.Parameter(
			torch.FloatTensor(embed_dim, self.feat_dim if self.gcn else 2 * self.feat_dim))
		init.xavier_uniform_(self.weight)

	def forward(self, nodes):
		"""
		Generates embeddings for a batch of nodes.

		nodes     -- list of nodes
		"""
		neigh_feats = self.aggregator.forward(nodes, [self.adj_lists[int(node)] for node in nodes],
											  self.num_sample)

		if isinstance(nodes, list):
			index = torch.LongTensor(nodes).cuda()
		else:
			index = nodes

		if not self.gcn:
			if self.cuda:
				self_feats = self.features(index)
			else:
				self_feats = self.features(index)
			combined = torch.cat((self_feats, neigh_feats), dim=1)
		else:
			combined = neigh_feats
		combined = F.relu(self.weight.mm(combined.t()))
		return combined

## 5. Test Function

In [None]:
def test_care(test_cases, labels, model, batch_size):

	test_batch_num = int(len(test_cases) / batch_size) + 1
	f1_gnn = 0.0
	acc_gnn = 0.0
	recall_gnn = 0.0
	f1_label1 = 0.0
	acc_label1 = 0.00
	recall_label1 = 0.0
	gnn_list = []
	label_list1 = []

	for iteration in range(test_batch_num):
		i_start = iteration * batch_size
		i_end = min((iteration + 1) * batch_size, len(test_cases))
		batch_nodes = test_cases[i_start:i_end]
		batch_label = labels[i_start:i_end]
		gnn_prob, label_prob1 = model.to_prob(batch_nodes, batch_label, train_flag=False)

		f1_gnn += f1_score(batch_label, gnn_prob.data.cpu().numpy().argmax(axis=1), average="macro")
		acc_gnn += accuracy_score(batch_label, gnn_prob.data.cpu().numpy().argmax(axis=1))
		recall_gnn += recall_score(batch_label, gnn_prob.data.cpu().numpy().argmax(axis=1), average="macro")

		f1_label1 += f1_score(batch_label, label_prob1.data.cpu().numpy().argmax(axis=1), average="macro")
		acc_label1 += accuracy_score(batch_label, label_prob1.data.cpu().numpy().argmax(axis=1))
		recall_label1 += recall_score(batch_label, label_prob1.data.cpu().numpy().argmax(axis=1), average="macro")

		gnn_list.extend(gnn_prob.data.cpu().numpy()[:, 1].tolist())
		label_list1.extend(label_prob1.data.cpu().numpy()[:, 1].tolist())

	auc_gnn = roc_auc_score(labels, np.array(gnn_list))
	ap_gnn = average_precision_score(labels, np.array(gnn_list))
	auc_label1 = roc_auc_score(labels, np.array(label_list1))
	ap_label1 = average_precision_score(labels, np.array(label_list1))
	print(f"GNN F1: {f1_gnn / test_batch_num:.4f}")
	print(f"GNN Accuracy: {acc_gnn / test_batch_num:.4f}")
	print(f"GNN Recall: {recall_gnn / test_batch_num:.4f}")
	print(f"GNN auc: {auc_gnn:.4f}")
	print(f"GNN ap: {ap_gnn:.4f}")

	return auc_gnn, auc_label1, recall_gnn, recall_label1

In [None]:
def test_sage(test_cases, labels, model, batch_size):
	test_batch_num = int(len(test_cases) / batch_size) + 1
	f1_gnn = 0.0
	acc_gnn = 0.0
	recall_gnn = 0.0
	gnn_list = []
	for iteration in range(test_batch_num):
		i_start = iteration * batch_size
		i_end = min((iteration + 1) * batch_size, len(test_cases))
		batch_nodes = test_cases[i_start:i_end]
		batch_label = labels[i_start:i_end]
		gnn_prob = model.to_prob(batch_nodes)
		f1_gnn += f1_score(batch_label, gnn_prob.data.cpu().numpy().argmax(axis=1), average="macro")
		acc_gnn += accuracy_score(batch_label, gnn_prob.data.cpu().numpy().argmax(axis=1))
		recall_gnn += recall_score(batch_label, gnn_prob.data.cpu().numpy().argmax(axis=1), average="macro")
		gnn_list.extend(gnn_prob.data.cpu().numpy()[:, 1].tolist())

	auc_gnn = roc_auc_score(labels, np.array(gnn_list))
	ap_gnn = average_precision_score(labels, np.array(gnn_list))
	print(f"GNN F1: {f1_gnn / test_batch_num:.4f}")
	print(f"GNN Accuracy: {acc_gnn / test_batch_num:.4f}")
	print(f"GNN Recall: {recall_gnn / test_batch_num:.4f}")
	print(f"GNN auc: {auc_gnn:.4f}")
	print(f"GNN ap: {ap_gnn:.4f}")

## 6. Train

## 6.1 Splitting Dataset

In [None]:
import time
import os
import random
from sklearn.model_selection import train_test_split


# load graph, feature, and label
[homo, relation1, relation2, relation3], feat_data, labels = load_data("YelpChi")

# train_test split
index = list(range(len(labels)))
idx_train, idx_test, y_train, y_test = train_test_split(index, labels, stratify=labels, test_size=0.20,
															random_state=2, shuffle=True)

In [None]:
print(f"homo: {homo[0]}")
print(f"relation 1: {relation1[0]}")
print(f"relation2: {relation2[0]}")
print(f"relation3: {relation3[0]}")
print(f"feat_data: {feat_data[0]}")
print(f"label : {labels[0]}")

homo: {0, 2, 6702}
relation 1: {0}
relation2: {0}
relation3: {0, 2, 6702}
feat_data: [0.02237555 0.07049484 0.42868165 0.99998516 0.99998516 0.39845686
 0.82359225 0.497025   0.96545738 0.15026337 0.99998516 0.58321834
 0.58391572 0.38148231 0.38164552 0.99997373 0.64309172 0.99997373
 0.80251163 0.78335917 0.75516906 0.77051205 0.9480598  0.86777185
 0.99502488 0.91044776 0.07960199 0.00995025 0.01492537 0.5920398
 0.13930348 0.49751244]
label : 0


## 6.2 SageGraph training



In [None]:
# initialize model input
features = nn.Embedding(feat_data.shape[0], feat_data.shape[1])
feat_data = normalize(feat_data)
features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)
features.to(device)
adj_lists = homo

agg1 = MeanAggregator(features)

enc1 = Encoder(features, feat_data.shape[1], 64, adj_lists, agg1, gcn=True, cuda= True)

# the vanilla GraphSAGE model as baseline
enc1.num_samples = 5
gnn_model = GraphSage(2, enc1)

In [None]:
# Split pos neg sets for under-sampling
train_pos, train_neg = pos_neg_split(idx_train, y_train)

# Initialize model input
features = nn.Embedding(feat_data.shape[0], feat_data.shape[1])
feat_data = torch.FloatTensor(normalize(feat_data))
features.weight = nn.Parameter(feat_data, requires_grad=False)
features.to(device)
# Set input graph
## Run Sage First
model = 'SAGE'
adj_lists = homo

cuda = torch.cuda.is_available()
agg1 = MeanAggregator(features)

enc1 = Encoder(features, feat_data.shape[1], 64, adj_lists, agg1, gcn=True, cuda=cuda)

# The vanilla GraphSAGE model as baseline
enc1.num_samples = 5
gnn_model = GraphSage(2, enc1)


gnn_model.to(device)

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, gnn_model.parameters()), lr=0.01, weight_decay=1e-3)

In [None]:
times = []
performance_log = []
epochs = 30
batch_size = 1024
for epoch in range(epochs):
    # Randomly under-sampling negative nodes for each epoch
    sampled_idx_train = undersample(train_pos, train_neg, scale=1)
    random.shuffle(sampled_idx_train)

    # Send number of batches to model to let the RLModule know the training progress
    num_batches = int(len(sampled_idx_train) / batch_size) + 1

    loss = 0.0
    epoch_time = 0

    # Mini-batch training
    for batch in range(num_batches):
        start_time = time.time()
        i_start = batch * batch_size
        i_end = min((batch + 1) * batch_size, len(sampled_idx_train))

        batch_nodes = sampled_idx_train[i_start:i_end]
        batch_label = labels[np.array(batch_nodes)]

        optimizer.zero_grad()
        loss = gnn_model.loss(batch_nodes, torch.cuda.LongTensor(batch_label))
        loss.backward()
        optimizer.step()
        end_time = time.time()
        epoch_time += end_time - start_time
        loss += loss.item()

    print(f'Epoch: {epoch}, loss: {loss.item() / num_batches}, time: {epoch_time}s')

  loss = gnn_model.loss(batch_nodes, torch.cuda.LongTensor(batch_label))


Epoch: 0, loss: 0.23152734835942587, time: 0.76222825050354s
Epoch: 1, loss: 0.23148196935653687, time: 0.21338462829589844s
Epoch: 2, loss: 0.23104302088419595, time: 0.2107713222503662s
Epoch: 3, loss: 0.23096140225728354, time: 0.22952032089233398s
Epoch: 4, loss: 0.23092949390411377, time: 0.22460293769836426s
Epoch: 5, loss: 0.23098824421564737, time: 0.3942244052886963s
Epoch: 6, loss: 0.2310264309247335, time: 0.35793375968933105s
Epoch: 7, loss: 0.23103690147399902, time: 0.3151240348815918s
Epoch: 8, loss: 0.23103843132654825, time: 0.32343196868896484s
Epoch: 9, loss: 0.23103390137354532, time: 0.39734387397766113s
Epoch: 10, loss: 0.23105684916178384, time: 0.34145450592041016s
Epoch: 11, loss: 0.23104788859685263, time: 0.3567466735839844s
Epoch: 12, loss: 0.23103861014048258, time: 0.4466519355773926s
Epoch: 13, loss: 0.2310813864072164, time: 0.33035945892333984s
Epoch: 14, loss: 0.23104757070541382, time: 0.39752960205078125s
Epoch: 15, loss: 0.23105047146479288, time: 0

In [None]:
torch.save(gnn_model.state_dict(), '/content/drive/MyDrive/DOAN/SAGE_model.pth')

In [None]:
gnn_model.load_state_dict(torch.load('/content/drive/MyDrive/DOAN/SAGE_model.pth'))

<All keys matched successfully>

In [None]:
gnn_model.eval()

GraphSage(
  (enc): Encoder(
    (features): Embedding(45954, 32)
    (aggregator): MeanAggregator(
      (features): Embedding(45954, 32)
    )
  )
  (xent): CrossEntropyLoss()
)

In [None]:
batch_size = 1024
test_sage(idx_test, y_test, gnn_model, batch_size)

GNN F1: 0.4608
GNN Accuracy: 0.8547
GNN Recall: 0.5000
GNN auc: 0.5000
GNN ap: 0.1453


## 6.3 CARE-CNN training

In [None]:
adj_lists = [relation1, relation2, relation3]
emb_size = 4
inter = 'GNN'
step_size = 2e-2
lambda_1 = 2
lambda_2 = 1e-3
lr = 0.01

intra1 = IntraAgg(features, feat_data.shape[1], cuda=cuda)
intra2 = IntraAgg(features, feat_data.shape[1], cuda=cuda)
intra3 = IntraAgg(features, feat_data.shape[1], cuda=cuda)
inter1 = InterAgg(features, feat_data.shape[1], emb_size, adj_lists, [intra1, intra2, intra3], inter=inter, step_size = step_size, cuda=cuda)

gnn_care_model = OneLayerCARE(2, inter1, lambda_1)

gnn_care_model.to(device)
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, gnn_model.parameters()), lr=lr, weight_decay=lambda_2)
times = []
performance_log = []

In [None]:
num_epochs = 30
batch_size = 1024
for epoch in range(num_epochs):
	# randomly under-sampling negative nodes for each epoch
	sampled_idx_train = undersample(train_pos, train_neg, scale=1)
	rd.shuffle(sampled_idx_train)

	# send number of batches to model to let the RLModule know the training progress
	num_batches = int(len(sampled_idx_train) / batch_size) + 1
	inter1.batch_num = num_batches

	loss = 0.0
	epoch_time = 0

	# mini-batch training
	for batch in range(num_batches):
		start_time = time.time()
		i_start = batch * batch_size
		i_end = min((batch + 1) * batch_size, len(sampled_idx_train))
		batch_nodes = sampled_idx_train[i_start:i_end]
		batch_label = labels[np.array(batch_nodes)]
		optimizer.zero_grad()
		#if cuda:
		loss = gnn_care_model.loss(batch_nodes, Variable(torch.cuda.LongTensor(batch_label)))
		#else:
		#loss = gnn_model.loss(batch_nodes, Variable(torch.LongTensor(batch_label)))
		loss.backward()
		optimizer.step()
		end_time = time.time()
		epoch_time += end_time - start_time
		loss += loss.item()

	print(f'Epoch: {epoch}, loss: {loss.item() / num_batches}, time: {epoch_time}s')


  loss = gnn_care_model.loss(batch_nodes, Variable(torch.cuda.LongTensor(batch_label)))


Epoch: 0, loss: 0.6957997481028239, time: 12.98915982246399s
Epoch: 1, loss: 0.696735143661499, time: 7.274843692779541s
epoch scores: [0.0012478944959243992, 0.0038967103581130625, 0.004135368966118057]
rewards: [-1, 1, 1]
thresholds: [0.48, 0.52, 0.52]
Epoch: 2, loss: 0.6932341257731119, time: 6.68493914604187s
epoch scores: [0.0013454673554313358, 0.004144397237324971, 0.004321600308726712]
rewards: [-1, -1, -1]
thresholds: [0.45999999999999996, 0.5, 0.5]
Epoch: 3, loss: 0.6954858303070068, time: 7.167391777038574s
epoch scores: [0.0012790461918918798, 0.0039909658546776715, 0.0041675485349907905]
rewards: [1, 1, 1]
thresholds: [0.48, 0.52, 0.52]
Epoch: 4, loss: 0.6953803698221842, time: 6.701725959777832s
epoch scores: [0.001286624546123115, 0.004069985749555022, 0.0041813805296561585]
rewards: [-1, -1, -1]
thresholds: [0.45999999999999996, 0.5, 0.5]
Epoch: 5, loss: 0.6942814191182455, time: 7.268238306045532s
epoch scores: [0.0013151007384623813, 0.004045392464784802, 0.0042347798

In [None]:
torch.save(gnn_care_model.state_dict(), '/content/drive/MyDrive/DOAN/CARE_model2.pth')

In [None]:
gnn_care_model.load_state_dict(torch.load('/content/drive/MyDrive/DOAN/CARE_model2.pth'))

<All keys matched successfully>

In [None]:
performance_log = []
batch_size = 1024
gnn_auc, label_auc, gnn_recall, label_recall = test_care(idx_test, y_test, gnn_care_model, batch_size)
performance_log.append([gnn_auc, label_auc, gnn_recall, label_recall])

GNN F1: 0.5980
GNN Accuracy: 0.6942
GNN Recall: 0.7000
GNN auc: 0.7665
GNN ap: 0.3810


## 7 SPARK

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=a8a5e209f8eda1ac1f2e4b7e8c431099dafd66612d7f1992987f12ad8d5f9595
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

spark = SparkSession \
.builder \
.appName("spark-dl-inference") \
.master("local[*]") \
.config("spark.executor.memory", "8g") \
.config("spark.driver.memory", "8g") \
.config("spark.python.worker.reuse",True) \
.getOrCreate()
# Create a SparkConf object
conf=SparkConf()

print("spark.executor.memory = ", conf.get("spark.executor.memory"))
print("spark.driver.memory = ", conf.get("spark.driver.memory"))

spark.executor.memory =  8g
spark.driver.memory =  8g


In [None]:
import numpy as np
import os
import pandas as pd

from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import udf
from pyspark.sql.functions import array, col, struct, pandas_udf
from pyspark.sql.types import ArrayType, FloatType, Union, Dict, IntegerType
from pyspark.sql.functions import col, floor, collect_list, monotonically_increasing_id
from pyspark.sql.functions import explode, posexplode, udf
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import torch.nn.functional as F

In [None]:
val_input = pd.read_csv('/content/drive/MyDrive/DOAN/test_a.csv')

In [None]:
val_input.head()

Unnamed: 0,nodes,label
0,35042,0
1,27485,0
2,16164,0
3,21204,0
4,12201,0


In [None]:
%%time
df = spark.createDataFrame(val_input.drop('label', axis = 1))

CPU times: user 330 ms, sys: 1.8 ms, total: 332 ms
Wall time: 361 ms


In [None]:
df.show()

+-----+
|nodes|
+-----+
|35042|
|27485|
|16164|
|21204|
|12201|
|38716|
|18519|
| 8865|
| 3804|
| 7383|
|41976|
| 1413|
|16995|
|20697|
|42494|
|41349|
| 8917|
|29393|
|12982|
| 2639|
+-----+
only showing top 20 rows



In [None]:
df = df.withColumn("group_id", floor(monotonically_increasing_id() / 5))
grouped_df = df.groupBy("group_id").agg(collect_list("nodes").alias("data_grouped"))
grouped_df.show(truncate=False)

+--------+-----------------------------------+
|group_id|data_grouped                       |
+--------+-----------------------------------+
|0       |[35042, 27485, 16164, 21204, 12201]|
|1       |[38716, 18519, 8865, 3804, 7383]   |
|2       |[41976, 1413, 16995, 20697, 42494] |
|3       |[41349, 8917, 29393, 12982, 2639]  |
|4       |[110, 24197, 23445, 9455, 45367]   |
|5       |[44530, 42078, 38127, 3992, 28694] |
|6       |[9735, 29484, 17637, 28373, 22435] |
|7       |[23158, 8108, 5255, 18665, 2760]   |
|8       |[35284, 760, 42136, 32438, 19168]  |
|9       |[45690, 24381, 2646, 16977, 19387] |
|10      |[34245, 26156, 39503, 22873, 26840]|
|11      |[16111, 15357, 35272, 36345, 44553]|
|12      |[15606, 29320, 29551, 34946, 41177]|
|13      |[16681, 10134, 12877, 22613, 42802]|
|14      |[6886, 25433, 24541, 36788, 1891]  |
|15      |[23262, 41729, 27314, 34693, 8374] |
|16      |[9162, 23462, 14412, 34839, 21668] |
|17      |[34966, 37425, 37571, 4551, 45439] |
|18      |[35

In [None]:
df.show(5)

+-----+--------+
|nodes|group_id|
+-----+--------+
|35042|       0|
|27485|       0|
|16164|       0|
|21204|       0|
|12201|       0|
+-----+--------+
only showing top 5 rows



In [None]:
%%time
grouped_df.write.mode("overwrite").parquet("YelpChi_cut5")

CPU times: user 8.71 ms, sys: 1.98 ms, total: 10.7 ms
Wall time: 1.19 s


### Care model

In [None]:
from pyspark.ml.functions import predict_batch_udf

def predict(inputs) -> np.ndarray:
  output,_ = gnn_care_model.to_prob(inputs, [0,0,0,0,0], train_flag=False)
  return output.data.cpu().numpy().argmax(axis=1).flatten().tolist()

In [None]:
predict_udf = udf(predict, ArrayType(IntegerType()))

In [None]:
df2 = spark.read.parquet("/content/YelpChi_cut5")
predicted_df = df2.withColumn("predictions", predict_udf(df2["data_grouped"]))

In [None]:
df2.show(5)

+--------+--------------------+
|group_id|        data_grouped|
+--------+--------------------+
|       0|[35042, 27485, 16...|
|       1|[38716, 18519, 88...|
|       2|[41976, 1413, 169...|
|       3|[41349, 8917, 293...|
|       4|[110, 24197, 2344...|
+--------+--------------------+
only showing top 5 rows



In [None]:
predicted_df.show(5)

+--------+--------------------+---------------+
|group_id|        data_grouped|    predictions|
+--------+--------------------+---------------+
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|
|       1|[38716, 18519, 88...|[0, 1, 0, 1, 1]|
|       2|[41976, 1413, 169...|[1, 1, 0, 0, 1]|
|       3|[41349, 8917, 293...|[0, 0, 1, 1, 0]|
|       4|[110, 24197, 2344...|[0, 0, 0, 0, 0]|
+--------+--------------------+---------------+
only showing top 5 rows



In [None]:
%%time
predicted_df.write.mode("overwrite").parquet("/content/drive/MyDrive/DOAN/predictions")

CPU times: user 203 ms, sys: 32.7 ms, total: 235 ms
Wall time: 47.9 s


In [None]:
df_exploded = predicted_df.withColumn('idx_node', explode(predicted_df['data_grouped']))

In [None]:
df_exploded1 = predicted_df.withColumn('prediction', explode(predicted_df['predictions']))

In [None]:
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

In [None]:
windowSpec = Window.orderBy(monotonically_increasing_id())
df_exploded_idx = df_exploded.withColumn("row_number", row_number().over(windowSpec))
df_exploded_idx.show(5)

+--------+--------------------+---------------+--------+----------+
|group_id|        data_grouped|    predictions|idx_node|row_number|
+--------+--------------------+---------------+--------+----------+
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|   35042|         1|
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|   27485|         2|
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|   16164|         3|
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|   21204|         4|
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|   12201|         5|
+--------+--------------------+---------------+--------+----------+
only showing top 5 rows



In [None]:
df_exploded1_idx = df_exploded1.withColumn("row_number", row_number().over(windowSpec))
df_exploded1_idx.show(5)

+--------+--------------------+---------------+----------+----------+
|group_id|        data_grouped|    predictions|prediction|row_number|
+--------+--------------------+---------------+----------+----------+
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|         1|         1|
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|         0|         2|
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|         1|         3|
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|         0|         4|
|       0|[35042, 27485, 16...|[1, 0, 1, 0, 1]|         1|         5|
+--------+--------------------+---------------+----------+----------+
only showing top 5 rows



In [None]:
joined_df = df_exploded1_idx.join(df_exploded_idx, on="row_number", how="inner")

In [None]:
df_select = joined_df.select("idx_node", "prediction")
df_select.show(5)

+--------+----------+
|idx_node|prediction|
+--------+----------+
|   35042|         1|
|   27485|         0|
|   16164|         1|
|   21204|         0|
|   12201|         1|
+--------+----------+
only showing top 5 rows

