In [12]:
from torch_geometric.data import HeteroData
import torch
from TwibotSmallEdgeHetero import TwibotSmallEdgeHetero


embedding_size,dropout,lr,weight_decay,svdComponents=128,0.3,1e-3,5e-3,100

In [13]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
homogeneous = TwibotSmallEdgeHetero(device=device,process=True,save=True,dev=False, svdComponents=svdComponents)

des_tensor,tweets_tensor,num_prop,category_prop,edge_index,edge_type,labels,train_idx,val_idx,test_idx=homogeneous.dataloader()

Loading train.json
Loading test.json
Small dataset version, not loading support.json
Loading dev.json
Finished
Loading labels...   Finished
Loading user description embeddings
Finished
Running tweet embedding
Finished
Processing feature3...   Finished
Processing feature4...   Finished
Building graph   Finished


In [14]:
# print(.shape)
tweeterEdgeIndices = (edge_type == 4).nonzero().squeeze()
print(tweeterEdgeIndices.shape)

tweeterEdges = edge_index[:,tweeterEdgeIndices]
print(tweeterEdges)
print(tweeterEdges.shape)

torch.Size([1999788])
tensor([[      0,       0,       0,  ...,   11825,   11825,   11825],
        [  11826,   11827,   11828,  ..., 2011611, 2011612, 2011613]])
torch.Size([2, 1999788])


In [15]:
## initialising the dataset object for our heterogeneous dataset

data = HeteroData()
data['user'].des = des_tensor
data['user'].cat = category_prop
data['user'].num = num_prop
# data['user'].x = None
data['user'].x = torch.cat((des_tensor, category_prop, num_prop), dim=1)
data['user'].y = labels
data['user'].train_idx = train_idx
data['user'].val_idx = val_idx
data['user'].test_idx = test_idx
data['tweet'].x = tweets_tensor

data['user', 'following', 'user'].edge_index = edge_index[:, (edge_type == 0).nonzero().squeeze()]
data['user', 'followedBy', 'user'].edge_index = edge_index[:, (edge_type == 1).nonzero().squeeze()]
data['tweet', 'mentions', 'user'].edge_index = edge_index[:, (edge_type == 2).nonzero().squeeze()]
data['tweet', 'retweets', 'user'].edge_index = edge_index[:, (edge_type == 3).nonzero().squeeze()]
data['user','writes', 'tweet'].edge_index = edge_index[:, (edge_type == 4).nonzero().squeeze()]

In [16]:
print(data.has_isolated_nodes())
print(data.has_self_loops())
print(data.is_undirected())

print(data['tweet'].x.shape)
print(data['user'].x.shape)



True
False
False
torch.Size([1999788, 100])
torch.Size([11826, 117])


In [17]:


class HeteroTwibot():
    def __init__(self, edgeHetero: TwibotSmallEdgeHetero):
        des_tensor,tweets_tensor,num_prop,category_prop,edge_index,edge_type,labels,train_idx,val_idx,test_idx=edgeHetero.dataloader()
        self.data = HeteroData()

        self.data['user'].des = des_tensor
        self.data['user'].cat = category_prop
        self.data['user'].num = num_prop
        # self.data['user'].x = None
        self.data['user'].x = torch.cat((des_tensor, category_prop, num_prop), dim=1)
        self.data['user'].y = labels
        self.data['user'].train_idx = train_idx
        self.data['user'].val_idx = val_idx
        self.data['user'].test_idx = test_idx
        self.data['tweet'].x = tweets_tensor

        self.data['user', 'following', 'user'].edge_index = edge_index[:, (edge_type == 0).nonzero().squeeze()]
        self.data['user', 'followedBy', 'user'].edge_index = edge_index[:, (edge_type == 1).nonzero().squeeze()]
        self.data['tweet', 'mentions', 'user'].edge_index = edge_index[:, (edge_type == 2).nonzero().squeeze()]
        self.data['tweet', 'retweets', 'user'].edge_index = edge_index[:, (edge_type == 3).nonzero().squeeze()]
        self.data['user','writes', 'tweet'].edge_index = edge_index[:, (edge_type == 4).nonzero().squeeze()]
        

In [18]:
heteroTwibot = HeteroTwibot(homogeneous)

print(heteroTwibot.data.has_isolated_nodes())
print(heteroTwibot.data.has_self_loops())
print(heteroTwibot.data.is_undirected())

Loading labels...   Finished
Loading user description embeddings
Finished
Running tweet embedding
Finished
Processing feature3...   Finished
Processing feature4...   Finished
Building graph   Finished
True
False
False


In [20]:
from HeteroTwibot import initializeHeteroAugTwibot

dataset = initializeHeteroAugTwibot(homogeneous)
# number of node types
print(dataset.node_types)


Loading labels...   Finished
Loading user description embeddings
Finished
Running tweet embedding
Finished
Processing feature3...   Finished
Processing feature4...   Finished
Building graph   Finished
['user', 'tweet']


In [22]:
print(dataset.edge_types)
print(dataset.num_nodes)

[('user', 'following', 'user'), ('user', 'followedBy', 'user'), ('tweet', 'mentions', 'user'), ('tweet', 'retweets', 'user'), ('user', 'writes', 'tweet')]
2011614


In [None]:
from torch_geometric.loader import NeighborLoader

batch_size = 128
kwargs = {'num_workers': min(torch.cuda.device_count(),4) if torch.cuda.device_count() > 0 and torch.device.type == 'cuda' else 1, 'persistent_workers': True, 'batch_size': batch_size}
train_loader = NeighborLoader(dataset, num_neighbors=[100] * 3,shuffle=False, input_nodes=('user',dataset['user'].train_mask), **kwargs)

In [None]:
# from torch_geometric.data import Dataset
for batch in train_loader:
    # So it DOES batch the other props as well!
    print(batch['user'].x.shape)
    print(batch['user'].des.shape)
    print(batch['user'].cat.shape)
    print(batch['user'].num.shape)
    print(batch['tweet'].x.shape)



train_datalist = list(train_loader)

torch.Size([5200, 117])
torch.Size([5200, 100])
torch.Size([5200, 11])
torch.Size([5200, 6])
torch.Size([54433, 100])
torch.Size([5247, 117])
torch.Size([5247, 100])
torch.Size([5247, 11])
torch.Size([5247, 6])
torch.Size([55345, 100])
torch.Size([5063, 117])
torch.Size([5063, 100])
torch.Size([5063, 11])
torch.Size([5063, 6])
torch.Size([49620, 100])
torch.Size([4047, 117])
torch.Size([4047, 100])
torch.Size([4047, 11])
torch.Size([4047, 6])
torch.Size([37474, 100])
torch.Size([3414, 117])
torch.Size([3414, 100])
torch.Size([3414, 11])
torch.Size([3414, 6])
torch.Size([33646, 100])
torch.Size([2837, 117])
torch.Size([2837, 100])
torch.Size([2837, 11])
torch.Size([2837, 6])
torch.Size([29027, 100])
torch.Size([2821, 117])
torch.Size([2821, 100])
torch.Size([2821, 11])
torch.Size([2821, 6])
torch.Size([27711, 100])
torch.Size([2447, 117])
torch.Size([2447, 100])
torch.Size([2447, 11])
torch.Size([2447, 6])
torch.Size([24327, 100])
torch.Size([2981, 117])
torch.Size([2981, 100])
torch.Si

In [None]:
print(type(train_datalist[0]))
print(len(train_datalist))
for i in range(len(train_datalist)):
    print(train_datalist[i]['user'].batch_size) 

<class 'torch_geometric.data.hetero_data.HeteroData'>
65
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
86


In [None]:
from torch_geometric.loader import DataListLoader

train_datalistloader = DataListLoader(train_datalist, batch_size=1, shuffle=False)

for batch in train_datalistloader:
    print(len(batch))
    print(type(batch[0]))
    print(batch[0]['user'].batch_size)

1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_data.HeteroData'>
128
1
<class 'torch_geometric.data.hetero_da