# Dataset

In [1]:
#import os
#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = "max_split_size_mb:512"

In [2]:
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data, Dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#torch.cuda.empty_cache()

In [4]:
'''
from torch_geometric_temporal.signal import temporal_signal_split
embedding_size = 128
dropout = 0.3
lr = 1e-3
weight_decay = 5e-3
batch_size=1024
training_epochs = 10
dataset = Twibot22StaticTemporal(26, device='cpu', batch_size=batch_size)
train_dataset, test_dataset = temporal_signal_split(dataset.data, train_ratio=0.7)

for b in train_dataset:
    print(b)
    break
print(len(train_dataset))
''';

In [5]:
from custom_static_graph_temporal_signal_batch import CustomStaticGraphTemporalSignalBatch

class Twibot22StaticTemporal(object):
    def __init__(self, number_of_timeframes, root=r'src/Data_test/preprocessed', device='cpu', batch_size=128):
        self.root = root
        self.device = device
        self.number_of_timeframes = number_of_timeframes
        self.batch_size = batch_size
        path = lambda name: f"{self.root}/{name}"
        
        # load labels
        labels = torch.load(path("labels.pt"), map_location=self.device)
        labels = [labels for i in range(self.number_of_timeframes)]
    
        # load node features
        numerical_features = torch.load(path("num_properties_tensor.pt"), map_location=self.device)
        categorical_features = torch.load(path("categorical_properties_tensor.pt"), map_location=self.device)
        description_embeddings = torch.load(path("user_description_embedding_tensor.pt"), map_location=self.device)
        feature_dict = {'numerical_features': numerical_features, 'categorical_features': categorical_features, 
                       'description_embeddings': description_embeddings}
        assert len(numerical_features) == len(categorical_features) == len(description_embeddings)
        self.num_nodes = len(numerical_features)
        
        tweet_embeddings = []
        for i in range(self.number_of_timeframes):
            tweet_embeddings.append(torch.load(path(f"user_tweets_tensor_{i}.pt"), map_location=self.device))
        
        
        # load edge index and types
        edge_index = torch.load(path("edge_index.pt"), map_location=self.device)
        edge_type = torch.load(path("edge_type.pt"), map_location=self.device).to(torch.float32)
    
        # load dataset masks
        train_mask = torch.load(path("train_mask.pt"), map_location=self.device)
        test_mask = torch.load(path("test_mask.pt"), map_location=self.device)
        val_mask = torch.load(path("validation_mask.pt"), map_location=self.device)
        
        self.data = CustomStaticGraphTemporalSignalBatch(edge_index=edge_index, edge_weight=edge_type, features=tweet_embeddings, 
                                                   targets=labels, batch_size=self.batch_size, num_nodes=self.num_nodes, **feature_dict)
        
    def get_dataset():
        return self.data
    
    def __get__(self, idx):
        if idx == 0: return self.data         

In [6]:
import torch
from torch import nn
from torch_geometric import nn as gnn
from torch_geometric_temporal import nn as tnn

class UnpackTuple(torch.nn.Module):
    def __init__(self, unpack_dimension=0):
        super().__init__()
        self.unpack_dimension = unpack_dimension
        
    def forward(self, x):
        return x[self.unpack_dimension]


class TemporalBotRGCN(torch.nn.Module):
    def __init__(self, desc_embedding_size=768, tweet_embedding_size=768, num_feature_size=5, 
                 cat_feature_size=3, embedding_dimension=128, num_relations=2, dropout=0.3):
        super().__init__()
        self.dropout = dropout
        
        # user description layer
        self.desc_layer = nn.Sequential(
            nn.Linear(desc_embedding_size, int(embedding_dimension/4)),
            nn.LeakyReLU()
        )
        
        # user tweet layer
        self.tweet_layer = nn.Sequential(
            nn.Linear(tweet_embedding_size, int(embedding_dimension/4)),
            nn.LeakyReLU()
        )
        
        # numeric feature layer
        self.num_feature_layer = nn.Sequential(
            nn.Linear(num_feature_size, int(embedding_dimension/4)),
            nn.LeakyReLU()
        )
        
        # categorical feature layer
        self.cat_feature_layer = nn.Sequential(
            nn.Linear(cat_feature_size, int(embedding_dimension/4)),
            nn.LeakyReLU()  
        )
        
        self.inner = gnn.Sequential('x, edge_index, edge_type', [
            (nn.Linear(embedding_dimension, embedding_dimension), 'x -> x'),
            (nn.LeakyReLU(), 'x -> x'),
            (tnn.recurrent.LRGCN(embedding_dimension, embedding_dimension, num_relations=num_relations, num_bases=2), 'x, edge_index, edge_type -> x'),
            (UnpackTuple(0), 'x -> x'),
            (nn.Dropout(self.dropout), 'x -> x'),
            (tnn.recurrent.LRGCN(embedding_dimension, embedding_dimension, num_relations=num_relations, num_bases=2), 'x, edge_index, edge_type -> x'),
            (UnpackTuple(0), 'x -> x'),
            (nn.Linear(embedding_dimension, embedding_dimension), 'x -> x'),
            (nn.LeakyReLU(), 'x -> x'),
            (nn.Linear(embedding_dimension, 1), 'x -> x'),
            (nn.Sigmoid(), 'x -> x'),
            ])
        
    def forward(self, desc_embedding, tweet_embedding, num_feature, cat_feature, edge_index, edge_type):        
        desc = self.desc_layer(desc_embedding)
        tweets = self.tweet_layer(tweet_embedding)
        numeric = self.num_feature_layer(num_feature)
        cat = self.cat_feature_layer(cat_feature)
        x = torch.cat([desc, tweets, numeric, cat], dim=1)
        
        return self.inner(x, edge_index, edge_type).flatten()  
    
    def init_weights(self, m):
        if type(m) == nn.Linear:
            nn.init.kaiming_uniform_(m.weight)

In [7]:
from tqdm import tqdm
from torch_geometric_temporal.nn.recurrent import GConvGRU
from torch_geometric_temporal.signal import temporal_signal_split
from torchmetrics import MetricCollection
from torchmetrics.classification import BinaryAccuracy, BinaryPrecision, BinaryRecall, BinaryF1Score, BinaryMatthewsCorrCoef

model_metrics = MetricCollection([
    BinaryAccuracy(), 
    BinaryPrecision(), 
    BinaryRecall(), 
    BinaryF1Score(), 
    BinaryMatthewsCorrCoef()])


embedding_size = 128
dropout = 0.3
lr = 1e-3
weight_decay = 5e-3
batch_size=1024
training_epochs = 1

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
dataset = Twibot22StaticTemporal(26, device='cpu', batch_size=batch_size)
train_dataset, test_dataset = temporal_signal_split(dataset.data, train_ratio=0.7)

model = TemporalBotRGCN(desc_embedding_size=768, tweet_embedding_size=768, num_feature_size=5, 
                 cat_feature_size=3, embedding_dimension=128, num_relations=2, dropout=dropout)
model.apply(model.init_weights)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.BCELoss()


def train_epoch():
    model.train()
    average_loss = 0.0
    number_of_batches = 0
    
    for batch in train_dataset:
        batch.to(device)
        optimizer.zero_grad()
        batch = batch.to(device)
        out = model(
            batch.description_embeddings,
            batch.x, 
            batch.numerical_features,
            batch.categorical_features.to(torch.float32),
            batch.edge_index,
            batch.edge_attr)
        
        loss = criterion(out, batch.y.to(torch.float32))
        average_loss += loss.item() * batch_size
        loss.backward()
        optimizer.step()
        number_of_batches += 1
    
    average_loss /= number_of_batches * batch_size
    return average_loss

losses = []

for e in range(training_epochs+1):
    loss = train_epoch()
    losses.append(loss)
    print(loss)
    
model.eval()

labels = []
predictions = []

with torch.no_grad():
    for batch in test_dataset:
        batch.to(device)
        out = model(
            batch.description_embeddings,
            batch.x, 
            batch.numerical_features,
            batch.categorical_features.to(torch.float32),
            batch.edge_index,
            batch.edge_attr)

        labels.extend(list(batch.y.to('cpu')))
        predictions.extend(list(out.to('cpu')))
metrics = model_metrics(torch.tensor(predictions), torch.tensor(labels))
print(metrics)

MemoryError: Unable to allocate 269. MiB for an array with shape (70521856,) and data type int32

In [None]:
class Twibot22(Dataset):
    def __init__(self, root=r'src/Data_test/preprocessed', device='cpu'):
        self.root = root
        super().__init__(self.root, None, None, None)
        self.device = device
        path = lambda name: f"{self.root}/{name}"
        
        # load labels
        labels = torch.load(path("labels.pt"), map_location=self.device)
        
        # load node features
        numerical_features = torch.load(path("num_properties_tensor.pt"), map_location=self.device)
        categorical_features = torch.load(path("categorical_properties_tensor.pt"), map_location=self.device)
        description_embeddings = torch.load(path("user_description_embedding_tensor.pt"), map_location=self.device)
        tweet_embeddings = torch.load(path("user_tweets_tensor.pt"), map_location=self.device)
        #merged_features = torch.cat([numerical_features, categorical_features, description_embeddings, tweet_embeddings], dim=1)
        
        # load edge index and types
        edge_index = torch.load(path("edge_index.pt"), map_location=self.device)
        edge_type = torch.load(path("edge_type.pt"), map_location=self.device)
        
        # load dataset masks
        train_mask = torch.load(path("train_mask.pt"), map_location=self.device)
        test_mask = torch.load(path("test_mask.pt"), map_location=self.device)
        val_mask = torch.load(path("validation_mask.pt"), map_location=self.device)
        
        self.data = Data(
            edge_index=edge_index,
            edge_attr=edge_type,
            y=labels,
            description_embeddings = description_embeddings,
            tweet_embeddings = tweet_embeddings,
            numerical_features = numerical_features,
            categorical_features = categorical_features,
            train_mask = train_mask,
            test_mask = test_mask,
            val_mask = val_mask,
            num_nodes = labels.shape[0]
        )
        
        assert self.data.validate()
        
    def len(self):
        return 1
    
    def get(self, idx):
        if idx == 0: return self.data
    
    @property
    def num_node_features(self):
        return self.data.num_node_features
    
    @property
    def num_edge_features(self):
        return self.data.num_edge_features
    
    @property
    def num_nodes(self):
        return self.data.num_nodes
    
    @property
    def num_edges(self):
        return self.data.num_edges  

# Model

In [None]:
import torch
from torch import nn
from torch_geometric import nn as gnn

In [None]:
class BotRGCN(nn.Module):
    def __init__(self, desc_embedding_size=768, tweet_embedding_size=768, num_feature_size=5, 
                 cat_feature_size=3, embedding_dimension=128, num_relations=2, dropout=0.3):
        super().__init__()
        self.dropout = dropout
        
        # TODO: use torch_geometric.nn.Sequential instead?
        
        # user description layer
        self.desc_layer = nn.Sequential(
            nn.Linear(desc_embedding_size, int(embedding_dimension/4)),
            nn.LeakyReLU()
        )
        
        # user tweet layer
        self.tweet_layer = nn.Sequential(
            nn.Linear(tweet_embedding_size, int(embedding_dimension/4)),
            nn.LeakyReLU()
        )
        
        # numeric feature layer
        self.num_feature_layer = nn.Sequential(
            nn.Linear(num_feature_size, int(embedding_dimension/4)),
            nn.LeakyReLU()
        )
        
        # categorical feature layer
        self.cat_feature_layer = nn.Sequential(
            nn.Linear(cat_feature_size, int(embedding_dimension/4)),
            nn.LeakyReLU()  
        )
        
        self.inner = gnn.Sequential('x, edge_index, edge_type', [
            (nn.Linear(embedding_dimension, embedding_dimension), 'x -> x'),
            (nn.LeakyReLU(), 'x -> x'),
            (gnn.RGCNConv(embedding_dimension, embedding_dimension, num_relations=num_relations), 'x, edge_index, edge_type -> x'),
            (nn.Dropout(self.dropout), 'x -> x'),
            (gnn.RGCNConv(embedding_dimension, embedding_dimension, num_relations=num_relations), 'x, edge_index, edge_type -> x'),
            (nn.Linear(embedding_dimension, embedding_dimension), 'x -> x'),
            (nn.LeakyReLU(), 'x -> x'),
            (nn.Linear(embedding_dimension, 1), 'x -> x'),
            (nn.Sigmoid(), 'x -> x'),
            ])
        
        '''
        # embedding layer
        self.embedding_input_layer = nn.Sequential(
            nn.Linear(embedding_dimension,embedding_dimension),
            nn.LeakyReLU()  
        )
        
        # RGCN layer
        # TODO: replace with FastRGCNConv?
        self.rgcn_layer = gnn.RGCNConv(embedding_dimension, embedding_dimension, num_relations=num_relations)
        
        # embedding output layer
        self.embedding_output_layer_1 = nn.Sequential(
            nn.Linear(embedding_dimension,embedding_dimension),
            nn.LeakyReLU()  
        )
        
        # output layer
        self.output_layer = nn.Linear(embedding_dimension, 2)
        '''
        
    def forward(self, desc_embedding, tweet_embedding, num_feature, cat_feature, edge_index, edge_type):        
        desc = self.desc_layer(desc_embedding)
        tweets = self.tweet_layer(tweet_embedding)
        numeric = self.num_feature_layer(num_feature)
        cat = self.cat_feature_layer(cat_feature)
        x = torch.cat([desc, tweets, numeric, cat], dim=1)
        
        return self.inner(x, edge_index, edge_type).flatten()
    
    def init_weights(self, m):
        if type(m) == nn.Linear:
            nn.init.kaiming_uniform_(m.weight)

# Training

In [None]:
torch.set_printoptions(threshold=100)

In [None]:
'''
from torchmetrics import MetricCollection
from torchmetrics.classification import BinaryAccuracy, BinaryPrecision, BinaryRecall, BinaryF1Score
from torch_geometric.loader import NeighborLoader

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
embedding_size = 128
dropout = 0.3
lr = 1e-3
weight_decay = 5e-3

dataset = Twibot22(device=device)
model = BotRGCN(desc_embedding_size=768, tweet_embedding_size=768, num_feature_size=5, 
                 cat_feature_size=3, embedding_dimension=128, num_relations=2, dropout=0.3)
model.apply(model.init_weights)

model_metrics = MetricCollection([
    BinaryAccuracy(), 
    BinaryPrecision(), 
    BinaryRecall(), 
    BinaryF1Score()])

model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(
        dataset.description_embeddings,
        dataset.tweet_embeddings, 
        dataset.numerical_features,
        dataset.categorical_features.to(torch.float32),
        dataset.edge_index,
        dataset.edge_type)
    #print(out[dataset.train_mask][0:50])
    #print(dataset.labels[dataset.train_mask][0:50])
    loss = criterion(out[dataset.train_mask], dataset.labels[dataset.train_mask].to(torch.float32))
    loss.backward()
    optimizer.step()
    return loss

def test():
    model.eval()
    out = model(
        dataset.description_embeddings,
        dataset.tweet_embeddings, 
        dataset.numerical_features,
        dataset.categorical_features.to(torch.float32),
        dataset.edge_index,
        dataset.edge_type)
    #preds = torch.argmax(out[dataset.test_mask], dim=1).to('cpu').detach()
    preds = out[dataset.test_mask].to('cpu').detach()
    labels = dataset.labels[dataset.test_mask].to('cpu').detach()
    metrics = model_metrics(preds, labels)
    return metrics
    
losses = []
for idx, e in tqdm(enumerate(range(0, 20000))):
    loss = train()
    losses.append(loss.item())
metrics = test()
print(metrics)
''';

In [None]:
import os
from collections import defaultdict
from datetime import datetime

import pandas as pd

from torchmetrics import MetricCollection
from torchmetrics.classification import BinaryAccuracy, BinaryPrecision, BinaryRecall, BinaryF1Score, BinaryMatthewsCorrCoef
from torch_geometric.loader import NeighborLoader

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
embedding_size = 128
dropout = 0.3
lr = 1e-3
weight_decay = 5e-3
batch_size=1024
training_epochs = 10
num_neighbors = [256] * 4

dataset = Twibot22(device='cpu')
model = BotRGCN(desc_embedding_size=768, tweet_embedding_size=768, num_feature_size=5, 
                 cat_feature_size=3, embedding_dimension=128, num_relations=2, dropout=0.3)
model.apply(model.init_weights)
model.to(device)

model_metrics = MetricCollection([
    BinaryAccuracy(), 
    BinaryPrecision(), 
    BinaryRecall(), 
    BinaryF1Score(), 
    BinaryMatthewsCorrCoef()])

model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

data = dataset[0]
train_loader = NeighborLoader(data, num_neighbors=num_neighbors, batch_size=batch_size, input_nodes=data.train_mask, shuffle=True)
test_loader = NeighborLoader(data, num_neighbors=num_neighbors, batch_size=batch_size, input_nodes=data.test_mask)
validation_loader = NeighborLoader(data, num_neighbors=num_neighbors, batch_size=batch_size, input_nodes=data.val_mask)


def train_epoch():
    model.train()
    average_loss = 0.0
    
    for batch in train_loader:
        batch.to(device)
        optimizer.zero_grad()
        batch = batch.to(device)
        out = model(
            batch.description_embeddings,
            batch.tweet_embeddings, 
            batch.numerical_features,
            batch.categorical_features.to(torch.float32),
            batch.edge_index,
            batch.edge_attr)
        loss = criterion(out[:batch_size], batch.y[:batch_size].to(torch.float32))
        average_loss += loss.item() * batch_size
        loss.backward()
        optimizer.step()
    
    average_loss /= len(train_loader) * batch_size
        
    return average_loss

def evaluate():
    model.eval()
    
    labels = []
    predictions = []
    
    with torch.no_grad():
        for batch in test_loader:
            batch.to(device)
            out = model(
                batch.description_embeddings,
                batch.tweet_embeddings, 
                batch.numerical_features,
                batch.categorical_features.to(torch.float32),
                batch.edge_index,
                batch.edge_attr)
            
            labels.extend(list(batch.y[:batch_size].to('cpu')))
            predictions.extend(list(out[:batch_size].to('cpu')))
        metrics = model_metrics(torch.tensor(predictions), torch.tensor(labels))
    return metrics

losses = []
test_metrics = defaultdict(list)

for e in range(training_epochs+1):
    loss = train_epoch()
    losses.append(loss)
    
    if e % 10 == 0:
        metrics = evaluate()
        test_metrics['epoch'].append(e)
        for k, v in metrics.items():
            test_metrics[k].append(v.item())
        
        print(f"Epoch: {e}, Loss: {loss:.2f}")
        print(f"Accuracy: {metrics['BinaryAccuracy']:.2f}")
        print(f"Precision: {metrics['BinaryPrecision']:.2f}")
        print(f"Recall: {metrics['BinaryRecall']:.2f}")
        print(f"F1-Score: {metrics['BinaryF1Score']:.2f}")
        print(f"MCC: {metrics['BinaryMatthewsCorrCoef']:.2f}")
        print()

        
# save loss, metrics and state dict
test_metrics_df = pd.DataFrame(test_metrics)     
losses_df = pd.DataFrame(losses, columns=['BCE loss'])

time = datetime.now().strftime("%d_%m_%Y__%H_%M_%S")
os.makedirs('results', exist_ok=True)

test_metrics_df.to_csv(f'results/test_metrics_{time}.csv')
losses_df.to_csv(f'results/losses_{time}.csv')
torch.save(model.state_dict(), f'results/state_dict_{time}.pt')

In [None]:
torch.cuda.empty_cache()

In [None]:
os.makedirs("bla/", exist_ok=True)

In [None]:
metrics_df = pd.read_csv('test_metrics_11_01_2023__18_32_00.csv')

In [None]:
m2 = torch.load("results/state_dict_11_01_2023__18_37_43.pt")

In [None]:
losses_df = pd.DataFrame(losses, columns=['BCE loss'])

In [None]:
losses_df

In [None]:
import matplotlib.pyplot as plt

plt.plot(losses)

In [None]:
data.train_mask.device

In [None]:
data.validate()

In [None]:
#next(iter(train_loader))

In [None]:
len(train_loader)* batch_size

In [None]:
torch.cuda.empty_cache()