# Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch_geometric.data import Data
from torch_geometric.nn import Node2Vec
from torch_geometric.utils import to_networkx, from_networkx
from gensim.models import Word2Vec
import numpy as np
import networkx as nx
import pandas as pd
import pickle
import os
import sys
import logging
import random
import igraph as ig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir('/sise/home/tommarz/hate_speech_detection/')
main_dir = os.getcwd()
main_dir

'/sise/home/tommarz/hate_speech_detection'

In [3]:
from config.detection_config import user_level_execution_config, user_level_conf, post_level_execution_config
from scipy.optimize import minimize
from utils.my_timeit import timeit
from utils.general import init_log

# import optuna
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# sampler = optuna.samplers.TPESampler(seed=0, multivariate=True, group=True, warn_independent_sampling=False ,**optuna.samplers.TPESampler.hyperopt_parameters())

logger = init_log("user_level_simple_models")

# Choose Dataset

In [261]:
dataset_name = 'gab'

In [262]:
network_output_dir = "/sise/home/tommarz/hate_speech_detection/data/networks_data"
raw_graphs_dict_path = os.path.join(network_output_dir, "raw_graphs_dict.p")
network_dataset_output_dir = os.path.join(network_output_dir, dataset_name)
raw_network_path  = os.path.join(network_dataset_output_dir, "raw_network.p")
largest_cc_path  = os.path.join(network_dataset_output_dir, "largest_cc.p")
gnn_path = os.path.join(main_dir, 'detection', 'gnn')
gnn_path

'/sise/home/tommarz/hate_speech_detection/detection/gnn'

# Load Dataset (Graph)

In [263]:
with open(largest_cc_path, 'rb') as f:
    largest_cc = pickle.load(f)
largest_cc.summary()

'IGRAPH DNW- 51195 2472710 -- gab\n+ attr: name (g), doc2vec (v), label (v), name (v), predictions (v), weight (e)'

In [264]:
g = largest_cc.copy()
g.reverse_edges()

In [265]:
labeled_nodes = g.vs.select(lambda v: v['label'] != -1)
len(labeled_nodes)

982

In [266]:
y = np.array(labeled_nodes['label'])
y.shape

(982,)

# Build Torch Geometric Dataset

In [267]:
import torch
from torch_geometric.data import Data

# Extract the edge list
edge_list = torch.tensor([edge.tuple for edge in g.es], dtype=torch.long).t().contiguous()
edge_weight = torch.tensor([edge['weight'] for edge in g.es]).t().contiguous()

# Optional: Create a feature matrix
features = torch.tensor(np.array(g.vs['doc2vec']), dtype=torch.float)  # Adjust shape if necessary

target = torch.tensor(np.array(g.vs['label']), dtype=torch.long)  # Adjust shape if necessary

# Create a PyTorch Geometric data object
dataset = Data(x=features, edge_index=edge_list, edge_weight=edge_weight, num_nodes=g.vcount(), y=target, name=dataset_name)
dataset.num_classes = len(dataset.y.unique())-1 # Cause of unlabled nodes
dataset.num_nodes = len(dataset.x) # Cause of unlabled nodes
dataset

Data(x=[51195, 100], edge_index=[2, 2472710], y=[51195], edge_weight=[2472710], num_nodes=51195, name='gab', num_classes=2)

# Create Train, Validation and Test Splits

In [268]:
from sklearn.model_selection import StratifiedKFold, train_test_split

In [269]:
seed = 0
np.random.seed(seed)
seeds = np.random.randint(0, 2**32, 5)
n_splits = 5
train_size = 1- 1/n_splits

In [270]:
# Ensures that CUDA operations are reproducible
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # For multi-GPU setups
np.random.seed(seed)
random.seed(seed)

X = dataset.x[target!=-1]
num_nodes = dataset.num_nodes

labeled_indices = (dataset.y != -1).nonzero(as_tuple=True)[0]

y = dataset.y[labeled_indices]

kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

folds = []

for train_indices, test_indices in kfold.split(labeled_indices, y):
    
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using more than one GPU
    np.random.seed(seed)
    random.seed(seed)

#     train_indices = labeled_indices[randperm[:num_train]]
#     test_indices = labeled_indices[randperm[num_train:]]

    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    train_mask[labeled_indices[train_indices]] = True
    test_mask[labeled_indices[test_indices]] = True

    # Ensure no overlap (for sanity check)
    assert torch.logical_and(train_mask[labeled_indices], test_mask[labeled_indices]).sum() == 0

    folds.append([train_mask, test_mask])

# GCN

In [271]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score

In [272]:
scorers = [precision_score, recall_score, f1_score, roc_auc_score]
scorer_names = ['_'.join(s.__name__.split('_')[:-1]) for s in scorers]
scorer_names

['precision', 'recall', 'f1', 'roc_auc']

In [273]:
from torch_geometric.nn import GCNConv, AGNNConv, GINConv, GCN, GIN, GINConv, GAT, GATConv, GraphSAGE, ARGA
from torch_geometric.nn.models import LightGCN, GCN, ARGA, ARGVA, GIN, GAE, GAT, GraphSAGE, GNNFF, VGAE, PNA
from torch_geometric.nn.models.basic_gnn import BasicGNN
from torch_geometric.nn.models import LabelPropagation

from torch_geometric.nn.conv import (
    EdgeConv,
    GATConv,
    GATv2Conv,
    GCNConv,
    GINConv,
    MessagePassing,
    PNAConv,
    SAGEConv,
)

import torch.nn.functional as F

In [274]:
from typing import Any, Callable, Dict, Final, List, Optional, Tuple, Union

class AGNN(BasicGNN):
    
    supports_edge_weight: Final[bool] = False
    supports_edge_attr: Final[bool] = False
    supports_norm_batch: Final[bool]
    
    def init_conv(self, in_channels: int, out_channels: int, **kwargs) -> MessagePassing:
        return AGNNConv(requires_grad=True)
    
#     def forward(self, x, edge_index):
        
#         self.lin1 = torch.nn.Linear(self.in_channels, self.hidden_channels)
#         self.prop1 = AGNNConv(requires_grad=True)
#         self.lin2 = torch.nn.Linear(self.hidden_channels, self.out_channels)
#         x = F.dropout(x, training=self.training)
#         x = F.relu(self.lin1(x))
#         x = self.prop1(x, edge_index)
#         x = F.dropout(x, training=self.training)
#         x = self.lin2(x)
#         return F.log_softmax(x, dim=1)

In [275]:
print("AGNN")
class AGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels=32, dropout=0.5, out_channels=2, num_layers=1):
        super(AGNN, self).__init__()
        self.num_layers = num_layers
        self.lin1 = torch.nn.Linear(in_channels, hidden_channels)
        self.prop1 = AGNNConv(requires_grad=True)
        self.lin2 = torch.nn.Linear(hidden_channels, out_channels)
        self.dropout = dropout
        self.num_layers

    def forward(self, x, edge_index, edge_weight=None):
        for i in range(self.num_layers):
            x = F.dropout(x, training=self.training, p=self.dropout)
            x = F.relu(self.lin1(x))
            x = self.prop1(x, edge_index)
        x = F.dropout(x, training=self.training, p=self.dropout)
        x = self.lin2(x)
        return F.log_softmax(x, dim=1)

AGNN


In [276]:
print("Cheby")
class Cheby(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels=32, dropout=0.2, out_channels=2, num_layers=1):
        super(Cheby, self).__init__()
        self.conv1 = ChebConv(in_channels, hidden_channels, K=1)
        self.conv2 = ChebConv(hidden_channels, out_channels, K=1)
        self.dropout = dropout
        
    def forward(self, x, edge_index, edge_weight=None):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, self.dropout)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

Cheby


In [277]:
print("GCN")
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels=32, out_channels=2, num_layers=1, dropout=0.2):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.dropout = dropout
        
    def forward(self, x, edge_index, edge_weight=None):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, self.dropout)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

GCN


In [278]:
print("GAT conv")
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels=32, out_channels=2, num_layers=1, dropout=0.2, heads=2):
        super(GAT, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, concat=True)
        self.conv2 = GATConv(heads * hidden_channels, out_channels, heads=heads, concat=False)
        self.dropout = dropout

    def forward(self, x, edge_index, edge_weight=None):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, self.dropout)
        x = self.conv2(x, edge_index)
        return x.log_softmax(dim=1)

GAT conv


In [279]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
data = dataset.to(device)
data

cuda


Data(x=[51195, 100], edge_index=[2, 2472710], y=[51195], edge_weight=[2472710], num_nodes=51195, name='gab', num_classes=2)

## Train

In [280]:
gnn_params = {
    'in_channels': data.num_features,
    'hidden_channels': 16,
    'num_layers': 1, 
    'out_channels': data.num_classes,
    # 'dropout': 0.5
}

n_epochs = 5

In [281]:
for fold in folds:
    train_mask, test_mask = fold
    print(np.mean(y[train_mask[labeled_indices]].numpy()), np.mean(y[test_mask[labeled_indices]].numpy()))

0.24968152866242038 0.24873096446700507
0.24968152866242038 0.24873096446700507
0.24936386768447838 0.25
0.24936386768447838 0.25
0.24936386768447838 0.25


In [282]:
from sklearn.utils.class_weight import compute_class_weight
class_weight = torch.Tensor(compute_class_weight(class_weight='balanced', classes=np.unique(data.y[train_mask].cpu()), y=data.y[train_mask].cpu().numpy()))
class_weight

tensor([0.6661, 2.0051])

In [283]:
criterion = F.nll_loss
# criterion = F.cross_entropy

In [284]:
gnn_dataset_dir = os.path.join(gnn_path, dataset_name)
if not os.path.exists(gnn_dataset_dir):
    os.mkdir(gnn_dataset_dir)

gnn_models_dir = os.path.join(gnn_dataset_dir, 'models')
if not os.path.exists(gnn_models_dir):
    os.mkdir(gnn_models_dir)

gnn_results_dir = os.path.join(gnn_dataset_dir, 'results')
if not os.path.exists(gnn_results_dir):
    os.mkdir(gnn_results_dir)

In [285]:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # For multi-GPU setups
np.random.seed(seed)
random.seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

model_results_dict = {}
    
for gnn_model in [GCN, GraphSAGE, GAT, AGNN]:
    train_scores = []
    test_scores = []
    model_name = gnn_model.__name__
    print(model_name)

    for idx, fold in enumerate(folds):
        # model = GCN(num_features=data.num_features, num_classes=data.num_classes, dropout_p=0.25).to(device)
        # model = GCN(data.num_features, data.num_classes, num_layers=2)
        # model = GAT(data.num_features, hidden_channels=16, num_layers=2)
        model = gnn_model(**gnn_params).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
        # optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
        model.train()

        best_loss = float('inf')
        model_path = os.path.join(gnn_models_dir, f'best_{gnn_model.__name__}_model_fold_{idx}.pt')

        print(f'Fold {idx}')
        train_mask, test_mask = fold

        class_weight = torch.Tensor(compute_class_weight(class_weight='balanced', classes=np.unique(data.y[train_mask].cpu()), y=data.y[train_mask].cpu().numpy())).to(device)

        for epoch in range(n_epochs):  # Number of epochs
            optimizer.zero_grad()
            
            data.edge_index = data.edge_index.to(device)
            data.edge_weight = data.edge_weight.to(device)
            
            out = model(data.x, data.edge_index, edge_weight=data.edge_weight)
            loss = criterion(out[train_mask], data.y[train_mask], weight=class_weight)
            loss.backward()
            optimizer.step()
            # if (epoch+1) % 50 == 0:
            #     print(f'Epoch {epoch+1}, Loss: {loss.item()}')

            if loss.item() < best_loss:
                best_loss = loss.item()
                # Save the model if current loss is lower than the best known loss
                torch.save(model, model_path)
                # print(f"Epoch {epoch+1}: New best loss {best_loss:.4f}, model saved.")

        model = torch.load(model_path)
        out = model(data.x, data.edge_index, edge_weight=data.edge_weight)
        train_probas, y_train = out[train_mask][:, 1].detach().cpu().numpy(), data.y[train_mask].detach().cpu().numpy()    
        test_probas, y_test = out[test_mask][:, 1].detach().cpu().numpy(), data.y[test_mask].detach().cpu().numpy()

        train_preds = out[train_mask].max(1)[1].cpu()
        test_preds = out[test_mask].max(1)[1].cpu()

        train_scores.append([scorer(y_train, train_preds) for scorer in scorers[:-1]] + [scorers[-1](y_train, train_probas)])
        test_scores.append([scorer(y_test, test_preds) for scorer in scorers[:-1]] + [scorers[-1](y_test, test_probas)])
    
    model_results_dict[model_name] = {'train': train_scores, 'test': test_scores}

GCN
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
GraphSAGE
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
GAT
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
AGNN
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4


## Evaluate

In [288]:
for model_name, results in model_results_dict.items():
    
    print(f'-------- {model_name} --------')
    
    print(f'---- train ----')
    train_scores, test_scores = results['train'], results['test']
    
    train_scores_df = pd.DataFrame(np.array(train_scores), columns=scorer_names)
    display(train_scores_df)

    train_scores_agg_df = train_scores_df.agg(['mean', 'std'])
    display(train_scores_agg_df)
        
    overleaf = f'& {model_name}'
    for mean, std in train_scores_agg_df.T.values:
        overleaf += (f' & ${mean:.3f} \pm {std:.3f}$')
    print(overleaf + '\\\\')

    print(f'---- test ----')
    test_scores_df = pd.DataFrame(np.array(test_scores), columns=scorer_names)
    display(test_scores_df)

    test_scores_agg_df = test_scores_df.agg(['mean', 'std'])
    display(test_scores_agg_df)

    overleaf = f'& {model_name}'
    for mean, std in test_scores_agg_df.T.values:
        overleaf += (f' & ${mean:.3f} \pm {std:.3f}$')
    print(overleaf + '\\\\')
    print('\n\n')

-------- GCN --------
---- train ----


Unnamed: 0,precision,recall,f1,roc_auc
0,0.251979,0.97449,0.400419,0.599468
1,0.262069,0.969388,0.412595,0.662165
2,0.216828,0.341837,0.265347,0.473452
3,0.300469,0.653061,0.411576,0.631425
4,0.292208,0.229592,0.257143,0.550951


Unnamed: 0,precision,recall,f1,roc_auc
mean,0.264711,0.633673,0.349416,0.583492
std,0.033528,0.345574,0.080683,0.07401


& GCN & $0.265 \pm 0.034$ & $0.634 \pm 0.346$ & $0.349 \pm 0.081$ & $0.583 \pm 0.074$\\
---- test ----


Unnamed: 0,precision,recall,f1,roc_auc
0,0.255319,0.979592,0.405063,0.578599
1,0.248649,0.938776,0.393162,0.671677
2,0.3125,0.510204,0.387597,0.572817
3,0.307692,0.653061,0.418301,0.585173
4,0.357143,0.306122,0.32967,0.544357


Unnamed: 0,precision,recall,f1,roc_auc
mean,0.296261,0.677551,0.386759,0.590524
std,0.044845,0.285496,0.034025,0.047962


& GCN & $0.296 \pm 0.045$ & $0.678 \pm 0.285$ & $0.387 \pm 0.034$ & $0.591 \pm 0.048$\\



-------- GraphSAGE --------
---- train ----


Unnamed: 0,precision,recall,f1,roc_auc
0,0.408163,0.204082,0.272109,0.59952
1,0.415493,0.30102,0.349112,0.564352
2,0.393502,0.556122,0.460888,0.628485
3,0.473684,0.321429,0.382979,0.628545
4,0.257184,0.913265,0.401345,0.527473


Unnamed: 0,precision,recall,f1,roc_auc
mean,0.389605,0.459184,0.373287,0.589675
std,0.080054,0.284893,0.0696,0.043659


& GraphSAGE & $0.390 \pm 0.080$ & $0.459 \pm 0.285$ & $0.373 \pm 0.070$ & $0.590 \pm 0.044$\\
---- test ----


Unnamed: 0,precision,recall,f1,roc_auc
0,0.444444,0.244898,0.315789,0.574738
1,0.351351,0.265306,0.302326,0.547987
2,0.428571,0.489796,0.457143,0.641538
3,0.468085,0.44898,0.458333,0.645703
4,0.25731,0.897959,0.4,0.528669


Unnamed: 0,precision,recall,f1,roc_auc
mean,0.389952,0.469388,0.386718,0.587727
std,0.086095,0.262941,0.074863,0.053602


& GraphSAGE & $0.390 \pm 0.086$ & $0.469 \pm 0.263$ & $0.387 \pm 0.075$ & $0.588 \pm 0.054$\\



-------- GAT --------
---- train ----


Unnamed: 0,precision,recall,f1,roc_auc
0,0.245333,0.469388,0.322242,0.510637
1,0.359116,0.663265,0.46595,0.703241
2,0.288889,0.066327,0.107884,0.530396
3,0.274596,0.954082,0.426454,0.641076
4,0.268293,0.954082,0.418813,0.552058


Unnamed: 0,precision,recall,f1,roc_auc
mean,0.287245,0.621429,0.348268,0.587482
std,0.043139,0.372264,0.144396,0.081687


& GAT & $0.287 \pm 0.043$ & $0.621 \pm 0.372$ & $0.348 \pm 0.144$ & $0.587 \pm 0.082$\\
---- test ----


Unnamed: 0,precision,recall,f1,roc_auc
0,0.237624,0.489796,0.32,0.475179
1,0.320388,0.673469,0.434211,0.645201
2,0.0,0.0,0.0,0.534777
3,0.269006,0.938776,0.418182,0.588505
4,0.261364,0.938776,0.408889,0.553658


Unnamed: 0,precision,recall,f1,roc_auc
mean,0.217676,0.608163,0.316256,0.559464
std,0.125373,0.38947,0.182287,0.063129


& GAT & $0.218 \pm 0.125$ & $0.608 \pm 0.389$ & $0.316 \pm 0.182$ & $0.559 \pm 0.063$\\



-------- AGNN --------
---- train ----


Unnamed: 0,precision,recall,f1,roc_auc
0,0.325641,0.647959,0.433447,0.653373
1,0.302966,0.729592,0.428144,0.635745
2,0.345946,0.326531,0.335958,0.660165
3,0.419355,0.464286,0.440678,0.665548
4,0.39604,0.408163,0.40201,0.665911


Unnamed: 0,precision,recall,f1,roc_auc
mean,0.35799,0.515306,0.408047,0.656149
std,0.048561,0.168251,0.042858,0.012488


& AGNN & $0.358 \pm 0.049$ & $0.515 \pm 0.168$ & $0.408 \pm 0.043$ & $0.656 \pm 0.012$\\
---- test ----


Unnamed: 0,precision,recall,f1,roc_auc
0,0.381443,0.755102,0.506849,0.716216
1,0.263158,0.714286,0.384615,0.609073
2,0.288462,0.306122,0.29703,0.630987
3,0.4,0.530612,0.45614,0.639178
4,0.351852,0.387755,0.368932,0.628905


Unnamed: 0,precision,recall,f1,roc_auc
mean,0.336983,0.538776,0.402713,0.644872
std,0.059105,0.196597,0.081143,0.041388


& AGNN & $0.337 \pm 0.059$ & $0.539 \pm 0.197$ & $0.403 \pm 0.081$ & $0.645 \pm 0.041$\\





In [286]:
# Lists to store aggregated DataFrames
all_train_agg_dfs = []
all_test_agg_dfs = []

for model_name, results in model_results_dict.items():
    
    train_scores, test_scores = results['train'], results['test']
    
    train_scores_df = pd.DataFrame(np.array(train_scores), columns=scorer_names)
    train_scores_agg_df = train_scores_df.agg(['mean', 'std']).T
    train_scores_agg_df['Method'] = model_name
    all_train_agg_dfs.append(train_scores_agg_df)
    
    test_scores_df = pd.DataFrame(np.array(test_scores), columns=scorer_names)
    test_scores_agg_df = test_scores_df.agg(['mean', 'std']).T
    test_scores_agg_df['Method'] = model_name
    all_test_agg_dfs.append(test_scores_agg_df)

# Concatenate all aggregated train DataFrames
merged_train_agg_df = pd.concat(all_train_agg_dfs).reset_index().set_index(['Method', 'index'])
# merged_train_agg_df.columns.names = ['Aggregation']
merged_train_agg_df = merged_train_agg_df.rename_axis(['Method', 'Metric'])

# Concatenate all aggregated test DataFrames
merged_test_agg_df = pd.concat(all_test_agg_dfs).reset_index().set_index(['Method', 'index'])
# merged_test_agg_df.columns.names = ['Aggregation']
merged_test_agg_df = merged_test_agg_df.rename_axis(['Method', 'Metric'])

# Display the merged DataFrames
print("Merged Aggregated Train DataFrame")
merged_train_agg_df.to_csv(os.path.join(gnn_results_dir, 'train_result_gnn.csv'))
display(merged_train_agg_df)

print("Merged Aggregated Test DataFrame")
merged_test_agg_df.to_csv(os.path.join(gnn_results_dir, 'test_result_gnn.csv'))
display(merged_test_agg_df)

Merged Aggregated Train DataFrame


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Method,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1
GCN,precision,0.264711,0.033528
GCN,recall,0.633673,0.345574
GCN,f1,0.349416,0.080683
GCN,roc_auc,0.583492,0.07401
GraphSAGE,precision,0.389605,0.080054
GraphSAGE,recall,0.459184,0.284893
GraphSAGE,f1,0.373287,0.0696
GraphSAGE,roc_auc,0.589675,0.043659
GAT,precision,0.287245,0.043139
GAT,recall,0.621429,0.372264


Merged Aggregated Test DataFrame


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Method,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1
GCN,precision,0.296261,0.044845
GCN,recall,0.677551,0.285496
GCN,f1,0.386759,0.034025
GCN,roc_auc,0.590524,0.047962
GraphSAGE,precision,0.389952,0.086095
GraphSAGE,recall,0.469388,0.262941
GraphSAGE,f1,0.386718,0.074863
GraphSAGE,roc_auc,0.587727,0.053602
GAT,precision,0.217676,0.125373
GAT,recall,0.608163,0.38947


In [158]:
len(labeled_nodes)

532

# Load Model

In [159]:
model_path = os.path.join(gnn_models_dir, "best_AGNN_model_fold_0.pt")

In [160]:
model = torch.load(model_path)
out = model(data.x, data.edge_index, edge_weight=data.edge_weight)
probas, y = out[:, 1].detach().cpu().numpy(), data.y.detach().cpu().numpy()    
preds = out.max(1)[1].cpu()

In [161]:
data.y[4]

tensor(1, device='cuda:0')

In [162]:
labeled_indices = (data.y!=-1).nonzero().flatten().detach().cpu().numpy()
labeled_indices

array([   4,   10,   17,   18,   32,   39,   40,   45,   53,   60,   61,
         62,   81,   84,   95,  111,  114,  128,  172,  199,  201,  207,
        208,  210,  215,  229,  235,  242,  246,  250,  251,  259,  265,
        273,  274,  275,  277,  278,  292,  296,  317,  318,  320,  322,
        330,  336,  343,  351,  357,  358,  368,  372,  404,  410,  437,
        438,  439,  451,  468,  482,  492,  495,  500,  512,  514,  524,
        534,  537,  566,  571,  573,  579,  581,  587,  597,  602,  625,
        631,  640,  643,  654,  667,  670,  673,  674,  678,  685,  699,
        706,  712,  714,  719,  721,  728,  730,  733,  740,  744,  747,
        749,  754,  758,  760,  763,  766,  767,  768,  777,  781,  784,
        788,  792,  795,  802,  804,  813,  824,  829,  832,  847,  850,
        853,  882,  890,  895,  898,  904,  910,  913,  915,  919,  922,
        935,  938,  943,  946,  950,  957,  983,  993, 1003, 1005, 1008,
       1009, 1015, 1021, 1035, 1036, 1050, 1065, 10

In [163]:
preds_arr = preds[labeled_indices].detach().cpu().numpy()

In [164]:
labeled_nodes = g.vs[labeled_indices]

In [165]:
labels_arr = labeled_nodes['label']
len(labels_arr)

532

In [166]:
false_preds = np.where(preds_arr!=labels_arr)[0]
# false_preds = test_vs[false_preds]

In [167]:
len(false_preds)

66

In [168]:
hate_nodes = labeled_nodes[false_preds].select(label_eq=1)

In [169]:
hate_nodes['name'][0]

'2486965772'

In [170]:
len(hate_nodes[0].neighbors())

29

In [171]:
len(hate_nodes[0].neighbors(mode='in'))

29

In [172]:
len(hate_nodes[0].neighbors(mode='out'))

29