In [3]:
import torch
from torch_geometric.data import Data
import pandas as pd
import numpy as np
from tqdm import tqdm
import dask.dataframe as dd

# Load label (node) data
print("Loading label data...")
label_file = 'D:/Twitter-Bot-Detection-Model/ml_model/dataset/label.csv'
labels_df = pd.read_csv(label_file, dtype={'id': 'object', 'label': 'category'})

# Convert user IDs to integers more efficiently
labels_df['id'] = pd.to_numeric(labels_df['id'].str[1:], errors='coerce')

# Create a mapping from old ID to new index using a dictionary comprehension
id_to_index = {old_id: new_index for new_index, old_id in enumerate(labels_df['id'])}

# Map bot/human labels to binary (bot = 0, human = 1)
label_map = {'bot': 0, 'human': 1}
labels_df['label'] = labels_df['label'].map(label_map)

# Create node labels tensor
num_nodes = len(id_to_index)
y = torch.zeros(num_nodes, dtype=torch.long)
y[labels_df['id'].map(id_to_index)] = torch.tensor(labels_df['label'].values, dtype=torch.long)

# Load edge data using pandas for initial processing
print("Loading edge data...")
edge_file = 'D:/Twitter-Bot-Detection-Model/ml_model/dataset/edge.csv'
chunks = pd.read_csv(edge_file, dtype={'source_id': 'object', 'target_id': 'object'}, chunksize=1000000)

# Process edges
print("Processing edges...")
edge_list = []
for chunk in tqdm(chunks, desc="Processing edge chunks"):
    chunk['source_id'] = pd.to_numeric(chunk['source_id'].str[1:], errors='coerce')
    chunk['target_id'] = pd.to_numeric(chunk['target_id'].str[1:], errors='coerce')
    
    # Filter out rows with NaN values
    chunk = chunk.dropna()
    
    source_indices = chunk['source_id'].map(id_to_index)
    target_indices = chunk['target_id'].map(id_to_index)
    
    # Filter out edges that don't have corresponding nodes
    mask = source_indices.notna() & target_indices.notna()
    source_indices = source_indices[mask].astype(int)
    target_indices = target_indices[mask].astype(int)
    
    edge_list.append(np.array([source_indices, target_indices]))

# Combine edge parts
edge_index = np.concatenate(edge_list, axis=1)

# Convert to torch tensor
edge_index = torch.tensor(edge_index, dtype=torch.long)

# Create the PyTorch-Geometric data object
data = Data(edge_index=edge_index, y=y)

print(f"Graph Data Object: {data}")
print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")

Loading label data...
Loading edge data...
Processing edges...


Processing edge chunks: 171it [16:16,  5.71s/it]


Graph Data Object: Data(edge_index=[2, 3745300], y=[1000000])
Number of nodes: 996093
Number of edges: 3745300




In [None]:
import csv
from collections import Counter
import time
import multiprocessing as mp

def efficient_node_edge_inspection(edge_file, label_file, total_edges, chunk_size=1_000_000, num_processes=mp.cpu_count()):
    print("\nInspecting edges and nodes...")
    start_time = time.time()

    # Load all labeled nodes
    labeled_nodes = set()
    with open(label_file, 'r', newline='') as labelfile:
        label_reader = csv.reader(labelfile)
        next(label_reader)  # Skip header
        for row in label_reader:
            labeled_nodes.add(row[0])  # Assuming the node ID is in the first column

    with mp.Manager() as manager:
        shared_degree_count = manager.dict()
        shared_nodes = manager.dict()
        
        chunks = list(chunk_generator(edge_file, chunk_size, total_edges))
        
        with mp.Pool(processes=num_processes) as pool:
            results = []
            for i, chunk in enumerate(chunks):
                result = pool.apply_async(process_chunk, (chunk, i))
                results.append(result)
            
            processed_edges = 0
            for result in results:
                local_degree_count, local_nodes, chunk_size = result.get()
                
                # Update shared data structures
                for node, count in local_degree_count.items():
                    if node in shared_degree_count:
                        shared_degree_count[node] += count
                    else:
                        shared_degree_count[node] = count
                
                for node in local_nodes:
                    shared_nodes[node] = True
                
                processed_edges += chunk_size
                print(f"Processed approximately {processed_edges} edges. Time elapsed: {time.time() - start_time:.2f} seconds")

        degree_count = Counter(shared_degree_count)
        all_nodes = set(shared_nodes.keys())

    unique_nodes = len(all_nodes)
    
    # Identify missing nodes
    missing_nodes = all_nodes - labeled_nodes
    extra_labeled_nodes = labeled_nodes - all_nodes
    
    print(f"\nTotal number of edges processed: {processed_edges}")
    print(f"Number of unique nodes in edge file: {unique_nodes}")
    print(f"Number of labeled nodes: {len(labeled_nodes)}")
    print(f"Number of nodes in edge file but not in label file: {len(missing_nodes)}")
    print(f"Number of nodes in label file but not in edge file: {len(extra_labeled_nodes)}")

    degrees = list(degree_count.values())
    print(f"\nDegree statistics:")
    print(f"Average degree: {sum(degrees) / len(degrees):.2f}")
    print(f"Maximum degree: {max(degrees)}")
    print(f"Minimum degree: {min(degrees)}")

    print(f"\nTotal time taken: {time.time() - start_time:.2f} seconds")

    return degree_count, missing_nodes, extra_labeled_nodes

def chunk_generator(file_path, chunk_size, total_edges):
    with open(file_path, 'r', newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        chunk = []
        for i, row in enumerate(reader):
            if i >= total_edges:
                break
            chunk.append(row)
            if len(chunk) == chunk_size:
                yield chunk
                chunk = []
        if chunk:
            yield chunk

def process_chunk(chunk, chunk_id):
    local_degree_count = Counter()
    local_nodes = set()

    for source, target in chunk:
        local_degree_count[source] += 1
        local_degree_count[target] += 1
        local_nodes.add(source)
        local_nodes.add(target)

    return local_degree_count, local_nodes, len(chunk)

# Usage
edge_file = 'D:/Twitter-Bot-Detection-Model/ml_model/dataset/edge.csv'
label_file = 'D:/Twitter-Bot-Detection-Model/ml_model/dataset/label.csv'
total_edges = 3745300  # The actual number of edges in your file

# Run the node-aware edge inspection
degree_count, missing_nodes, extra_labeled_nodes = efficient_node_edge_inspection(edge_file, label_file, total_edges)

# You can now use degree_count, unique_edges, missing_nodes, and extra_labeled_nodes for further analysis


Inspecting edges and nodes...


In [5]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import coalesce, to_scipy_sparse_matrix
import pandas as pd
import numpy as np
from tqdm import tqdm

# Assuming you have already run the previous loading code

# Coalesce the edge_index to remove duplicate edges and ensure undirected graph
edge_index, _ = coalesce(edge_index, None, num_nodes, num_nodes)

# Convert edge_index to a PyTorch sparse matrix
# Create the adjacency matrix as a sparse COO tensor
adj_sparse = torch.sparse_coo_tensor(edge_index, torch.ones(edge_index.shape[1]), (num_nodes, num_nodes))

# Convert the adjacency matrix to a scipy sparse matrix (optional, if needed for further processing)
adj_scipy = to_scipy_sparse_matrix(edge_index)

# Print the sparsity of the matrix
print(f"Sparse matrix shape: {adj_sparse.shape}")
print(f"Number of non-zero entries: {adj_sparse._nnz()}")

Sparse matrix shape: torch.Size([1000000, 1000000])
Number of non-zero entries: 3334408


In [9]:
from torch_geometric.utils import degree

# Calculate degree centrality
degree_centrality = degree(data.edge_index[0], num_nodes=data.num_nodes).float()  # Out-degree

# Initialize a tensor for features with zeros
num_features = 10
x = torch.zeros((num_nodes, num_features))  # Use zeros for all nodes

# Create a full degree centrality tensor
full_degree_centrality = torch.zeros(num_nodes)
full_degree_centrality[:degree_centrality.size(0)] = degree_centrality  # Fill in degree centrality values

# Assign degree centrality to the first feature for all nodes
x[:, 0] = full_degree_centrality  # Assign full degree centrality values

# Generate random features for the other dimensions (1 to num_features-1)
x[:, 1:] = torch.rand(num_nodes, num_features - 1)  # Random features for demonstration

# Create a new Data object with the synthetic features
data.x = x  # Adding the synthetic features to the Data object

print(data.x.shape)  # Check the shape of the feature tensor

torch.Size([1000000, 10])


In [10]:
from torch_geometric.data import Data

# Create the Data object with features, edge index, and labels
data = Data(x=x, edge_index=edge_index, y=y)

# Print the Data object to verify its structure
print(f"Data object: {data}")
print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")
print(f"Feature tensor shape: {data.x.shape}")
print(f"Edge index shape: {data.edge_index.shape}")
print(f"Labels tensor shape: {data.y.shape}")

Data object: Data(x=[1000000, 10], edge_index=[2, 3334408], y=[1000000])
Number of nodes: 1000000
Number of edges: 3334408
Feature tensor shape: torch.Size([1000000, 10])
Edge index shape: torch.Size([2, 3334408])
Labels tensor shape: torch.Size([1000000])


In [11]:
# Example split (80% train, 10% validation, 10% test)
num_train = int(0.8 * num_nodes)
num_val = int(0.1 * num_nodes)

# Create boolean masks for train, validation, and test splits
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[:num_train] = True

val_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask[num_train:num_train + num_val] = True

test_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask[num_train + num_val:] = True

# Assign masks to the data object
data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

# Print the masks to verify
print("Training mask:", data.train_mask.sum().item(), "nodes in training set.")
print("Validation mask:", data.val_mask.sum().item(), "nodes in validation set.")
print("Test mask:", data.test_mask.sum().item(), "nodes in test set.")

Training mask: 800000 nodes in training set.
Validation mask: 100000 nodes in validation set.
Test mask: 100000 nodes in test set.


In [14]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Create the model
model = GCN(num_features=num_features, num_classes=2)  # Adjust num_classes as needed

In [15]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()  # Clear gradients
    out = model(data)      # Forward pass
    loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute loss
    loss.backward()        # Backpropagation
    optimizer.step()       # Update parameters
    return loss.item()

# Training for a number of epochs
for epoch in range(200):
    loss = train()
    print(f'Epoch {epoch+1}: Loss: {loss:.4f}')

Epoch 1: Loss: 95.6816
Epoch 2: Loss: 85.3128
Epoch 3: Loss: 74.1763
Epoch 4: Loss: 64.7125
Epoch 5: Loss: 55.0502
Epoch 6: Loss: 45.9568
Epoch 7: Loss: 36.7742
Epoch 8: Loss: 28.4627
Epoch 9: Loss: 20.8869
Epoch 10: Loss: 14.1611
Epoch 11: Loss: 9.9347
Epoch 12: Loss: 6.8703
Epoch 13: Loss: 5.7702
Epoch 14: Loss: 5.1379
Epoch 15: Loss: 5.0403
Epoch 16: Loss: 5.0848
Epoch 17: Loss: 5.4152
Epoch 18: Loss: 5.5252
Epoch 19: Loss: 5.7926
Epoch 20: Loss: 6.0269
Epoch 21: Loss: 6.2700
Epoch 22: Loss: 6.3025
Epoch 23: Loss: 6.5503
Epoch 24: Loss: 6.6611
Epoch 25: Loss: 6.8470
Epoch 26: Loss: 6.8154
Epoch 27: Loss: 6.8376
Epoch 28: Loss: 6.7988
Epoch 29: Loss: 6.8834
Epoch 30: Loss: 6.8273
Epoch 31: Loss: 6.7099
Epoch 32: Loss: 6.5625
Epoch 33: Loss: 6.4601
Epoch 34: Loss: 6.4499
Epoch 35: Loss: 6.2420
Epoch 36: Loss: 6.0854
Epoch 37: Loss: 6.0031
Epoch 38: Loss: 5.8664
Epoch 39: Loss: 5.7423
Epoch 40: Loss: 5.5997
Epoch 41: Loss: 5.4771
Epoch 42: Loss: 5.2865
Epoch 43: Loss: 5.1814
Epoch 44: 

In [18]:
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize with a larger hidden layer
model = GCN(num_features=num_features, hidden_dim=32, num_classes=2)

optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

def train():
    model.train()
    optimizer.zero_grad()  # Clear gradients
    out = model(data)      # Forward pass
    loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute loss
    loss.backward()        # Backpropagation
    optimizer.step()       # Update parameters
    return loss.item()

def validate():
    model.eval()
    with torch.no_grad():
        out = model(data)  # Forward pass
        val_loss = criterion(out[data.val_mask], data.y[data.val_mask])  # Compute validation loss
        pred = out[data.val_mask].max(1)[1]  # Get predictions
        correct = pred.eq(data.y[data.val_mask]).sum().item()
        accuracy = correct / data.val_mask.sum().item()  # Calculate accuracy
    return val_loss.item(), accuracy

patience = 10
best_val_loss = float('inf')
epochs_no_improve = 0
for epoch in range(200):
    train_loss = train()
    val_loss, accuracy = validate()

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve == patience:
        print(f'Early stopping at epoch {epoch+1}')
        break

Early stopping at epoch 11


In [20]:
import torch

# Define the test function
def test():
    model.eval()  # Set model to evaluation mode
    out = model(data)  # Forward pass on the entire dataset
    pred = out.argmax(dim=1)  # Get predictions
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()  # Count correct predictions
    acc = correct / data.test_mask.sum()  # Compute accuracy
    test_loss = criterion(out[data.test_mask], data.y[data.test_mask]).item()  # Compute test loss
    return test_loss, acc.item()

# Call the test function after training
test_loss, test_acc = test()
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')

# Saving the trained model to a file
torch.save(model.state_dict(), 'gnn_model.pth')
print('Model saved to gnn_model.pth')

Test Loss: 0.5723, Test Accuracy: 0.7665
Model saved to gnn_model.pth
