In [1]:
!pip install torch torch-geometric -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from sklearn.metrics import (
    precision_recall_curve, precision_score, recall_score,
    f1_score, accuracy_score, confusion_matrix
)


In [3]:
edges = pd.read_csv("/content/twitter_combined.txt", sep=" ", header=None, names=["src","dst"])

G = nx.from_pandas_edgelist(edges, "src", "dst", create_using=nx.DiGraph())
G = nx.convert_node_labels_to_integers(G)

print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())

Nodes: 58608
Edges: 1282125


In [4]:
deg = nx.degree_centrality(G)
deg_n = np.array(list(deg.values()))

clust = nx.clustering(G.to_undirected())
clust_n = np.array(list(clust.values()))

pr = nx.pagerank(G, alpha=0.85)
pr_n = np.array(list(pr.values()))

def norm(v): return (v - v.min()) / (v.max() - v.min() + 1e-8)

features = np.vstack([norm(deg_n), norm(clust_n), norm(pr_n)]).T
x = torch.tensor(features, dtype=torch.float)
print("Feature Matrix:", x.shape)


Feature Matrix: torch.Size([58608, 3])


In [5]:
pr_vals = np.array(list(pr.values()))
thr = np.quantile(pr_vals, 0.95)

y = torch.tensor((pr_vals >= thr).astype(int), dtype=torch.long)
print("Influencers:", int(y.sum()), "/", len(y))


Influencers: 2931 / 58608


In [6]:
edge_index = torch.tensor(list(G.edges()), dtype=torch.long).t().contiguous()
data = Data(x=x, edge_index=edge_index, y=y)

data


Data(x=[58608, 3], edge_index=[2, 1282125], y=[58608])

In [7]:
num_nodes = data.num_nodes
perm = torch.randperm(num_nodes)

train_size = int(0.7 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask  = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
test_mask[perm[train_size:]] = True


In [8]:
num_pos = int(y.sum())
num_neg = len(y) - num_pos

weight_pos = num_neg / num_pos
weights = torch.tensor([1.0, weight_pos], dtype=torch.float)

weights


tensor([ 1.0000, 18.9959])

In [9]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels=64, out_channels=2, dropout=0.4):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, out_channels)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv3(x, edge_index)
        return x


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = data.to(device)

model = GraphSAGE(in_channels=data.x.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.004, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss(weight=weights.to(device))

print("Training GraphSAGE...")

for epoch in range(1, 201):
    model.train()
    optimizer.zero_grad()

    out = model(data.x, data.edge_index)
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        model.eval()
        with torch.no_grad():
            probs = torch.softmax(out[test_mask], dim=1)[:,1]
            pred = (probs > 0.5).long()
            acc = (pred == data.y[test_mask]).float().mean().item()
        print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Test Acc: {acc:.4f}")

print("GraphSAGE Training Complete!")


Training GraphSAGE...
Epoch 020 | Loss: 0.5016 | Test Acc: 0.7103
Epoch 040 | Loss: 0.3909 | Test Acc: 0.8073
Epoch 060 | Loss: 0.3378 | Test Acc: 0.8530
Epoch 080 | Loss: 0.3113 | Test Acc: 0.8701
Epoch 100 | Loss: 0.2756 | Test Acc: 0.8991
Epoch 120 | Loss: 0.2494 | Test Acc: 0.9142
Epoch 140 | Loss: 0.2245 | Test Acc: 0.9269
Epoch 160 | Loss: 0.2129 | Test Acc: 0.9204
Epoch 180 | Loss: 0.1944 | Test Acc: 0.9301
Epoch 200 | Loss: 0.1781 | Test Acc: 0.9425
GraphSAGE Training Complete!


In [12]:
from sklearn.metrics import precision_recall_curve

model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index)
    probs = torch.softmax(out[test_mask], dim=1)[:,1].cpu().numpy()
    true_y = data.y[test_mask].cpu().numpy()

prec, rec, thr = precision_recall_curve(true_y, probs)
f1s = 2 * (prec * rec) / (prec + rec + 1e-8)
best_idx = f1s.argmax()
best_thr = thr[best_idx] if best_idx < len(thr) else 0.5

pred = (probs > best_thr).astype(int)

print("\n GraphSAGE Results:")
print("Optimal Threshold:", round(best_thr,3))
print("Accuracy:", accuracy_score(true_y, pred))
print("Precision:", precision_score(true_y, pred))
print("Recall:", recall_score(true_y, pred))
print("F1:", f1_score(true_y, pred))
print("Confusion Matrix:\n", confusion_matrix(true_y, pred))



 GraphSAGE Results:
Optimal Threshold: 0.848
Accuracy: 0.975203321389979
Precision: 0.7322297955209348
Recall: 0.823658269441402
F1: 0.7752577319587629
Confusion Matrix:
 [[16395   275]
 [  161   752]]


In [14]:
pagerank_arr = np.array(list(pr.values()))

test_idx = test_mask.nonzero(as_tuple=False).squeeze().cpu().numpy()
pred_idx = set(test_idx[pred == 1])
true_idx = set(np.argsort(pagerank_arr)[-int(0.05 * len(pagerank_arr)):])

overlap = len(pred_idx & true_idx) / len(true_idx)
print(f"Overlap with PageRank Top 5% = {overlap*100:.2f}%")


Overlap with PageRank Top 5% = 25.63%
