In [None]:
!pip install torch_geometric



In [None]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='data/Cora', name='Cora', transform=NormalizeFeatures())
data = dataset[0]


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, in_feats, hidden_feats, num_classes, dropout):
        super().__init__()
        self.conv1 = GCNConv(in_feats, hidden_feats)
        self.conv2 = GCNConv(hidden_feats, num_classes)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(dataset.num_features, 16, dataset.num_classes, dropout=0.5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)


In [None]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def test():
    model.eval()
    out = model(data.x, data.edge_index).argmax(dim=1)
    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        correct = (out[mask] == data.y[mask]).sum()
        accs.append(int(correct) / int(mask.sum()))
    return accs  # [train_acc, val_acc, test_acc]

for epoch in range(1, 201):
    loss = train()
    if epoch % 20 == 0:
        train_acc, val_acc, test_acc = test()
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, '
              f'Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}')


Epoch 020, Loss: 1.6711, Train: 0.8714, Val: 0.6280, Test: 0.6240
Epoch 040, Loss: 1.2894, Train: 0.9071, Val: 0.7100, Test: 0.7190
Epoch 060, Loss: 0.9110, Train: 0.9571, Val: 0.7700, Test: 0.7670
Epoch 080, Loss: 0.7109, Train: 0.9643, Val: 0.7720, Test: 0.7830
Epoch 100, Loss: 0.5672, Train: 0.9714, Val: 0.7940, Test: 0.7960
Epoch 120, Loss: 0.4817, Train: 0.9786, Val: 0.8020, Test: 0.8040
Epoch 140, Loss: 0.4225, Train: 0.9857, Val: 0.8100, Test: 0.8120
Epoch 160, Loss: 0.3962, Train: 0.9857, Val: 0.8040, Test: 0.8240
Epoch 180, Loss: 0.3664, Train: 0.9857, Val: 0.8020, Test: 0.8130
Epoch 200, Loss: 0.3317, Train: 0.9929, Val: 0.8000, Test: 0.8140


In [None]:
import torch
print(torch.__version__)


2.6.0+cu124


In [None]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.6.0+cu124.html


Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html


In [None]:
from torch_geometric.nn import Node2Vec
node2vec = Node2Vec(
    data.edge_index,
    embedding_dim=64,
    walk_length=20,
    context_size=10,
    walks_per_node=10,
    p=1, q=1,
    sparse=True
)
loader = node2vec.loader(batch_size=128, shuffle=True)
optimizer = torch.optim.SparseAdam(node2vec.parameters(), lr=0.01)

node2vec.train()
for epoch in range(1, 51):
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = node2vec.loss(pos_rw, neg_rw)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch:02d}, Loss: {total_loss:.4f}')

node2vec.eval()
embeddings = node2vec.embedding.weight.detach().cpu().numpy()

Epoch 01, Loss: 122.1980
Epoch 02, Loss: 89.1411
Epoch 03, Loss: 69.9602
Epoch 04, Loss: 56.9911
Epoch 05, Loss: 48.0397
Epoch 06, Loss: 41.2609
Epoch 07, Loss: 36.1559
Epoch 08, Loss: 32.1848
Epoch 09, Loss: 29.2310
Epoch 10, Loss: 27.0399
Epoch 11, Loss: 25.3061
Epoch 12, Loss: 24.0149
Epoch 13, Loss: 23.0232
Epoch 14, Loss: 22.2289
Epoch 15, Loss: 21.6225
Epoch 16, Loss: 21.0690
Epoch 17, Loss: 20.7292
Epoch 18, Loss: 20.4030
Epoch 19, Loss: 20.1429
Epoch 20, Loss: 19.9238
Epoch 21, Loss: 19.7064
Epoch 22, Loss: 19.5767
Epoch 23, Loss: 19.4656
Epoch 24, Loss: 19.3051
Epoch 25, Loss: 19.2427
Epoch 26, Loss: 19.1447
Epoch 27, Loss: 19.0718
Epoch 28, Loss: 19.0018
Epoch 29, Loss: 18.9521
Epoch 30, Loss: 18.8658
Epoch 31, Loss: 18.8059
Epoch 32, Loss: 18.7707
Epoch 33, Loss: 18.7520
Epoch 34, Loss: 18.6913
Epoch 35, Loss: 18.6662
Epoch 36, Loss: 18.6053
Epoch 37, Loss: 18.6216
Epoch 38, Loss: 18.5716
Epoch 39, Loss: 18.5593
Epoch 40, Loss: 18.5560
Epoch 41, Loss: 18.5091
Epoch 42, Loss:

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
text_feats = data.x.cpu().numpy()

def evaluate(X):
    clf = RandomForestClassifier(n_estimators=100, random_state=0)
    clf.fit(X[data.train_mask], data.y[data.train_mask])
    preds = clf.predict(X[data.test_mask])
    return accuracy_score(data.y[data.test_mask], preds)


acc_node2vec = evaluate(embeddings)
acc_text     = evaluate(text_feats)
acc_combined = evaluate(np.hstack([embeddings, text_feats]))

print(f"Node2Vec-only: {acc_node2vec:.4f}")
print(f"Text-only:    {acc_text:.4f}")
print(f"Combined:     {acc_combined:.4f}")

Node2Vec-only: 0.6530
Text-only:    0.5660
Combined:     0.7160


In [None]:
X_combined = np.hstack([embeddings, text_feats])
train_mask = data.train_mask.cpu().numpy()
val_mask   = data.val_mask.cpu().numpy()
test_mask  = data.test_mask.cpu().numpy()

y = data.y.cpu().numpy()


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
X_train = X_combined[train_mask]
y_train = y[train_mask]

param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
svc = SVC(kernel='linear')
grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

print("Best SVM C:", grid.best_params_['C'])
print("CV Accuracy:", grid.best_score_)


Best SVM C: 0.1
CV Accuracy: 0.7214285714285714


In [None]:
best_svc = grid.best_estimator_
X_test = X_combined[test_mask]
y_test = y[test_mask]

test_acc = best_svc.score(X_test, y_test)
print("Test Accuracy:", test_acc)


Test Accuracy: 0.709


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

X_train = X_combined[train_mask]
y_train = y[train_mask]

param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 50]
}

rf = RandomForestClassifier(random_state=0)
grid_rf = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_rf.fit(X_train, y_train)

print("Best RF params:", grid_rf.best_params_)
print("CV Accuracy:", grid_rf.best_score_)


Best RF params: {'max_depth': None, 'n_estimators': 500}
CV Accuracy: 0.7785714285714285


In [None]:
best_rf = grid_rf.best_estimator_
X_test = X_combined[test_mask]
y_test = y[test_mask]

test_acc_rf = best_rf.score(X_test, y_test)
print("Test Accuracy (RF):", test_acc_rf)


Test Accuracy (RF): 0.745


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

teleport_ps = [0.0, 0.1, 0.2]
results = {}

for p in teleport_ps:
    # 1. Initialize a visits matrix (classes × nodes)
    visits = np.zeros((num_classes, len(y_gcc)), dtype=int)

    # 2. For each class c
    for c in range(num_classes):
        seeds = seed_nodes_gcc[c]

        # 3. For each seed in that class
        for seed in seeds:

            # 4. Perform 1000 random walks of length 100
            for _ in range(1000):
                cur = seed
                for _ in range(100):
                    # 5. Record the visit
                    visits[c, cur] += 1

                    # 6. Teleport with probability p
                    if np.random.rand() < p:
                        cur = np.random.choice(seeds)
                    else:
                        nbrs, probs = trans_probs[cur]
                        cur = np.random.choice(nbrs, p=probs)

    # 7. Assign each node to the class with max visits
    y_pred = visits.argmax(axis=0)

    # Evaluate only on the unlabeled nodes
    y_true = y_gcc[unlabeled_mask]
    y_hat  = y_pred[unlabeled_mask]
    acc = accuracy_score(y_true, y_hat)
    f1  = f1_score(y_true, y_hat, average='macro')

    results[p] = (acc, f1)

# Print out the final table
print("p    Accuracy   Macro F1")
for p, (acc, f1) in results.items():
    print(f"{p:.1f}  {acc:.4f}     {f1:.4f}")


p    Accuracy   Macro F1
0.0  0.7499     0.7151
0.1  0.7469     0.7406
0.2  0.7321     0.7266
