# Day 6 

In [1]:
import torch
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.datasets import TUDataset
from torch_geometric.datasets import Planetoid



## Task 1
Following the Tutorial at 
 https://pytorch-geometric.readthedocs.io/en/latest/get_started/introduction.html

In [2]:
edge_index =  torch.tensor([[0,1,1,2], 
                            [1,0,2,1]], dtype=torch.long)

print(edge_index)

x = torch.tensor([[-1], [0], [1]], dtype=torch.float)
print(x)

tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])
tensor([[-1.],
        [ 0.],
        [ 1.]])


In [3]:
data = Data(x=x, edge_index=edge_index)
data

Data(x=[3, 1], edge_index=[2, 4])

In [4]:
data.validate(raise_on_error=False)

True

In [5]:
edge_index = torch.tensor([[0, 1],
                           [1, 0],
                           [1, 2],
                           [2, 1]], dtype=torch.long)

print(edge_index)

x = torch.tensor([[-1], [0], [1]], dtype=torch.float)
print(x)

data = Data(x=x, edge_index=edge_index.t().contiguous())
data

tensor([[0, 1],
        [1, 0],
        [1, 2],
        [2, 1]])
tensor([[-1.],
        [ 0.],
        [ 1.]])


Data(x=[3, 1], edge_index=[2, 4])

In [6]:
data.validate(raise_on_error=False)

True

In [7]:
print(data.keys())
print(data.edge_index)
print(data.x)

['edge_index', 'x']
tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])
tensor([[-1.],
        [ 0.],
        [ 1.]])


In [8]:
for key, item in data:
    print(f'{key} found in data')

x found in data
edge_index found in data


In [9]:
'edge_attr' in data

False

In [10]:
data.num_nodes

3

In [11]:
data.num_edges

4

In [12]:
data.num_node_features

1

In [13]:
data.has_isolated_nodes()

False

In [14]:
data.has_self_loops()

False

In [15]:
data.is_directed()

False

In [16]:
dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')

In [17]:
len(dataset)

600

In [18]:
print(dataset)

ENZYMES(600)


In [19]:
dataset.num_classes

6

In [20]:
dataset.num_node_features

3

In [21]:
dataset.num_node_labels

3

In [22]:
data = dataset[0]
print(data)

Data(edge_index=[2, 168], x=[37, 3], y=[1])


In [23]:
data.is_undirected()

True

In [24]:
train_dataset = dataset[:540]
print(train_dataset)

ENZYMES(540)


In [25]:
test_dataset = dataset[540:]
test_dataset

ENZYMES(60)

In [26]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')

In [27]:
len(dataset)

1

In [28]:
dataset.num_classes

7

In [29]:
dataset.num_node_features

1433

In [30]:
data=dataset[0]
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [31]:
print(data.is_undirected())

print(data.train_mask.sum().item())

print(data.val_mask.sum().item())

print(data.test_mask.sum().item())


True
140
500
1000


In [32]:
from torch_geometric.utils import scatter
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

for data in loader:
    data

    data.num_graphs

    x = scatter(data.x, data.batch, dim=0, reduce='mean')
    print(x.size())


torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([32, 21])
torch.Size([24, 21])


In [33]:
# from torch_geometric.datasets import ShapeNet

# dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'])

# dataset[0]


## Task 2 
Following the tutorial at:
https://ogb.stanford.edu/docs/home/

In [34]:
from ogb.nodeproppred import PygNodePropPredDataset
from torch_geometric.data import DataLoader

  from pkg_resources import parse_version


In [35]:
# Override torch.load inside PyG
torch_load = torch.load
def torch_load_unsafe(*args, **kwargs):
    kwargs["weights_only"] = False
    return torch_load(*args, **kwargs)
torch.load = torch_load_unsafe

from ogb.nodeproppred import PygNodePropPredDataset
dataset = PygNodePropPredDataset(name="ogbn-proteins", root="dataset/")

In [36]:
data = dataset[0]
data

Data(num_nodes=132534, edge_index=[2, 79122504], edge_attr=[79122504, 8], node_species=[132534, 1], y=[132534, 112])

In [37]:
data.num_edges

79122504

In [38]:
data.num_nodes

132534

In [39]:
print(data.y)

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])


## Task 3

First following the tutorial at
https://networkx.org/documentation/stable/tutorial.html

In [40]:
import networkx as nx
from torch_geometric.utils import to_networkx

### Sample 1000 edges randomly (with seed) from the graph.

In [41]:
edge_index = data.edge_index  
num_edges = edge_index.size(1)

torch.manual_seed(42)  # reproducibility

perm = torch.randperm(num_edges)[:1000]
sampled_edge_index = edge_index[:, perm]

### Convert the sampled edges into a networkx object by adding the edges and their edge attributes.

In [42]:
srs, dst = sampled_edge_index
G = nx.Graph()
G.add_edges_from(torch.stack([srs, dst], dim=1).tolist())


edge_features = {(u, v): data.edge_attr[j].tolist()
                 for j, (u, v) in enumerate(torch.stack([srs, dst], dim=1).tolist())}
nx.set_edge_attributes(G, edge_features, "feat")

print(G)

Graph with 1966 nodes and 1000 edges


### Calculate the number of connected components

In [43]:
num_components = nx.number_connected_components(G)
print(num_components)


966


### Identify the hub nodes and their degrees.

In [44]:
hubs =[]
highest_degree = 0
for node in G.nodes:
    degree = G.degree[node]
    if degree > highest_degree:
        hubs = [node]
        highest_degree = degree
    elif degree == highest_degree:
        hubs.append(node)

print(hubs, highest_degree)   
print(f'There are {len(hubs)} hub nodes with degree {highest_degree}')     



[90652, 130319, 88287, 88003, 130125, 77953, 73947, 98122, 113925, 12148, 32807, 36087, 109466, 99902, 67471, 2111, 113062, 99983, 115864, 113829, 94128, 71067, 67341, 4952, 117054, 117451, 119411, 7919, 93150, 110728, 33742, 3104, 35409, 127804] 2
There are 34 hub nodes with degree 2


### Visualize the Graph with plotly

In [45]:
import plotly.graph_objects as go

In [46]:
import networkx as nx
import plotly.graph_objects as go

# Use a layout for node positions
pos = nx.spring_layout(G, seed=42)

# Extract edge coordinates
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]  # None separates edges
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=3, color='gray'),
    hoverinfo='none',
    mode='lines'
)

# Extract node coordinates
node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)


node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        size=10,
        color=[],
        colorbar=dict(
            thickness=15,
            title='Node Degree',
            xanchor='left',
            
        )
    )
)

# Color nodes by degree
node_adjacencies = []
node_text = []
for node, adjacencies in G.adjacency():
    node_adjacencies.append(len(adjacencies))
    node_text.append(f"Node {node} has {len(adjacencies)} connections")

node_trace.marker.color = node_adjacencies
node_trace.text = node_text

# Create figure
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='NetworkX Graph with Plotly',
                title_x=0.5,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=0,l=0,r=0,t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
             )
)

fig.show()


## Task 5

### Create the train, validation, and test datasets (using the available splits)

In [47]:
from torch_geometric.loader import dataloader
from torch_geometric.nn.models import Node2Vec

In [48]:
split_idx = dataset.get_idx_split()

train_idx = split_idx['train']
valid_idx = split_idx['valid']
test_idx  = split_idx['test']


data.train_idx = train_idx
data.test_idx = test_idx
data.val_idx = valid_idx
print(data)

Data(num_nodes=132534, edge_index=[2, 79122504], edge_attr=[79122504, 8], node_species=[132534, 1], y=[132534, 112], train_idx=[86619], test_idx=[24679], val_idx=[21236])


In [52]:
print(data.edge_index.size())

torch.Size([2, 79122504])


### Train a Node2vec model

In [50]:
model = Node2Vec(
    edge_index=data.edge_index,
    embedding_dim=128,
    walk_length=20,
    context_size=10,

)


ImportError: 'Node2Vec' requires either the 'pyg-lib' or 'torch-cluster' package

In [None]:

loader = model.loader(
    batch_size=128,      # tune this based on GPU RAM/CPU
    shuffle=True,
    num_workers=4        # increase if you have CPU cores
)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
data = data.to(device)


In [None]:
def train_model(Epochs=100):
    model.train()
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
    for epoch in range(Epochs):
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw, neg_rw)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()


In [None]:
print(data)
print(data.node_species)
print(data.node_species.squeeze())
species_id = data.node_species.squeeze()
unique_species, counts = torch.unique_consecutive(species_id, return_counts=True)

Data(num_nodes=132534, edge_index=[2, 79122504], edge_attr=[79122504, 8], node_species=[132534, 1], y=[132534, 112])
tensor([[3702],
        [3702],
        [3702],
        ...,
        [7955],
        [7955],
        [7955]])
tensor([3702, 3702, 3702,  ..., 7955, 7955, 7955])


In [None]:
for i, s in enumerate(unique_species):
    
    print(f'Species {s} has {counts[i]} nodes')



Species 3702 has 25449 nodes
Species 4932 has 6568 nodes
Species 6239 has 18108 nodes
Species 7227 has 13015 nodes
Species 9606 has 19354 nodes
Species 511145 has 4125 nodes
Species 10090 has 21236 nodes
Species 7955 has 24679 nodes
tensor([  3702,   4932,   6239,   7227,   7955,   9606,  10090, 511145])
