# Deep Learning
## Excercise 11: Graph Neural Networks 

In this exercise we will explore graph neural networks. We will lean heavily on the [tutorials](https://pytorch-geometric.readthedocs.io/en/latest/get_started/colabs.html) provided by `pytorch-geometric`. Take a look at those tutorials for more explanations and exercises.

In [None]:
# Helper function for visualization.
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def visualize(h, color):
    z = TSNE(n_components=2, init='pca', learning_rate='auto').fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

### 1. Node Classification

We want to build a Graph Neural Network for node classification. 

The data reading is already implemented for you. We will use (as the official [tutorial](https://colab.research.google.com/drive/14OvFnAXggxB8vM4e8vSURUp1TaKnovzX?usp=sharing)) the `Cora` dataset. Each node represents a document, which is represented by a bag-of-words feature vector. Two documents are connected if there exists a citation link between them. Each document can be classified in one of 7 categories.

In [None]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='./data/Planetoid', name='Cora', transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
print(f'Contains self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Note that the training dataset is quite small (only ~5%).

#### 1. Build a Graph Neural Network

Your network should consist of:
- a graph convolutional layer `torch_geometric.nn.GCNConv`, which takes the bag-of-words feature vectors and the edges as input and returns a 16-dim output.
- a ReLU layer
- a dropout layer with dropout probability of 0.3
- a second graph convolutional layer, which maps the dimensions down to the number of classes.

*Hint*: Check out the pytorch_geometric documentation to check the inputs and outputs of the graph convolutional layers.

Visualize your untrained node embeddings using the `visualize` function given above.

In [None]:
import torch
from torch import nn
from torch_geometric.nn import GCNConv

In [None]:
#ToDo: fill in the __init__() and forward() functions. Add arguments if needed.
class NodeClassificationModel(torch.nn.Module):
    def __init__(self,):
        super(NodeClassificationModel, self).__init__()


    def forward(self, x, edge_index):

        return x



In [None]:
class NodeClassificationModel(torch.nn.Module):
    def __init__(self, in_features, hidden_dimensions, out_features, num_hidden_layers):
        super(NodeClassificationModel, self).__init__()
        self.conv1 = GCNConv(in_features, hidden_dimensions)
        self.conv2 = GCNConv(hidden_dimensions, out_features)
        self.dropout = nn.Dropout(p=0.3)
        self.relu = nn.ReLU()
        self.hidden_layers = []
        if num_hidden_layers > 0:  # for the 3rd task
            for i in range(num_hidden_layers):
                self.hidden_layers += [GCNConv(hidden_dimensions, hidden_dimensions)]

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.relu(x)
        x = self.dropout(x)
        
        if len(self.hidden_layers) > 0: # for the 3rd task
            for hidden_layer in self.hidden_layers:
                x = hidden_layer(x, edge_index)
                x = self.relu(x)
                x = self.dropout(x)
        
        x = self.conv2(x, edge_index)
        return x

ncm = NodeClassificationModel(in_features=dataset.num_features, out_features=dataset.num_classes,
                                hidden_dimensions=16, num_hidden_layers=4)
print(ncm)
ncm.eval()

model_output = ncm(data.x, data.edge_index)
visualize(model_output, color=data.y)

#### 2. Train the model
Use cross-entropy loss and the adam optimizer with a learning rate of 0.01 and L2 regularization with a penalty of $5\cdot 10^{-4}$. Train the model for a maximum of 200 epochs while using the validation data to implement early stopping. Evaluate your model accuracy on the test data.

Visualize the embeddings.

In [None]:
#ToDo: Train and Evaluate the model

In [None]:
def train(num_epochs, data, model, optimizer, loss_function, stop_criterion):
    best_val_loss = 100000
    counter = 0
    for epoch in range(num_epochs):
        model.train()
        out = model(data.x, data.edge_index)
        train_loss = loss_function(out[data.train_mask], data.y[data.train_mask])
        train_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        model.eval()
        val_loss, val_acc = 0, 0
        with torch.no_grad():
            val_loss, val_acc = evaluate(out, data.y, loss_function, data.val_mask)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'GCN.pt')
            counter = 0
        else:
            counter += 1
        
        if epoch %5 == 0:
            print(f"Epoch {epoch} \t Train Loss: {train_loss:.4f} \t Val Loss: {val_loss:.4f} \t Val Acc: {val_acc:.4f}")
        if counter > stop_criterion:
            break
    model.load_state_dict(torch.load('GCN.pt'))

def evaluate(model_output, ground_truth, loss_function, split):

    split_loss = loss_function(model_output[split], ground_truth[split])
    prediction = model_output.argmax(dim=1)
    correct = (prediction[split]==ground_truth[split]).sum()
    accuracy = correct / split.sum()
    return split_loss, accuracy
                  
ncm = NodeClassificationModel(in_features=dataset.num_features, out_features=dataset.num_classes,
                                hidden_dimensions=16, num_hidden_layers=0)
optimizer = torch.optim.Adam(ncm.parameters(), lr=0.01, weight_decay=5e-4)
loss_function = torch.nn.CrossEntropyLoss()
train(200, data, ncm, optimizer, loss_function, 20)

ncm.eval()
model_output = ncm(data.x, data.edge_index)
test_loss, test_acc = evaluate(model_output, data.y, loss_function, data.test_mask)
print(f"Test Accuracy: {test_acc.item()*100:.2f}%")
visualize(model_output, color=data.y)


#### 3. More complex models (optional)

Modify your Graph Neural Network. What happens if you increase the hidden dimension or use more layers?

### 2. Graph Classification

Now that we build a model to classify nodes in a graph successfully, we turn to graph classification. Here we now want to create representations - and then predict the labels - for entire graphs.

As the official [tutorial](https://colab.research.google.com/drive/1I8a0DfQ3fI7Njc62__mVXUlcAleUclnb) we use the `MUTAG` dataset. Each graph represents a chemical which have to be classified into two classes depending how they affect a bacterium. The data loading is again done for you. pytorch-geometric also provides its own dataloader, which takes care of any difficulties when creating batches for graphs. Check out the official tutorial, if you want to know more.

In [None]:
import torch
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root='./data/TUDataset', name='MUTAG')

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print(f"Number of node features: {dataset.num_node_features}, Number of edge features: {dataset.num_edge_features}")

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
print(f'Contains self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

torch.manual_seed(12345)
dataset = dataset.shuffle()

train_dataset = dataset[:100]
val_dataset = dataset[100:150]
test_dataset = dataset[150:]

print()
print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of validation graphs: {len(val_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

from torch_geometric.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
print()
for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

As you can see, in contrast to the `Cora` dataset, here we have multiple -- but smaller -- graphs and each graph has just one label. Additionally we have edge features (`edge_attr`).

Note that the batched data contains a new attribute `batch`, which maps each node to its respective graph in the batch.

#### 1. Build a Graph Neural Network

To create representations for entire graphs, we first create embeddings for each node, which are then aggregated into an embedding for the entire graph in a so-called **readout layer**.

Build a graph neural network consisting of:
1. A `torch_geometric.nn.GINCov` layer. Use `torch.nn.Linear` for the nn, which maps the input from the number of node features to 64 hidden dimensions.
2. Two further `torch_geometric.nn.GINCov` with `torch.nn.Linear`, keeping in and out features at 64 dimensions.
3. Apply ReLU between the convolutional layers.
4. Aggregate the output of the last convolutional layer into a graph representation using `torch_gemoentric.nn.global_mean_pool`
5. Apply a dropout layer with dropout probability of 0.5
6. Apply a fully connected layer, where the out features are equal to the number of classes.

*Hint*: Make sure that the pooling layer knows which nodes belong to which graph.

In [None]:
from torch import nn
from torch_geometric.nn import GINConv
from torch_geometric.nn import global_mean_pool

#ToDo: fill in the __init__ and forward functions. Add arguments if needed.
class GraphClassificationModel(torch.nn.Module):
    def __init__(self, ):
        super(GraphClassificationModel, self).__init__()
        

    def forward(self, ):
        
        
        return x

gcm = GraphClassificationModel()
print(gcm)

In [None]:
from torch import nn
from torch.nn import functional as F
from torch_geometric.nn import GINConv
from torch_geometric.nn import global_mean_pool

class GraphClassificationModel(torch.nn.Module):
    def __init__(self, in_features, hidden_dimensions, out_features):
        super(GraphClassificationModel, self).__init__()
        self.conv1 = GINConv(nn=nn.Linear(in_features, hidden_dimensions))
        self.conv2 = GINConv(nn=nn.Linear(hidden_dimensions, hidden_dimensions))
        self.conv3 = GINConv(nn=nn.Linear(hidden_dimensions, hidden_dimensions))
        self.fc = nn.Linear(hidden_dimensions, out_features)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = self.relu(x)
        x = self.conv2(x, edge_index)
        x = self.relu(x)
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = self.dropout(x)
        x = self.fc(x)
        
        return x

gcm = GraphClassificationModel(in_features=dataset.num_node_features, out_features=dataset.num_classes,
                              hidden_dimensions=64)
print(gcm)

#### 2. Train the model
Use cross-entropy loss and the adam optimizer with a learning rate of 0.01 and L2 regularization with a penalty of $5\cdot 10^{-4}$. Train the model for a maximum of 200 epochs while using the validation data to implement early stopping. Evaluate your model accuracy on the test data. 

In [None]:
#ToDo: Implemnt Training and Evaluation

In [None]:
def train(num_epochs, train_dataloader, val_dataloader, model, optimizer, loss_function, stop_criterion):
    best_val_loss = 100000
    counter = 0
    for epoch in range(num_epochs):
        model.train()
        for data in train_dataloader:
            out = model(data.x, data.edge_index, data.batch)
            train_loss = loss_function(out, data.y)
            train_loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        val_loss, val_acc = evaluate(val_dataloader, model, loss_function)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'GCM.pt')
            counter = 0
        else:
            counter += 1
        
        if epoch %5 == 0:
            print(f"Epoch {epoch} \t Train Loss: {train_loss:.4f} \t Val Loss: {val_loss:.4f} \t Val Acc: {val_acc:.4f}")
        if counter > stop_criterion:
            break
    model.load_state_dict(torch.load('GCM.pt'))

def evaluate(data_loader, model, loss_function):
    model.eval()
    cum_loss = 0
    tp = 0
    num_entries = 0
    for data in data_loader:
        out = model(data.x, data.edge_index, data.batch)
        loss = loss_function(out, data.y)
        cum_loss += loss.item()

        prediction = out.argmax(dim=1)
        tp += (prediction == data.y).sum()
        num_entries += len(data)
    return cum_loss/len(data_loader), tp/num_entries


gcm = GraphClassificationModel(in_features=dataset.num_node_features, out_features=dataset.num_classes,
                              hidden_dimensions=64)

optimizer = torch.optim.Adam(gcm.parameters(), lr=0.01, weight_decay=5e-4)
loss_function = torch.nn.CrossEntropyLoss()
train(200, train_loader, val_loader, gcm, optimizer, loss_function, 20)

test_loss, test_acc = evaluate(test_loader, gcm, loss_function)
print(f"Accuracy on Test Set: {test_acc:.4f}")


#### 3. Graph Neural Networks with Edge Attributes

As mentioned before, our dataset also contains edge attributes. Now we want to make use of them.

Replace the `torch_geometric.nn.GINCov` by `torch_geometric.nn.GINECov`, which can deal with edge attributes. Train your model again. Can you achieve better performance?

In [None]:
from torch import nn
from torch.nn import functional as F
from torch_geometric.nn import GINEConv
from torch_geometric.nn import global_mean_pool

#ToDo: fill the __init__ and forward functions, add arguments if needed.
class GraphClassificationModelwE(torch.nn.Module):
    def __init__(self, ):
        super(GraphClassificationModelwE, self).__init__()


    def forward(self, ):

        return x

gcmwe = GraphClassificationModelwE()
print(gcmwe)

In [None]:
from torch import nn
from torch.nn import functional as F
from torch_geometric.nn import GINEConv
from torch_geometric.nn import global_mean_pool

class GraphClassificationModelwE(torch.nn.Module):
    def __init__(self, in_features, hidden_dimensions, out_features, edge_features):
        super(GraphClassificationModelwE, self).__init__()
        self.conv1 = GINEConv(nn=nn.Linear(in_features, hidden_dimensions), edge_dim=edge_features)
        self.conv2 = GINEConv(nn=nn.Linear(hidden_dimensions, hidden_dimensions), edge_dim=edge_features)
        self.conv3 = GINEConv(nn=nn.Linear(hidden_dimensions, hidden_dimensions), edge_dim=edge_features)
        self.fc = nn.Linear(hidden_dimensions, out_features)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x, edge_index, edge_attr, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index, edge_attr)
        x = self.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = self.relu(x)
        x = self.conv3(x, edge_index, edge_attr)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = self.dropout(x)
        x = self.fc(x)
        
        return x

gcmwe = GraphClassificationModelwE(in_features=dataset.num_node_features, out_features=dataset.num_classes,
                              hidden_dimensions=64, edge_features=dataset.num_edge_features)
print(gcmwe)

In [None]:
#ToDo: Implement training and evaluation

In [None]:
def train(num_epochs, train_dataloader, val_dataloader, model, optimizer, loss_function, stop_criterion):
    best_val_loss = 100000
    counter = 0
    for epoch in range(num_epochs):
        model.train()
        for data in train_dataloader:
            out = model(data.x, data.edge_index, data.edge_attr, data.batch)
            train_loss = loss_function(out, data.y)
            train_loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        val_loss, val_acc = evaluate(val_dataloader, model, loss_function)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'GCM.pt')
            counter = 0
        else:
            counter += 1
        
        if epoch %5 == 0:
            print(f"Epoch {epoch} \t Train Loss: {train_loss:.4f} \t Val Loss: {val_loss:.4f} \t Val Acc: {val_acc:.4f}")
        if counter > stop_criterion:
            break
    model.load_state_dict(torch.load('GCM.pt'))

def evaluate(data_loader, model, loss_function):
    model.eval()
    cum_loss = 0
    tp = 0
    num_entries = 0
    for data in data_loader:
        out = model(data.x, data.edge_index, data.edge_attr, data.batch)
        loss = loss_function(out, data.y)
        cum_loss += loss.item()

        prediction = out.argmax(dim=1)
        tp += (prediction == data.y).sum()
        num_entries += len(data)
    return cum_loss/len(data_loader), tp/num_entries


gcmwe = GraphClassificationModelwE(in_features=dataset.num_node_features, out_features=dataset.num_classes,
                              hidden_dimensions=64, edge_features=dataset.num_edge_features)

optimizer = torch.optim.Adam(gcmwe.parameters(), lr=0.01, weight_decay=5e-4)
loss_function = torch.nn.CrossEntropyLoss()
train(200, train_loader, val_loader, gcmwe, optimizer, loss_function, 20)

test_loss, test_acc = evaluate(test_loader, gcmwe, loss_function)
print(f"Accuracy on Test Set: {test_acc:.4f}")
