<a href="https://colab.research.google.com/github/LeoVogiatzis/GNN_based_NILM/blob/main/GNN_based_NILM/tree/main/notebooks/Graph_convolution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git


[K     |████████████████████████████████| 7.9 MB 4.2 MB/s 
[K     |████████████████████████████████| 3.5 MB 4.4 MB/s 
[K     |████████████████████████████████| 407 kB 4.1 MB/s 
[K     |████████████████████████████████| 45 kB 3.1 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [None]:
import networkx as nx
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Dataset, Data
import numpy as np
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
print(f"Torch version: {torch.__version__}")
print(f"Cuda available: {torch.cuda.is_available()}")
print(f"Torch geometric version: {torch_geometric.__version__}")

Torch version: 1.10.0+cu111
Cuda available: True
Torch geometric version: 2.0.2


In [None]:
import io
from google.colab import files
uploaded = files.upload()

Saving dishwaser_20.graphml to dishwaser_20.graphml


In [None]:
G = nx.read_graphml('data/raw/dishwaser_20.graphml')

FileNotFoundError: ignored

In [None]:
G.nodes(data=True)

In [None]:
[int(i) for i in G.nodes()]

In [None]:
class NilmDataset(Dataset):
    def __init__(self, root, filename, test=False, transform=None, pre_transform=None):
        """2
        root = Where the dataset should be stored. This folder is split 
        into raw_dir (downloaded dataset) and processed_dir (processed data).
        """
        self.test = test
        self.filename = filename
        super(NilmDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        """ If this file exists in raw_dir, the download is not triggered.
            (The download func. is not implemented here)
        """
        return self.filename

    @property
    def processed_file_names(self):
        """ If these files are found in raw_dir, processing is skipped"""
        self.data = pd.read_csv(self.raw_paths[0]).reset_index()

        if self.test:
            return [f'data_test_{i}.pt' for i in list(self.data.index)]
        else:
            return [f'data_{i}.pt' for i in list(self.data.index)]

    def download(self):
        pass

    def process(self):
        self.G = nx.read_graphml(self.raw_paths[0])
        print(len(self.G.nodes), len(self.G.edges))
        # TODO: read graphs below
            # Get node features
        node_feats = self._get_node_features(self.G)
            # Get edge features
        edge_feats = self._get_edge_features(self.G)
            # Get adjacency info
        edge_index = self._get_adjacency_info(self.G)
            # Get labels info
        labels = self._get_labels(nx.get_node_attributes(self.G, 'state'))  # pass label here. E.g. if it is a column for this graph it could be graph_csv['label']

            # Create data object
        # self.data = Data(x=node_feats, edge_index=edge_index, y=labels)
        
        self.data = Data(x=node_feats, edge_index=edge_index, 
                         edge_attr=edge_feats, y=labels, 
                        #  train_mask=[2000], test_mask=[2000]
                         )
        

        self.num_classes = 2

        nodes = np.asarray([int(i) for i in self.G.nodes()]).astype(np.int64)
        node_labels = np.asarray([int(G.nodes[i]['state']) for i in self.G.nodes]).astype(np.int64)
        # splitting the data into train, validation and test
        X_train, X_test, y_train, y_test = train_test_split(pd.Series(nodes), 
                                                    pd.Series(node_labels),
                                                    test_size=0.30, 
                                                    random_state=42)
        n_nodes = self.G.number_of_nodes()
        # create train and test masks for data
        self.train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        print(self.train_mask)
        self.test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        self.train_mask[X_train.index] = True
        self.test_mask[X_test.index] = True
        self.data['train_mask'] = train_mask
        self.data['test_mask'] = test_mask        
        self.data, self.slices = self.collate([data])
        
        if self.test:
          torch.save(self.data, os.path.join(self.processed_dir, 'data_test_0.pt'))
        else:
          torch.save(self.data, os.path.join(self.processed_dir, 'data_0.pt'))

    def _get_node_features(self, graph):
        """
        This will return a matrix / 2d array of the shape
        [Number of Nodes, Node Feature size]

        We could also use torch_geometric.from_networkx to create a Data object
        with both adjacency and features, but instead we do it manually here
        """
        all_node_feats = list(nx.get_node_attributes(graph, 'drift').values())

        all_node_feats = np.asarray(all_node_feats)
        all_node_feats = all_node_feats.reshape((-1, 1))
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, graph):
      """ This will return a matirx with the gaussian filter kernel of all 
          edges
      """
      all_edge_feats = []
      for e in graph.edges(data=True):
        all_edge_feats += [[e[2]['gaussian_kernel']], [e[2]['gaussian_kernel']]]

      return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, graph):
        """
        We could also use torch_geometric.from_networkx to create a Data object
        with both adjacency and features, but instead we do it manually here
        """

        edge_indices = []
        for edge in graph.edges:
            i = int(edge[0])  # get source
            j = int(edge[1])  # get destination
            edge_indices += [[i, j], [j, i]]  # undirected graph

        edge_indices = torch.tensor(edge_indices)
        edge_indices = edge_indices.t().to(torch.long).view(2, -1)
        return edge_indices

    def _get_labels(self, labels):
        labels = list(labels.values())
        labels = np.asarray(labels)
        return torch.tensor(labels, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
        return data

In [None]:
graph_mains_1 = NilmDataset(root='data', filename='dishwaser_20.graphml')
print(graph_mains_1.data)

In [None]:
data.num_classes = 2
labels = np.asarray([int(G.nodes[i]['state']) for i in G.nodes]).astype(np.int64)
nodes = np.asarray([int(i) for i in G.nodes()]).astype(np.int64)
from sklearn.model_selection import train_test_split
# splitting the data into train, validation and test
X_train, X_test, y_train, y_test = train_test_split(pd.Series(nodes), 
                                                    pd.Series(labels),
                                                    test_size=0.30, 
                                                    random_state=42)
n_nodes = G.number_of_nodes()
# create train and test masks for data
train_mask = torch.zeros(n_nodes, dtype=torch.bool)
test_mask = torch.zeros(n_nodes, dtype=torch.bool)
train_mask[X_train.index] = True
test_mask[X_test.index] = True
graph_mains_1.data['train_mask'] = train_mask
graph_mains_1.data['test_mask'] = test_mask

In [None]:
graph_mains_1.data.train_mask

In [None]:
# graph_mains_1.data, graph_mains_1.slices = graph_mains_1.collate([graph_mains_1])

In [None]:
data = graph_mains_1.data
print(data)
print('==============================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
# print(f'Number of training nodes: {data.train_mask.sum()}')
# print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
# print(f'Is undirected: {data.is_undirected()}')

In [None]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

edge_index = data.edge_index
print(edge_index.t())

In [None]:
graph_mains_1.data.test_mask

In [None]:
import torch
from torch.nn import Linear, ReLU
from torch_geometric.nn import GCNConv
import torch.nn.functional as F


class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        torch.manual_seed(1234)
        self.conv1 = GCNConv(data.num_features, 1)
        self.conv2 = GCNConv(1, 1)
        # self.conv3 = GCNConv(4, 2)
        # self.classifier = Linear(1, 1)

    def forward(self, x, edge_index):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

        
        # Apply a final (linear) classifier.
        # out = self.classifier(h)

        # return h.log_softmax(x, dim=1)
        # out, 
        # F.log_softmax(x, dim=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data =  data.to(device)

model = GCN().to(device) 
# model = GCN()
print(model)

In [None]:
import torch.nn.functional as F

torch.manual_seed(42)

optimizer_name = "Adam"
lr = 1e-1
optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr)
epochs = 200

def train():
  model.train()
  optimizer.zero_grad()
  print(graph_mains_1.train_mask)
  F.nll_loss(model()[graph_mains_1.train_mask], graph_mains_1.data.y[graph_mains_1.data.train_mask]).backward()
  optimizer.step()

@torch.no_grad()
def test():
  model.eval()
  logits = model()
  mask1 = data['train_mask']
  pred1 = logits[mask1].max(1)[1]
  acc1 = pred1.eq(data.y[mask1]).sum().item() / mask1.sum().item()
  mask = data['test_mask']
  pred = logits[mask].max(1)[1]
  acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
  return acc1,acc

for epoch in range(1, epochs):
  train()

train_acc,test_acc = test()

print('#' * 70)
print('Train Accuracy: %s' %train_acc )
print('Test Accuracy: %s' % test_acc)
print('#' * 70)

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from torch_geometric.nn import Node2Vec
from sklearn.model_selection import train_test_split


In [None]:
from torch.nn import Linear, ReLU
from torch_geometric.nn import Sequential, GCNConv
in_channels = 1
out_channels =1
model = Sequential('x, edge_index', [
    (GCNConv(in_channels, 64), 'x, edge_index -> x'),
    ReLU(inplace=True),
    (GCNConv(64, 64), 'x, edge_index -> x'),
    ReLU(inplace=True),
    Linear(64, out_channels),
])

In [None]:
print(model)

In [None]:
loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

In [None]:
labels = np.asarray([G.nodes[i]['state'] for i in G.nodes]).astype(np.int64)

# assigning colours to node labels
color_map = []
for i in labels:
    if i == 0:
        color_map.append('blue')
    else: 
        color_map.append('red')  

# transform the embeddings from 128 dimensions to 2D space
m = TSNE(learning_rate=20, random_state=42)
tsne_features = m.fit_transform(list(model))

# plot the transformed embeddings
plt.figure(figsize=(9,6)) 
plt.scatter(x = tsne_features[:,0], 
            y = tsne_features[:,1],
            c = color_map,
            s =600,
            alpha=0.6)