In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

import torch
import os

from tqdm import tqdm
from torch_geometric.data import Dataset, Data



In [2]:
class MyOwnDataset(Dataset):
    def __init__(self, root, test=False, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self): # Returns a list of file names found in the raw directory
        return ['train_data.txt', 'train_edges.txt', 'test_data.txt', 'test_edges.txt']

    @property
    def processed_file_names(self): # Returns a list of file names that were already procesed
        return ['data_train.pt', 'data_test.pt']

    def download(self): # Files to download
        pass
    
    def feature_normalize(self, mx):
        """Row-normalize sparse matrix"""
        rowsum = np.array(mx.sum(1))
        r_inv = np.power(rowsum, -1).flatten()
        r_inv[np.isinf(r_inv)] = 0.
        r_mat_inv = sp.diags(r_inv)
        mx = r_mat_inv.dot(mx)
        return mx
    
    def adj_normalize(self, mx):
        """Row-normalize sparse matrix"""
        rowsum = np.array(mx.sum(1)) # Sum each row
        r_inv = np.power(rowsum, -1/2).flatten() # Negative square root
    #     r_inv[np.isinf(r_inv)] = 0.
        r_mat_inv = sp.diags(r_inv) # Create diagonal matrix

        # D^(-1/2).A.D^(-1/2)
        mx = r_mat_inv.dot(mx)
        mx = mx.dot(r_mat_inv)
        return mx
    
    def sparse_mx_to_torch_sparse_tensor(self, sparse_mx):
        """Convert a scipy sparse matrix to a torch sparse tensor."""
        sparse_mx = sparse_mx.tocoo().astype(np.float32)
        indices = torch.from_numpy(
            np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
#         values = torch.from_numpy(sparse_mx.data)
#         shape = torch.Size(sparse_mx.shape)
#         return torch.sparse.FloatTensor(indices, values, shape)
        return indices
    
    def encode_onehot(self, labels):
        classes = set(labels)
        classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
        labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
        return labels_onehot

    def organize_data(self, which):
        if which == 'train':
            data_path = os.path.join(self.raw_dir, 'train_data.txt')
            edges_path = os.path.join(self.raw_dir, 'train_edges.txt')
        else:
            data_path = os.path.join(self.raw_dir, 'test_data.txt')
            edges_path = os.path.join(self.raw_dir, 'test_edges.txt')
            
        idx_features_labels = np.genfromtxt(data_path, dtype=np.dtype(str))
        
        num_feats = 19
        features = sp.csr_matrix(idx_features_labels[:, 0:num_feats], dtype=np.float32) # Processing features into a sparse matrix
        labels = self.encode_onehot(idx_features_labels[:, -2]) # one-hot encoding the labels
        
        # build graph
        idx = np.array(idx_features_labels[:, -1], dtype=np.int32) # Reading node-ids

        # Creating node ids to eliminate discrepencies in node ids in the data
        idx_map = {j: i for i, j in enumerate(idx)} # Creating index for nodes to map it in adjacency matrix
        
        edges_unordered = np.genfromtxt(edges_path, dtype=np.int32) # Reading edges
        edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape) # Mapping node-ids in the edge list to the index
        
        # Build adjacency matrix
        adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)
        
        print(adj.shape)
        
        # Normalizing features
        features = self.feature_normalize(features)
        
        adj = self.adj_normalize(adj + sp.eye(adj.shape[0]))
        
        features = torch.FloatTensor(np.array(features.todense()))
        labels = torch.LongTensor(np.where(labels)[1])
        adj = self.sparse_mx_to_torch_sparse_tensor(adj)
        
        
        data = Data(x=features, edge_index=adj, y=labels)
        
        return data
    
    def process(self):
        # processing training data
        train_data = self.organize_data('train')
        torch.save(train_data, os.path.join(self.processed_dir, 'data_train.pt'))
        
        # processing test data
        test_data = self.organize_data('test')
        torch.save(test_data, os.path.join(self.processed_dir, 'data_test.pt'))


    def len(self):
        return len(self.processed_file_names)

    def get(self, which):
        if which == 'train':
            data = torch.load(os.path.join(self.processed_dir, 'data_train.pt'))
        elif which == 'test':
            data = torch.load(os.path.join(self.processed_dir, 'data_test.pt'))
        return data

In [3]:
filepath = 'data/wtracks/ttbar/'
dataset = MyOwnDataset(filepath)

Processing...


(2038461, 2038461)
(679547, 679547)


Done!


In [4]:
# print('Training data details')
data = dataset.get('train')

print(data.num_nodes)
print(data.num_edges)
print(data.num_features)
print(data.num_node_features)

2038461
35235779
19
19


In [5]:
# print('Test data details')
data = dataset.get('test')

print(data.num_nodes)
print(data.num_edges)
print(data.num_features)
print(data.num_node_features)

679547
11696875
19
19
