# Reddit Dataset

## Data

In [1]:
from dgl.data import RedditDataset
import json
import numpy as np
import torch

data = RedditDataset(verbose=True)
graph = data[0]

Finished data loading.
  NumNodes: 232965
  NumEdges: 114615892
  NumFeats: 602
  NumClasses: 41
  NumTrainingSamples: 153431
  NumValidationSamples: 23831
  NumTestSamples: 55703
Done loading data from cached files.


In [2]:
# node features
node_feats = graph.ndata["feat"].numpy()
# node labels
node_class = graph.ndata["label"].numpy()
# edge list
edge = torch.stack(graph.edges()).numpy().T

In [3]:
from gli.io import save_graph, Attribute

node_attrs = [
    Attribute(
        "NodeFeature",
         node_feats,
        "Node features of Reddit dataset, incorporating pretrained GloVe CommonCrawl word embeddings.",
        "float",
        "Tensor",
    ),
    Attribute(
        "NodeLabel",
        node_class,
        "Node labels of Reddit dataset, int ranged from 0 to 40.",
        "int",
        "Tensor",
    )
]

metadata = save_graph(
    name="reddit",
    edge=edge,
    num_nodes=graph.num_nodes(),
    node_attrs=node_attrs,
    description="Reddit dataset.",
    cite="@article{hamilton2017inductive,\ntitle={Inductive representation learning on large graphs},\nauthor={Hamilton, Will and Ying, Zhitao and Leskovec, Jure},\njournal={Advances in neural information processing systems},\nvolume={30},\nyear={2017}}",
)


The metadata.json and graph data (.npz files) is now saved in the current directory.

In [4]:
# Print metadata
print(json.dumps(metadata, indent=2))

{
  "description": "Reddit dataset.",
  "data": {
    "Node": {
      "NodeFeature": {
        "description": "Node features of Reddit dataset, incorporating pretrained GloVe CommonCrawl word embeddings.",
        "type": "float",
        "format": "Tensor",
        "file": "reddit__graph__bfb7717c1f9b72842adc4af257467122.npz",
        "key": "Node_NodeFeature"
      },
      "NodeLabel": {
        "description": "Node labels of Reddit dataset, int ranged from 0 to 40.",
        "type": "int",
        "format": "Tensor",
        "file": "reddit__graph__bfb7717c1f9b72842adc4af257467122.npz",
        "key": "Node_NodeLabel"
      }
    },
    "Edge": {
      "_Edge": {
        "file": "reddit__graph__bfb7717c1f9b72842adc4af257467122.npz",
        "key": "Edge_Edge"
      }
    },
    "Graph": {
      "_NodeList": {
        "file": "reddit__graph__Graph_NodeList__e4f77fbbcc4906feaf9f51e8d2a6da98.sparse.npz"
      }
    }
  },
  "citation": "@article{hamilton2017inductive,\ntitle={Inductiv

## Task

In [5]:
train_set = graph.ndata["train_mask"].nonzero().squeeze().numpy()
val_set = graph.ndata["val_mask"].nonzero().squeeze().numpy()
test_set = graph.ndata["test_mask"].nonzero().squeeze().numpy()

In [6]:
from gli.io import save_task_node_classification

task_data = save_task_node_classification(
    name="reddit",
    description="Node classification on Reddit dataset.",
    feature=["Node/NodeFeature"],
    target="Node/NodeLabel",
    num_classes=41,
    train_set=train_set,
    val_set=val_set,
    test_set=test_set,
    task_id="1"
)

change to int64


The task data (.json and .npz files) is now saved in the current directory.

In [7]:
print(json.dumps(task_data, indent=2))

{
  "description": "Node classification on Reddit dataset.",
  "type": "NodeClassification",
  "feature": [
    "Node/NodeFeature"
  ],
  "target": "Node/NodeLabel",
  "num_classes": 41,
  "train_set": {
    "file": "reddit__task_node_classification_1__f966ab3b42876ca118130cd1ea52237f.npz",
    "key": "train_set"
  },
  "val_set": {
    "file": "reddit__task_node_classification_1__f966ab3b42876ca118130cd1ea52237f.npz",
    "key": "val_set"
  },
  "test_set": {
    "file": "reddit__task_node_classification_1__f966ab3b42876ca118130cd1ea52237f.npz",
    "key": "test_set"
  }
}


Test loading the data.

In [8]:
from gli.dataloading import read_gli_graph, read_gli_task, combine_graph_and_task

g = read_gli_graph("./metadata.json")
t = read_gli_task("./task_node_classification_1.json")
data = combine_graph_and_task(g, t)
data[0]

Reddit dataset.
Node classification on Reddit dataset.


Graph(num_nodes=232965, num_edges=114615892,
      ndata_schemes={'NodeFeature': Scheme(shape=(602,), dtype=torch.float32), 'NodeLabel': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})

In [9]:
print(t.split['train_set'].dtype)

torch.int64
