# Cora Example

In [1]:
%load_ext autoreload
%autoreload 2

## Data

In [2]:
import json
import numpy as np
import torch
import scipy.sparse as sparse
from dgl.data import CoraGraphDataset

dataset = CoraGraphDataset()
graph = dataset[0]

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


We can save the data file in only one file `cora.npz`.

In [3]:
# node features
node_feats = sparse.csr_matrix(graph.ndata["feat"].numpy())
# node labels
node_class = graph.ndata["label"].numpy()  # (2708,)
# edge list
edge = torch.stack(graph.edges()).numpy().T
print(edge.shape)

(10556, 2)


## Load raw text for cora dataset

In [4]:
import sys
sys.path.append("../../")
from gli.raw_text_utils import load_data

_, raw_text_dict = load_data(dataset="cora", use_text=True)

print(raw_text_dict.keys())

for key, item in raw_text_dict.items():
    print(key, item[:1])


  from .autonotebook import tqdm as notebook_tqdm


dict_keys(['title', 'abs', 'label'])
title ['Title: The megaprior heuristic for discovering protein sequence patterns  ']
abs ['Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength

In [5]:
from gli.io import save_graph, Attribute

node_attrs = [
    Attribute(
        "NodeFeature",
        node_feats,
        "Node features of Cora dataset, 1/0-valued vectors.",
        "int",
        "SparseTensor",
    ),
    Attribute(
        "NodeLabel",
        node_class,
        "Node labels of Cora dataset, int ranged from 1 to 7.",
        "int",
        "Tensor",
    ),
    Attribute(
        "NodeRawTextTitle",
        raw_text_dict["title"],
        "Raw text of title of each node in Cora dataset, list of strings.",
        "str",
        "List[str]"
    ),
    Attribute(
        "NodeRawTextAbstract",
        raw_text_dict["abs"],
        "Raw text of abstract of each node in Cora dataset, list of strings.",
        "str",
        "List[str]"
    ),
    Attribute(
        "NodeRawTextLabel",
        raw_text_dict["label"],
        "Raw text of label of each node in Cora dataset, list of strings.",
        "str",
        "List[str]"
    )
]


metadata = save_graph(
    name="cora",
    edge=edge,
    num_nodes=graph.num_nodes(),
    node_attrs=node_attrs,
    description="CORA dataset.",
    cite=
    "@inproceedings{yang2016revisiting,\ntitle={Revisiting semi-supervised learning with graph embeddings},\nauthor={Yang, Zhilin and Cohen, William and Salakhudinov, Ruslan},\nbooktitle={International conference on machine learning},\npages={40--48},\nyear={2016},\norganization={PMLR}\n}",
)

The metadata.json and graph data (.npz files) is now saved in the current directory.

In [6]:
# Print metadata
print(json.dumps(metadata, indent=2))

{
  "description": "CORA dataset.",
  "data": {
    "Node": {
      "NodeFeature": {
        "description": "Node features of Cora dataset, 1/0-valued vectors.",
        "type": "int",
        "format": "SparseTensor",
        "file": "cora__graph__Node_NodeFeature__7032c9c380d1889061dcbbcd76b8c427.sparse.npz"
      },
      "NodeLabel": {
        "description": "Node labels of Cora dataset, int ranged from 1 to 7.",
        "type": "int",
        "format": "Tensor",
        "file": "cora__graph__6c912909fa18eff10797210ea5e485fe.npz",
        "key": "Node_NodeLabel"
      },
      "NodeRawTextTitle": {
        "description": "Raw text of title of each node in Cora dataset, list of strings.",
        "type": "str",
        "format": "List[str]",
        "optional file": "cora__graph__Node_NodeRawTextTitle__4a9ad6575f5acfe3b828fe66f072bd5c.optional.npz",
        "key": "Node_NodeRawTextTitle"
      },
      "NodeRawTextAbstract": {
        "description": "Raw text of abstract of each nod

## Task

In [7]:
train_set = graph.ndata["train_mask"].nonzero().squeeze().numpy()
val_set = graph.ndata["val_mask"].nonzero().squeeze().numpy()
test_set = graph.ndata["test_mask"].nonzero().squeeze().numpy()

In [8]:
from gli.io import save_task_node_classification

task_data = save_task_node_classification(
    name="cora",
    description="Node classification on CORA dataset. Planetoid split.",
    feature=["Node/NodeFeature"],
    target="Node/NodeLabel",
    num_classes=7,
    train_set=train_set,
    val_set=val_set,
    test_set=test_set,
    task_id="1"
)

The task data (.json and .npz files) is now saved in the current directory.

In [9]:
print(json.dumps(task_data, indent=2))

{
  "description": "Node classification on CORA dataset. Planetoid split.",
  "type": "NodeClassification",
  "feature": [
    "Node/NodeFeature"
  ],
  "target": "Node/NodeLabel",
  "num_classes": 7,
  "train_set": {
    "file": "cora__task_node_classification_1__41e167258678b585872679839ce9c40f.npz",
    "key": "train_set"
  },
  "val_set": {
    "file": "cora__task_node_classification_1__41e167258678b585872679839ce9c40f.npz",
    "key": "val_set"
  },
  "test_set": {
    "file": "cora__task_node_classification_1__41e167258678b585872679839ce9c40f.npz",
    "key": "test_set"
  }
}


Test loading the data.

In [10]:
from gli.dataloading import read_gli_graph, read_gli_task, combine_graph_and_task

g = read_gli_graph("./metadata.json")
t = read_gli_task("./task_node_classification_1.json")
data = combine_graph_and_task(g, t)
data[0]

CORA dataset.
Node classification on CORA dataset. Planetoid split.


  return torch.sparse_csr_tensor(crow_indices,


Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'NodeFeature': Scheme(shape=(1433,), dtype=torch.float32), 'NodeLabel': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})

Loading data with raw text.

In [11]:
from gli.dataloading import get_gli_dataset

dataset = get_gli_dataset("cora", "NodeClassification", load_raw_text=True, verbose=True)

data = dataset[0]

print(data)


All data files already exist. Skip downloading.
CORA dataset.
All data files already exist. Skip downloading.
Node classification on CORA dataset. Planetoid split.
Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'NodeFeature': Scheme(shape=(1433,), dtype=torch.float32), 'NodeLabel': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})


The raw text are saved in:

In [14]:
data.NodeRawTextTitle[0], data.NodeRawTextAbstract[0], data.NodeRawTextLabel[0]

('Title: The megaprior heuristic for discovering protein sequence patterns  ',
 'Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in

After adding LICENSE and README.md, the dataset directory will be the following.

In [None]:
!tree .