## Sherirto/CSTAG

bash command

```shell
export HF_ENDPOINT=https://hf-mirror.com
huggingface-cli download --repo-type dataset --resume-download Sherirto/CSTAG --local-dir graph-feature-selection/data/CSTAG --local-dir-use-symlinks False
```

In [1]:
CSTAG_DATASET_BASE_PATH = "../data/CSTAG"
# find all .pt files in a directory and its subdirectories, return a list of file paths
import os
import dgl
import numpy  as np
import pandas as pd
import torch

def find_files(dir_path, end_with='.pt'):
    file_list = []
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            if file.endswith(end_with):
                file_list.append(os.path.join(root, file))
    return file_list

find_files(CSTAG_DATASET_BASE_PATH, '.pt')

  from .autonotebook import tqdm as notebook_tqdm


['../data/CSTAG/History/History.pt',
 '../data/CSTAG/Goodreads/Goodreads.pt',
 '../data/CSTAG/Computers/Computers.pt',
 '../data/CSTAG/Children/Children.pt',
 '../data/CSTAG/Fitness/Fitness.pt',
 '../data/CSTAG/Photo/Photo.pt',
 '../data/CSTAG/CitationV8/Citation-2015.pt']

In [2]:
datasets = ["Children", "Computers", "Fitness", "History", "Photo"]
num_data_splits = 10
train_prop = 0.5
valid_prop = 0.25
save_path_base = "../data"

for dataset in datasets:
    # edges
    edges = None
    graph_file = find_files(f"{CSTAG_DATASET_BASE_PATH}/{dataset}", '.pt')
    glist, label_dict = dgl.load_graphs(graph_file[0])
    for i, g in enumerate(glist):
        print(f"Dataset: {dataset}, Graph {i+1}:")
        print(f"Number of nodes: {g.number_of_nodes()}")
        edges = torch.tensor(np.array(g.edges()).T)
        print(f"Number of edges: {g.number_of_edges()}, edges dtype: {edges.dtype}")
    # node labels
    csv_file = find_files(f"{CSTAG_DATASET_BASE_PATH}/{dataset}", '.csv')
    csv_data = pd.read_csv(csv_file[0])
    node_labels = torch.tensor(np.array(csv_data['label'].values))
    print(f"Number of labels: {len(node_labels)}, node_labels dtype: {node_labels.dtype}")
    # node features
    features_file = find_files(f"{CSTAG_DATASET_BASE_PATH}/{dataset}", '.npy')
    node_features = torch.tensor(np.load(features_file[0])).to(torch.float)
    print(f"Number of features: {len(node_features)}, node_features dtype: {node_features.dtype}")
    print(f"Feature dimension: {len(node_features[0])}")
    # Splits
    seed = 0
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    num_nodes = len(node_labels)
    mask_number = torch.zeros(num_data_splits, num_nodes)
    for i in range(num_data_splits):
        mask_number[i] = torch.randperm(num_nodes)
    train_masks = (mask_number<=(train_prop*num_nodes))
    val_masks = (torch.logical_and(mask_number<=((train_prop+valid_prop)*num_nodes),mask_number>(train_prop*num_nodes)))
    test_masks = (mask_number>((train_prop+valid_prop)*num_nodes))
    print(f"mask ratio(train:val:test): {train_masks.sum().item()/num_nodes/num_data_splits:.2f}"\
          f":{val_masks.sum().item()/num_nodes/num_data_splits:.2f}"
            f":{test_masks.sum().item()/num_nodes/num_data_splits:.2f}")
    np.savez(f'{save_path_base}/{dataset}.npz',
            node_features=node_features.numpy(),
            node_labels=node_labels.numpy(),
            edges=edges.numpy(),
            train_masks=train_masks.numpy(),
            val_masks=val_masks.numpy(),
            test_masks=test_masks.numpy())
    print(f"save {dataset} graph, train, val, test masks to {save_path_base}/{dataset}.npz")
    print("-" * 50)

Dataset: Children, Graph 1:
Number of nodes: 76875
Number of edges: 1554578, edges dtype: torch.int64
Number of labels: 76875, node_labels dtype: torch.int64
Number of features: 76875, node_features dtype: torch.float32
Feature dimension: 768
mask ratio(train:val:test): 0.50:0.25:0.25
save Children graph, train, val, test masks to ../data/Children.npz
--------------------------------------------------
Dataset: Computers, Graph 1:
Number of nodes: 87229
Number of edges: 721081, edges dtype: torch.int64
Number of labels: 87229, node_labels dtype: torch.int64
Number of features: 87229, node_features dtype: torch.float32
Feature dimension: 768
mask ratio(train:val:test): 0.50:0.25:0.25
save Computers graph, train, val, test masks to ../data/Computers.npz
--------------------------------------------------
Dataset: Fitness, Graph 1:
Number of nodes: 173055
Number of edges: 1773500, edges dtype: torch.int64
Number of labels: 173055, node_labels dtype: torch.int64
Number of features: 173055, n

## For roman-empire, amazon-ratings, minesweeper, tolokers, questions datasets , please refer to [CLICK THIS LINK](https://github.com/yandex-research/heterophilous-graphs/tree/main/data) for download