In [2]:
from typing import Dict

import numpy as np
import torch
from torch import nn

from pathlib import Path

from download import download_file, extract_tar


class CoraDataset:
    def __init__(self, include_edges: bool = True):
        self.include_edges = include_edges

        # Download dataset
        self._download()

        # Read the paper ids, feature vectors, and labels
        content = np.genfromtxt('cora/cora/cora.content', dtype=np.dtype(str))

        # Load the citations, it's a list of pairs of integers.
        citations = np.genfromtxt('cora/cora/cora.cites', dtype=np.int32)

        features = torch.tensor(np.array(content[:, 1:-1], dtype=np.float32))
        self.features = features / features.sum(dim=1, keepdim=True)

        self.classes = {s: i for i, s in enumerate(set(content[:, -1]))}
        self.labels = torch.tensor([self.classes[i] for i in content[:, -1]], dtype=torch.long)

        paper_ids = np.array(content[:, 0], dtype=np.int32)
        ids_to_idx = {id_: i for i, id_ in enumerate(paper_ids)}

        self.adj_mat = torch.eye(len(self.labels), dtype=torch.bool)

        if self.include_edges:
            for e in citations:
                e1, e2 = ids_to_idx[e[0]], ids_to_idx[e[1]]
                self.adj_mat[e1][e2] = True
                self.adj_mat[e2][e1] = True

    def _download(self):
        data_dir = Path('cora')
        if not data_dir.exists():
            data_dir.mkdir(parents=True)

        tgz_file = data_dir / 'cora.tgz'
        content_file = data_dir / 'cora.content'

        if not content_file.exists():
            download_file('https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz', tgz_file)
            extract_tar(tgz_file, data_dir)


def accuracy(output: torch.Tensor, labels: torch.Tensor):
    """
    A simple function to calculate the accuracy
    """
    return output.argmax(dim=-1).eq(labels).sum().item() / len(labels)



In [17]:
data = CoraDataset(True)
print(data.classes)

{'Neural_Networks': 0, 'Case_Based': 1, 'Probabilistic_Methods': 2, 'Reinforcement_Learning': 3, 'Rule_Learning': 4, 'Genetic_Algorithms': 5, 'Theory': 6}


In [16]:
import matplotlib.pyplot as plt
import networkx as nx

def evaluate_individual(data):
    features = data.features
    labels = data.labels
    edges_adj = data.adj_mat
    #print all shapes
    print(features.shape)
    print(labels.shape)
    print(edges_adj.shape)
    #edges_adj = edges_adj.unsqueeze(-1)

    # Create a graph
    G = nx.from_numpy_array(edges_adj.numpy())

    # Assign labels and features to nodes
    for i, (feat, label) in enumerate(zip(features, labels)):
        G.nodes[i]['feature'] = feat
        G.nodes[i]['label'] = label

    # Draw the graph
    # pos = nx.spring_layout(G)
    # nx.draw_networkx(G, pos, with_labels=False, node_size=5, node_color=[[.5, .5, .5]])
    # node_labels = nx.get_node_attributes(G, 'label')
    # nx.draw_networkx_labels(G, pos, labels=node_labels)
    # plt.show()

    print(features[0])
    print(labels[70])
  
evaluate_individual(data)


torch.Size([2708, 1433])
torch.Size([2708])
torch.Size([2708, 2708])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor(6)
