In [3]:
import torch
import pandas as pd
import random
import torch.nn.functional as F
import numpy as np
from torch_geometric.data import Data

FIRST_YEAR = 1995
LAST_YEAR = 2019
FEATURES = ['pop', 'cpi', 'emp']
NUM_TRAIN = 15 
NUM_VAL = 3
NUM_TEST = 6
NUM_EDGE_FEATURES = 10
EDGE_FEATURES = ['f'+str(i) for i in range(NUM_EDGE_FEATURES)]

# The data is found in the project's Github.
DOWNLOAD_PREFIX = 'https://raw.githubusercontent.com/pboennig/gnns_for_gdp/master/'

def create_data(year):
    '''
    For given year, pull in node features, edge features, and edge index and
    save in a PyG Data object.
    '''
    
    assert(year in range(FIRST_YEAR, LAST_YEAR + 1))
    edges = pd.read_csv(f'{DOWNLOAD_PREFIX}/output/X_EDGE_{year}.csv')

    # generate map from iso_code to ids of form [0, ..., num_unique_iso_codes - 1]
    iso_codes = set(edges['i'])
    iso_codes = iso_codes.union(set(edges['j']))
    iso_code_to_id = {code : i for (i, code) in enumerate(iso_codes)}

    # load in edge index
    edges['i_id'] = edges['i'].map(iso_code_to_id)
    edges['j_id'] = edges['j'].map(iso_code_to_id)
    edge_index = torch.from_numpy(edges[['i_id', 'j_id']].to_numpy(np.int64)).t()
    edge_attr = torch.from_numpy(edges[EDGE_FEATURES].to_numpy(np.float32)) #extract the features from the dataset.
    edge_attr = (edge_attr - edge_attr.mean(axis=0)) / (edge_attr.std(axis=0))
    
    # load in target values
    y_df = pd.read_csv(f'{DOWNLOAD_PREFIX}/output/Y_{year}.csv')
    y_df['id'] = y_df['iso_code'].map(iso_code_to_id)
    y = torch.from_numpy(y_df.sort_values('id')[f'{year+1}'].to_numpy(np.float32)).unsqueeze(1)# get labels as tensor
    y = y.log() # log scale since spread of GDP is large
    
    # load in input features
    x_df = pd.read_csv(f'{DOWNLOAD_PREFIX}/output/X_NODE_{year}.csv')
    x_df['id'] = x_df['iso_code'].map(iso_code_to_id)
    features = ['pop', 'cpi', 'emp']
    x = torch.from_numpy(x_df.sort_values('id').loc[:,features].to_numpy(np.float32))
    x = (x - x.mean(axis=0)) / (x.std(axis=0))  # scale and center data
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)

def evaluate_model(model, data_iter):
    '''
    Accumulate MSE over a data list or loader.
    '''
    return sum([F.mse_loss(model(data), data.y).item() for data in data_iter])

def get_data():
    '''
    Generate data_lists for train, val, and test. These lists can be either loaded into data_loaders
    or indexed directly.
    '''

    data_list = [create_data(year) for year in range(FIRST_YEAR, LAST_YEAR)]
    random.shuffle(data_list)
    data_train = data_list[:NUM_TRAIN]
    data_val = data_list[NUM_TRAIN:NUM_TRAIN+NUM_VAL+1]
    data_test = data_list[NUM_TRAIN+NUM_VAL:]
    return (data_train, data_val, data_test)

In [4]:
data_train, data_val, data_test = get_data() # the function described above, these data are what we'll work with

In [39]:
data_list

NameError: name 'data_list' is not defined

In [44]:
data_train[0].edge_attr[0]

tensor([-0.1315,  0.6238,  0.2239, -0.1391,  0.7561,  0.9039, 20.0175,  0.1796,
        -0.1735, 27.8697])

In [45]:
data_train[0].edge_attr[92]

tensor([ 4.1992e-02,  3.4500e-01,  2.4886e-01,  4.7984e-02,  7.0066e+00,
        -5.5987e-02, -9.7999e-02,  2.5473e+01, -6.0308e-02, -6.7887e-03])

In [42]:
data_train[0].x.shape

torch.Size([194, 3])

In [43]:
data_train[0].edge_index.shape

torch.Size([2, 9802])

In [13]:
index = data_train[0].edge_index.t()
index

tensor([[ 39, 187],
        [ 39, 187],
        [ 39, 187],
        ...,
        [130, 185],
        [130, 165],
        [130, 140]])

In [12]:
np.where(index )

(array([], dtype=int64),)

In [21]:
res = []
for i in index:
    res.append(tuple(i.tolist()))

In [26]:
for idx, i in enumerate(res):
    if res[idx] == (187,39):
        print(idx)

92
93
94
95
96
97
98
99
100
101


In [30]:
res[9]

(39, 187)

In [32]:
data_train[0].edge_attr[0:9]

tensor([[-0.1315,  0.6238,  0.2239, -0.1391,  0.7561,  0.9039, 20.0175,  0.1796,
         -0.1735, 27.8697],
        [-0.1315,  0.6238,  0.2239, -0.1391,  0.7561,  0.9039, 20.0175,  0.1796,
         -0.1735, 27.8697],
        [-0.1315,  0.6238,  0.2239, -0.1391,  0.7561,  0.9039, 20.0175,  0.1796,
         -0.1735, 27.8697],
        [-0.1315,  0.6238,  0.2239, -0.1391,  0.7561,  0.9039, 20.0175,  0.1796,
         -0.1735, 27.8697],
        [-0.1315,  0.6238,  0.2239, -0.1391,  0.7561,  0.9039, 20.0175,  0.1796,
         -0.1735, 27.8697],
        [-0.1315,  0.6238,  0.2239, -0.1391,  0.7561,  0.9039, 20.0175,  0.1796,
         -0.1735, 27.8697],
        [-0.1315,  0.6238,  0.2239, -0.1391,  0.7561,  0.9039, 20.0175,  0.1796,
         -0.1735, 27.8697],
        [-0.1315,  0.6238,  0.2239, -0.1391,  0.7561,  0.9039, 20.0175,  0.1796,
         -0.1735, 27.8697],
        [-0.1315,  0.6238,  0.2239, -0.1391,  0.7561,  0.9039, 20.0175,  0.1796,
         -0.1735, 27.8697]])

In [38]:
data_train[0].edge_attr[92:102]

tensor([[ 4.1992e-02,  3.4500e-01,  2.4886e-01,  4.7984e-02,  7.0066e+00,
         -5.5987e-02, -9.7999e-02,  2.5473e+01, -6.0308e-02, -6.7887e-03],
        [ 4.1992e-02,  3.4500e-01,  2.4886e-01,  4.7984e-02,  7.0066e+00,
         -5.5987e-02, -9.7999e-02,  2.5473e+01, -6.0308e-02, -6.7887e-03],
        [ 4.1992e-02,  3.4500e-01,  2.4886e-01,  4.7984e-02,  7.0066e+00,
         -5.5987e-02, -9.7999e-02,  2.5473e+01, -6.0308e-02, -6.7887e-03],
        [ 4.1992e-02,  3.4500e-01,  2.4886e-01,  4.7984e-02,  7.0066e+00,
         -5.5987e-02, -9.7999e-02,  2.5473e+01, -6.0308e-02, -6.7887e-03],
        [ 4.1992e-02,  3.4500e-01,  2.4886e-01,  4.7984e-02,  7.0066e+00,
         -5.5987e-02, -9.7999e-02,  2.5473e+01, -6.0308e-02, -6.7887e-03],
        [ 4.1992e-02,  3.4500e-01,  2.4886e-01,  4.7984e-02,  7.0066e+00,
         -5.5987e-02, -9.7999e-02,  2.5473e+01, -6.0308e-02, -6.7887e-03],
        [ 4.1992e-02,  3.4500e-01,  2.4886e-01,  4.7984e-02,  7.0066e+00,
         -5.5987e-02, -9.7999e-0

In [36]:
data_train[0].edge_index[:,92:102]

tensor([[187, 187, 187, 187, 187, 187, 187, 187, 187, 187],
        [ 39,  39,  39,  39,  39,  39,  39,  39,  39,  39]])