In [None]:
!pip install ogb --quiet
import numpy as np
import pickle as pkl
import scipy.sparse as sp
import networkx as nx
import sys
import ogb
from ogb.nodeproppred import NodePropPredDataset
print ('python version: ', sys.version_info)

python version:  sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Check the dataset

In [None]:
dataset = 'cora'
names = ['feature', 'label', 'graph', 'idx_train', 'idx_eval', 'idx_test']
objects = []
for i in range(len(names)):
    f = open("/content/drive/My Drive/data/{}/{}.bin".format(dataset, names[i]), 'rb')
    if sys.version_info > (3, 0): # if python==3.x
        objects.append(pkl.load(f, encoding='latin1'))
    else: # if python==2.x
        objects.append(pkl.load(f))
feature, label, graph, idx_train, idx_eval, idx_test = objects

print ("Below shows the type of the stored objects:")
print ("-- feature: type={}, shape={}".format(type(feature), feature.shape))
print ("-- label: type={}, shape={}".format(type(label), label.shape))
print ("-- graph: type={}, node num={}".format(type(graph), len(graph)))
print ("-- idx_train: type={}, size={}".format(type(idx_train), len(idx_train)))
print ("-- idx_eval: type={}, size={}".format(type(idx_eval), len(idx_eval)))
print ("-- idx_test: type={}, size={}".format(type(idx_test), len(idx_test)))

Below shows the type of the stored objects:
-- feature: type=<class 'scipy.sparse.lil.lil_matrix'>, shape=(2708, 1433)
-- label: type=<class 'numpy.ndarray'>, shape=(2708, 7)
-- graph: type=<class 'collections.defaultdict'>, node num=2708
-- idx_train: type=<class 'range'>, size=140
-- idx_eval: type=<class 'range'>, size=500
-- idx_test: type=<class 'list'>, size=1000


In [None]:
# convert scipy sparse matrix to numpy array
dense_features = feature.toarray()
print (dense_features.shape, type(dense_features))

(2708, 1433) <class 'numpy.ndarray'>


### Process facebook_page data
* Source link: https://www.kaggle.com/rozemberczki/musae-facebook-pagepage-network?select=musae_facebook_target.csv

In [None]:
import pandas as pd
import io
import sys

# first download the dataset from kaggle and upload it into the notebook
#Upload the file into my notebook from my local drive
from google.colab import files
uploaded = files.upload()

Saving musae_facebook_edges.csv to musae_facebook_edges.csv
Saving musae_facebook_features.csv to musae_facebook_features.csv
Saving musae_facebook_target.csv to musae_facebook_target.csv


In [None]:
#then read the csv's into pandas dataframes (its what I'm familiar with)
import pandas as pd
import sys
import io
import json
from itertools import groupby

features_df = pd.read_csv(io.StringIO(uploaded['musae_facebook_features.csv'].decode('utf-8')))

#group file by node id then feature id 

edges_df = pd.read_csv(io.StringIO(uploaded['musae_facebook_edges.csv'].decode('utf-8')))
labels_df = pd.read_csv(io.StringIO(uploaded['musae_facebook_target.csv'].decode('utf-8')))

labels_df #what we want is the one hot encoding of page type
#another file for label index and label context 
#features_df

Unnamed: 0,id,facebook_id,page_name,page_type
0,0,145647315578475,The Voice of China 中国好声音,tvshow
1,1,191483281412,U.S. Consulate General Mumbai,government
2,2,144761358898518,ESET,company
3,3,568700043198473,Consulate General of Switzerland in Montreal,government
4,4,1408935539376139,Mark Bailey MP - Labor for Miller,politician
...,...,...,...,...
22465,22465,1379955382222841,Kurt Wiegel MdL,politician
22466,22466,1651527995097082,dubdub Stories,company
22467,22467,155369444540412,Ministerio del Interior - Paraguay,government
22468,22468,175067819212798,Tottus Perú,company


In [None]:
#FB LABELS

#one hot encode the target/labels vectors ... question about this, what are the targets
# I know I can one hot encode the page_type category , but what about the others?
from sklearn.preprocessing import OneHotEncoder

label = labels_df[["page_type"]]

cat_encoder = OneHotEncoder()
label_1hot = cat_encoder.fit_transform(label)
label_1hot.toarray()

#[company, government, politician, tvshow]

pkl.dump(label_1hot, open('/content/drive/My Drive/Facebook_Data/label.bin', "wb"))


In [None]:
#FB EDGES

# loop through all the edges and then turn those into json {index: [index of neighbor nodes]}
edges_list = []
for index,row in edges_df.iterrows():
  #row[id_1] and row[id_2]
  edges_list.append((row['id_1'],row['id_2']))
  edges_list.append((row['id_2'],row['id_1'])) #since it says the graph is undirected, and each edge is only listed once, we want to get both directions

#ask Lu about this

print(edges_list[:5])
graph_adjacency_list = {k: [v[1] for v in g] for k, g in groupby(sorted(edges_list), lambda e: e[0])} #that should be it for the edges

print(graph_adjacency_list[0]) #it works!

pkl.dump(graph_adjacency_list, open('/content/drive/My Drive/Facebook_Data/graph.bin', "wb"))

[(0, 18427), (18427, 0), (1, 21708), (21708, 1), (1, 22208)]
[18427]


In [None]:
#FEATURES (FB)
# loop through all the edges and then turn those into json {index: [index of neighbor nodes]}
features_list = []
numberUniqueFeatIDs = features_df['feature_id'].nunique() #this should be the maximum of all the feature ID's
numberUniqueNodeIDs = features_df['node_id'].nunique()

print(features_df['feature_id'].max())
print(features_df['node_id'].max())

print(numberUniqueFeatIDs)
print(numberUniqueNodeIDs)

for index,row in features_df.iterrows():
  features_list.append((row['node_id'],row['feature_id'])) #now I don't care about the bi directionality here since its a list of features

#build the feature dictionary
feature_dict = {k: [v[1] for v in g] for k, g in groupby(sorted(features_list), lambda e: e[0])} #that should be it for the edges

featureMat = sp.dok_matrix((numberUniqueNodeIDs,numberUniqueFeatIDs))

for node_id, feat_id in feature_dict.items():
    featureMat[node_id, feat_id] = 1

print(feature_dict[0])
print(featureMat[0]) #it worked!
#then convert them to scipy.lil.lil
features_sparse = sp.lil_matrix(featureMat) # that should be it for this features matrix

#write the features matrix to a bin file
pkl.dump(features_sparse, open('/content/drive/My Drive/Facebook_Data/feature.bin', "wb"))

4713
22469
4714
22470
[143, 236, 874, 901, 1072, 1078, 3133, 3825]
  (0, 143)	1.0
  (0, 236)	1.0
  (0, 874)	1.0
  (0, 901)	1.0
  (0, 1072)	1.0
  (0, 1078)	1.0
  (0, 3133)	1.0
  (0, 3825)	1.0


In [None]:
# train test split using sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

train_set, other_set = train_test_split(labels_df, test_size=0.4)
val_set, test_set = train_test_split(other_set, test_size = 0.5)

In [None]:
#create bin files out of those sparse matrices
train_list = list(train_set.index.values)  # this will always work in pandas
val_list = list(val_set.index.values)
test_list = list(test_set.index.values)

#then go to byte array then write to file
pkl.dump(train_list, open('/content/drive/My Drive/Facebook_Data/idx_train.bin', "wb"))
pkl.dump(val_list, open('/content/drive/My Drive/Facebook_Data/idx_eval.bin', "wb"))
pkl.dump(test_list, open('/content/drive/My Drive/Facebook_Data/idx_test.bin', "wb"))



### Process amazon_product data
* Source link: https://ogb.stanford.edu/docs/nodeprop/#loader

In [None]:
import pandas as pd
import sys
import io
import json
from itertools import groupby

# followint the instruction of the website to download data
dataset = NodePropPredDataset(name = 'ogbn-products')



In [None]:
#split_idx = dataset.get_idx_split()
#train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph, label = dataset[0] # graph: library-agnostic graph object
pkl.dump(graph, open('/content/drive/My Drive/graph.pkl', "wb"))
del dataset

In [None]:
print (graph['edge_index'].shape, graph['node_feat'].shape, label.shape)

(2, 123718280) (2449029, 100) (2449029, 1)


In [None]:
#Amazon Features
print(type(graph['node_feat']))
a_features_sparse = sp.lil_matrix(graph['node_feat']) # that should be it for this features matrix

pkl.dump(a_features_sparse, open('/content/drive/My Drive/Amazon_Data/feature.bin', "wb"))

<class 'numpy.ndarray'>


In [None]:
#Amazon Labels
from sklearn.preprocessing import OneHotEncoder

a_label_df = pd.DataFrame(label, columns = ['label'])
a_label = a_label_df[["label"]]

#the label is just a number and there are up to 46 of them
print(a_label_df['label'].value_counts())


cat_encoder = OneHotEncoder()
a_label_1hot = cat_encoder.fit_transform(a_label)

print(a_label_1hot.toarray()) #it works
#new categories

pkl.dump(a_label_1hot.toarray(), open('/content/drive/My Drive/Amazon_Data/label.bin', "wb"))

4     668950
7     172199
6     158771
3     151061
12    131886
2     116043
0     114294
8     110796
1     109832
13    101541
16     83594
21     80795
9      67358
10     52345
18     49019
24     45406
17     42337
5      40715
11     32937
42     32500
15     26911
20     22575
19     17438
23      3653
14      3079
25      3024
28      1969
29      1561
43      1399
22       879
36       630
44       566
26       553
37       514
32       513
31       418
30       277
27       259
34       154
38        91
41        61
35        44
39        37
33        29
45         9
40         6
46         1
Name: label, dtype: int64
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
print(graph['edge_index'].transpose().shape[1])

2


In [None]:
#test something
ini_array = np.array([['manjeet', 'akshat'], ['nikhil', 'akash'], ['eb', 'jb']]) 
                          
  
# convert numpy arrays into tuples 
result = list([tuple(row) for row in ini_array)]) 
print(result)
print(ini_array.shape)

[('manjeet', 'akshat'), ('nikhil', 'akash'), ('eb', 'jb')]
(3, 2)


In [None]:
#Amazon EDGES
#a_graph_df = pd.DataFrame(graph['edge_index'].transpose(), columns = ['id_1', 'id_2'])a

a_edges_list = list([tuple(row) for row in graph['edge_index'].transpose()]) 
pkl.dump(a_edges_list, open('/content/drive/My Drive/elist.pkl', "wb"))
print('made the list')

I had to do some stuff here to pickle it and restart my collab runtime so as not to exceed my ram capacity

In [None]:
import pandas as pd
import sys
import io
import json
from itertools import groupby
import pickle as pkl
with open('/content/drive/My Drive/elist.pkl', 'rb') as f:
    a_edges = pickle.load(f)

a_graph_adjacency_list = {k: [v[1] for v in g] for k, g in groupby(sorted(a_edges), lambda e: e[0])} #that should be it for the edges

del a_edges_list
print('made the dictionary')

pkl.dump(a_graph_adjacency_list, open('/content/drive/My Drive/Amazon_Data/graph.bin', "wb"))

print('completed dump')

In [None]:
#Amazon train, test, and validation

pkl.dump(train_idx.tolist, open('/content/drive/My Drive/Amazon_Data/idx_train.bin', "wb"))
pkl.dump(valid_idx.tolist, open('/content/drive/My Drive/Amazon_Data/idx_eval.bin', "wb"))
pkl.dump(test_idx.tolist, open('/content/drive/My Drive/Amazon_Data/idx_test.bin', "wb"))

### Process citation data
* Source link: https://github.com/kimiyoung/planetoid, where x, allx and tx are labeled data, unlabeled data and test data for inductive learning
* Process and split train/eval/test sets based on https://github.com/PetarV-/GAT/blob/master/utils/process.py

In [None]:
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index
def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

In [None]:
def process_citation_data(dataset_str): # {'pubmed', 'citeseer', 'cora'}
    """Load data."""
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("./data/.raw_citation/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)    
    test_idx_reorder = parse_index_file("./data/.raw_citation/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)
    
#     print (x.shape, y.shape, tx.shape, allx.shape)
#     print (len(test_idx_reorder), test_idx_reorder[:10], test_idx_range[:10])

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]
    
    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)
    
    print ("labeled instance total num: ", len(idx_test)+len(idx_train)+len(idx_val))
    print ("instance total num*feature_dim, with class num: ", features.shape, labels.shape[1])
    print ("node num in graph: ", len(graph))

    return features, labels, graph, idx_train, idx_val, idx_test

In [None]:
dataset = 'citeseer'
names = ['feature', 'label', 'graph', 'idx_train', 'idx_eval', 'idx_test']
objects = process_citation_data(dataset) # features, graph, idx_train, idx_val, idx_test
for i in range(len(names)):
    pkl.dump(objects[i], open("./data/{}/{}.bin".format(dataset, names[i]), 'wb'))

labeled instance total num:  1620
instance total num*feature_dim, with class num:  (3327, 3703) 6
node num in graph:  3327
