### Running `node2vec`  on pubmed diabetes data 

In [1]:
%load_ext autoreload
%autoreload 2

from scripts import helper_scripts as sga
from scripts import testing_utils as tu
from scripts.uGLAD import main as uG
from scripts.helper_scripts import process_table, get_partial_correlations, precision_empty, uGLAD_graph
from analytics_utils import get_partial_correlations_from_graph, set_node_attributes

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import pandas as pd

In [3]:
file = "../../../../externalData/Pubmed-Diabetes/data/Pubmed-Diabetes.NODE.paper.tab"

fh = open(file, "r")
index = 0

datapoints = []
while 1:
    line = fh.readline()
    if not line:
        break
    index += 1
    if index < 3:
        continue
    fields = line.split('\t')
    datapoints.append([int(fields[0]), int(fields[1][-1])]) # paperID, label

paper_cat = pd.DataFrame(datapoints, columns = ['paperID', 'label'])
paper_cat

Unnamed: 0,paperID,label
0,12187484,1
1,2344352,1
2,14654069,1
3,16443886,2
4,2684155,1
...,...,...
19712,17559889,3
19713,8792097,2
19714,17934141,1
19715,18673544,3


In [5]:
paper_cat['label'].value_counts()

2    7875
3    7739
1    4103
Name: label, dtype: int64

In [6]:
cat_list = paper_cat['label'].unique()
print(cat_list)

[1 2 3]


In [13]:
import pickle

fh = open("diabetes_files/pubmed_graph_test.pkl", "rb")
G = pickle.load(fh)
# fh = open("diabetes_files/pubmed_edges.pkl", "rb")
# G_graph_edge_list = pickle.load(fh)


In [14]:
G.nodes(data=True)

NodeDataView({2401586: {}, 17277038: {}, 1107095: {}, 10860187: {}, 17261860: {}, 10459555: {}, 16731860: {}, 16804063: {}, 18828242: {}, 3019804: {}, 1551486: {}, 16776847: {}, 12921782: {}, 8593944: {}, 16813601: {}, 12637977: {}, 14532170: {}, 18981116: {}, 11815512: {}, 11829539: {}, 2533036: {}, 12823237: {}, 8012717: {}, 3214432: {}, 8606491: {}, 2908044: {}, 12453967: {}, 7859932: {}, 1727730: {}, 3371576: {}, 17130480: {}, 12814458: {}, 10643211: {}, 2670641: {}, 8349034: {}, 16123336: {}, 12856125: {}, 3054559: {}, 7309132: {}, 3402522: {}, 12546277: {}, 8432410: {}, 7937925: {}, 18840781: {}, 6384267: {}, 2253403: {}, 8098666: {}, 8477801: {}, 2043812: {}, 12077744: {}, 8593939: {}, 12200073: {}, 15919812: {}, 9049484: {}, 15196191: {}, 11358432: {}, 9754819: {}, 11295462: {}, 15867184: {}, 15999801: {}, 17353506: {}, 6369965: {}, 14646372: {}, 18299186: {}, 12107720: {}, 7861877: {}, 15855328: {}, 1971659: {}, 18556337: {}, 14510863: {}, 12136407: {}, 6441449: {}, 11318596: 

In [15]:
# G_uGLAD = G.copy()
# G = G_uGLAD.copy()
print(len(G.edges()))

44724


#  Running GNN 
1. Run node2vec
2. Train a MLP on known labels
3. Predict the labels for the test data

### Take care of negative weights in G

In [17]:
# G.edges(data=True)

In [16]:
# Dropping the negative correlations
def drop_neg_edges(G):
    remove_edges = []
    for i, e in enumerate(G.edges(data=True)):
        if e[2]['color'] == 'red':
            remove_edges.append(e)
    print(f'Red edges removed {len(remove_edges)}')
    G.remove_edges_from(remove_edges)
    return G
G = drop_neg_edges(G) # Graph prepared for node2vec with positive edges

Red edges removed 24038


### Run node2vec 

In [18]:
from node2vec import Node2Vec
# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=64, weight_key='weight', p=1, q=2)  # Use temp_folder for big graphs
# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)
# Save embeddings for later use
folder='../../../../externalData/Pubmed-Diabetes/'
model.wv.save_word2vec_format(folder + "node_embeddings_64")
# Save model for later use
model.save(folder + "node2vec_model_64")

Computing transition probabilities: 100%|██████████| 300/300 [00:15<00:00, 18.78it/s]


### Load the node2vec model
- If already pretrained

In [13]:
# # Loading the node2vec model 
# import gensim
# # model = gensim.models.Word2Vec.load(folder + "cora/node2vec_model")
# model = gensim.models.Word2Vec.load(folder + "cora/node2vec_model_64")

### Get the node embeddings of graph 

In [19]:
# Getting the embeddings of all the nodes in graph G
print(f'embedding size {model.vector_size}')
list_nodes = list(map(str, G.nodes()))
embeddings = np.array(model.wv.__getitem__(list_nodes))
print(embeddings.shape)
# Create a node:embedding dict
node_embedding_dict = {n:embeddings[i, :] for i, n in enumerate(list_nodes)}

embedding size 64
(300, 64)


### Get the class labels for the nodes

In [21]:
# Get labels/classes for all the nodes 
def get_all_labels(G, paper_cat):
    # creating the node:label dictionary - 1022969: 'Case_Based', 143801: 'Neural_Networks', 612306: 'Rule_Learning',..
    unknown_cat = 'not available'
    node_attribute_dict = {str(n):unknown_cat for n in G.nodes()}
    # setting the known classes
    node_attribute_known = {}
    attribute_values = {}
    for n in G.nodes:
        attr = paper_cat[paper_cat['paperID']==n].label.iloc[0]
        node_attribute_known[str(n)] = attr
        if attr in attribute_values:
            attribute_values[attr] += 1
        else:
            attribute_values[attr] = 1
    node_attribute_dict = set_node_attributes(node_attribute_dict, node_attribute_known)
    return node_attribute_dict
node_label_dict = get_all_labels(G, paper_cat)

### Prepare the input data for the classifier

In [22]:
# Preparing the input data for the MLP to predict the classes
# Features are node embeddings, predictions are the class labels
X, y = [], []
for n in G.nodes():
    n = str(n)
    X.append(node_embedding_dict[n])
    y.append(node_label_dict[n])
X = pd.DataFrame(X)
# y = pd.get_dummies(pd.DataFrame(y, columns=['class']))
y = pd.DataFrame(y, columns=['class'])
print(X, y)

           0         1         2         3         4         5         6   \
0   -0.151729 -0.517408 -0.066847  0.168869 -0.219188  0.457198 -0.407433   
1   -0.003977 -0.296986  0.162149  0.002492 -0.448699  0.266657  0.094966   
2   -0.053012  0.095236  0.493257  0.011922 -0.156093  0.127414  0.007349   
3    0.062232  0.142866  0.085899  0.048084  0.235555  0.012092 -0.507567   
4    0.112578 -0.038089  0.314876 -0.027883 -0.036866  0.218854  0.483599   
..        ...       ...       ...       ...       ...       ...       ...   
295  0.003339 -0.265717 -0.070580 -0.072964  0.081308 -0.105798 -0.233518   
296  0.121748 -0.214194 -0.222019  0.157270 -0.205384 -0.115900 -0.155127   
297 -0.227092  0.096758  0.093378 -0.503574 -0.201245 -0.505726 -0.106383   
298  0.061585 -0.071133  0.179004  0.090919  0.501293  0.175256 -0.018622   
299  0.159938 -0.218695 -0.193433 -0.008247 -0.158755 -0.111667  0.091431   

           7         8         9   ...        54        55        56  \
0  

### Fit a MLP 

In [24]:
# Fitting a MLP and masking entries 
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score 

def run_node2vec_analysis(X, y):
    results={}  # masked entries: accuracy
    for masked_entries in range(10, 250, 10):
        test_size = masked_entries/len(G.nodes())
        print(f'Num of masked entries {masked_entries}, test size {test_size}')
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        # fit the mlp
        clf = LogisticRegression(multi_class='ovr', solver='liblinear')
        # clf = MLPClassifier(random_state=1, max_iter=500, hidden_layer_sizes=(50))

        clf = clf.fit(X_train, y_train)

        # Optionally, we can do CV and get the multiple estimators
        # cv_results = cross_val_score(clf, X_train, y_train, cv=5)
        # cv_results = cross_validate(clf, X_train, y_train, return_train_score=True, return_estimator=True)
        # print(cv_results)
        
        # y_test_pred = clf.predict(X_test)
        # # print(y_test_pred)
        # Check the diversity of the predictions
        print(f'train accuracy {clf.score(X_train, y_train)}')
        test_accuracy = clf.score(X_test, y_test)
        results[masked_entries] = test_accuracy
        print(f'test accuracy {test_accuracy}\n')
    return results

def run_node2vec_analysis_paper(X, y, REPEAT=50):
    results={}  # masked entries: accuracy
    num_nodes = X.shape[0]
    for masked_entries in [0.2, 0.4, 0.6]: # 20%, 40%, 60%
        masked_entries = int(masked_entries * num_nodes)
        print(f'Num of masked entries {masked_entries}, repeat {REPEAT} times')
        results[masked_entries] = {}
        acc_tr, acc_test = [], []
        for _ in range(REPEAT):
            test_size = masked_entries/len(G.nodes())
            # , test size {test_size}')
            # Split the data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)#, random_state=42)
            # fit the mlp
            clf = LogisticRegression(multi_class='ovr', solver='liblinear')
            # clf = MLPClassifier(random_state=1, max_iter=500, hidden_layer_sizes=(50))
            clf = clf.fit(X_train, y_train)
            acc_tr.append(clf.score(X_train, y_train))
            acc_test.append(clf.score(X_test, y_test))
        results[masked_entries]['train'] = [np.mean(acc_tr), np.std(acc_tr)]
        results[masked_entries]['test'] = [np.mean(acc_test), np.std(acc_test)]
    print(f'Results {results}\n')
    return results

# results_n2v = run_node2vec_analysis(X, y)
results_n2v_diabetes = run_node2vec_analysis_paper(X, y)

Num of masked entries 60, repeat 50 times
Num of masked entries 120, repeat 50 times
Num of masked entries 180, repeat 50 times
Results {60: {'train': [0.7794166666666666, 0.01807411439354944], 'test': [0.5740000000000001, 0.06575712889109439]}, 120: {'train': [0.8145555555555555, 0.020813397869261938], 'test': [0.5429999999999999, 0.043216509191125865]}, 180: {'train': [0.8631666666666667, 0.02661922863896198], 'test': [0.5147777777777777, 0.03362116470393884]}}



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu