# Init

In [1]:
verbose=False

In [2]:
%matplotlib inline
#%matplotlib notebook
import os
import pathpy as pp
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import matplotlib.pyplot as plt
#from tqdm import tqdm
from tqdm.notebook import tqdm

In [3]:
from HigherOrderPathGenerator import HigherOrderPathGenerator, CrossValidation_HigherOrderPathGenerator, ABCHigherOrderPathGenerator
from Embedding import ABCEmbedding, HON_DeepWalk_Embedding, HONEM_Embedding, HON_NetMF_Embedding, HON_GraRep_Embedding, HON_Transition_Hierarchical_Embedding
from Visualizations import Visualization, EmbeddingView, Lattice2D_EmbeddingView
from Datasets import init_generator

In [4]:
gen_HON = init_generator('primaryschool_1.csv')
#gen_HON = init_generator('workplace_30.csv')
#gen_HON = init_generator('hospital_1.csv')

97223 rules read
[]
[]


In [5]:
gen_FON = gen_HON.to_FON(gen_HON._id + ' (FON)')

# Embeddings

In [6]:
class embedding_builder(object):
    "Instantiates an embedding and trains it."
    def __init__(self, name:str, init_para=None, **train_para):
        self._name = name
        self._init_para = dict() if init_para is None else init_para
        self._train_para = train_para
    
    def build(self, gen: ABCHigherOrderPathGenerator, dimension:int = 128):
        if self._name == 'NetMF':
            emb = HON_NetMF_Embedding(gen, dimension, **self._init_para)
            emb.train(**self._train_para)
        elif self._name == 'GraRep':
            emb = HON_GraRep_Embedding(gen, dimension, **self._init_para)
            emb.train(**self._train_para)
        elif self._name == 'Experiment':
            #emb = HON_Transition_Hierarchical_Embedding(gen, dimension, **self._init_para)
            emb = HON_CV_Transition_Hierarchical_Embedding(gen, dimension, **self._init_para)
            emb.train(**self._train_para)
        elif self._name == 'DeepWalk':
            emb = HON_DeepWalk_Embedding(gen, dimension)
        elif self._name == 'HONEM':
            emb = HONEM_Embedding(gen, dimension)
            emb.train()
        else:
            assert False, 'Invalid name %s' % name
        return emb
    
    @staticmethod
    def get_true_probs(gen: ABCHigherOrderPathGenerator, excluded_edges):
        res = defaultdict(dict) # use nested dictionaries, because decode is expensive
        for source in gen.source_paths_len1:
            for _, next_node, prob in gen.transition_probs(source):
                if (source[-1],next_node) in excluded_edges:
                    res[source][next_node]=prob
        return res
    
    def evaluate_pairs(self, true_probs, gen_build: ABCHigherOrderPathGenerator, dimension: int = 128, **kwargs):
        emb = self.build(gen_build, dimension)
        node2str = emb.key2str if emb._symmetric else emb.node2str
        source2str = emb.key2str if emb._symmetric else emb.path2str
        res = []
        for source, probs in true_probs.items():
            source_str = source2str(source)
            predicted_probs = emb.decode_path(source, **kwargs)
            for next_node, prob in probs.items():
                next_node_str = node2str(next_node)
                res.append(dict(source=source_str, target=next_node_str, true_prob=prob, pred_prob=predicted_probs[next_node_str]))
        return res
    
    def evaluate(self, true_probs, gen_build: ABCHigherOrderPathGenerator, dimension:int = 128, **kwargs):
        emb = self.build(gen_build, dimension)
        node2str = emb.key2str if emb._symmetric else emb.node2str
        SSE = 0
        SSElog = 0
        min_prob=1e-10
        for source, probs in true_probs.items():
            predicted_probs = emb.decode_path(source, **kwargs)
            for next_node, prob in probs.items():
                SSE += (prob - predicted_probs[node2str(next_node)])**2
                SSElog += (math.log2(max(prob,min_prob)) - math.log2(max(predicted_probs[node2str(next_node)],min_prob)))**2
        return dict(SSE=SSE,SSElog=SSElog)

In [7]:
class HON_CV_Transition_Hierarchical_Embedding(HON_Transition_Hierarchical_Embedding):
    """In a cross-validation setting, some edges should be excluded from training the embedding.
    While this is impossible for methods based on (unweighted) matrix factorization using SVD,
    this is simple for methods based on weighted matrix factorization using SGD.
    
    The trick is to skip the update step if it happens to coincide with one of the excluded edges.
    """
    def __init__(self, gen, dimension: int=128, node_hierarchy='calc', seed=None, neg_stationary:bool=True):
        super().__init__(gen, dimension, node_hierarchy, seed, neg_stationary)
        if type(gen) == CrossValidation_HigherOrderPathGenerator:
            self._excluded_edges = { (iu,iv) for u,iu in self._source_paths.items() for v,iv in self._target_nodes.items()
                    if (u[-1],v) in gen._excluded_edges }
        else:
            self._excluded_edges = set() # warn?
    
    def _update(self, iu, iv, label, learning_rate):
        if (iu,iv) in self._excluded_edges:
            return # skip training (both positive and negative samples) for the excluded edges
        super()._update(iu, iv, label, learning_rate)

In [8]:
dimensions = [16, 32, 64, 128]

In [9]:
decode_args = dict(use_neighborhood=True, no_self_loops=True, normalize=True, step=1) # decode ignores additional parameters

In [10]:
netmf_pairs = dict(pairwise=True)
grarep_para = dict(num_steps=2, pairwise=True)
embedding_builders = {
    'NetMF(W1,N1)': embedding_builder('NetMF', init_para=netmf_pairs, window_size=1, negative=1),
    'NetMF(W1,N5)': embedding_builder('NetMF', init_para=netmf_pairs, window_size=1, negative=5),
    'NetMF(W2,N1)': embedding_builder('NetMF', init_para=netmf_pairs, window_size=2, negative=1),
    'NetMF(W2,N5)': embedding_builder('NetMF', init_para=netmf_pairs, window_size=2, negative=5),
    'NetMF(W3,N1)': embedding_builder('NetMF', init_para=netmf_pairs, window_size=3, negative=1),
    'NetMF(W3,N5)': embedding_builder('NetMF', init_para=netmf_pairs, window_size=3, negative=5),
    'NetMF(W5,N1)': embedding_builder('NetMF', init_para=netmf_pairs, window_size=5, negative=1),
    'NetMF(W5,N5)': embedding_builder('NetMF', init_para=netmf_pairs, window_size=5, negative=5),
#    'NetMF(W10,N1)': embedding_builder('NetMF', init_para=netmf_pairs, window_size=10, negative=1),
#    'NetMF(W10,N5)': embedding_builder('NetMF', init_para=netmf_pairs, window_size=10, negative=5),
    
#    'GraRep(2,N5)': embedding_builder('GraRep', init_para=grarep_para, negative=5), # GraRep behaves like NetMF(W1)
#    'HONEM':  embedding_builder('HONEM'),
}

In [11]:
FON_edges = list((start[-1],next_node) for start in gen_HON.source_paths_len1 for _, next_node, _ in gen_HON.transition_probs(start))
#FON_edges

## debug builder.evaluate_pairs
Before running extensive simulations, examine the best case - no cross validation:
* true_probs contains all transition probabilities for the excluded_edges, but we want all of them (set excluded_edges=FON_edges)
* the embedding uses the original transition probabilities (gen_HON)
* The embedding approximates the 1-step transitions [NetMF(window_size=1) or Experiment (its SGD equivalent)] instead of using random walks
* After decoding the probabilities, keep only those connected by a FON link and re-normalize

**The outcome is worse than I had hoped for, rendering the whole analysis useless.**

In [12]:
def debug_evaluate_pairs(builder, dimension=128, title=None, only_FON_edges=True, FON_edges=FON_edges, decode_args=decode_args, limit=(0.00001,1)):
    true_probs = builder.get_true_probs(gen_HON, excluded_edges=FON_edges) # true_probs for all edges
    gen_XE = gen_HON # no cross validation
    res = builder.evaluate_pairs(true_probs, gen_XE, dimension, **decode_args)
    df = pd.DataFrame(res).set_index(['source','target'])
    
    df.plot.scatter('true_prob', 'pred_prob', alpha=0.01, logx=True, logy=True, xlim=limit, ylim=limit)
    if title is not None: plt.suptitle(title)
    ax = plt.gca()
    ax.plot(limit,limit, '-')

In [13]:
if verbose:
    %time debug_evaluate_pairs(embedding_builders['NetMF(W1,N1)'], 128, 'HON NetMF(W=1,N=1,R=128)') # 3.5 sec
    plt.savefig('tmp/prob_netmf-w1n1r128.png') # figure B.1 (left)
    %time debug_evaluate_pairs(embedding_builders['NetMF(W3,N5)'], 16, 'HON NetMF(W=3,N=5,R=16)') # 11.5 sec
    plt.savefig('tmp/prob_netmf-w3n5r16.png') # figure B.1 (right)
    #%time debug_evaluate_pairs(embedding_builders['NetMF(W10,N5)'], 16, 'HON NetMF(W=10,N=5,R=16)') # 5 min
    #plt.savefig('tmp/prob_netmf-w310n5r16.png')

    #%time debug_evaluate_pairs(embedding_builders['NetMF(W10,N5)'], 256, 'NetMF(W10,N5)')
    #£%time debug_evaluate_pairs(embedding_builder('Experiment', dict(seed=1, neg_stationary=True), steps=1000, negative=1), 256, 'Experiment')

In [14]:
#for name,builder in embedding_builders.items():
#    %time debug_evaluate_pairs(builder, 16, name)

## Crossvalidation of probability prediction for single embedding
Compares true and predicted probabilities using cross validation.

Cross validation means, that for a set of edges, a new instance `gen_XE` of CrossValidation_HigherOrderPathGenerator is generated, where all information about the transition probabilities along these edges is hidden. An embedding is trained using `gen_XE` and the (predicted) transition probabilities are decoded from this embedding.

In [15]:
from sklearn.model_selection import KFold
def evaluate_pairs(gen, builder, builder_id, dimension=128, n_splits=20, use_FON=False, plot=False, decode_args=decode_args, limit=(0.00001,1)):
    gen_build = gen
    if use_FON:
        gen_build = HigherOrderPathGenerator(node_sort_key=gen._node_sort_key, id=gen._id + ' (FON)')
        for key in gen.rule_keys:
            if len(key)>1:
                continue
            for start, next_node,prob in gen.transition_probs(key):
                gen_build.add_rule(start, next_node, prob)
    res = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    progress = tqdm(range(kf.n_splits))
    progress_iter = iter(progress)
    for i_split, (_, test_index) in enumerate(kf.split(FON_edges)):
        next(progress_iter)
        #progress.set_postfix(dict(split=i_split))
        excluded_edges = list(e for i,e in enumerate(FON_edges) if i in test_index) # = FON_edges[test_index]
        true_probs = embedding_builder.get_true_probs(gen, excluded_edges)
        gen_XE = CrossValidation_HigherOrderPathGenerator(gen_build, excluded_edges, '%s exclude_%d' % (gen_build._id,i_split))
        res.extend(builder.evaluate_pairs(true_probs, gen_XE, dimension, **decode_args))
    try:
        next(progress_iter)
    except StopIteration:
        pass
    res = pd.DataFrame(res).set_index(['source','target'])
    if plot:
        res.plot.scatter('true_prob', 'pred_prob', alpha=0.01, logx=True, logy=True, xlim=limit, ylim=limit)
        ax = plt.gca()
        ax.plot(limit,limit, '-')
        plt.gcf().suptitle(('FON' if use_FON else 'HON') +' '+ builder_id + f' CV(k={n_splits})')
    return res

In [16]:
if verbose:
    df = evaluate_pairs(gen_HON, embedding_builders['NetMF(W1,N1)'], 'NetMF(W=1,N=1,R=128)', 128, n_splits=100, use_FON=False, plot=True)
    plt.savefig('tmp/cv-prob_netmf-w1n1r128.png')  # figure B.2 (left)

In [17]:
if verbose:
    df2 = evaluate_pairs(gen_HON, embedding_builders['NetMF(W2,N1)'], 'NetMF(W=2,N=1,R=16)', 16, n_splits=100, use_FON=False, plot=True)
    plt.savefig('tmp/cv-prob_netmf-w2n1r16.png')

In [18]:
if verbose:
    df2 = evaluate_pairs(gen_HON, embedding_builders['NetMF(W3,N5)'], 'NetMF(W=3,N=5,R=16)', 16, n_splits=100, use_FON=False, plot=True)
    plt.savefig('tmp/cv-prob_netmf-w3n5r16.png')  # figure B.2 (right)

# Compare embeddings

In [19]:
from sklearn.model_selection import KFold
res = []
kf = KFold(n_splits=20, shuffle=True, random_state=0)
progress = tqdm(range(kf.n_splits * len(embedding_builders) * len(dimensions) * 2))
progress_iter = iter(progress)
for i_split, (_, test_index) in enumerate(kf.split(FON_edges)):
    ##excluded_edges = FON_edges[test_index]
    excluded_edges = list(e for i,e in enumerate(FON_edges) if i in test_index)
    true_probs = embedding_builder.get_true_probs(gen_HON, excluded_edges)
    for gen_name,gen in [('HON', gen_HON), ('FON', gen_FON)]:
        gen_XE = CrossValidation_HigherOrderPathGenerator(gen, excluded_edges, '%s exclude_%d' % (gen_name,i_split))
        for builder_name, builder in embedding_builders.items():
            for dimension in dimensions:
                next(progress_iter)
                progress.set_postfix_str(f'{i_split}: {gen_name} {builder_name} {dimension}')
                #progress.set_postfix(dict(split=i_split, gen=gen_name, model=builder_name, dim=dimension))
                out = dict(model=gen_name, dimension=dimension, embedding=builder_name, split=i_split)
                out.update(builder.evaluate(true_probs, gen_XE, dimension, **decode_args))
                res.append(out)
try:
    next(progress_iter)
except StopIteration:
    pass
df = pd.DataFrame(res)
df.head()

HBox(children=(FloatProgress(value=0.0, max=1280.0), HTML(value='')))




Unnamed: 0,model,dimension,embedding,split,SSE,SSElog
0,HON,16,"NetMF(W1,N1)",0,3.811116,27535.852668
1,HON,32,"NetMF(W1,N1)",0,5.826772,29355.270917
2,HON,64,"NetMF(W1,N1)",0,7.298458,31640.753643
3,HON,128,"NetMF(W1,N1)",0,8.006961,33005.995388
4,HON,16,"NetMF(W1,N5)",0,4.222847,28841.761619


In [20]:
df_sum = df.groupby(['model','dimension','embedding'])['SSE','SSElog'].sum()
df_sum.to_csv('tmp/cv-prob.csv', sep='\t', encoding='utf-16')

  """Entry point for launching an IPython kernel.


In [21]:
df_sum['SSE'].sort_values()

model  dimension  embedding   
FON    16         NetMF(W5,N5)     47.792190
                  NetMF(W3,N5)     49.280430
                  NetMF(W5,N1)     49.350664
HON    16         NetMF(W5,N5)     49.384262
                  NetMF(W5,N1)     50.371449
                                     ...    
       128        NetMF(W1,N5)     93.412453
       64         NetMF(W1,N1)    106.354252
FON    64         NetMF(W1,N1)    106.354252
HON    128        NetMF(W1,N1)    132.641770
FON    128        NetMF(W1,N1)    132.641770
Name: SSE, Length: 64, dtype: float64

In [22]:
df_sum['SSElog'].sort_values()

model  dimension  embedding   
HON    16         NetMF(W5,N1)    524060.605983
       32         NetMF(W5,N1)    526176.746898
       16         NetMF(W3,N1)    531328.282973
       32         NetMF(W3,N1)    538490.524515
       16         NetMF(W2,N1)    539406.553885
                                      ...      
       128        NetMF(W1,N5)    636259.354630
FON    64         NetMF(W1,N1)    656262.486685
HON    64         NetMF(W1,N1)    656262.486685
       128        NetMF(W1,N1)    679515.663336
FON    128        NetMF(W1,N1)    679515.663336
Name: SSElog, Length: 64, dtype: float64