In [1]:
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm
import pandas as pd
import typing
import scipy
from collections import OrderedDict
from sklearn.manifold import TSNE
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score

In [2]:
%matplotlib inline

In [3]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource

In [4]:
output_notebook()

In [5]:
import sys
sys.path.append(".")

In [6]:
from tda.graph_dataset import get_dataset
from tda.embeddings.weisfeiler_lehman import get_wl_embedding
from tda.embeddings import EmbeddingType, get_embedding

In [7]:
datasets = {0: get_dataset(
            num_epochs=20,
            epsilon=0.04,
            noise=0.0,
            adv=False)
           }

Loaded successfully model from /tmp/tda/trained_models/mnist_20_epochs.model


Successfully generated dataset of 1000 points (model accuracy 92.0%)


In [8]:
for epsilon in np.linspace(0.01, 0.075, num=5):
    print(f"Trying epsilon={epsilon}")
    datasets[epsilon] = get_dataset(
            num_epochs=20,
            epsilon=epsilon,
            noise=0.0,
            adv=True
)

Trying epsilon=0.01
Loaded successfully model from /tmp/tda/trained_models/mnist_20_epochs.model


Successfully generated dataset of 1000 points (model accuracy 88.8%)


Trying epsilon=0.026250000000000002
Loaded successfully model from /tmp/tda/trained_models/mnist_20_epochs.model


Successfully generated dataset of 1000 points (model accuracy 80.3%)


Trying epsilon=0.0425
Loaded successfully model from /tmp/tda/trained_models/mnist_20_epochs.model


Successfully generated dataset of 1000 points (model accuracy 67.0%)


Trying epsilon=0.058750000000000004
Loaded successfully model from /tmp/tda/trained_models/mnist_20_epochs.model


Successfully generated dataset of 1000 points (model accuracy 47.8%)


Trying epsilon=0.075
Loaded successfully model from /tmp/tda/trained_models/mnist_20_epochs.model


Successfully generated dataset of 1000 points (model accuracy 32.2%)


In [124]:
def get_vector_from_diagram(dgm):
    """
    Simple tentative to get vector from persistent diagram
    (Top 20 lifespans)
    """
    return list(reversed(sorted([dp.death-dp.birth for dp in dgm][1:])))[:20]

In [9]:
embeddings = list()

for epsilon in datasets:
    print(f"Trying epsilon={epsilon}")
    ds = datasets[epsilon]
    for idx in tqdm(range(len(ds[:100]))):
        embedding = get_embedding(
            embedding_type=EmbeddingType.WeisfeilerLehman,
            graph=ds[idx][0],
            params={'threshold': 20000, 'height': 5, 'hash_size': 50}
        )
        #embedding = get_embedding(
        #    embedding_type=EmbeddingType.AnonymousWalk,
        #    graph=ds[idx][0],
        #    params={'steps': 5, 'threshold': 25000}
        #)
        #dgm = get_embedding(
        #    embedding_type=EmbeddingType.PersistentDiagram,
        #    graph=ds[idx][0],
        #    params={'threshold': 25000}
        #)
        #embedding = get_vector_from_diagram(dgm)
        
        embeddings.append((embedding, ds[idx][1], ds[idx][2], ds[idx][3], epsilon))


  0%|          | 0/100 [00:00<?, ?it/s]

Trying epsilon=0


100%|██████████| 100/100 [01:04<00:00,  1.66it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Trying epsilon=0.01


100%|██████████| 100/100 [00:59<00:00,  1.57it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Trying epsilon=0.026250000000000002


100%|██████████| 100/100 [00:54<00:00,  1.84it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Trying epsilon=0.0425


100%|██████████| 100/100 [00:47<00:00,  2.04it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Trying epsilon=0.058750000000000004


100%|██████████| 100/100 [00:44<00:00,  2.16it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Trying epsilon=0.075


100%|██████████| 100/100 [00:42<00:00,  2.34it/s]


In [17]:
tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=100000)
viz_data = tsne.fit_transform([np.ndarray.flatten(np.array((e[0]))) for e in embeddings])

[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 600 samples in 0.001s...
[t-SNE] Computed neighbors for 600 samples in 0.033s...
[t-SNE] Computed conditional probabilities for sample 600 / 600
[t-SNE] Mean sigma: 27.332086
[t-SNE] KL divergence after 250 iterations with early exaggeration: 87.200340
[t-SNE] KL divergence after 9150 iterations: 1.580144


In [18]:
from palettable.colorbrewer.qualitative import Paired_12
from palettable.cartocolors.diverging import Temps_6

def epsilon_color(epsilon):
    keys = sorted(list(datasets.keys()))
    idx = keys.index(epsilon)
    return Temps_6.hex_colors[idx]



df = pd.DataFrame({
    "x1": viz_data[:,0],
    "x2": viz_data[:, 1],
    "y": [e[1] for e in embeddings],
    "y_pred": [e[2] for e in embeddings],
    "y_adv": [e[3] for e in embeddings],
    "epsilon": [e[4] for e in embeddings]
})

df["color"] = df["y"].apply(lambda val: Paired_12.hex_colors[val])
df["color_adv"] = df["y_adv"].apply(lambda val: Paired_12.hex_colors[val])
df["color_eps"] = df["epsilon"].apply(epsilon_color)

def epsilon_legend(epsilon):
    eps_str = str(int(epsilon * 1000) / 1000)
    accuracies = ["92.0%", "88.8%", "80.3%", "67.0%", "47.8%", "32.2%"]
    keys = sorted(list(datasets.keys()))
    idx = keys.index(epsilon)
    acc = accuracies[idx]
    return f"{eps_str} (acc {acc})"

df["legend"] = df["epsilon"].apply(epsilon_legend)

df.head()

Unnamed: 0,x1,x2,y,y_pred,y_adv,epsilon,color,color_adv,color_eps,legend
0,-65.113136,58.388264,7,7,0,0.0,#FF7F00,#A6CEE3,#009392,0.0 (acc 92.0%)
1,-73.430603,11.13229,2,2,0,0.0,#B2DF8A,#A6CEE3,#009392,0.0 (acc 92.0%)
2,-93.846886,36.197308,1,1,0,0.0,#1F78B4,#A6CEE3,#009392,0.0 (acc 92.0%)
3,-84.58107,19.279242,0,0,0,0.0,#A6CEE3,#A6CEE3,#009392,0.0 (acc 92.0%)
4,-77.390648,41.818829,4,4,0,0.0,#FB9A99,#A6CEE3,#009392,0.0 (acc 92.0%)


In [19]:
f = figure(title="Weisfeiler-Lehman steps for various epsilon")

f.scatter(source=ColumnDataSource(df),
          x="x1", y="x2", color="color_eps", legend="legend", marker="circle")

show(f)

In [20]:
separability_values = list()

for epsilon in sorted(datasets.keys()):
    
    if epsilon==0.0:
        continue
    
    roc_values = list()
    
    for gamma in np.logspace(-6, -3, 10):
        ocs = OneClassSVM(
        tol=1e-5,
        gamma=gamma)

        clean_data = [np.ndarray.flatten(np.array((e[0]))) for e in embeddings if e[4]==0.0]
        train_data = clean_data[:len(clean_data)//2]
        test_data = clean_data[len(clean_data)//2:]

        ocs.fit(train_data)
        
        bad_data = [np.ndarray.flatten(np.array((e[0]))) for e in embeddings if e[4]==epsilon]
        
        predictions = ocs.score_samples(test_data+bad_data)
        
        labels = np.concatenate((np.ones(len(test_data)), np.zeros(len(bad_data))))
        
        
        roc_val = roc_auc_score(y_true=labels, y_score=predictions)
        roc_values.append(roc_val)
        
    separability_values.append((epsilon, np.max(roc_values)))
    
print(separability_values)   

[(0.01, 0.7856), (0.026250000000000002, 0.8720000000000001), (0.0425, 0.8972), (0.058750000000000004, 0.9318), (0.075, 0.9502)]


In [21]:
f = figure(
    title="Best AUC reachable for various epsilon",
    x_axis_label="Epsilon",
    y_axis_label="AUC")

f.line(
    x=[z[0] for z in separability_values],
    y=[z[1] for z in separability_values],
    line_width=2, legend="Weisfeiler-Lehman"
)

show(f)