In [1]:
import generator
import model
import os
import matplotlib.pyplot as plt


### Loss functions and Embedding size

In [2]:
import pickle
import torch
import fastbook
from fastbook import *
from fastai.collab import *
from fastai.tabular.all import *
from fastai.losses import *
import torch.nn as nn


def get_small_emb_sz(dls_df, n_bits=None):
    emb = get_emb_sz(dls_df)   # corresponds to number of nodes in each column
    print("recommended emb size", emb)
    emb = max(emb,key=lambda x:x[0])
    if n_bits == None:
        n_bits = embs[1]
    print("\tusing emb size:", (emb[0], n_bits))
    return [(emb[0], n_bits)]


def MRELoss(inp, targ) -> Tensor:
    inp = torch.flatten(inp).float()
    targ = torch.flatten(targ).float()
    nom = torch.nn.functional.l1_loss(inp, targ)
    noo = torch.Tensor([0]).repeat(targ.size(0)).to(device)
    denom = torch.nn.functional.l1_loss(noo, targ)
    loss = (nom / denom).mean()
    return loss


def CombineLoss(inp, targ) -> Tensor:
    mse_loss = MSELossFlat()
    loss_1 = mse_loss(inp, targ)

    loss_2 = MRELoss(inp, targ)

    return alpha * loss_1 + (1 - alpha) * loss_2

  warn(


In [3]:
# learn.recorder()from fastai2.imports import *
# from fastai2.torch_core import *
# from fastai2.learner import *
# call as: learn.recorder.plot_metrics()
@patch
@delegates(subplots)
def plot_metrics(self: Recorder, nrows=None, ncols=None, figsize=None, **kwargs):
    metrics = np.stack(self.values)
    names = self.metric_names[1:-1]
    n = len(names) - 1
    if nrows is None and ncols is None:
        nrows = int(math.sqrt(n))
        ncols = int(np.ceil(n / nrows))
    elif nrows is None: nrows = int(np.ceil(n / ncols))
    elif ncols is None: ncols = int(np.ceil(n / nrows))
    figsize = figsize or (ncols * 6, nrows * 4)
    fig, axs = subplots(nrows, ncols, figsize=figsize, **kwargs)
    axs = [ax if i < n else ax.set_axis_off() for i, ax in enumerate(axs.flatten())][:n]
    for i, (name, ax) in enumerate(zip(names, [axs[0]] + axs)):
        ax.plot(metrics[:, i], color='#1f77b4' if i == 0 else '#ff7f0e', label='valid' if i > 0 else 'train')
        ax.set_title(name if i > 1 else 'losses')
        ax.legend(loc='best')
    plt.show()
    

In [4]:
def create_model(dataset, loss_function, n_bits, valid_pct, savepath):
    if (not os.path.exists(savepath)):
        os.makedirs(savepath)

    data = np.array(dataset)
    df = pd.DataFrame(data, columns=['src', 'dst', 'label'])
    dls_df = CollabDataLoaders.from_df(df, bs=64, valid_pct=valid_pct)
    
    embs_sz = get_small_emb_sz(dls_df, n_bits)
    # multiply number of nodes by 2. max distance cannot be larger than this
    # this represents the max width of an individual graph
    max_value = df.label.max() * 2
    print("max value is", max_value, "gotten by max label:", df.label.max(), "* embedding matrix size: ", embs_sz[0][0])
    trainer = model.CollabNN(*embs_sz, y_range=(0, max_value))

    learn = Learner(dls_df, trainer, loss_func=loss_function, path=savepath, metrics=[mse, mae, MRELoss])
    
    return learn, df

def train_model(dataset, num_epochs, loss_function, n_bits=None, valid_pct=0.2, savepath="save/default/"):

    learn, df = create_model(dataset, loss_function, n_bits, valid_pct, savepath)

#     SaveModelCallback(with_opt=True)
#     learn.remove_cb(ProgressCallback) #remove (uncomment) progresscallback if running from terminal

    learn.fit_one_cycle(n_epoch=num_epochs, lr_max=5e-3, wd=0.01) 
                        #cbs=[EarlyStoppingCallback(monitor='train_loss', min_delta=0.001, patience=50)])  #ShowGraphCallback(), 
    
    allnodes = set(df.src).union(set(df.dst))
    embs = learn.model.save_embeddings(Tensor(list(allnodes)).to(device).int())

    with open(savepath + "embeddings.pkl", "wb") as outfile:
        pickle.dump(embs, outfile)
    
    learn.export()
    return learn

In [5]:
# shared hyperparams
num_epochs = 300
alpha = 0.5
loss_function = CombineLoss
n_bits = 4
lg_N = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

valid_pct = 0.2

# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


### get the datasets

In [6]:
def get_cycles_dataset(lg_N, train_pct=0.8):
    cy_datapath =  'save/cycle/data/log_' + str(lg_N) + '_data'

    if os.path.isfile(cy_datapath):
        cycles_dataset = generator.load_dataset_from_file(cy_datapath)
    else:
        cycles_dataset = generator.generate_cycles_dataset(lg_N)
        generator.save_dataset_to_file(cycles_dataset, cy_datapath)
    cycles_dataset = np.array(cycles_dataset)
    np.random.shuffle(cycles_dataset)
    cycles_trainset, cycles_testset = cycles_dataset[:int(train_pct*len(cycles_dataset)),:], cycles_dataset[int(train_pct*len(cycles_dataset)):,:]
    
    return cycles_trainset, cycles_testset

In [8]:
def get_graphs_dataset(train_pct=0.8, real_graph_paths=None):
    if real_graph_paths == None:
        real_graph_paths = [
                            "/jumbo/lisp/ike/code/DistanceLabelling/datasets/ENZYMES_g1/ENZYMES_g1.edges",
                            "/jumbo/lisp/ike/code/DistanceLabelling/datasets/ENZYMES_g1/ENZYMES_g1.edges", 
                           "/jumbo/lisp/ike/code/DistanceLabelling/datasets/ENZYMES_g118/ENZYMES_g118.edges"
                           ]
    gr_datapath =  'save/graph/data/ngraphs_' + str(len(real_graph_paths)) + '_data'

    if os.path.isfile(gr_datapath):
        graphs_dataset = generator.load_dataset_from_file(gr_datapath)
    else:
        graphs_dataset = generator.generate_real_graphs_dataset(real_graph_paths)
        generator.save_dataset_to_file(graphs_dataset, gr_datapath)
    graphs_dataset = np.array(graphs_dataset)
    np.random.shuffle(graphs_dataset)
    graphs_trainset, graphs_testset = graphs_dataset[:int(train_pct*len(graphs_dataset)),:], graphs_dataset[int(train_pct*len(graphs_dataset)):,:]
    
    return graphs_trainset, graphs_testset

In [9]:
def get_trees_dataset(lg_N, train_pct=0.8):
    tr_datapath =  'save/tree/data/log_' + str(lg_N) + '_data'

    if os.path.isfile(tr_datapath):
        trees_dataset = generator.load_dataset_from_file(tr_datapath)
    else:
        trees_dataset = generator.generate_trees_dataset(2**lg_N)
        generator.save_dataset_to_file(trees_dataset, tr_datapath)

    trees_dataset = np.array(trees_dataset)
    np.random.shuffle(trees_dataset)
    trees_trainset, trees_testset = trees_dataset[:int(train_pct*len(trees_dataset)),:], trees_dataset[int(train_pct*len(trees_dataset)):,:]

    return trees_trainset, trees_testset

In [11]:
def get_cycle_model(alpha, num_epochs, n_bits, loss_function, lg_N, train_pct):
    # create and train on cycles
    split = train_pct*100
    cy_savepath = 'save/cycle/split_' +str(split)+ '/' +str(alpha)+ '_' + str(lg_N) + '_' + str(num_epochs) + '_'+str(n_bits)+ '/'

    saved = cy_savepath+"export.pkl" 
    if os.path.isfile(saved):
        cycle_model = load_learner(saved, cpu=False)  # learn.load('model')
        print("exact model trained before... retrieved model")
    else:
        cycles_dataset = get_cycles_dataset(lg_N)
        cycle_model = train_model(cycles_dataset, num_epochs, loss_function, n_bits=n_bits, savepath=cy_savepath)
        cycle_model.recorder.plot_metrics()
    return cycle_model

In [12]:
def get_tree_model(alpha, num_epochs, n_bits, loss_function, lg_N, train_pct):
    # create and train on trees
    split = train_pct*100
    tr_savepath = 'save/tree/split_' + str(split) + '/' +str(alpha)+ '_' +str(lg_N)+ '_' +str(num_epochs)+ '_'+str(n_bits)+ '/'
    saved = tr_savepath+"export.pkl"  # "models/model.pth"
    if os.path.isfile(saved):
        tree_model = load_learner(saved, cpu=False)  # learn.load('model')
        print("exact model trained before... retrieved model")
    else:
        trees_dataset = get_trees_dataset(lg_N, train_pct)
        tree_model = train_model(trees_dataset, num_epochs, loss_function, n_bits=n_bits, savepath=tr_savepath)
        print(tree_model.model.node_to_emb)
        tree_model.recorder.plot_metrics()
        
    return tree_model

In [13]:
def get_graph_model(alpha, num_epochs, n_bits, loss_function, train_pct):
    split = train_pct*100
    # create and train on real graphs
    gr_savepath = 'save/graph/split_' + str(split) + '/' + str(alpha) + '_' + str(num_epochs) + '/'
    saved = gr_savepath+"export.pkl"  # "models/model.pth"
    if os.path.isfile(saved):
        graph_model = load_learner(saved, cpu=False)  # learn.load('model')
        print("exact model trained before... retrieved model")
    else:
        graphs_dataset = get_graphs_dataset(train_pct)
        graph_model = train_model(graphs_dataset, num_epochs, loss_function, n_bits=n_bits, savepath=gr_savepath)
        graph_model.recorder.plot_metrics()
    return graph_model

In [14]:
# cycles_dataset[-10:]

In [15]:
# print(cycle_model.model(Tensor([[0, 115]]).to(device).int()))

In [16]:
# cycle_model.get_preds()

In [17]:
# n = 10
testset = trees_testset
tst_data, y_targ = torch.IntTensor(testset[:,:2]).to(device), Tensor(np.array(testset)[:,2]).to(device).int()

# display(data)
# tst = graph_model.embedding_layers(data[:, 0]), graph_model.embedding_layers(data[:, 1])
# tst


preds = tree_model.model(data)

calc_loss = MSELossFlat()
calc_loss(preds, y_targ)
# calc_loss

NameError: name 'trees_testset' is not defined

In [None]:
def plot_bits_to_loss(max_bits, lossfunc):
    plt.plot(xpoints, ypoints)
    plt.show()