In [2]:

import os, sys, math, time
import numpy as np
import numpy.linalg as la
import plotly.graph_objects as go
import plotly.express as ex
from plotly.subplots import make_subplots
import pandas as pd

import json as js
import _pickle as pickle
import bz2
import ray

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torch.utils.data import Dataset, TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from collections import OrderedDict

from ray import tune
from ray.tune.suggest.bayesopt import BayesOptSearch
import shutil
import tempfile
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, \
    TuneReportCheckpointCallback

import pytorch_lightning as pl
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping

from cytoolz import sliding_window
sys.path.append("../")
import func

In [3]:
# Prepare train data
DATA_PATH = "/home/nuoc/Documents/MEX/data"
MODEL_PATH = "/home/nuoc/Documents/MEX/models"
RESULTS_PATH = "/home/nuoc/Documents/MEX/results"

In [4]:
# Test torch lightning + ray tune

class MLP(pl.LightningModule):
    def __init__(self, config:dict=None, dimensions:list=None, loss_fn=None,
                 dataset=None, train_set=None, val_set=None, test_set=None,
                 keep_prob:float=.2, name:str="model", load=False,
                 single_module:int=0):

        super(MLP, self).__init__()
        self.name = name
        self.dimensions = dimensions
        self.keep_prob = keep_prob
        self.single_module = single_module

        if load:
            self.build()
        else:
            self.k = config["k"]
            self.learning_rate = config["lr"]
            dimensions.append(self.k)
            self.dimensions = dimensions
            self.loss_fn = loss_fn
            self.keep_prob = keep_prob          #   %
            self.batch_size = config["batch_size"]

            self.dataset = dataset

            self.train_set = train_set
            self.val_set = val_set
            self.test_set = test_set
            self.best_val_loss = np.inf

            self.build()
            if self.train_set is None:
                self.setup_data([.7, .15, .15])

            self.encoder.apply(self.init_params)
            self.decoder.apply(self.init_params)

    def build(self):
        layer_sizes = list(sliding_window(2, self.dimensions))
        if self.single_module == -1 or self.single_module == 0:
            layers = []

            for i, size in enumerate(layer_sizes):
                layers.append(("fc"+str(i), nn.Linear(size[0], size[1])))
                if i < len(self.dimensions)-2:
                    layers.append(("act"+str(i), nn.ELU()))
                    layers.append(("drop"+str(i+1), nn.Dropout(self.keep_prob)))
            self.encoder = nn.Sequential(OrderedDict(layers))
        else:
            self.encoder = nn.Sequential()

        if self.single_module == 0 or self.single_module == 1:
            layers = []
            for i, size in enumerate(layer_sizes[-1::-1]):
                layers.append(("fc"+str(i), nn.Linear(size[1], size[0])))
                if i < len(self.dimensions)-2:
                    layers.append(("act"+str(i), nn.ELU()))
                    layers.append(("drop"+str(i+1), nn.Dropout(self.keep_prob)))
            self.decoder = nn.Sequential(OrderedDict(layers))
        else:
            self.decoder = nn.Sequential()

    def forward(self, x:torch.Tensor) -> torch.Tensor:
        return self.decoder(self.encoder(x))

    def training_step(self, batch, batch_idx):
        x, y = batch
        prediction = self(x)
        loss = self.loss_fn(prediction, y)

        self.log("ptl/train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch

        prediction = self(x)
        loss = self.loss_fn(prediction, y)

        self.log('ptl/val_loss', loss, prog_bar=True)
        return {"val_loss":loss}

    def test_step(self, batch, batch_idx):
        x, y = batch

        prediction = self(x)
        loss = self.loss_fn(prediction, y)

        self.log('ptl/test_loss', loss, prog_bar=True)
        return {"val_loss":loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        self.log("avg_val_loss", avg_loss)
        if avg_loss < self.best_val_loss:
            self.best_val_loss = avg_loss
            self.save_checkpoint()

    def save_checkpoint(self, checkpoint_dir=MODEL_PATH):
        path = os.path.join(checkpoint_dir, self.name)
        model = {"k":self.k, "dimensions":self.dimensions,"keep_prob":self.keep_prob, "name":self.name,
                 "encoder":self.encoder.state_dict(),
                 "decoder":self.decoder.state_dict()}
        if not os.path.exists(path):
            os.mkdir(path)
        with bz2.BZ2File(os.path.join(path,
                                      str(self.best_val_loss.cpu().numpy())+"."+str(self.k)+".pbz2"), "w") as f:
            pickle.dump(model, f)
    def load(self, state_dict1: 'OrderedDict[str, Tensor]'=None, state_dict2: 'OrderedDict[str, Tensor]'=None,
                        strict: bool = True):

        if self.single_module == -1 or self.single_module == 0:
            self.encoder.load_state_dict(state_dict1, strict)
        if self.single_module == 1 or self.single_module == 0:
            self.decoder.load_state_dict(state_dict2, strict)
    @staticmethod
    def load_checkpoint(filename):
        # return torch.load(os.path.join(checkpoint_dir,filename))
        with bz2.BZ2File(filename, "rb") as f:
            obj = pickle.load(f)
        model = MLP(name=obj["name"], dimensions=obj["dimensions"], load=True)
        model.encoder.load_state_dict(obj["encoder"])
        model.decoder.load_state_dict(obj["decoder"])
        return model
        # self.encoder.load_state_dict(obj["encoder"])
        # self.decoder.load_state_dict(obj["decoder"])
        # self.best_val_loss = obj["val_loss"]

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

    def setup_data(self, split_ratio, N):
        self.n_train_samples= int(split_ratio[0]*N)
        self.n_val_samples= int(split_ratio[1] * N)
        self.n_test_samples= int(N-self.n_train_samples-self.n_val_samples)
        self.train_set, self.val_set, self.test_set = random_split(self.dataset,
                                                                   [self.n_train_samples,
                                                                    self.n_val_samples,
                                                                    self.n_test_samples])

    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size=self.batch_size, pin_memory=True)

    def val_dataloader(self):
        return DataLoader(self.val_set, batch_size=self.batch_size, pin_memory=True)

    def test_dataloader(self):
        return DataLoader(self.test_set, batch_size=self.batch_size, pin_memory=True)

    @staticmethod
    def init_params(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(.01)

In [13]:
class MIX(pl.LightningModule):
    def __init__(self, config:dict=None, model1:nn.Module=None, model2:nn.Module=None,
                 train_set=None, val_set=None, test_set=None,
                 name:str="model"):

        super(MIX, self).__init__()
        self.name = name
        self.model1 = model1
        self.model2 = model2

        self.learning_rate = config["lr"]
        self.loss_fn = config["loss_fn"]
        self.batch_size = config["batch_size"]

        self.train_set = train_set
        self.val_set = val_set
        self.test_set = test_set
        self.best_val_loss = np.inf

    def forward(self, x:torch.Tensor) -> (torch.Tensor,torch.Tensor,torch.Tensor,torch.Tensor):
        k = int(x.size()[-1] / 2)
        input1, input2 = x[:, :k], x[:, k:]
        h1, h2 = self.model1.encoder(input1), self.model2.encoder(input2)
        out1, out2 = self.model1.decoder(h2), self.model2.decoder(h1)

        # outputs = torch.cat((output1, output2), 1) # TODO: might disable
        return out1, out2, h1, h2
    def training_step(self, batch, batch_idx):
        x, y = batch
        out1, out2, h1, h2 = self(x)
        loss = self.loss_fn(out1, out2, h1, h2, y)

        self.log("ptl/train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch

        out1, out2, h1, h2 = self(x)
        loss = self.loss_fn(out1, out2, h1, h2, y)

        self.log('ptl/val_loss', loss, prog_bar=True)
        return {"val_loss":loss}

    def test_step(self, batch, batch_idx):
        x, y = batch

        out1, out2, h1, h2 = self(x)
        loss = self.loss_fn(out1, out2, h1, h2, y)


        self.log('ptl/test_loss', loss, prog_bar=True)
        return {"val_loss":loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        self.log("avg_val_loss", avg_loss)
        if avg_loss < self.best_val_loss:
            self.best_val_loss = avg_loss
            self.save_checkpoint()

    def save_checkpoint(self, checkpoint_dir=MODEL_PATH):
        path = os.path.join(checkpoint_dir, self.name)
        model = {"name":self.name,
                 "model1_name":self.model1.name,
                 "model1_single_module":self.model1.single_module,
                 "model1_state_dict1":self.model1.encoder.state_dict(), "model1_state_dict2":self.model1.decoder.state_dict(),
                 "model2_name":self.model2.name,
                 "model2_single_module":self.model2.single_module,
                 "model2_state_dict1":self.model2.encoder.state_dict(), "model2_state_dict2":self.model2.decoder.state_dict()
                 }
        if not os.path.exists(path):
            os.mkdir(path)
        with bz2.BZ2File(os.path.join(path,
                                      str(self.best_val_loss.cpu().numpy())+"."+str(self.k)+".pbz2"), "w") as f:
            pickle.dump(model, f)

    @staticmethod
    def load_checkpoint(filename):
        # return torch.load(os.path.join(checkpoint_dir,filename))
        with bz2.BZ2File(filename, "rb") as f:
            obj = pickle.load(f)
        model1 = MLP(load=True, name=obj["model1_name"], dimensions=obj["model1_dim"], single_module=obj["model1_single_module"])
        model2 = MLP(load=True, name=obj["model2_name"], dimensions=obj["model2_dim"], single_module=obj["model2_single_module"])
        model1.load(obj["model1_state_dict1"], obj["model1_state_dict2"])
        model2.load(obj["model2_state_dict1"], obj["model2_state_dict2"])

        model = MIX(name=obj["name"], model1=model1, model2=model2)
        return model
        # self.encoder.load_state_dict(obj["encoder"])
        # self.decoder.load_state_dict(obj["decoder"])
        # self.best_val_loss = obj["val_loss"]

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

    def setup_data(self, split_ratio, N):
        self.n_train_samples= int(split_ratio[0]*N)
        self.n_val_samples= int(split_ratio[1] * N)
        self.n_test_samples= int(N-self.n_train_samples-self.n_val_samples)
        self.train_set, self.val_set, self.test_set = random_split(self.dataset,
                                                                   [self.n_train_samples,
                                                                    self.n_val_samples,
                                                                    self.n_test_samples])

    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size=self.batch_size, pin_memory=True)

    def val_dataloader(self):
        return DataLoader(self.val_set, batch_size=self.batch_size, pin_memory=True)

    def test_dataloader(self):
        return DataLoader(self.test_set, batch_size=self.batch_size, pin_memory=True)

    @staticmethod
    def init_params(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(.01)


In [6]:
def train_tune(config, model1, model2,
                 train_set=None, val_set=None, test_set=None,
                 num_epochs=300, num_cpus=24, num_gpus=1, model_name="model"):

    model = MIX(config=config, model1=model1, model2=model2,
                train_set=train_set, val_set=val_set, test_set=test_set, name=model_name)
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"),
        progress_bar_refresh_rate=20,
        callbacks=[
            TuneReportCallback({"loss":"avg_val_loss",}, on="validation_end"),
            EarlyStopping(monitor="avg_val_loss")
        ],
        precision=16,
    )
    trainer.fit(model)

def normalise(x:torch.Tensor):
    std = torch.std(x)
    std[std==0] = 1
    return (x - torch.mean(x)) / std


In [7]:
def prepare_data(datasets:list, featureList:list,
                 train_ratio:float=0.8, val_ratio:float=0.2, test_size:int=100, SEED:int=2021):
   # process data
    data = [func.processData(d, featureList, shutdown=False) for d in datasets]
    input_data = [np.vstack(d) for d in data]
    x_tensors = [func.normaliseT(torch.from_numpy(x).float()) for x in input_data]
    y_tensors = [torch.from_numpy(x).float() for x in input_data]

    # prepare datasets
    test_sets = [(x_tensor[-test_size:], y_tensor[-test_size:]) for x_tensor, y_tensor in zip(x_tensors, y_tensors)]
    x_training = torch.vstack([x_tensor[:-test_size] for x_tensor in x_tensors])
    y_training = torch.vstack([y_tensor[:-test_size] for y_tensor in y_tensors])
    dataset = TensorDataset(x_training, y_training)
    N = len(x_training)

    train_ratio = int(train_ratio*N)
    val_ratio = int(val_ratio*N)
    print("Train: ", train_ratio, ", Validation: ", val_ratio)
    train_set, val_set = random_split(dataset, [train_ratio, val_ratio], generator=torch.Generator().manual_seed(SEED))
    return train_set, val_set, test_sets

def train(train_set:Dataset, val_set:Dataset, model1, model2,
          config:dict, EPOCHS:int=300, hidden_dim:int=256,
          n_gpu=1, n_samples=20, model_name="model",
          ):



    scheduler = ASHAScheduler(max_t = EPOCHS, grace_period=1, reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["k", "lr", "batch_size"],
        metric_columns=["loss", "training_iteration"],
        max_error_rows=5,
        max_progress_rows=5,
        max_report_frequency=10)
    analysis = tune.run(
        tune.with_parameters(
            train_tune,
            model1=model1, model2=model2,
            train_set = train_set, val_set = val_set,
            num_epochs = EPOCHS,
            num_gpus=n_gpu,
            model_name=model_name
        ),
        resources_per_trial= {"cpu":1, "gpu":n_gpu},
        metric="loss",
        mode="min",
        config=config,
        num_samples=n_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name=model_name,
        verbose=False,
        checkpoint_freq=0,
        keep_checkpoints_num=1,
        checkpoint_score_attr="loss",
        checkpoint_at_end=True
    )

    print("-"*70)
    print("Done")
    print("Best hyperparameters found were: ", analysis.best_config)
    print("Best achieved loss was: ", analysis.best_result)
    print("-"*70)

def clean_checkpoints(num_keep=3, path="../../models"):
    saved_checkpoints = []
    for dir, dname, files in os.walk(path):
        for fname in files:
            fname = fname.split(".")
            saved_checkpoints.append(fname)
        print("Num checkpoints in {}: {}".format(dir, len(saved_checkpoints)))

        saved_checkpoints.sort(key = lambda x: x[1])
        for filename in saved_checkpoints[num_keep:]:
            os.remove(os.path.join(dir,".".join(filename)))
        break
    return ".".join(saved_checkpoints[0])

def test(model:torch.nn.Module, test_sets:list, loss_fn, set_names=list,
         save=True, path="../../results", model_name="model"):
    # Intra test performance
    with torch.no_grad():
        df = {}
        if set_names is None: set_names = np.arange(len(test_sets))
        for i,t1 in enumerate(test_sets):
            for j, t2 in enumerate(test_sets):
                x = t1[0]
                y = t2[1]
                out1, out2, h1, h2 = model(x)
                loss = loss_fn(out1, out2, h1, h2, y)
                df["{}-{}".format(set_names[i], set_names[j])] = [loss.cpu().numpy()]
                print("Test encoding {} to {}, MSE={:.2f}".format(set_names[i], set_names[j], loss))
        filepath = os.path.join(path, model_name)
        if not os.path.exists(filepath): os.mkdir(filepath)
        pd.DataFrame(df).to_csv(os.path.join(filepath, "tests.csv"))


In [8]:
def train_multi_model(datapaths1:list, datapaths2:list, featureList:list,config:dict=None,
                      model1=None, model2=None,
                       n_samples:int=30, model_name:str="model", loss_fn=nn.functional.mse_loss,
                       dataset_names:list=None):
    # load data
    datasets1 = [func.load(os.path.join(DATA_PATH,path)) for path in datapaths1]
    datasets2 = [func.load(os.path.join(DATA_PATH,path)) for path in datapaths2]
    train_set1, val_set1, test_set1 = prepare_data(datasets1, featureList)
    train_set2, val_set2, test_set2 = prepare_data(datasets2, featureList)

    train_set = [(torch.cat([x[0],y[0]],dim=0),torch.cat([y[1],x[1]],dim=0)) for x, y in zip(train_set1, train_set2)]
    val_set = [(torch.cat([x[0],y[0]],dim=0),torch.cat([y[1],x[1]],dim=0)) for x, y in zip(val_set1, val_set2)]
    test_set = [(torch.cat([x[0],y[0]],dim=0),torch.cat([y[1],x[1]],dim=0)) for x, y in zip(test_set1, test_set2)]

    train(train_set=train_set, val_set=val_set, config=config, model1=model1, model2=model2,
          n_samples=n_samples, model_name=model_name)

    best_model = clean_checkpoints(path=MODEL_PATH)
    best_model = MIX.load_checkpoint(best_model)
    test(best_model, test_set, loss_fn=mse_loss, set_names=dataset_names, path=RESULTS_PATH)


In [9]:
def mse_loss(out1, out2, h1, h2, y):
    return nn.functional.mse_loss(torch.cat((out1, out2), dim=1), y)

def mse_similarity_loss(out1, out2, h1, h2, y):
    y1, y2 = y[:, :out1.size()[-1]], y[:, out1.size()[-1]:]
    mse1 = nn.functional.mse_loss(out1, y1)
    mse2 = nn.functional.mse_loss(out2, y2)
    similarity_loss = nn.functional.mse_loss(h1, h2)
    return (mse1 + mse2 + similarity_loss) / 3

def mse_similarity_mae_loss(out1, out2, h1, h2, y):
    y1, y2 = y[:, :out1.size()[-1]], y[:, out1.size()[-1]:]
    mse1 = nn.functional.mse_loss(out1, y1)
    mse2 = nn.functional.mse_loss(out2, y2)
    similarity_loss = nn.functional.smooth_l1_loss(h1, h2)
    return (mse1 + mse2 + similarity_loss) / 3

def mse_similarity_kl_loss(out1, out2, h1, h2, y):
    y1, y2 = y[:, :out1.size()[-1]], y[:, out1.size()[-1]:]
    mse1 = nn.functional.mse_loss(out1, y1)
    mse2 = nn.functional.mse_loss(out2, y2)
    similarity_loss = nn.functional.kl_div(h1, h2)
    return (mse1 + mse2 + similarity_loss) / 3

def mse_similarity_nll_loss(out1, out2, h1, h2, y):
    y1, y2 = y[:, :out1.size()[-1]], y[:, out1.size()[-1]:]
    mse1 = nn.functional.mse_loss(out1, y1)
    mse2 = nn.functional.mse_loss(out2, y2)
    similarity_loss = nn.functional.nll_loss(h1, h2)
    return (mse1 + mse2 + similarity_loss) / 3

In [10]:
datapaths1 = ["LOCO_R1-default-locomotion.pbz2",
             "LOCO_R1-default-locomotion-small.pbz2",
             "LOCO_R1-default-locomotion-large.pbz2"]
datapaths2 = ["LOCO_R2-default-locomotion.pbz2",
             "LOCO_R2-default-locomotion-small.pbz2",
             "LOCO_R2-default-locomotion-large.pbz2"]
featureList = ["pos", "rotMat", "velocity"]


In [11]:
model1 = MLP.load_checkpoint("/home/nuoc/Documents/MEX/models/MLP-MLP_R1/0.0016515953.205.pbz2")
model2 = MLP.load_checkpoint("/home/nuoc/Documents/MEX/models/MLP-MLP_R2/MLP-MLP0.0014728174.92.pbz2")

In [14]:
config = {
    "lr": tune.loguniform(1e-3, 1e-7),
    "batch_size":tune.choice([6, 12, 24, 48]),
    "loss_fn":tune.choice([mse_loss, mse_similarity_loss])
}

train_multi_model(datapaths1=datapaths1, datapaths2=datapaths2, featureList=featureList, config=config,
                  model1=model1, model2=model2, n_samples=1,
                   model_name="MIX_1", dataset_names=["Rig1", "Rig2"])


2021-03-31 15:48:47,838	INFO worker.py:664 -- Calling ray.init() again after it has already been called.
2021-03-31 15:48:50,476	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m
2021-03-31 15:48:53,824	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m
2021-03-31 15:48:57,237	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m
2021-03-31 15:49:00,625	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m
2021-03-31 15:49:04,014	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m
2021-03-31 15:49:06,827	INFO ray_trial_executor.py:197 -- Initializing Ray automatically.For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run`.
2021-03-31 15:49:07,473	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m
[2m[36m(pid=1219967)

Train:  3216 , Validation:  804
Train:  3216 , Validation:  804
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]


TuneError: ('Trials did not complete', [_inner_dd80d_00000])

In [None]:
x_tensor = torch.cat([train_set2[:2][0],train_set[:2][0]], dim=1)
y_tensor = torch.cat([train_set[:2][1], train_set2[:2][1]], dim=1)

In [None]:
xx = torch.unsqueeze(x_tensor, 0)
xx.size()

In [None]:
with torch.no_grad():
    pred = mix(x_tensor)
    loss = nn.functional.mse_loss(pred, y_tensor)

In [None]:
    loss.numpy()