In [5]:
import pytorch_lightning as pl
from matplotlib import pyplot as plt
import seaborn as sns
from cca_zoo.deepmodels import (
    DCCA,
    DCCA_NOI,
    DCCA_SDL,
    #BarlowTwins,
    get_dataloaders,
    
)
from cca_zoo.deepmodels.utils import architectures, objectives
from cca_zoo.plotting import pairplot_label
from cca_zoo.data import CCA_Dataset
from cca_zoo.models import CCA
from cca_zoo.model_selection import GridSearchCV
import torch.optim as optim
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split
import os
from scipy.spatial import distance
from scipy.stats import pearsonr
from scipy.stats import fisher_exact
from scipy.stats.contingency import crosstab
from scipy.stats import hypergeom
import random
import pandas as pd
import numpy as np
import pickle
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import seed_everything
from torch import nn
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from ray.tune.integration.pytorch_lightning import tune
os.chdir('../raw_data')

In [6]:
class Files:
    def __init__(self, filename):
        self.file = filename
        
    def write_to_file(self, data):
        with open(self.file, 'wb') as f:
            pickle.dump(data, f) 
        return None
    
    def load_pickle(self):
        data = pd.read_pickle(self.file)
        return data
    
    def load_csv(self, sep, usecols=None):
        data = pd.read_csv(self.file, sep=sep, usecols=usecols)
        return data
        
        

In [18]:
# load df with all info
path = './df_classes_max3_embeddings.pickle'
df_all = Files(path).load_pickle()[:100]#just a subset for scripting
df_all.shape

(100, 8)

In [19]:
class DeepCCA:
    def __init__(self, df_all,batch_size = 768,num_workers = 6,\
                latent_dims=100, epochs=300, lr=0.001): #default dims determined after iteratin 10:50 dims 
        
        self.df_all = df_all
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.v1='ms2ds'
        self.v2 = 'mol2vec'
        
        self.epochs = epochs
        self.sdl_lr = 0.025118864315095822#0.01#lr (picked after running lr_finder)
        self.dcca_lr = 5.623413251903491e-08 #lr
        self.latent_dims=latent_dims
        self.optim = 'sgd'
        self.activation = nn.Tanh()
        self.objective = objectives.CCA
        self.encoder_1_layers = (500,500)
        self.encoder_2_layers = (500,500)
        seed_everything(15)
        
        
    def split_data(self,test_size=0.2,\
                   random_state=None,stratify=None): # thinking of removing this one
        
        if random_state != None and stratify == None:
            train_df, test_df = \
            train_test_split(self.df_all, test_size=test_size, random_state=random_state)
        
        elif random_state == None and stratify != None:
            train_df, test_df = \
            train_test_split(self.df_all, test_size=test_size,stratify=self.df_all[stratify])
        else:
            train_df, test_df = \
            train_test_split(self.df_all, test_size=test_size, random_state=42)
        
        return train_df, test_df 
    
    def gen_views(self,v1='ms2ds',v2='mol2vec'):
        
        #split test, train
        train_df, test_df= self.split_data(test_size=0.2,random_state=42)
        
        #Split train dataset into train and validation set
        train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
        
        
        #extract the 2 view, v1 == spectra embeddings, v2==structure embeddings
        v1_train, v1_test = np.array([x for x in train_df[v1]]), np.array([x for x in test_df[v1]])
        v2_train, v2_test = np.array([x for x in train_df[v2]]), np.array([x for x in test_df[v2]])

        # validation
        v1_val, v2_val = np.array([x for x in val_df[v1]]), np.array([x for x in val_df[v2]])
        
        #update self dfs
        self.train_df = train_df
        self.test_df = test_df
        self.val_df = val_df
        
        # not memory efficient !!!
        self.v1_train, self.v1_test = v1_train, v1_test
        self.v2_train, self.v2_test = v2_train, v2_test
        self.v1_val, self.v2_val = v1_val, v2_val
        
        return None
    
    def deepcca_encoders(self):
        # define encoders
        self.encoder_1 = architectures.Encoder(
                                      latent_dims = self.latent_dims, 
                                      feature_size = self.v1_size,
                                      layer_sizes = self.encoder_1_layers,
                                      activation = self.activation
                                    )
                                     
        self.encoder_2 = architectures.Encoder(
                                      latent_dims=self.latent_dims, 
                                      feature_size=self.v2_size, 
                                      layer_sizes=self.encoder_2_layers,
                                      activation = self.activation
                                     )
       
        return None#[encoder_1, encoder_2]
        
    
    def deepcca_dataloaders(self):
        
        #v1_train,v1_test, v2_train,v2_test, v1_val, v2_val = \
        self.gen_views(v1=self.v1, v2=self.v2)
        
        #creat CCA dataset 
        train_dataset = CCA_Dataset([self.v1_train, self.v2_train])
        test_dataset = CCA_Dataset([self.v1_test, self.v2_test])
        val_dataset = CCA_Dataset([self.v1_val, self.v2_val])
        
        #update features size
        self.v1_size = self.v1_train.shape[1]
        self.v2_size = self.v2_train.shape[1]
        self.N = len(train_dataset)
        
        #set N (for sdl; equal len train dataset)
        self.N = len(train_dataset)
        
        #loaders
        self.train_loader , self.val_loader = get_dataloaders(train_dataset, \
                                                    val_dataset,batch_size=self.batch_size,\
                                                    num_workers=self.num_workers,drop_last=False)
        self.test_loader = get_dataloaders(test_dataset,\
                                      batch_size=self.batch_size, \
                                      num_workers=self.num_workers,drop_last=False)
        
        
        
        
        return None
    
    
    def train_cca(self): #only for comparison with other deep models
        
       
        
        #define and train cca model
        print('\n','-'*20,'\n Training CCA\n','-'*20)
        cca = CCA(latent_dims=self.latent_dims).fit((self.v1_train, self.v2_train))
        self.cca = cca
        return None#cca
    
    def train_sdl(self, checkpoint=None, logger=None,lam=0.0001,enable_progress_bar=True ):
        
        
        # 2. SDL
        sdl = DCCA_SDL(self.latent_dims,
                       optimizer=self.optim,
                       N=self.N, 
                       encoders = [self.encoder_1,self.encoder_2],
                       lam=0.0001, 
                       lr=self.sdl_lr,
                       dropout=0.05,
                       objective=self.objective) 

        
        
        #define the trainer
        
        self.trainer = pl.Trainer(#default_root_dir=default_root_dir,
                             logger = logger,
                             max_epochs=self.epochs, #enable early stoppage instead of specifiying num epochs                           
                             log_every_n_steps=1,
                             val_check_interval = 1, #`Trainer(val_check_interval=1)` was configured so validation will run after every batch.
                             
                             callbacks=[
                                checkpoint,
                                 #pl.callbacks.early_stopping.EarlyStopping(monitor="val/l2") # early stopage to reduce overfitting
                             ],
                            enable_progress_bar=enable_progress_bar,
                            auto_lr_find = True
                                )#,
        
        #callbacks=[pl.callbacks.early_stopping.EarlyStopping(monitor="train/sdl")])# early stopage to reduce overfitting
        
        print('\n','-'*20,'\n Training SDL\n','-'*20)
        self.trainer.fit(sdl, self.train_loader,self.val_loader)
        self.sdl = sdl
        return None#sdl
    
    def train_dcca(self):
        
       
        
        # 2. DCCA
        dcca = DCCA(self.latent_dims,
                    optimizer=self.optim,
                    encoders = [self.encoder_1,self.encoder_2],
                    lr=self.dcca_lr,
                    objective=self.objective) 

        #train
        #tb_logger = pl_loggers.TensorBoardLogger(save_dir="pl_logs/dcca")
        trainer = pl.Trainer(default_root_dir="./dcca",max_epochs=self.epochs,log_every_n_steps=1)#,
        
        #callbacks=[pl.callbacks.early_stopping.EarlyStopping(monitor="train/sdl")])# early stopage to reduce overfitting

        print('\n','-'*20,'\n Training DCCA\n','-'*20)
        trainer.fit(dcca, self.train_loader,self.val_loader)
        
        self.dcca = dcca
        
        return None #dcca
    
    
    
    def score(self,model,dataset): 
        """
        model: either 'cca', 'dcca', 'sdl'
        dataset: 'train', 'test', or 'val'
        
        returns: correlation 
        """
       # for cca models 
        #m = eval(model)
        
        #specify data to transform
        if dataset == 'train':
            v1,v2, loader = self.v1_train, self.v2_train, self.train_loader
        elif dataset == 'test':
            v1,v2, loader = self.v1_test, self.v2_test, self.test_loader
        elif dataset == 'val':
            v1,v2, loader = self.v1_val, self.v2_val, self.val_loader
        
        if model == 'cca':
            corr = self.cca.score([v1,v2])
        
        if model == 'sdl':
            corr = self.sdl.score(loader)
        
        elif model == 'dcca':
            corr = self.dcca.score(loader)
       
        return corr
    def update_z_scores(self,dataset, z1,z2,cols):
        #update train df with transformed z scores
            if dataset == 'train':
                
                self.train_df[cols[0]] = [x for x in z1]
                self.train_df[cols[1]] = [x for x in z2]
                
            #update test df
            if dataset == 'test':
                self.test_df[cols[0]] = [x for x in z1]
                self.test_df[cols[1]] = [x for x in z2]
            
            #update val df
            if dataset == 'val':
                self.val_df[cols[0]] = [x for x in z1]
                self.val_df[cols[1]] = [x for x in z2]
            return None
                
        
    
    def transform(self,model,dataset):
        """
        model: either 'cca', 'dcca', 'sdl': of course the model must have been fitted :)
        loader: is similar data loader used to train either sdl/dcca
        dataset: either 'train', 'test', 'val'
        
        returns transformed data; view1,view2
        """
        
        #specify data to transform
        if dataset == 'train':
            v1,v2, loader = self.v1_train, self.v2_train, self.train_loader
        elif dataset == 'test':
            v1,v2, loader = self.v1_test, self.v2_test, self.test_loader
        elif dataset == 'val':
            v1,v2, loader = self.v1_val, self.v2_val, self.val_loader
        
        
        #specify the model for transformation
        if model == 'cca':
            z1,z2 = self.cca.transform([v1,v2]) #transform
            self.update_z_scores(dataset,z1,z2,cols=['cca_z1','cca_z2']) # update the df with z scores            
    
            
        if model == 'sdl':
            z1,z2 = self.sdl.transform(loader)
            self.update_z_scores(dataset,z1,z2,cols=['sdl_z1','sdl_z2'])
                
        
        if model == 'dcca':
            z1,z2 = self.dcca.transform(loader)
            self.update_z_scores(dataset,z1,z2,cols=['dcca_z1','dcca_z2'])
        
        
        return None##z1,z2; can be found in self.<df[model_z]>       
   

In [20]:
# Initiate deepcca objec
Models = DeepCCA(df_all)

# generate data loaders and cca v1,v2
Models.deepcca_dataloaders()

# set up the encoders
Models.deepcca_encoders()

Global seed set to 15


In [26]:

#define the metric to monitor 'like the scorer for gridsearch'

metrics = {"loss": "val/l2"}

#creat a callback that will communicate with ray-tune
ray_tune_callback = TuneReportCallback(metrics, on="validation_end")

# Defining a search space!
config = {
     "optimizer": tune.choice(['sgd', 'adam', 'adamw']),
 
     "lr": tune.loguniform(1e-4, 1e-1),
     "batch_size": tune.choice([128, 128*2, 128*3]),
    "latent_dims": tune.choice([10,20,30]), 
    "dropout": tune.choice([0.05,0.1,0.15,0.2,0.25]),
    "activation":tune.choice([
            nn.LeakyReLU(), # already the default in cca_zoo
            nn.ReLU(),
            nn.Sigmoid(),
            nn.Tanh()
            ])
    }




In [27]:
#set up logger
version =  f'testing_ray_tune'
   
experiment_dir = './sdl_logs'
    
checkpoint = ModelCheckpoint(save_last=True,
                                       monitor="val/l2",
                                       mode = 'min')
    
logger = TensorBoardLogger(save_dir=experiment_dir, 
                                   name='ray_tune',
                                   version = version)

In [28]:
def train_tune(config, epochs=10, gpus=0):
    
    
    #set up logger
    version =  f'testing_ray_tune'
   
    experiment_dir = './sdl_logs'
    
    checkpoint = ModelCheckpoint(save_last=True,
                                       monitor="val/l2",
                                       mode = 'min')
    
    logger = TensorBoardLogger(save_dir=experiment_dir, 
                                   name='ray_tune',
                                   version = version)
    
    Models.latent_dims = config['latent_dims']
    
    Models.activation = config['activation'] 
    
    # set up the encoders with the new params
    Models.deepcca_encoders()

    
    sdl = DCCA_SDL(Models.latent_dims,
                optimizer=config['optimizer'],
                N=Models.N, 
                encoders = [Models.encoder_1,Models.encoder_2],
                lam=0.0001, 
                lr=Models.sdl_lr,
                dropout=0.05,
                objective=Models.objective)
    
    trainer = pl.Trainer(#default_root_dir=default_root_dir,
    logger = logger,
    max_epochs=10, #enable early stoppage instead of specifiying num epochs                           
    log_every_n_steps=1,
    val_check_interval = 1, #`Trainer(val_check_interval=1)` was configured so validation will run after every batch.
    callbacks=[
        checkpoint,
        pl.callbacks.early_stopping.EarlyStopping(monitor="val/l2"), # early stopage to reduce overfitting
        ray_tune_callback
              ],
    enable_progress_bar=True,
    auto_lr_find = True
    )
    
    trainer.fit(sdl,Models.train_loader,Models.val_loader)

    return None






In [84]:

# %load_ext tensorboard
# %tensorboard --logdir=~/ray_results/

In [29]:
tuner = tune.Tuner(
    tune.with_resources(train_tune, {"cpu": 1, "extra_cpu": 4}),
    tune_config=tune.TuneConfig(
        metric='loss', # key name of the metrics dict
        mode="min",
        num_samples=5,
    ),
    param_space=config
)

results = tuner.fit()

0,1
Current time:,2023-01-10 12:20:44
Running for:,00:02:14.17
Memory:,4.3/15.4 GiB

Trial name,status,loc,activation,batch_size,dropout,latent_dims,lr,optimizer,iter,total time (s),loss
train_tune_83467_00000,TERMINATED,127.0.0.1:5956,LeakyReLU(negat_ffa0,384,0.1,30,0.0989989,sgd,10,87.093,1.07836
train_tune_83467_00001,TERMINATED,127.0.0.1:17152,Tanh(),384,0.05,10,0.00338178,adam,4,80.5888,16.4923
train_tune_83467_00002,TERMINATED,127.0.0.1:8588,LeakyReLU(negat_fd90,128,0.15,30,0.00129123,sgd,10,80.3961,1.3501
train_tune_83467_00003,TERMINATED,127.0.0.1:10572,ReLU(),128,0.25,20,0.0290575,adam,10,80.5436,21.0767
train_tune_83467_00004,TERMINATED,127.0.0.1:16212,LeakyReLU(negat_4490,256,0.25,20,0.0121429,adam,10,78.6366,28.8088


[2m[36m(train_tune pid=5956)[0m GPU available: False, used: False
[2m[36m(train_tune pid=5956)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_tune pid=5956)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_tune pid=5956)[0m HPU available: False, using: 0 HPUs
[2m[36m(train_tune pid=5956)[0m `Trainer(val_check_interval=1)` was configured so validation will run after every batch.
[2m[36m(train_tune pid=5956)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=5956)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=5956)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(train_tune pid=5956)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=5956)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=5956)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=5956)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=5956)[0m   rank_zero_deprecation(
[2m[36m(train_tune pi

Sanity Checking: 0it [00:00, ?it/s]
Sanity Checking DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 31.83it/s]


[2m[36m(train_tune pid=17152)[0m GPU available: False, used: False
[2m[36m(train_tune pid=17152)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_tune pid=17152)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_tune pid=17152)[0m HPU available: False, using: 0 HPUs
[2m[36m(train_tune pid=17152)[0m `Trainer(val_check_interval=1)` was configured so validation will run after every batch.
[2m[36m(train_tune pid=17152)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=17152)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=17152)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(train_tune pid=17152)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=17152)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=17152)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=17152)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=17152)[0m   rank_zero_deprecation(
[2m[36m(

Sanity Checking: 0it [00:00, ?it/s] 
Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s]                             
Sanity Checking DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 45.14it/s]


[2m[36m(train_tune pid=8588)[0m GPU available: False, used: False
[2m[36m(train_tune pid=8588)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_tune pid=8588)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_tune pid=8588)[0m HPU available: False, using: 0 HPUs
[2m[36m(train_tune pid=8588)[0m `Trainer(val_check_interval=1)` was configured so validation will run after every batch.
[2m[36m(train_tune pid=8588)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=8588)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=8588)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(train_tune pid=8588)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=8588)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=8588)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=8588)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=8588)[0m   rank_zero_deprecation(
[2m[36m(train_tune pi

Sanity Checking: 0it [00:00, ?it/s]
Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s]                             
Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 62.48it/s]


[2m[36m(train_tune pid=10572)[0m GPU available: False, used: False
[2m[36m(train_tune pid=10572)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_tune pid=10572)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_tune pid=10572)[0m HPU available: False, using: 0 HPUs
[2m[36m(train_tune pid=10572)[0m `Trainer(val_check_interval=1)` was configured so validation will run after every batch.
[2m[36m(train_tune pid=10572)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=10572)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=10572)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(train_tune pid=10572)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=10572)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=10572)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=10572)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=10572)[0m   rank_zero_deprecation(
[2m[36m(

Sanity Checking: 0it [00:00, ?it/s] 
Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s]                             
Sanity Checking DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 62.34it/s]


[2m[36m(train_tune pid=16212)[0m GPU available: False, used: False
[2m[36m(train_tune pid=16212)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_tune pid=16212)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_tune pid=16212)[0m HPU available: False, using: 0 HPUs
[2m[36m(train_tune pid=16212)[0m `Trainer(val_check_interval=1)` was configured so validation will run after every batch.
[2m[36m(train_tune pid=16212)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=16212)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=16212)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(train_tune pid=16212)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=16212)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=16212)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=16212)[0m   rank_zero_deprecation(
[2m[36m(train_tune pid=16212)[0m   rank_zero_deprecation(
[2m[36m(

Sanity Checking: 0it [00:00, ?it/s] 
Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s]                             
Sanity Checking DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 95.05it/s]
Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s]                             
Epoch 0:  50%|█████     | 1/2 [00:58<00:58, 58.64s/it, loss=2.03, v_num=tune, train/objective=2.030, train/l2=2.020, train/sdl=56.60]


[2m[36m(train_tune pid=5956)[0m 2023-01-10 12:19:55.870864: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
[2m[36m(train_tune pid=5956)[0m 2023-01-10 12:19:55.871393: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


[2m[36m(train_tune pid=17152)[0m Epoch 0:  50%|█████     | 1/2 [00:58<00:58, 58.43s/it]Epoch 0:  50%|█████     | 1/2 [00:58<00:58, 58.43s/it]Epoch 0:  50%|█████     | 1/2 [00:58<00:58, 58.43s/it, loss=1.92, v_num=tune, train/objective=1.920, train/l2=1.910, train/sdl=20.00]


[2m[36m(train_tune pid=17152)[0m 2023-01-10 12:20:04.614511: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
[2m[36m(train_tune pid=17152)[0m 2023-01-10 12:20:04.614843: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


[2m[36m(train_tune pid=5956)[0m 
[2m[36m(train_tune pid=5956)[0m Validation: 0it [00:00, ?it/s][A


Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_tune_83467_00000,2023-01-10_12-20-09,True,,c03ef941841b433d8062a15a4656e0d4,"0_activation=LeakyReLU_negative_slope_0_01,batch_size=384,dropout=0.1000,latent_dims=30,lr=0.0990,optimizer=sgd",LLL,10,1.07836,127.0.0.1,5956,87.093,0.109351,87.093,1673349609,0,,10,83467_00000,0.0155997
train_tune_83467_00001,2023-01-10_12-20-12,True,,4a5fcbb0abe14d58b1333407a30f3764,"1_activation=Tanh,batch_size=384,dropout=0.0500,latent_dims=10,lr=0.0034,optimizer=adam",LLL,4,16.4923,127.0.0.1,17152,80.5888,0.145987,80.5888,1673349612,0,,4,83467_00001,0.0156171
train_tune_83467_00002,2023-01-10_12-20-21,True,,2478e959c45f44548b8ffd03b8d58fd7,"2_activation=LeakyReLU_negative_slope_0_01,batch_size=128,dropout=0.1500,latent_dims=30,lr=0.0013,optimizer=sgd",LLL,10,1.3501,127.0.0.1,8588,80.3961,0.0942624,80.3961,1673349621,0,,10,83467_00002,0.0138371
train_tune_83467_00003,2023-01-10_12-20-32,True,,e8613ab2c52849c78b51d2620eee3e38,"3_activation=ReLU,batch_size=128,dropout=0.2500,latent_dims=20,lr=0.0291,optimizer=adam",LLL,10,21.0767,127.0.0.1,10572,80.5436,0.117228,80.5436,1673349632,0,,10,83467_00003,0.0156507
train_tune_83467_00004,2023-01-10_12-20-42,True,,953c180ac82b424d8f7b041c31bec8bb,"4_activation=LeakyReLU_negative_slope_0_01,batch_size=256,dropout=0.2500,latent_dims=20,lr=0.0121,optimizer=adam",LLL,10,28.8088,127.0.0.1,16212,78.6366,0.156699,78.6366,1673349642,0,,10,83467_00004,0.0312788


[2m[36m(train_tune pid=5956)[0m 
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 0: 100%|██████████| 2/2 [01:12<00:00, 36.06s/it, loss=2.03, v_num=tune, train/objective=2.030, train/l2=2.020, train/sdl=56.60]
Epoch 0: 100%|██████████| 2/2 [01:12<00:00, 36.17s/it, loss=2.03, v_num=tune, train/objective=2.030, train/l2=2.020, train/sdl=56.60]
Epoch 0: 100%|██████████| 2/2 [01:12<00:00, 36.18s/it, loss=2.03, v_num=tune, train/objective=2.030, train/l2=2.020, train/sdl=56.60]
Epoch 1:  50%|█████     | 1/2 [00:00<00:00, 13.52it/s, loss=1.66, v_num=tune, train/objective=1.290, train/l2=1.290, train/sdl=53.20]
Validation: 0it [00:00, ?it/s][Am 
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 1: 100%|██████████| 2/2 [00:00<00:00, 22.77it/s, loss=1.66, v_num=tune, train/objective=1.290, train/l2=1.290, train/sdl=53.20]
Epoch 1: 100%|██████

[2m[36m(train_tune pid=17152)[0m Epoch 3: 100%|██████████| 2/2 [00:00<00:00, 12.31it/s, loss=1.67, v_num=tune, train/objective=1.140, train/l2=1.130, train/sdl=81.30]


2023-01-10 12:20:14,181	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'activation': LeakyReLU(negative_slope=0.01)}


[2m[36m(train_tune pid=8588)[0m Epoch 0:  50%|█████     | 1/2 [00:59<00:59, 59.57s/it]Epoch 0:  50%|█████     | 1/2 [00:59<00:59, 59.57s/it]
[2m[36m(train_tune pid=8588)[0m Epoch 0:  50%|█████     | 1/2 [00:59<00:59, 59.57s/it, loss=1.97, v_num=tune, train/objective=1.970, train/l2=1.960, train/sdl=62.70]


2023-01-10 12:20:16,749	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'activation': Tanh()}
[2m[36m(train_tune pid=8588)[0m 2023-01-10 12:20:16.763087: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
[2m[36m(train_tune pid=8588)[0m 2023-01-10 12:20:16.763380: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


[2m[36m(train_tune pid=8588)[0m 
Validation: 0it [00:00, ?it/s][Am 
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 0: 100%|██████████| 2/2 [01:03<00:00, 31.83s/it, loss=1.97, v_num=tune, train/objective=1.970, train/l2=1.960, train/sdl=62.70]
Epoch 0: 100%|██████████| 2/2 [01:03<00:00, 31.83s/it, loss=1.97, v_num=tune, train/objective=1.970, train/l2=1.960, train/sdl=62.70]
Epoch 1:  50%|█████     | 1/2 [00:00<00:00, 31.13it/s, loss=1.65, v_num=tune, train/objective=1.330, train/l2=1.330, train/sdl=55.50]
Validation: 0it [00:00, ?it/s][Am 
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 1: 100%|██████████| 2/2 [00:00<00:00, 31.32it/s, loss=1.65, v_num=tune, train/objective=1.330, train/l2=1.330, train/sdl=55.50]
Epoch 1: 100%|██████████| 2/2 [00:00<00:00, 31.32it/s, loss=1.65, v_num=tune, train/objective=1.330, train/l2=1.330, t

2023-01-10 12:20:24,429	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'activation': LeakyReLU(negative_slope=0.01)}


[2m[36m(train_tune pid=10572)[0m Epoch 0:  50%|█████     | 1/2 [00:59<00:59, 59.28s/it]Epoch 0:  50%|█████     | 1/2 [00:59<00:59, 59.28s/it]Epoch 0:  50%|█████     | 1/2 [00:59<00:59, 59.28s/it, loss=1.76, v_num=tune, train/objective=1.760, train/l2=1.750, train/sdl=56.00]


[2m[36m(train_tune pid=10572)[0m 2023-01-10 12:20:27.955088: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
[2m[36m(train_tune pid=10572)[0m 2023-01-10 12:20:27.955402: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


[2m[36m(train_tune pid=10572)[0m 
Validation: 0it [00:00, ?it/s][A0m 
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 0: 100%|██████████| 2/2 [01:02<00:00, 31.38s/it, loss=1.76, v_num=tune, train/objective=1.760, train/l2=1.750, train/sdl=56.00]
Epoch 0: 100%|██████████| 2/2 [01:02<00:00, 31.40s/it, loss=1.76, v_num=tune, train/objective=1.760, train/l2=1.750, train/sdl=56.00]
Epoch 1:   0%|          | 0/2 [00:00<?, ?it/s, loss=1.76, v_num=tune, train/objective=1.760, train/l2=1.750, train/sdl=56.00]        
Epoch 1:  50%|█████     | 1/2 [00:00<00:00, 22.25it/s, loss=1.88, v_num=tune, train/objective=2.000, train/l2=1.990, train/sdl=82.40]
Validation: 0it [00:00, ?it/s][A0m 
[2m[36m(train_tune pid=10572)[0m 
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 1: 100%|██████████| 2/2 [00:00<00:00, 35.08it/s, loss=1.88, v_num=tune, 

2023-01-10 12:20:35,086	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'activation': ReLU()}
[2m[36m(train_tune pid=16212)[0m 2023-01-10 12:20:37.494756: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
[2m[36m(train_tune pid=16212)[0m 2023-01-10 12:20:37.494836: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch 0:  50%|█████     | 1/2 [00:51<00:51, 51.69s/it, loss=1.9, v_num=tune, train/objective=1.900, train/l2=1.900, train/sdl=62.40]
[2m[36m(train_tune pid=16212)[0m 
Validation: 0it [00:00, ?it/s][A0m 
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 0: 100%|██████████| 2/2 [00:55<00:00, 27.75s/it, loss=1.9, v_num=tune, train/objective=1.900, train/l2=1.900, train/sdl=62.40]
Epoch 0: 100%|██████████| 2/2 [00:55<00:00, 27.78s/it, loss=1.9, v_num=tune, train/objective=1.900, train/l2=1.900, train/sdl=62.40]
Epoch 0: 100%|██████████| 2/2 [00:55<00:00, 27.78s/it, loss=1.9, v_num=tune, train/objective=1.900, train/l2=1.900, train/sdl=62.40]
Epoch 1:  50%|█████     | 1/2 [00:00<00:00, 34.79it/s, loss=1.91, v_num=tune, train/objective=1.930, train/l2=1.920, train/sdl=81.40]
Validation: 0it [00:00, ?it/s][A0m 
[2m[36m(train_tune pid=16212)[0m 
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation Dat

2023-01-10 12:20:44,959	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'activation': LeakyReLU(negative_slope=0.01)}
2023-01-10 12:20:45,116	INFO tune.py:762 -- Total run time: 134.43 seconds (134.16 seconds for the tuning loop).


In [31]:
results.get_best_result().config

{'optimizer': 'sgd',
 'lr': 0.09899886332345004,
 'batch_size': 384,
 'latent_dims': 30,
 'dropout': 0.1,
 'activation': LeakyReLU(negative_slope=0.01)}

In [13]:
#another way of doing the same step above

# from functools import partial
# sdl_ray = tune.run(
#  partial(train_tune,gpus=4),config=config, num_samples=5, metric="val/l2", mode='min'
#  )

In [14]:
#sdl_ray.get_best_config("loss", 'min')

In [None]:
### END of ray-tune-test ###

In [4]:
class Files:
    def __init__(self, filename):
        self.file = filename
        
    def write_to_file(self, data):
        with open(self.file, 'wb') as f:
            pickle.dump(data, f) 
        return None
    
    def load_pickle(self):
        data = pd.read_pickle(self.file)
        return data
    
    def load_csv(self, sep, usecols=None):
        data = pd.read_csv(self.file, sep=sep, usecols=usecols)
        return data
        
     

In [5]:
tanis = Files('./GNPS_15_12_2021_pos_tanimoto_scores.pickle').load_pickle()

In [6]:
tanis.head()

inchikey14,LFTLOKWAGJYHHR,BQDXDGDOYPUUOD,VEPUCZUJLKAVNM,PXPSEALQIQRPQY,HDZVRBPBPCZCJG,SXJIZQPZESTWLD,JDOFZOKGCYYUER,WGTCMJBJRPKENJ,FCCDDURTIIUXBY,FDLLEBFMOIHMNM,...,RJAHLSXSRQXGGI,VKJTXCWIQDBMLE,NFIHKFSODJJLGC,NHLBOKNHQIEJIH,QABASLXUKXNHMC,XGVJWXAYKUHDOO,MNKNQKOOKLVXDB,CQKNELOTFUSOTP,MHCYVCDXRQGUFW,NMCMVEXMLSARCJ
inchikey14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LFTLOKWAGJYHHR,1.0,0.057353,0.042969,0.053269,0.069264,0.055453,0.048193,0.053296,0.052863,0.056204,...,0.049612,0.054762,0.053929,0.060065,0.049683,0.05298,0.049046,0.095833,0.050964,0.050159
BQDXDGDOYPUUOD,0.057353,1.0,0.162866,0.215026,0.242169,0.176221,0.29627,0.195915,0.089888,0.46,...,0.185547,0.430151,0.180851,0.218014,0.321244,0.297297,0.272672,0.147776,0.317369,0.253207
VEPUCZUJLKAVNM,0.042969,0.162866,1.0,0.286316,0.113158,0.15231,0.208607,0.264908,0.082418,0.18662,...,0.15748,0.167925,0.233333,0.152523,0.228311,0.251228,0.226978,0.080844,0.289835,0.231393
PXPSEALQIQRPQY,0.053269,0.215026,0.286316,1.0,0.134357,0.185499,0.342992,0.34486,0.08168,0.257655,...,0.243169,0.248216,0.351724,0.207021,0.299073,0.376868,0.351122,0.073363,0.362462,0.374658
HDZVRBPBPCZCJG,0.069264,0.242169,0.113158,0.134357,1.0,0.145266,0.205817,0.136898,0.102,0.240987,...,0.139401,0.275825,0.136986,0.157074,0.223055,0.192202,0.190231,0.166329,0.196615,0.175919


In [20]:
tanis.loc['LFTLOKWAGJYHHR', 'VEPUCZUJLKAVNM']

0.04296875

In [16]:
pd.head()

inchikey14,LFTLOKWAGJYHHR,BQDXDGDOYPUUOD,VEPUCZUJLKAVNM,PXPSEALQIQRPQY,HDZVRBPBPCZCJG,SXJIZQPZESTWLD,JDOFZOKGCYYUER,WGTCMJBJRPKENJ,FCCDDURTIIUXBY,FDLLEBFMOIHMNM,...,RJAHLSXSRQXGGI,VKJTXCWIQDBMLE,NFIHKFSODJJLGC,NHLBOKNHQIEJIH,QABASLXUKXNHMC,XGVJWXAYKUHDOO,MNKNQKOOKLVXDB,CQKNELOTFUSOTP,MHCYVCDXRQGUFW,NMCMVEXMLSARCJ
inchikey14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LFTLOKWAGJYHHR,1.0,,,,,,,,,,...,,,,,,,,,,
BQDXDGDOYPUUOD,,1.0,,,,,,,,,...,,,,,,,,,,
VEPUCZUJLKAVNM,,,1.0,,,,,,,,...,,,,,,,,,,
PXPSEALQIQRPQY,,,,1.0,,,,,,,...,,,,,,,,,,
HDZVRBPBPCZCJG,,,,,1.0,,,,,,...,,,,,,,,,,
