In [42]:
import pytorch_lightning as pl
from matplotlib import pyplot as plt

from cca_zoo.deepmodels import (
    DCCA,
    DCCA_NOI,
    DCCA_SDL,
    BarlowTwins,
    get_dataloaders,
    
)
from cca_zoo.deepmodels.utils import architectures, objectives
from cca_zoo.plotting import pairplot_label
from cca_zoo.data import CCA_Dataset
from cca_zoo.models import CCA

from sklearn.model_selection import train_test_split
import os
import pandas as pd
import numpy as np


os.chdir('../raw_data')


In [3]:
os.listdir()

['ALL_GNPS.json',
 'ALL_GNPS_15_12_2021_positive_annotated.pickle',
 'ALL_GNPS_15_12_2021_positive_annotated_classifications.txt',
 'all_ms2ds_embedding.pickle',
 'all_spec_embedding.pickle',
 'df_ALL_GNPS_15_12_2021_positive_annotated_embeddings_class.pickle',
 'df_classes_all_embeddings.pickle',
 'df_classes_max3_embeddings.pickle',
 'df_classes_max5_embeddings.pickle',
 'df_classes_unique_embeddings.pickle',
 'GNPS_15_12_2021_ms2ds_embeddings.pickle',
 'lightning_logs',
 'max3_mol2vec_embedding.pickle',
 'max3_ms2ds_embedding.pickle',
 'max3_spec_embedding.pickle',
 'max5_mol2vec_embedding.pickle',
 'max5_ms2ds_embedding.pickle',
 'max5_spec_embedding.pickle',
 'model_300dim.pkl',
 'mol2vec_embedding.pickle',
 'ms2ds_embeddings_unique_GNPS_15_12_2021.pickle',
 'ms2ds_model_GNPS_15_12_2021.hdf5',
 'ms2ds_pred_euc',
 'plot_validation.py',
 'spec2vec_embeddings_unique_GNPS_15_12_2021.pickle',
 'spec2vec_model_GNPS_15_12_2021.model',
 'spec2vec_model_GNPS_15_12_2021.model.syn1neg.npy',


#### Functions to load and or write to pickle files

In [2]:
class Files:
    def __init__(self, filename):
        self.file = filename
        
    def write_to_file(self, data):
        with open(self.file, 'wb') as f:
            pickle.dump(data, f) 
        return None
    
    def load_pickle(self):
        data = pd.read_pickle(self.file)
        return data
    
    def load_csv(self, sep, usecols=None):
        data = pd.read_csv(self.file, sep=sep, usecols=usecols)
        return data
        
        
    

In [53]:
# load df with all info
path = './df_classes_max3_embeddings.pickle'
df_all = Files(path).load_pickle()[:20000] #just a subset for scripting
df_all.shape

(20000, 8)

In [54]:
#split into train and test dataset
train_df, test_df = \
train_test_split(df_all, test_size=0.3, random_state=42)#, stratify=df_all['cf_class'])

#Split train dataset into train and validation set
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

#extract the 2 view, v1 == spectra embeddings, v2==structure embeddings
v1_train, v1_test = np.array([x for x in train_df['ms2ds']]), np.array([x for x in test_df['ms2ds']])
v2_train, v2_test = np.array([x for x in train_df['mol2vec']]), np.array([x for x in test_df['mol2vec']])

v1_val, v2_val = np.array([x for x in val_df['ms2ds']]), np.array([x for x in val_df['mol2vec']])

In [65]:
# What does CCA dataset do????
train_dataset = CCA_Dataset([v1_train, v2_train])
test_dataset = CCA_Dataset([v1_test, v2_test])
val_dataset = CCA_Dataset([v1_val, v2_val])

train_loader, val_loader = get_dataloaders(train_dataset, val_dataset, batch_size=128, num_workers=6,
                                              drop_last=False)
test_loader = get_dataloaders(test_dataset,batch_size=128, num_workers=6,
                                              drop_last=False)

In [62]:
#params
N = len(train_dataset)
latent_dims = 10
epochs = 100
#cca = CCA(latent_dims=latent_dims).fit((X, Y))

# define encoders
encoder_1 = architectures.Encoder(latent_dims=10, feature_size=v1_train.shape[1])
                                     
encoder_2 = architectures.Encoder(latent_dims=10, feature_size=v2_train.shape[1])

# define variant algorithmns
# 1. DCCA
dcca = DCCA(
        latent_dims=latent_dims,
        encoders=[encoder_1, encoder_2],
        objective=objectives.CCA,
        )

# 2. SDL
sdl = DCCA_SDL(latent_dims,
                N=N, encoders=[encoder_1, encoder_2],
                lam=1e-2, lr=1e-3)

# Define the trainer
trainer = pl.Trainer(
        max_epochs=epochs, log_every_n_steps=1)

# train
trainer.fit(sdl, train_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")

  | Name     | Type       | Params
----------------------------------------
0 | encoders | ModuleList | 66.8 K
1 | mse      | MSELoss    | 0     
2 | bns      | ModuleList | 0     
----------------------------------------
66.8 K    Trainable params
0         Non-trainable params
66.8 K    Total params
0.267     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [67]:
dcca.score(test_loader)

array([0.78250995, 0.72444134, 0.67104867, 0.50718818, 0.36996513,
       0.25741073, 0.23203247, 0.1247855 , 0.09177555, 0.05677649])

In [68]:
#score test dataset
# check the validation loop warning