In [1]:
%load_ext autoreload
%autoreload 2

import sys, glob, os, gzip
sys.path.insert(1, os.path.join(sys.path[0], '..'))

import numpy as np
import pandas as pd

import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers.neptune import NeptuneLogger

from ablang_train import ABtokenizer, AbLang, trainingframe
from ablang_train.train_utils import callback_handler, datamodule, arghandler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def set_neptune_logger(args):
    """
    Initialize Neptune logger
    """

    neptune_args = { 'api_key':"eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI0N2Y2YmIxMS02OWM3LTRhY2MtYTQxOC0xODU5N2E0ODFmMzEifQ==",
    'project':"tobiasheol/AbLangTraining",
    'name':args.name,
    'log_model_checkpoints':False,
    }

    return NeptuneLogger(**neptune_args)

In [3]:
# SET ARGUMENTS AND HPARAMS
arguments = arghandler.parse_args(args="", is_test=True)
logger = set_neptune_logger(arguments.model_specific_args)

arguments.model_specific_args.n_encoder_blocks = 1
arguments.model_specific_args.hidden_embed_size = 64
arguments.model_specific_args.n_attn_heads = 8
arguments.model_specific_args.over_sample_data = 1
arguments.model_specific_args.cpus = 4
arguments.model_specific_args.data_path = '../data/single_data/'
arguments.model_specific_args.train_batch_size = 1000
arguments.model_specific_args.effective_batch_size = 1000
arguments.model_specific_args.eval_batch_size = 1
arguments.model_specific_args.val_check_interval = 1
arguments.model_specific_args.use_tkn_dropout = False
arguments.model_specific_args.learning_rate = 4e-02 #4e-04
#arguments.model_specific_args.out_path = #set path

arguments.trainer_args['logger'] = logger
arguments.trainer_args['log_every_n_steps'] = 1


callbacks = callback_handler.CallbackHandler(save_step_frequency=1, 
                                progress_refresh_rate=0, 
                                outpath=arguments.model_specific_args.out_path)

In [4]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    #
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    np.random.seed(seed)
    
    pl.seed_everything(seed)
    
# SET SEED - IMPORTANT FOR MULTIPLE GPUS, OTHERWISE GOOD FOR REPRODUCIBILITY
enforce_reproducibility(arguments.model_specific_args.seed)

Global seed set to 42


In [5]:
# LOAD AND INITIATE DATA
arguments.model_specific_args.mask_technique = 'random'

abrep_dm = datamodule.MyDataModule(arguments.model_specific_args, ABtokenizer) 
# You are supposed to just be able to add abrep to the fit function, but it doesn't work when using multiple GPUs
abrep_dm.setup('fit')

train = abrep_dm.train_dataloader()
val = abrep_dm.val_dataloader()

In [6]:
for batch in train:
    
    print(batch['input'][0])
    print(batch['labels'][:len(batch['input'][0])])
    break

tensor([ 0,  6, 15, 23, 23, 15,  6,  7, 12, 13, 12, 23, 23, 10, 13, 12,  4,  7,
        20, 23, 20,  7, 23, 15, 14, 23, 23, 17,  8, 18, 23, 12, 18, 23, 23,  3,
        19, 15, 23, 10, 14, 13, 12, 23, 12, 23,  6, 23, 16, 14, 20, 16, 16, 18,
         5,  6,  7, 23, 23, 23, 18, 14,  5,  7, 15,  4, 12,  2, 17,  8, 16,  7,
         2,  5,  9,  7,  4, 23,  8, 20, 18, 20, 23,  1,  7,  7, 20,  2, 14, 19,
        23,  8, 14, 15, 17, 18, 11, 14,  4, 15,  4, 23, 23,  5, 11,  8, 23, 13,
         9,  5, 18, 19, 12, 10, 12,  8, 23, 23,  8, 15, 23, 23, 22, 22, 21, 21,
        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
        21, 21, 21, 21, 21, 21, 21, 21, 

In [7]:
# LOAD MODEL
model = trainingframe.TrainingFrame(arguments.model_specific_args, AbLang, ABtokenizer)

# INITIALISE TRAINER
trainer = pl.Trainer(**arguments.trainer_args, callbacks=callbacks())

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1)` was configured so validation will run after every batch.


In [8]:
trainer.fit(model, train, val)


  | Name    | Type             | Params
---------------------------------------------
0 | loss_fn | CrossEntropyLoss | 0     
1 | ablang  | AbLang           | 56.0 K
---------------------------------------------
56.0 K    Trainable params
0         Non-trainable params
56.0 K    Total params
0.224     Total estimated model params size (MB)


https://app.neptune.ai/tobiasheol/AbLangTraining/e/ABLANG-363


Info (NVML): NVML Shared Library Not Found. GPU usage metrics may not be reported. For more information, see https://docs.neptune.ai/help/nvml_error/


Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


  self._container.define(self._path, value, wait)
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [9]:
seq_to_restore = 'EVQLVESGPGLVQPGKSLRLSCVASGFTFSGYGMHWVRQAPGKGLEWIALIIYDESNKYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVFYCAKVKFYDPTAPNDYWGQGTLVTVSS'

In [10]:
model1 = model.ablang
tokenizer1 = model.tokenizer

with torch.no_grad():
    tokenPreds = model1(tokenizer1([seq_to_restore], pad=True, device='cpu'))

tokenMAX = torch.max(torch.nn.Softmax(dim=-1)(tokenPreds), -1)

aaPreds = tokenizer1(tokenMAX[1], encode=False)

unkMatrix = torch.zeros(tokenMAX[0].shape, dtype=torch.long, device='cpu') + 21

aaPreds50 = ['-'.join(tokenizer1(torch.where(tokenMAX[0]<=.5, unkMatrix, tokenMAX[1]).detach(), encode=False)[0].split('<unk>'))]

In [11]:
tokenPreds[0][2]

tensor([ 1.0584, -1.0601, -6.2503, -5.6491,  1.1684, -1.8348,  0.4710, -2.8998,
        -1.2980, -5.3902, -0.3479, -2.5714, -0.6055, -1.2087,  2.3027,  4.5818,
         6.7706, -1.0007, -3.2951, -3.4409,  2.5432, -1.5018,  2.3688, -1.1355])

In [12]:
tokenMAX = torch.max(torch.nn.Softmax(dim=-1)(tokenPreds), -1)
tokenMAX[1]

tensor([[ 0,  0, 16,  0, 16, 16, 18, 13, 16,  0, 16,  5,  4,  4,  0, 16,  0,  0,
         14, 22, 20,  4, 15, 16,  7,  4, 16,  2, 14, 17,  4, 15,  4, 16,  7,  3,
          7,  4, 22,  0,  7,  0,  7,  0,  7, 20,  9, 10,  7,  4,  9, 13, 13,  4,
         16, 16,  0, 15,  0,  4,  4,  7, 16, 13, 16,  0,  7,  8,  9, 14, 13,  4,
         20, 16, 16, 16,  0,  7, 14,  8,  4, 14,  0,  7,  4,  4, 14,  8,  7, 13,
         16, 14,  7,  4,  4,  4,  4,  4,  0, 15,  4,  4,  4, 16,  0, 14,  7,  4,
          7, 16,  4,  7,  7,  0, 16, 14,  9, 16, 14, 15,  0,  0, 22]])

In [13]:
masked_to_restore = '***LVESGPGLVQPGKSLRLSCVASGFTFSGYGMHWVRQAPGKGLEWIALIIYDESNKYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVFYCAKVKFYDPTAPND************'

In [14]:
with torch.no_grad():
    tokenPreds = model1(tokenizer1([masked_to_restore], pad=True, device='cpu'))
    tokenMAX = torch.max(torch.nn.Softmax(dim=-1)(tokenPreds), -1)
    masked_aaPreds = tokenizer1(tokenMAX[1], encode=False)
masked_aaPreds

['<EVQII<PI<III<<I<II>PIVI<II>I>II>IISIIL<<<><ILIQP<LPPIII<EI<<<IGIII>>IPI>III<V>T>A<IIIA><<I><I>>K<<I<>>I<>><IIYWGQGDLVTVSS>']

In [15]:
print(aaPreds[0][1:-1])
print(aaPreds50[0][1:-1])
print(seq_to_restore)

<I<IIYPI<IDKK<I<<A>LKVISKIRAFKVKISHSK><S<S<SLNQSKNPPKII<V<KKSIPI<STNAPKLIII<SATKA<SKKATSPIASKKKKK<VKKKI<ASKSIKSS<IANIAV<<
-I--I-PI<ID--<I<-A>LKVISKI-AFK-K-SHSK-<-<S<----S-NPPKII--<KKSI-I<S--APKLII-<-A-KA<S--A-S-IASKKKK-<-KK-I<ASK-IKSS<IANIAV<<
EVQLVESGPGLVQPGKSLRLSCVASGFTFSGYGMHWVRQAPGKGLEWIALIIYDESNKYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVFYCAKVKFYDPTAPNDYWGQGTLVTVSS
