In [1]:
%load_ext autoreload
%autoreload 2

import os, json
import numpy as np
import pandas as pd

import torch
import pytorch_lightning as pl

from ablang_train import ABtokenizer, AbLang, TrainingFrame, CallbackHandler, AbDataModule, ablang_parse_args

  from neptune.version import version as neptune_client_version
  from neptune import new as neptune


In [2]:
# SET ARGUMENTS AND HPARAMS
arguments = ablang_parse_args(args=["--json_args", "../reports/model_arguments/00_test_heavy.json"])
arguments.model_specific_args.data_path = '../data/single_data/'
arguments.model_specific_args.eval_path = '/vols/bitbucket/olsen/processed_oas_data/nov2022/nov2022-paired-all/'
arguments.model_specific_args.use_moe = True
arguments.model_specific_args

Namespace(name='test', json_args='../reports/model_arguments/00_test_heavy.json', n_encoder_blocks=1, hidden_embed_size=64, n_attn_heads=8, dropout=0.1, use_tkn_dropout=False, loss_fn='CrossEntropy_Loss', a_fn='gelu', fl_gamma=2, use_moe=True, mask_percent=0.15, variable_masking=False, mask_technique='random', change_percent=0.1, leave_percent=0.1, initializer_range=0.02, layer_norm_eps=1e-12, data_path='../data/single_data/', out_path='/data/iraqbabbler/olsen/Documents/projects/AbLang/model-catalogue/paired-ablang/train_ablang_pair/reports/models', eval_path='/vols/bitbucket/olsen/processed_oas_data/nov2022/nov2022-paired-all/', cpus=1, max_fit_batch_size=10, effective_batch_size=20, num_training_steps=1000, warmup_steps=2000, learning_rate=0.0004, cdr3_focus=1, weight_decay=0.01, adam_epsilon=1e-08, adam_betas=[0.9, 0.98], seed=42, eval_batch_size=100, over_sample_data=0, accelerator='cpu', devices=1, precision='32-true', val_check_interval=200, log_every_n_steps=200, enable_checkpoi

In [3]:
callbacks = CallbackHandler(
    save_step_frequency=arguments.model_specific_args.log_every_n_steps, 
    progress_refresh_rate=0, 
    outpath=arguments.model_specific_args.out_path
)

In [4]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    #
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    np.random.seed(seed)
    
    pl.seed_everything(seed)
    
# SET SEED - IMPORTANT FOR MULTIPLE GPUS, OTHERWISE GOOD FOR REPRODUCIBILITY
enforce_reproducibility(arguments.model_specific_args.seed)

Global seed set to 42


In [5]:
# LOAD AND INITIATE DATA
arguments.model_specific_args.mask_technique = 'random'

ablang_dm = AbDataModule(arguments.model_specific_args, ABtokenizer) 
# You are supposed to just be able to add abrep to the fit function, but it doesn't work when using multiple GPUs
ablang_dm.setup('fit')

train = ablang_dm.train_dataloader()
val = ablang_dm.val_dataloader()

In [6]:
for batch in train:
    
    print(batch['input'][0])
    print(batch['labels'][:len(batch['input'][0])])
    break

tensor([ 0, 23, 15, 10, 20, 15,  6,  7, 12, 13, 12, 20, 15, 10, 13, 12,  4,  7,
        23,  2, 20,  7, 11, 15, 14,  7, 12, 17,  8, 17,  7, 23, 18, 12,  1, 23,
        19, 15,  2, 24, 14, 13, 12,  4, 12, 20,  6, 19,  1, 14, 20, 16, 16, 18,
         5,  6,  7,  9,  4, 23, 18, 23,  5,  7, 15,  4, 12, 23, 17,  8, 16,  7,
         2,  5,  9,  7,  4, 23,  8, 20, 18, 20, 10,  1,  7,  7, 20,  2, 14,  6,
         5,  8, 14, 23, 17, 18, 11, 14,  4, 15,  4, 23, 23,  5, 23,  8, 23, 13,
         9,  5, 18, 19, 12, 10, 12,  8, 20, 15,  8, 23,  7, 23, 22, 25,  0,  5,
        16, 15,  1, 23, 10,  8, 13,  7,  8, 20,  7, 23, 23, 15, 23,  5,  2, 15,
        23, 20,  8, 11,  4, 23,  7, 10,  5, 16,  7, 23, 20, 14,  3, 18, 10, 10,
         4, 13, 12,  4, 14, 13,  4,  4, 20, 16, 18, 14, 14,  7,  7, 20, 10,  7,
        12, 15, 13, 23,  2, 17,  7, 12,  7, 12,  7, 12,  8,  5, 17,  8, 20,  8,
        16,  7, 23, 20, 15, 13,  6,  5, 17, 23,  8, 18, 18, 11, 23, 10, 10,  9,
         7,  9, 19,  8, 17, 23, 23, 12, 

In [7]:
# LOAD MODEL
model = TrainingFrame(arguments.model_specific_args, AbLang, ABtokenizer)

# INITIALISE TRAINER
trainer = pl.Trainer(**arguments.trainer_args, callbacks=callbacks())

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [8]:
%%time
trainer.fit(model, train, val)


  | Name    | Type             | Params
---------------------------------------------
0 | loss_fn | CrossEntropyLoss | 0     
1 | ablang  | AbLang           | 548 K 
---------------------------------------------
548 K     Trainable params
4         Non-trainable params
548 K     Total params
2.193     Total estimated model params size (MB)
  self._run_instance = neptune.init_run(**self._neptune_init_args)


https://app.neptune.ai/tobiasheol/AbLangTraining/e/ABLANG-613


/data/iraqbabbler/olsen/miniconda3/envs/ablang-train/lib/python3.9/site-packages/pytorch_lightning/loggers/neptune.py:402: NeptuneUnsupportedType: You're attempting to log a type that is not directly supported by Neptune (<class 'list'>).
        Convert the value to a supported type, such as a string or float, or use stringify_unsupported(obj)
        for dictionaries or collections that contain unsupported values.
        For more, see https://docs.neptune.ai/help/value_of_unsupported_type
  self.run[parameters_key] = params
/data/iraqbabbler/olsen/miniconda3/envs/ablang-train/lib/python3.9/site-packages/pytorch_lightning/loggers/neptune.py:402: NeptuneUnsupportedType: You're attempting to log a type that is not directly supported by Neptune (<class 'NoneType'>).
        Convert the value to a supported type, such as a string or float, or use stringify_unsupported(obj)
        for dictionaries or collections that contain unsupported values.
        For more, see https://docs.neptune.

CPU times: user 18min 34s, sys: 1min 52s, total: 20min 26s
Wall time: 2min 35s


In [10]:
trainer.accumulate_grad_batches

2

In [11]:
seq_to_restore = '<EVQLVESGPGLVQPGKSLRLSCVASGFTFSGYGMHWVRQAPGKGLEWIALIIYDESNKYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVFYCAKVKFYDPTAPNDYWGQGTLVTVSS>|'

In [12]:
model1 = model.ablang
tokenizer1 = model.tokenizer

with torch.no_grad():
    tokenPreds = model1(tokenizer1([seq_to_restore], pad=True, w_extra_tkns=False, device='cpu'))

tokenMAX = torch.max(torch.nn.Softmax(dim=-1)(tokenPreds), -1)

aaPreds = tokenizer1(tokenMAX.indices, mode='decode', device='cpu')

unkMatrix = torch.zeros(tokenMAX[0].shape, dtype=torch.long, device='cpu') + 21

aaPreds50 = ['-'.join(tokenizer1(torch.where(tokenMAX[0]<=.5, unkMatrix, tokenMAX[1]).detach(), mode='decode')[0].split('<unk>'))]

In [13]:
tokenPreds[0][2]

tensor([-3.3180, -1.9939,  0.7972, -1.5904,  0.3748,  0.2131, -0.4509,  0.8056,
         0.3215, -0.0354,  0.7362, -1.5872, -0.1760,  0.2939,  0.5680,  3.2837,
         0.2128,  0.3843,  0.3578, -1.6928,  0.9091, -2.1137, -2.8008, -1.6972,
        -3.2703, -3.3106])

In [14]:
tokenMAX = torch.max(torch.nn.Softmax(dim=-1)(tokenPreds), -1)
tokenMAX[1]

tensor([[ 0,  8, 15, 10, 20, 15,  8,  7, 12, 13, 12, 20, 15, 10, 20, 12,  4,  7,
         20,  2, 20,  7, 11, 15, 14,  7, 12, 17,  8, 17,  7, 12, 14, 12,  1, 20,
          7, 15,  2, 10, 14, 13, 12,  4, 10, 20,  6, 19, 16, 14, 20, 16, 16, 18,
          5,  8,  7,  9,  4,  7,  7, 14,  5,  7, 15,  4, 12,  8, 17,  8, 16,  7,
          8,  5,  9,  7,  4,  9,  8, 20, 18, 20,  8,  1,  7,  7, 10,  8, 14,  6,
          5,  8, 14, 15, 17, 10, 11, 14,  4, 15,  4, 17,  7,  5, 13,  8, 14, 13,
          9,  5, 10, 19, 12,  8, 10,  8, 10, 14,  8, 15,  7,  7, 22, 25]])

In [15]:
masked_to_restore = '***LVESGPGLVQPGKSLRLSCVASGFTFSGYGMHWVRQAPGKGLEWIALIIYDESNKYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVFYCAKVKFYDPTAPND************'
masked_to_restore = '<********************************GMHWVRQAPGKGLEWIALI*********ADSVKGRFTISRDNSKNTLYLQMSSLRA*********************************>|'

In [16]:
with torch.no_grad():
    tokenPreds = model1(tokenizer1([masked_to_restore], pad=True, w_extra_tkns=False, device='cpu'))
    tokenMAX = torch.max(torch.nn.Softmax(dim=-1)(tokenPreds), -1)
    masked_aaPreds = tokenizer1(tokenMAX[1], mode='decode')
masked_aaPreds

['GGGGGGGGGGGGGGGGGVVGGGGTVGGGTTTGGGGGGSGGGGGGGGGGGGGGGGGGGGGGGGSVGGGVGGSGGGSGGGGGGGGSSGGGGGGGTGGGGGTGGGGGTGGGGGGGGSGGGGGGG>']

In [17]:
print(aaPreds[0])
print(aaPreds50[0])
print(seq_to_restore)

TVQLVTSGPGLVQLGKSLRLSCVASGFTFSGAGMLSVRQAPGKQLEWIALIIYDTSNKSSADSVKGTFTISTDNSKNTLYLTMSSQTAEDTAVFQCAKVKFSDPTAPNDQWGTQTQATVSS>
-V-LV-S-PGLVQ-GKSLRLS-VAS--T-S------VRQAP-K-L----LI--D-S-K---DSVK---T---D-SK--L-L--------D--V----KVK--DP--P-D---------V-->
<EVQLVESGPGLVQPGKSLRLSCVASGFTFSGYGMHWVRQAPGKGLEWIALIIYDESNKYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVFYCAKVKFYDPTAPNDYWGQGTLVTVSS>|
