In [1]:
%load_ext autoreload
%autoreload 2

import os, json
import numpy as np
import pandas as pd

import torch
import pytorch_lightning as pl

from ablang_train import ABtokenizer, AbLang, TrainingFrame, CallbackHandler, AbDataModule, ablang_parse_args

  from neptune.version import version as neptune_client_version
  from neptune import new as neptune


In [2]:
# SET ARGUMENTS AND HPARAMS
arguments = ablang_parse_args(args="", is_test=True)

arguments.model_specific_args.n_encoder_blocks = 1
arguments.model_specific_args.hidden_embed_size = 64
arguments.model_specific_args.n_attn_heads = 8
arguments.model_specific_args.over_sample_data = 0
arguments.model_specific_args.cpus = 4
arguments.model_specific_args.data_path = "../data/single_data/" #    '/vols/bitbucket/olsen/processed_oas_data/nov2022/nov2022-paired-all/'
arguments.model_specific_args.train_batch_size = 20
arguments.model_specific_args.accumulate_grad_batches = 2
arguments.model_specific_args.effective_batch_size = 10
arguments.model_specific_args.eval_batch_size = 10
arguments.model_specific_args.val_check_interval = 1
arguments.model_specific_args.use_tkn_dropout = False
arguments.model_specific_args.learning_rate = 4e-02 #4e-04
arguments.model_specific_args.change_percent = 1
arguments.model_specific_args.leave_percent = 0 #4e-04
arguments.model_specific_args.loss_fn = "Focal_Loss" #"CrossEntropy_Loss" #"Focal_Loss"
arguments.model_specific_args.a_fn = "gelu"
#arguments.model_specific_args.fl_gamma = 5
#arguments.model_specific_args.out_path = #set path

In [3]:
#arguments = ablang_parse_args(args=["--json_args", "../reports/model_arguments/05_paired_C.json"], is_test=True)
#arguments.model_specific_args.train_batch_size = 10
arguments.model_specific_args

Namespace(name='Model', json_args='', n_encoder_blocks=1, hidden_embed_size=64, n_attn_heads=8, dropout=0.1, use_tkn_dropout=False, loss_fn='Focal_Loss', a_fn='gelu', fl_gamma=2, mask_percent=0.15, variable_masking=False, mask_technique='random', change_percent=1, leave_percent=0, initializer_range=0.02, layer_norm_eps=1e-12, data_path='../data/single_data/', out_path='/data/iraqbabbler/olsen/Documents/projects/AbLang/model-catalogue/paired-ablang/train_ablang_pair/reports/models', cpus=4, max_fit_batch_size=256, effective_batch_size=10, num_training_steps=1000, warmup_steps=2000, learning_rate=0.04, cdr3_focus=1, weight_decay=0.01, adam_epsilon=1e-08, adam_betas=[0.9, 0.98], seed=42, eval_batch_size=10, over_sample_data=0, accelerator='cpu', devices=1, val_check_interval=1, pad_tkn=21, start_tkn=0, end_tkn=22, sep_tkn=25, mask_tkn=23, vocab_size=26, accumulate_grad_batches=2, train_batch_size=20, max_steps=1000)

In [4]:
def set_neptune_logger(args):
    """
    Initialize Neptune logger
    """

    neptune_args = { 'api_key':"eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI0N2Y2YmIxMS02OWM3LTRhY2MtYTQxOC0xODU5N2E0ODFmMzEifQ==",
    'project':"tobiasheol/AbLangTraining",
    'name':args.name,
    'log_model_checkpoints':False,
    }

    return pl.loggers.neptune.NeptuneLogger(**neptune_args)

arguments.trainer_args['logger'] = set_neptune_logger(arguments.model_specific_args)
arguments.trainer_args['log_every_n_steps'] = 1


callbacks = CallbackHandler(save_step_frequency=1, 
                                progress_refresh_rate=0, 
                                outpath=arguments.model_specific_args.out_path)

In [5]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    #
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    np.random.seed(seed)
    
    pl.seed_everything(seed)
    
# SET SEED - IMPORTANT FOR MULTIPLE GPUS, OTHERWISE GOOD FOR REPRODUCIBILITY
enforce_reproducibility(arguments.model_specific_args.seed)

Global seed set to 42


In [6]:
# LOAD AND INITIATE DATA
arguments.model_specific_args.mask_technique = 'random'

ablang_dm = AbDataModule(arguments.model_specific_args, ABtokenizer) 
# You are supposed to just be able to add abrep to the fit function, but it doesn't work when using multiple GPUs
ablang_dm.setup('fit')

train = ablang_dm.train_dataloader()
val = ablang_dm.val_dataloader()

In [7]:
for batch in train:
    
    print(batch['input'][0])
    print(batch['labels'][:len(batch['input'][0])])
    break

tensor([ 0, 12, 15, 10, 20, 15,  6,  7, 12, 13, 12, 20, 15, 10, 13, 12,  4,  7,
         4, 11, 20,  7, 11, 15, 14,  7, 12, 17,  8, 17,  7, 16, 18, 12,  1,  7,
        19, 15,  2, 24, 14, 13, 12,  4, 12, 20,  6, 19,  1, 14, 20, 16, 16, 18,
         5,  6,  7,  9,  4, 16, 18,  6,  5,  7, 15,  4, 12,  5, 17,  8, 16,  7,
         2,  9,  9,  7,  4,  4,  8, 20, 18, 20, 10,  1,  7,  7, 20,  2, 14,  6,
         5,  8, 14,  8, 17, 18, 11, 14,  4, 15,  4,  3, 16,  5, 14,  8,  2, 13,
         9,  5, 18,  3, 12, 10, 12,  8, 20, 15,  8, 12,  7,  9, 22, 25,  0,  5,
        16, 15,  1,  1, 10,  8, 13,  7,  8, 20,  7,  4, 10, 15,  4,  5,  2, 15,
        11, 20,  8, 11,  4, 10,  7, 10,  5, 16,  7,  8, 20, 14,  3, 18, 10, 10,
         4, 13, 12,  4, 14, 13,  4,  4, 20, 16, 18, 14, 14,  7,  7, 20, 10,  7,
        12, 15, 13, 18,  2, 17,  7, 12,  7, 12,  7, 12,  8,  5, 17,  8, 20,  8,
        16,  7, 14, 20, 15, 13,  6,  5, 17, 12,  8, 18, 18, 11,  9, 10, 10,  9,
         7,  9, 19,  8, 17,  4, 14, 12, 

In [8]:
# LOAD MODEL
model = TrainingFrame(arguments.model_specific_args, AbLang, ABtokenizer)

# INITIALISE TRAINER
trainer = pl.Trainer(**arguments.trainer_args, callbacks=callbacks())

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train, val)


  | Name    | Type       | Params
---------------------------------------
0 | loss_fn | Focal_Loss | 0     
1 | ablang  | AbLang     | 56.1 K
---------------------------------------
56.1 K    Trainable params
4         Non-trainable params
56.1 K    Total params
0.224     Total estimated model params size (MB)
  self._run_instance = neptune.init_run(**self._neptune_init_args)


https://app.neptune.ai/tobiasheol/AbLangTraining/e/ABLANG-552


/data/iraqbabbler/olsen/miniconda3/envs/ablang-train/lib/python3.9/site-packages/pytorch_lightning/loggers/neptune.py:402: NeptuneUnsupportedType: You're attempting to log a type that is not directly supported by Neptune (<class 'list'>).
        Convert the value to a supported type, such as a string or float, or use stringify_unsupported(obj)
        for dictionaries or collections that contain unsupported values.
        For more, see https://docs.neptune.ai/help/value_of_unsupported_type
  self.run[parameters_key] = params


In [None]:
seq_to_restore = '<EVQLVESGPGLVQPGKSLRLSCVASGFTFSGYGMHWVRQAPGKGLEWIALIIYDESNKYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVFYCAKVKFYDPTAPNDYWGQGTLVTVSS>|'

In [None]:
model1 = model.ablang
tokenizer1 = model.tokenizer

with torch.no_grad():
    tokenPreds = model1(tokenizer1([seq_to_restore], pad=True, w_extra_tkns=False, device='cpu'))

tokenMAX = torch.max(torch.nn.Softmax(dim=-1)(tokenPreds), -1)

aaPreds = tokenizer1(tokenMAX.indices, mode='decode', device='cpu')

unkMatrix = torch.zeros(tokenMAX[0].shape, dtype=torch.long, device='cpu') + 21

aaPreds50 = ['-'.join(tokenizer1(torch.where(tokenMAX[0]<=.5, unkMatrix, tokenMAX[1]).detach(), mode='decode')[0].split('<unk>'))]

In [None]:
tokenPreds[0][2]

In [None]:
tokenMAX = torch.max(torch.nn.Softmax(dim=-1)(tokenPreds), -1)
tokenMAX[1]

In [None]:
masked_to_restore = '***LVESGPGLVQPGKSLRLSCVASGFTFSGYGMHWVRQAPGKGLEWIALIIYDESNKYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVFYCAKVKFYDPTAPND************'
masked_to_restore = '<********************************GMHWVRQAPGKGLEWIALI*********ADSVKGRFTISRDNSKNTLYLQMSSLRA*********************************>|'

In [None]:
with torch.no_grad():
    tokenPreds = model1(tokenizer1([masked_to_restore], pad=True, w_extra_tkns=False, device='cpu'))
    tokenMAX = torch.max(torch.nn.Softmax(dim=-1)(tokenPreds), -1)
    masked_aaPreds = tokenizer1(tokenMAX[1], mode='decode')
masked_aaPreds

In [None]:
print(aaPreds[0])
print(aaPreds50[0])
print(seq_to_restore)