In [1]:
import torch
from omegaconf.omegaconf import OmegaConf
from pytorch_lightning import Trainer
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

In [2]:
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPDDPPlugin, NLPSaveRestoreConnector
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import StatelessTimer, exp_manager
from nemo.utils.config_utils import update_model_config

    


In [3]:
cfg = OmegaConf.load("conf/megatron_gpt_config.yaml")

cfg.trainer.gpus = 1

# Set current model params
cfg.model.encoder_seq_length = 2048

# Set prompt tuning params
cfg.model.optim.lr = 2e-4
cfg.model.optim.sched.min_lr = 2e-6
cfg.model.use_soft_prompts = True
cfg.model.prompt_length = 10
cfg.model.data.train_ds = 'prompt_tuning_ner_train.json'
cfg.model.data.valid_ds = 'prompt_tuning_ner_val.json'
cfg.model.data.test_ds = 'prompt_tuning_ner_test.json'
cfg.model.data.batch_size = 32
cfg.model.data.data_prefix = None
cfg.model.optim.sched.warmup_steps = 100
cfg.model.optim.sched.constant_steps = 1000
cfg.trainer.max_steps = 3000
cfg.restore_from_path = 'megatron_gpt.nemo'

In [4]:
plugins = [NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)]

if cfg.get('cluster_type', None) == 'BCP':
    plugins.append(TorchElasticEnvironment())

trainer = Trainer(plugins=plugins, **cfg.trainer)

exp_manager(trainer, cfg.exp_manager)

model = MegatronGPTModel.restore_from(cfg.restore_from_path, cfg.model, trainer=trainer)

      rank_zero_deprecation(
    
      rank_zero_deprecation(
    
      rank_zero_deprecation(
    
      rank_zero_deprecation(
    
Using 16bit native Automatic Mixed Precision (AMP)
      rank_zero_deprecation(
    
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
[NeMo W 2021-12-09 08:24:49 exp_manager:558] No version folders would be created under the log folder as 'resume_if_exists' is enabled.
[NeMo W 2021-12-09 08:24:49 exp_manager:411] There was no checkpoint folder at checkpoint_dir :/prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints. Training from scratch.


[NeMo I 2021-12-09 08:24:49 exp_manager:283] Experiments will be logged at /prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt
[NeMo I 2021-12-09 08:24:49 exp_manager:648] TensorboardLogger has been set up


[NeMo W 2021-12-09 08:24:49 exp_manager:879] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 1000. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.
      rank_zero_deprecation(
    


[NeMo I 2021-12-09 08:24:52 tokenizer_utils:188] Getting Megatron tokenizer for pretrained model name: megatron-gpt-345m and custom vocab file: None
[NeMo I 2021-12-09 08:24:52 tokenizer_utils:124] Getting HuggingFace AutoTokenizer with pretrained_model_name: gpt2, vocab_file: /root/.cache/torch/megatron/megatron-gpt-345m_vocab, special_tokens_dict: {}, and use_fast: False


Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using mask_token, but it is not set yet.


[NeMo I 2021-12-09 08:24:56 megatron_gpt_model:531] Padded vocab_size: 50304, original vocab_size: 50257, dummy tokens: 47.
[NeMo I 2021-12-09 08:24:57 tokenizer_utils:188] Getting Megatron tokenizer for pretrained model name: megatron-gpt-345m and custom vocab file: None
[NeMo I 2021-12-09 08:24:57 tokenizer_utils:124] Getting HuggingFace AutoTokenizer with pretrained_model_name: gpt2, vocab_file: /root/.cache/torch/megatron/megatron-gpt-345m_vocab, special_tokens_dict: {}, and use_fast: False


Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using mask_token, but it is not set yet.


[NeMo I 2021-12-09 08:25:01 megatron_gpt_model:531] Padded vocab_size: 50304, original vocab_size: 50257, dummy tokens: 47.
[NeMo I 2021-12-09 08:25:01 save_restore_connector:149] Model MegatronGPTModel was successfully restored from /prompt-tuning/NeMo/examples/nlp/language_modeling/megatron_gpt.nemo.


In [5]:
model.get_prompt_table()

set()

In [6]:
model.init_prompt_from_text("NER-Yes-No", "named entities yes or no")

In [7]:
model.get_prompt_table()

{'NER-Yes-No'}

In [8]:
model.init_prompt_from_text("NER-Complete", "name chemicals entities in context")

In [9]:
model.get_prompt_table()

{'NER-Complete', 'NER-Yes-No'}

In [10]:
model.prompt_tuning_freeze()

In [11]:
for param in model.parameters():
    if param.requires_grad:
        print(param)

Parameter containing:
tensor([[-0.0025, -0.0637,  0.0431,  ..., -0.0647, -0.0375,  0.0141],
        [ 0.0106, -0.0536,  0.0301,  ..., -0.0778, -0.0132, -0.0020],
        [-0.0194, -0.0314, -0.0012,  ..., -0.0174, -0.0120, -0.0127],
        ...,
        [-0.0194, -0.0314, -0.0012,  ..., -0.0174, -0.0120, -0.0127],
        [-0.0097, -0.0083,  0.0061,  ...,  0.0364,  0.0232, -0.0137],
        [ 0.0054,  0.0301, -0.0031,  ...,  0.0367, -0.0045,  0.0023]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([[ 0.0031,  0.0119,  0.0349,  ...,  0.0126,  0.0047, -0.0095],
        [ 0.0109,  0.0207,  0.0084,  ...,  0.0139,  0.0015, -0.0072],
        [-0.0033, -0.0169, -0.0171,  ...,  0.0173,  0.0033, -0.0068],
        ...,
        [-0.0072, -0.0287, -0.0161,  ...,  0.0002,  0.0034, -0.0015],
        [ 0.0021, -0.0283, -0.0269,  ..., -0.0048,  0.0075, -0.0037],
        [-0.0031, -0.0270, -0.0164,  ...,  0.0128,  0.0079, -0.0024]],
       device='cuda:0', requires_grad=True)


In [12]:
trainer.fit(model)

      rank_zero_deprecation(
    
      rank_zero_deprecation(
    
initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1
I1209 08:25:01.515900 140379858880320 distributed_c10d.py:218] Added key: store_based_barrier_key:1 to store for rank: 0
I1209 08:25:01.517103 140379858880320 distributed_c10d.py:252] Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 1 processes
----------------------------------------------------------------------------------------------------

I1209 08:25:01.520228 140379858880320 distributed_c10d.py:218] Added key: store_based_barrier_key:2 to store for rank: 0
I1209 08:25:01.521262 140379858880320 distributed_c10d.py:252] Rank 0: Completed store-based barrier for key:store_based_barrier_key:2 with 1 nodes.
I1209 08:25:01.522430 140379858880320 distribu

> initializing tensor model parallel with size 1
> initializing pipeline model parallel with size 1
> initializing data parallel with size 1
[NeMo I 2021-12-09 08:25:01 nlp_overrides:118] mp_rank: 0
[NeMo I 2021-12-09 08:25:01 nlp_overrides:119] dp_rank: 0
[NeMo I 2021-12-09 08:25:01 gpt_prompt_tuning_dataset:57] Loading and tokenizing dataset ... 


17341it [00:06, 2488.03it/s]

[NeMo I 2021-12-09 08:25:08 gpt_prompt_tuning_dataset:78] Skipped 9 sentences, sequence length too long or too short
[NeMo I 2021-12-09 08:25:08 gpt_prompt_tuning_dataset:57] Loading and tokenizing dataset ... 



4141it [00:01, 2595.44it/s]

[NeMo I 2021-12-09 08:25:10 gpt_prompt_tuning_dataset:78] Skipped 2 sentences, sequence length too long or too short
[NeMo I 2021-12-09 08:25:10 gpt_prompt_tuning_dataset:57] Loading and tokenizing dataset ... 



10190it [00:03, 2793.23it/s]

[NeMo I 2021-12-09 08:25:13 gpt_prompt_tuning_dataset:78] Skipped 22 sentences, sequence length too long or too short



LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


[NeMo I 2021-12-09 08:25:13 modelPT:561] Optimizer config = FusedAdam (
    Parameter Group 0
        betas: [0.9, 0.98]
        bias_correction: True
        eps: 1e-08
        lr: 0.0002
        weight_decay: 0.01
    )
[NeMo I 2021-12-09 08:25:13 lr_scheduler:748] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7faaee41b580>" 
    will be used during training (effective maximum steps = 1000) - 
    Parameters : 
    (warmup_steps: 100
    constant_steps: 600
    min_lr: 2.0e-06
    max_steps: 1000
    )
[NeMo I 2021-12-09 08:25:13 nlp_overrides:80] Configuring DDP for model parallelism.



  | Name  | Type     | Params
-----------------------------------
0 | model | GPTModel | 125 M 
-----------------------------------
30.7 K    Trainable params
125 M     Non-trainable params
125 M     Total params
250.586   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

      rank_zero_warn(
    
    
      rank_zero_warn(
    


Training: 0it [00:00, ?it/s]

    
    
I1209 08:25:15.816919 140379858880320 distributed.py:872] Reducer buckets have been rebuilt in this iteration.


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 99: val_loss reached 3.24958 (best 3.24958), saving model to "/prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=3.25-step=99-consumed_samples=396.0.ckpt" as top 10


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 199: val_loss reached 6.03490 (best 3.24958), saving model to "/prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=6.03-step=199-consumed_samples=796.0.ckpt" as top 10


[NeMo I 2021-12-09 08:25:57 nlp_overrides:129] Removing checkpoint: /prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=3.25-step=99-consumed_samples=396.0-last.ckpt


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 299: val_loss reached 5.96162 (best 3.24958), saving model to "/prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.96-step=299-consumed_samples=1196.0.ckpt" as top 10


[NeMo I 2021-12-09 08:26:20 nlp_overrides:129] Removing checkpoint: /prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=6.03-step=199-consumed_samples=796.0-last.ckpt


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 399: val_loss reached 5.91219 (best 3.24958), saving model to "/prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.91-step=399-consumed_samples=1596.0.ckpt" as top 10


[NeMo I 2021-12-09 08:26:43 nlp_overrides:129] Removing checkpoint: /prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.96-step=299-consumed_samples=1196.0-last.ckpt


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 499: val_loss reached 5.90947 (best 3.24958), saving model to "/prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.91-step=499-consumed_samples=1996.0.ckpt" as top 10


[NeMo I 2021-12-09 08:27:07 nlp_overrides:129] Removing checkpoint: /prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.91-step=399-consumed_samples=1596.0-last.ckpt


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 641: val_loss reached 5.78398 (best 3.24958), saving model to "/prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.78-step=641-consumed_samples=2564.0.ckpt" as top 10


[NeMo I 2021-12-09 08:27:37 nlp_overrides:129] Removing checkpoint: /prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.91-step=499-consumed_samples=1996.0-last.ckpt


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 741: val_loss reached 5.62877 (best 3.24958), saving model to "/prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.63-step=741-consumed_samples=2964.0.ckpt" as top 10


[NeMo I 2021-12-09 08:27:58 nlp_overrides:129] Removing checkpoint: /prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.78-step=641-consumed_samples=2564.0-last.ckpt


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 841: val_loss reached 5.43558 (best 3.24958), saving model to "/prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.44-step=841-consumed_samples=3364.0.ckpt" as top 10


[NeMo I 2021-12-09 08:28:22 nlp_overrides:129] Removing checkpoint: /prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.63-step=741-consumed_samples=2964.0-last.ckpt


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 941: val_loss reached 5.20438 (best 3.24958), saving model to "/prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.20-step=941-consumed_samples=3764.0.ckpt" as top 10


[NeMo I 2021-12-09 08:28:45 nlp_overrides:129] Removing checkpoint: /prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.44-step=841-consumed_samples=3364.0-last.ckpt


Saving latest checkpoint...


[NeMo I 2021-12-09 08:28:57 nlp_overrides:129] Removing checkpoint: /prompt-tuning/NeMo/examples/nlp/language_modeling/nemo_experiments/megatron_gpt/checkpoints/megatron_gpt--val_loss=5.20-step=941-consumed_samples=3764.0-last.ckpt
