Updated notebook to fix batch configuration and precision bugs (#4447)

* Updated notebook to fix batch configuration and precision bugs Signed-off-by: Virginia Adams <vadams@nvidia.com> * Deleted cell outputs Signed-off-by: Virginia Adams <vadams@nvidia.com> * Set datasets back to full dataset Signed-off-by: Virginia Adams <vadams@nvidia.com> Co-authored-by: Eric Harper <complex451@gmail.com>
NVIDIA · Jun 24, 2022 · 34101ec · 34101ec
1 parent 4b8fab8
commit 34101ec
Showing 1 changed file with 69 additions and 17 deletions.
diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb
@@ -7,7 +7,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "BRANCH=\"main\""
+    "BRANCH=\"r1.10.0\""
    ]
   },
   {
@@ -848,7 +848,7 @@
     "os.environ[\"RANK\"] = '0'\n",
     "os.environ[\"WORLD_SIZE\"] = '1'\n",
     "\n",
-    "plugins = [NLPDDPPlugin(find_unused_parameters=False), TorchElasticEnvironment()]\n",
+    "plugins = [NLPDDPPlugin(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()]\n",
     "trainer = pl.Trainer(plugins=plugins, **config.trainer)\n",
     "\n",
     "print(\"Trainer config - \\n\")\n",
@@ -901,7 +901,7 @@
    "source": [
     "# Set some of the learning parameters\n",
     "config.model.optim.lr = 1e-4\n",
-    "config.model.batch_size = 16"
+    "config.model.precision = config.trainer.precision"
    ]
   },
   {
@@ -1009,7 +1009,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "74a5a358",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "response = model.generate(inputs=test_examples, length_params=None)\n",
@@ -1032,15 +1034,27 @@
     "We need to update:\n",
     "\n",
     "1. `name`\n",
-    "3. `model.restore_path`\n",
-    "5. `model.existing_tasks`\n",
-    "6. `model.new_tasks`\n",
-    "7. `model.data.train_ds`\n",
-    "8. `model.data.validation_ds`\n",
+    "2. `model.restore_path`\n",
+    "3. `model.existing_tasks`\n",
+    "4. `model.new_tasks`\n",
+    "5. `model.virtual_prompt_style`\n",
+    "6. `model.data.train_ds`\n",
+    "7. `model.data.validation_ds`\n",
     "\n",
     "Remember that we already set `task_templates` for SQuAD when we were defining the task template for the other two tasks. We would add it here if we had not already set it above."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5ec279d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change the experiment name\n",
+    "config.name = 'squad_p_tuning'"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "6adb09a3",
@@ -1052,13 +1066,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b5ec279d",
+   "id": "2e196967",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Change the experiment name\n",
-    "config.name = 'squad_p_tuning'\n",
-    "\n",
     "# Change restore path from null to the p-tuned model we just finished training\n",
     "config.model.restore_path = \"multitask_p_tuned_gpt.nemo\"\n",
     "\n",
@@ -1067,6 +1078,25 @@
     "config.model.new_tasks = [\"squad\"]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "4dc088ec",
+   "metadata": {},
+   "source": [
+    "After the first round of p-tuning finished, the ``virtual_prompt_style`` got automatically set to ``inference`` at the end of training. This was done to make the prompt learning model ready as soon as training is complete. For the second round of p-tuning, we need to set ``virtual_prompt_style`` to ``p-tuning`` again."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c49128a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reset virtual prompt style to \"p-tuning\" from \"inference\"\n",
+    "config.model.virtual_prompt_style = \"p-tuning\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1102,18 +1132,40 @@
     "# Limiting the number of validation batches for sake of time\n",
     "config.trainer.limit_val_batches = 100\n",
     "\n",
+    "# Adjust learning rate for the task\n",
     "config.model.optim.lr = 5e-4\n",
-    "config.model.optim.sched.min_lr = 1e-5\n",
-    "config.model.batch_size = 4\n",
     "\n",
     "# Reset the trainer\n",
-    "plugins = [NLPDDPPlugin(find_unused_parameters=False), TorchElasticEnvironment()]\n",
+    "plugins = [NLPDDPPlugin(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()]\n",
     "trainer = pl.Trainer(plugins=plugins, **config.trainer)\n",
     "\n",
     "print(\"Trainer config - \\n\")\n",
     "print(OmegaConf.to_yaml(config.trainer))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ac21b0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from apex.transformer import parallel_state\n",
+    "from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator\n",
+    "from nemo.utils import AppState\n",
+    "\n",
+    "app_state = AppState()\n",
+    "\n",
+    "# Need to reconfigure micro batch calculator with apex for new p-tuning session\n",
+    "_reconfigure_microbatch_calculator(\n",
+    "    rank=app_state.global_rank,\n",
+    "    rampup_batch_size=None,\n",
+    "    global_batch_size=config.model.global_batch_size,\n",
+    "    micro_batch_size=config.model.micro_batch_size,\n",
+    "    data_parallel_size=parallel_state.get_data_parallel_world_size(),\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1145,7 +1197,7 @@
    "execution_count": null,
    "id": "1b3d95f1",
    "metadata": {
-    "scrolled": true
+    "scrolled": false
    },
    "outputs": [],
    "source": [