diff --git a/README.rst b/README.rst index 06ea863600e7..c2334f5e2336 100644 --- a/README.rst +++ b/README.rst @@ -224,7 +224,7 @@ Install it manually if not using the NVIDIA PyTorch container. git clone https://github.com/ericharper/apex.git cd apex - git checkout nm_v1.11.0 + git checkout nm_v1.13.0 pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./ Transformer Engine diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index a40e5a2ae35f..0604c0287d05 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -106,7 +106,6 @@ model: apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages - use_unified_checkpoint: True # Use model parallel independent checkpointing ## Activation Checkpointing # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. diff --git a/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py b/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py index da620dd434ec..554178e7e95d 100644 --- a/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py +++ b/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py @@ -13,6 +13,7 @@ # limitations under the License. import torch.multiprocessing as mp +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer diff --git a/examples/nlp/language_modeling/megatron_t5_prompt_learning.py b/examples/nlp/language_modeling/megatron_t5_prompt_learning.py index e35c023d7d26..efcf01288a7c 100644 --- a/examples/nlp/language_modeling/megatron_t5_prompt_learning.py +++ b/examples/nlp/language_modeling/megatron_t5_prompt_learning.py @@ -13,6 +13,7 @@ # limitations under the License. import torch.multiprocessing as mp +from lightning_lite.plugins.environments import TorchElasticEnvironment from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer diff --git a/tests/collections/tts/test_tts_exportables.py b/tests/collections/tts/test_tts_exportables.py index d7684de732e5..c7083b45f6d7 100644 --- a/tests/collections/tts/test_tts_exportables.py +++ b/tests/collections/tts/test_tts_exportables.py @@ -15,7 +15,6 @@ import tempfile import pytest -import torch from omegaconf import OmegaConf from nemo.collections.tts.models import FastPitchModel, HifiGanModel, RadTTSModel @@ -81,5 +80,4 @@ def test_RadTTSModel_export_to_torchscript(self, radtts_model): model = radtts_model.cuda() with tempfile.TemporaryDirectory() as tmpdir: filename = os.path.join(tmpdir, 'rad.ts') - with torch.cuda.amp.autocast(enabled=True): - model.export(output=filename, verbose=True, check_trace=True) + model.export(output=filename, verbose=True, check_trace=True) diff --git a/tutorials/nlp/Non_English_Downstream_Tasks_(NER).ipynb b/tutorials/nlp/Non_English_Downstream_Tasks_(NER).ipynb deleted file mode 100644 index bfa56e5a2567..000000000000 --- a/tutorials/nlp/Non_English_Downstream_Tasks_(NER).ipynb +++ /dev/null @@ -1,899 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "OETcTQlcguCm" - }, - "outputs": [], - "source": [ - "BRANCH = 'main'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "o_0K1lsW1dj9" - }, - "outputs": [], - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell\n", - "\n", - "# install NeMo\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pC0slAc0h9zN", - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# If you're not using Colab, you might need to upgrade jupyter notebook to avoid the following error:\n", - "# 'ImportError: IProgress not found. Please update jupyter and ipywidgets.'\n", - "\n", - "! pip install ipywidgets\n", - "! jupyter nbextension enable --py widgetsnbextension\n", - "\n", - "# Please restart the kernel after running this cell" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dzqD2WDFOIN-" - }, - "outputs": [], - "source": [ - "from nemo.collections import nlp as nemo_nlp\n", - "from nemo.utils.exp_manager import exp_manager\n", - "\n", - "import os\n", - "import wget \n", - "import torch\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf\n", - "\n", - "import zipfile\n", - "import random\n", - "from glob import glob" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "daYw_Xll2ZR9" - }, - "source": [ - "# Tutorial Overview\n", - "In this tutorial, we will show how to use a pre-trained BERT language model on a non-English downstream task. Here we are going to use Persian language and Named entity recognition (NER) task as an example. Note, most of the rest downstream tasks supported in NeMo should work similarly for other languages. \n", - "\n", - "# Task Description\n", - "NER is the task of detecting and classifying key information (entities) in text.\n", - "For example, in a sentence: `Mary lives in Santa Clara and works at NVIDIA`, we should detect that `Mary` is a person, `Santa Clara` is a location and `NVIDIA` is a company.\n", - "\n", - "In this tutorial we will be using [BERT language model](https://arxiv.org/abs/1810.04805).\n", - "\n", - "To read more about other topics and downstream task that can be done in NeMo, you can see the [NeMo's tutorial page](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/).\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZnuziSwJ1yEB" - }, - "source": [ - "# Dataset\n", - "\n", - "In this tutorial we are going to use [Persian Arman dataset for our NER task](https://github.com/HaniehP/PersianNER).\n", - "\n", - "Arman is a hand annotated Persian corpus for NER task with 250,015 tokens and 7,682 sentences. Using [IOB encoding](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)), tokens are labeled with either one of the following name entities or labeled with O. \n", - "\n", - "* event = event\n", - "* fac = facility\n", - "* loc = location\n", - "* org = organization\n", - "* pers = person\n", - "* pro = product\n", - "\n", - "Each of these has a label staring with **B** that indicates it is the first token of the name entity and with **I** for others. \n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qzcZ3nb_-SVT" - }, - "source": [ - "# NeMo Token Classification Data Format\n", - "\n", - "[TokenClassification Model](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/token_classification/token_classification_model.py) in NeMo supports NER and other token level classification tasks, as long as the data follows the format specified below. \n", - "\n", - "Token Classification Model requires the data to be split into 2 files: \n", - "* text.txt \n", - "* labels.txt. \n", - "\n", - "Each line of the **text.txt** file contains text sequences, where words are separated with spaces, i.e.: \n", - "[WORD] [SPACE] [WORD] [SPACE] [WORD].\n", - "\n", - "The **labels.txt** file contains corresponding labels for each word in text.txt, the labels are separated with spaces, i.e.:\n", - "[LABEL] [SPACE] [LABEL] [SPACE] [LABEL].\n", - "\n", - "Example of a text.txt file:\n", - "```\n", - "دبیر شورای عالی انقلاب فرهنگی از گنجانده شدن 5 زبان خارجی جدید در برنامه درسی مدارس خبر داد.\n", - "```\n", - "Corresponding labels.txt file:\n", - "```\n", - "O B_ORG I_ORG I_ORG I_ORG O O O O O O O O O O O O O O \n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SL58EWkd2ZVb" - }, - "source": [ - "## Download and preprocess the data¶" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_z2tCEIXZa90" - }, - "source": [ - "You can download the Arman dataset by cloning to the following github repository: https://github.com/HaniehP/PersianNER.\n", - "\n", - "After downloading the data, you will see a few files and folders inside a directory named PersianNER. Take ArmanPersoNERCorpus.zip and upload it to `DATA_DIR` (if running in a docker or locally) or use **files** from Google colab to upload the files.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "n8HZrDmr12_-" - }, - "outputs": [], - "source": [ - "# path to the folder with ArmanPersoNERCorpus.zip file (if running locally on in a docker)\n", - "DATA_DIR = \"PATH_TO_FOLDER_WITH_ZIP.ZIP_FILE\"\n", - "WORK_DIR = \"WORK_DIR\"\n", - "\n", - "# adding an empty subfolder for data (otherwise it can interact with existing folders in DATA_DIR)\n", - "subfolder = f\"{DATA_DIR}/non_eng_NER\"\n", - "\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "os.makedirs(DATA_DIR, exist_ok=True)\n", - "os.makedirs(subfolder, exist_ok=True)\n", - "\n", - "! cp $DATA_DIR/ArmanPersoNERCorpus.zip $subfolder/.\n", - "DATA_DIR = f\"{DATA_DIR}/non_eng_NER\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k1TmF5rrdPMj" - }, - "outputs": [], - "source": [ - "if 'google.colab' in str(get_ipython):\n", - " from google.colab import files\n", - " uploaded = files.upload() " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HTUKJOownkrF" - }, - "outputs": [], - "source": [ - "if 'google.colab' in str(get_ipython):\n", - " ! mv ArmanPersoNERCorpus.zip $DATA_DIR/." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NhUzIeF0Yg0l" - }, - "source": [ - "Let's extract files from the zip file. It will generate three test and train files which have overlaps and are intended to be used in turn as train and test sets. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Y01BdjPRW-7B" - }, - "outputs": [], - "source": [ - "! cd $DATA_DIR && unzip \"ArmanPersoNERCorpus.zip\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qaDgL-sQaX2e" - }, - "source": [ - "Next, we will be putting all data into a single file and removing any repeated sentences. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "B0T4CzJvbBJ4" - }, - "outputs": [], - "source": [ - "file_all = os.path.join(DATA_DIR, \"all_data.txt\")\n", - "with open(file_all, \"w\") as f1:\n", - " for filename in glob(f\"{DATA_DIR}/test_fold*.txt\") + glob(f\"{DATA_DIR}/train_fold*.txt\"):\n", - " with open(filename, \"r\", encoding = \"ISO-8859-1\") as f2:\n", - " for line in f2:\n", - " f1.write(line)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VzVuET8HESFB" - }, - "source": [ - "Now, you need to convert this data into NeMo compatible format before starting the training process. For this purpose, you can run [examples/nlp/token_classification/data/import_from_iob_format.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/token_classification/data/import_from_iob_format.py) on your train and dev files, as follows:\n", - "\n", - "\n", - "\n", - "\n", - "```\n", - "python examples/nlp/token_classification/data/import_from_iob_format.py --data_file PATH_TO_IOB_FORMAT_DATAFILE, e.g., \"DATA_DIR/all_data.txt\"\n", - "```\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ord_6KlkeNl8" - }, - "outputs": [], - "source": [ - "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/nlp/token_classification/data/import_from_iob_format.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "IfSUkxffeSpL" - }, - "outputs": [], - "source": [ - "!python import_from_iob_format.py --data_file $DATA_DIR/all_data.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Aj0rXbYXbivW" - }, - "source": [ - "Now we process the data to remove potentially any repeated sentences and then split them into train and dev sets. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "CgvnTlqzbq5-" - }, - "outputs": [], - "source": [ - "sent_dict = dict()\n", - "line_removed = dict()\n", - "line_counter = 0\n", - "with open(DATA_DIR + \"/text_all_not_repeated.txt\", \"w\") as f1:\n", - " with open(DATA_DIR + \"/text_all_data.txt\", \"r\") as f2:\n", - " for line in f2:\n", - " line_counter += 1\n", - " if (not line in sent_dict):\n", - " sent_dict[line] = 1\n", - " f1.write(line)\n", - " else:\n", - " line_removed[line_counter] = 1\n", - "#labels:\n", - "line_counter = 0\n", - "with open(DATA_DIR + \"/labels_all_not_repeated.txt\", \"w\") as f1:\n", - " with open(DATA_DIR + \"/labels_all_data.txt\", \"r\") as f2:\n", - " for line in f2:\n", - " line_counter += 1\n", - " if(not line_counter in line_removed):\n", - " f1.write(line)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0cO3crs_gXjt" - }, - "source": [ - "After preprocessing the data and removing repeated sentences, there will be 7668 total valid sentences. We will be using 85% of that as train and 15% as dev. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7oHQYsMMbugP" - }, - "outputs": [], - "source": [ - "total_data = 7668\n", - "train_share = 0.85\n", - "used_lines_train = dict()\n", - "flag = 1\n", - "count = 0\n", - "while flag:\n", - " idx = random.randint(1, total_data)\n", - " if (not idx in used_lines_train):\n", - " used_lines_train[idx] = 1\n", - " count += 1\n", - " if (count/total_data > train_share):\n", - " flag = 0\n", - "\n", - "line_counter = 0\n", - "with open(DATA_DIR+ \"/text_train.txt\", \"w\") as f1:\n", - " with open(DATA_DIR + \"/text_dev.txt\", \"w\") as f2:\n", - " with open(DATA_DIR + \"/text_all_not_repeated.txt\", \"r\") as f3:\n", - " for line in f3:\n", - " line_counter += 1\n", - " if (line_counter in used_lines_train):\n", - " f1.write(line)\n", - " else:\n", - " f2.write(line)\n", - "\n", - "line_counter = 0\n", - "with open(DATA_DIR + \"/labels_train.txt\", \"w\") as f1:\n", - " with open(DATA_DIR + \"/labels_dev.txt\", \"w\") as f2:\n", - " with open(DATA_DIR + \"/labels_all_not_repeated.txt\", \"r\") as f3:\n", - " for line in f3:\n", - " line_counter += 1\n", - " if (line_counter in used_lines_train):\n", - " f1.write(line)\n", - " else:\n", - " f2.write(line)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1Q-GWNwDbzKl" - }, - "source": [ - "Finally, we remove files that are not needed anymore." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "II20ustub5BF" - }, - "outputs": [], - "source": [ - "print(\"Removed files:\")\n", - "for filename in os.listdir(DATA_DIR):\n", - " if (filename == \"text_dev.txt\" or filename == \"text_train.txt\" or filename == \"labels_dev.txt\" or filename == \"labels_train.txt\"):\n", - " continue\n", - " print(filename)\n", - " os.remove(DATA_DIR + \"/\" + filename)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "U8Ty5_S7Ye8h" - }, - "source": [ - "Now, the data folder should contain these 4 files:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L8vsyh3JZH26" - }, - "source": [ - "\n", - "\n", - "* labels_dev.txt\n", - "* labels_train.txt\n", - "* text_dev.txt\n", - "* text_train.txt\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qB0oLE4R9EhJ" - }, - "outputs": [], - "source": [ - "! ls -l $DATA_DIR" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6UDPgadLN6SG" - }, - "outputs": [], - "source": [ - "# let's take a look at the data \n", - "print('Text:')\n", - "! head -n 5 {DATA_DIR}/text_train.txt\n", - "\n", - "print('\\nLabels:')\n", - "! head -n 5 {DATA_DIR}/labels_train.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_whKCxfTMo6Y" - }, - "source": [ - "# Model configuration\n", - "\n", - "Our Named Entity Recognition model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model followed by a Token Classification layer.\n", - "\n", - "The model is defined in a config file which declares multiple important sections. They are:\n", - "- **model**: All arguments that are related to the Model - language model, token classifier, optimizer and schedulers, datasets and any other related information\n", - "\n", - "- **trainer**: Any argument to be passed to PyTorch Lightning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "T1gA8PsJ13MJ" - }, - "outputs": [], - "source": [ - "MODEL_CONFIG = \"token_classification_config.yaml\"\n", - "# download the model's configuration file \n", - "config_dir = WORK_DIR + '/configs/'\n", - "os.makedirs(config_dir, exist_ok=True)\n", - "if not os.path.exists(config_dir + MODEL_CONFIG):\n", - " print('Downloading config file...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/token_classification/conf/' + MODEL_CONFIG, config_dir)\n", - "else:\n", - " print ('config file is already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mX3KmWMvSUQw" - }, - "outputs": [], - "source": [ - "# this line will print the entire config of the model\n", - "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n", - "print(config_path)\n", - "config = OmegaConf.load(config_path)\n", - "print(OmegaConf.to_yaml(config))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZCgWzNBkaQLZ" - }, - "source": [ - "# Fine-tuning the model using Arman dataset\n", - "\n", - "Let's select a [`bert-base-multilingual-uncased`](https://huggingface.co/bert-base-multilingual-uncased) BERT model and fine-tune it on the Arman dataset.\n", - "\n", - "## Setting up Data within the config\n", - "\n", - "Among other things, the config file contains dictionaries called dataset, train_ds and validation_ds. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n", - "\n", - "We assume that both training and evaluation files are in the same directory and use the default names mentioned during the data download step. \n", - "So, to start model training, we simply need to specify `model.dataset.data_dir`, like we are going to do below.\n", - "\n", - "Also notice that some config lines, including `model.dataset.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n", - "\n", - "Let us now add the data directory path to the config.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LQHCJN-ZaoLp" - }, - "outputs": [], - "source": [ - "# in this tutorial train and dev datasets are located in the same folder, so it is enought to add the path of the data directory to the config\n", - "config.model.dataset.data_dir = DATA_DIR\n", - "\n", - "# if you want to use the full dataset, set NUM_SAMPLES to -1\n", - "NUM_SAMPLES = 1000\n", - "config.model.train_ds.num_samples = NUM_SAMPLES\n", - "config.model.validation_ds.num_samples = NUM_SAMPLES\n", - "\n", - "# for demonstartion purposes we're running only a single epoch\n", - "config.trainer.max_epochs = 5\n", - "print(OmegaConf.to_yaml(config.model))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nB96-3sTc3yk" - }, - "source": [ - "## Building the PyTorch Lightning Trainer\n", - "\n", - "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem.\n", - "\n", - "Let's first instantiate a Trainer object" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1tG4FzZ4Ui60" - }, - "outputs": [], - "source": [ - "print(\"Trainer config - \\n\")\n", - "print(OmegaConf.to_yaml(config.trainer))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "knF6QeQQdMrH" - }, - "outputs": [], - "source": [ - "# lets modify some trainer configs\n", - "# checks if we have GPU available and uses it\n", - "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n", - "config.trainer.devices = 1\n", - "config.trainer.accelerator = accelerator\n", - "\n", - "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n", - "\n", - "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n", - "# config.trainer.amp_level = O1\n", - "\n", - "# remove distributed training flags\n", - "config.trainer.strategy = None\n", - "\n", - "# setup max number of steps to reduce training time for demonstration purposes of this tutorial\n", - "config.trainer.max_steps = 32\n", - "\n", - "config.exp_manager.exp_dir = WORK_DIR\n", - "trainer = pl.Trainer(**config.trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8IlEMdVxdr6p" - }, - "source": [ - "## Setting up a NeMo Experiment¶\n", - "\n", - "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "exp_manager(trainer, config.get(\"exp_manager\", None))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8uztqGAmdrYt" - }, - "outputs": [], - "source": [ - "exp_dir = config.exp_manager.exp_dir\n", - "\n", - "# the exp_dir provides a path to the current experiment for easy access\n", - "exp_dir = str(exp_dir)\n", - "exp_dir" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8tjLhUvL_o7_" - }, - "source": [ - "Before initializing the model, we might want to modify some of the model configs. For example, we might want to modify the pretrained BERT model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Xeuc2i7Y_nP5" - }, - "outputs": [], - "source": [ - "# get the list of supported BERT-like models, for the complete list of HugginFace models, see https://huggingface.co/models\n", - "print(nemo_nlp.modules.get_pretrained_lm_models_list(include_external=False))\n", - "\n", - "# specify BERT-like model, you want to use\n", - "PRETRAINED_BERT_MODEL = \"bert-base-multilingual-uncased\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fzNZNAVRjDD-" - }, - "source": [ - "Now, we are ready to initialize our model. During the model initialization call, the dataset and data loaders we'll be prepared for training and evaluation.\n", - "Also, the pretrained BERT model will be downloaded, note it can take up to a few minutes depending on the size of the chosen BERT model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NgsGLydWo-6-" - }, - "outputs": [], - "source": [ - "model = nemo_nlp.models.TokenClassificationModel(cfg=config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kQ592Tx4pzyB" - }, - "source": [ - "## Monitoring training progress\n", - "Optionally, you can create a Tensorboard visualization to monitor training progress." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mTJr16_pp0aS" - }, - "outputs": [], - "source": [ - "try:\n", - " from google import colab\n", - " COLAB_ENV = True\n", - "except (ImportError, ModuleNotFoundError):\n", - " COLAB_ENV = False\n", - "\n", - "# Load the TensorBoard notebook extension\n", - "if COLAB_ENV:\n", - " %load_ext tensorboard\n", - " %tensorboard --logdir {exp_dir}\n", - "else:\n", - " print(\"To use tensorboard, please use this notebook in a Google Colab environment.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fj1pdEdD0Vm3" - }, - "source": [ - "See how it performs before fine-tuning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wo1oVGIT0aBZ" - }, - "outputs": [], - "source": [ - "# define the list of queries for inference\n", - "queries = [\n", - " 'حمید طاهایی افزود : برای اجرای این طرحها 0 میلیارد و 0 میلیون ریال اعتبار هزینه شده است . ',\n", - " 'دکتر اصغری دبیر چهارمین همایش انجمن زمین‌شناسی ایران در این زمینه گفت : از مجموع چهار صد مقاله رسیده به دبیرخانه همایش ، يك صد و هشتاد مقاله ظرف مدت دو روز در هشت سالن همایش برگزار شد . '\n", - "]\n", - "results = model.add_predictions(queries)\n", - "\n", - "for query, result in zip(queries, results):\n", - " print()\n", - " print(f'Query : {query}')\n", - " print(f'Result: {result.strip()}\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kyElt0Es-aSk" - }, - "outputs": [], - "source": [ - "print(\"Trainer config - \\n\")\n", - "print(OmegaConf.to_yaml(config.trainer))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hUvnSpyjp0Dh" - }, - "outputs": [], - "source": [ - "# start model training\n", - "trainer.fit(model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MOrR0PeJqa0j" - }, - "source": [ - "After the training is complete, `.nemo` file that contains model's checkpoints and all associated artifacts could be found under `nemo_experiments/token_classification_model/DATE_TIME`" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-lFo27PJ0o3W" - }, - "source": [ - "See how it gets better after:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9fNcBnz80rLO" - }, - "outputs": [], - "source": [ - "results = model.add_predictions(queries)\n", - "\n", - "for query, result in zip(queries, results):\n", - " print()\n", - " print(f'Query : {query}')\n", - " print(f'Result: {result.strip()}\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JxBiIKMlH8yv" - }, - "source": [ - "After training for 100 epochs, with the default config and NUM_SAMPLES = -1 (i.e. all data is used), your model performance should look similar to this: \n", - "```\n", - " label precision recall f1 support\n", - " O (label_id: 0) 99.09 99.19 99.14 32867\n", - " B-event (label_id: 1) 67.74 70.00 68.85 90\n", - " B-fac (label_id: 2) 70.89 73.68 72.26 76\n", - " B-loc (label_id: 3) 87.45 82.70 85.01 497\n", - " B-org (label_id: 4) 81.88 87.06 84.39 649\n", - " B-pers (label_id: 5) 94.93 93.36 94.14 542\n", - " B-pro (label_id: 6) 79.31 70.41 74.59 98\n", - " I-event (label_id: 7) 87.38 74.72 80.55 352\n", - " I-fac (label_id: 8) 83.08 77.14 80.00 140\n", - " I-loc (label_id: 9) 77.78 73.39 75.52 124\n", - " I-org (label_id: 10) 86.51 89.93 88.18 834\n", - " I-pers (label_id: 11) 95.30 94.35 94.82 301\n", - " I-pro (label_id: 12) 82.86 86.57 84.67 67\n", - " -------------------\n", - " micro avg 97.78 97.78 97.78 36637\n", - " macro avg 84.17 82.50 83.24 36637\n", - " weighted avg 97.78 97.78 97.77 36637\n", - "```\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VZp9STMHQAp1" - }, - "source": [ - "**References**\n", - "\n", - "1. Devlin, Jacob, et al. \"BERT: Pre-training of deep bidirectional transformers for language understanding.\" arXiv preprint arXiv:1810.04805 (2018).\n", - "\n", - "2. Hanieh Poostchi, Ehsan Zare Borzeshi, Mohammad Abdous, and Massimo Piccardi, \"PersoNER: Persian Named-Entity Recognition,\" The 26th International Conference on Computational Linguistics (COLING 2016), pages 3381–3389, Osaka, Japan, 2016.\n", - "\n", - "3. Hanieh Poostchi, Ehsan Zare Borzeshi, and Massimo Piccardi, \"BiLSTM-CRF for Persian Named-Entity Recognition; ArmanPersoNERCorpus: the First Entity-Annotated Persian Dataset,\" The 11th Edition of the Language Resources and Evaluation Conference (LREC), Miyazaki, Japan, 7-12 May 2018, ISLRN 399-379-640-828-6, ISLRN 921-509-141-609-6." - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "Non_English_Downstream_Tasks_(NER).ipynb", - "private_outputs": true, - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 1 -}