# Download BioBert, Prepare DataSets For NER and RE Model Training

In [1]:
import os
import urllib
import azureml
import tempfile
import azureml.core
from azureml.data.datapath import DataPath
from azureml.core import Model, Workspace, Environment, Run, Dataset, Datastore, ScriptRunConfig, Experiment, ComputeTarget
from azureml.data import OutputFileDatasetConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.resource_configuration import ResourceConfiguration

print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")

You are currently using version 1.40.0 of the Azure ML SDK


In [2]:
ws = Workspace.from_config()

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: udacity-capstone-ws
Azure region: northeurope
Subscription id: fbe09221-d2fa-4355-8174-808a6c0b6925
Resource group: udacity-capstone


In [3]:
data_store = Datastore(ws, "workspaceblobstore")
print(data_store)

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-644b04ba-cd6a-402f-aa72-d1730d303ea8",
  "account_name": "udacitycapston2606602571",
  "protocol": "https",
  "endpoint": "core.windows.net"
}


In [4]:
# cpu_cluster = ComputeTarget(workspace=ws, name="StandardDS11v2")

### Download And Register Pretrained Biobert Model
In this step it's used for tokenization of data

In [1]:
from transformers import AutoModel, AutoTokenizer 

In [2]:
tokenizer_large = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1")
model_large = AutoModel.from_pretrained("dmis-lab/biobert-large-cased-v1.1")

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/467k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

In [3]:
tokenizer_base = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model_base = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

Downloading:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [4]:
tokenizer_large.save_pretrained("../biobert-large-cased-v1.1")
tokenizer_base.save_pretrained("../biobert-base-cased-v1.1")

('../biobert-base-cased-v1.1/tokenizer_config.json',
 '../biobert-base-cased-v1.1/special_tokens_map.json',
 '../biobert-base-cased-v1.1/vocab.txt',
 '../biobert-base-cased-v1.1/added_tokens.json')

In [5]:
model_large.save_pretrained("../biobert-large-cased-v1.1")
model_base.save_pretrained("../biobert-base-cased-v1.1")

In [10]:
model = Model.register(workspace=ws,
                       model_name='biobert-large-cased-v1.1',                # Name of the registered model in your workspace.
                       model_path='../biobert-large-cased-v1.1',  # Local file to upload and register as a model.
                       model_framework=Model.Framework.PYTORCH,  # Framework used to create the model.
                       resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5),
                       description='BioBERT, a biomedical language representation model designed for biomedical text mining tasks such as biomedical named entity recognition, relation extraction, question answering, etc.',
                       tags={'area': 'biobert-large', 'type': 'transformers'})

print('Name:', model.name)
print('Version:', model.version)

Registering model biobert-large-cased-v1.1
Name: biobert-large-cased-v1.1
Version: 1


In [11]:
model = Model.register(workspace=ws,
                       model_name='biobert-base-cased-v1.1',                # Name of the registered model in your workspace.
                       model_path='../biobert-base-cased-v1.1',  # Local file to upload and register as a model.
                       model_framework=Model.Framework.PYTORCH,  # Framework used to create the model.
                       resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5),
                       description='BioBERT, a biomedical language representation model designed for biomedical text mining tasks such as biomedical named entity recognition, relation extraction, question answering, etc.',
                       tags={'area': 'biobert-base', 'type': 'transformers'})

print('Name:', model.name)
print('Version:', model.version)

Registering model biobert-base-cased-v1.1
Name: biobert-base-cased-v1.1
Version: 1


### Load Data

In [5]:
key = "n2c2_2018"

if key in ws.datasets.keys(): 
        dataset = ws.datasets[key]
        
# download the dataset 
dataset.download(target_path='../', overwrite=False) 

# mount dataset to the temp directory at `mounted_path`

mounted_path = tempfile.mkdtemp()
mount_context = dataset.mount(mounted_path)

mount_context.start()

In [6]:
!ls ../n2c2_2018

test  train


In [8]:
key = "ade_corpus"

if key in ws.datasets.keys(): 
        ade_data = ws.datasets[key]

# download the dataset 
ade_data.download(target_path='../', overwrite=False) 

# mount dataset to the temp directory at `mounted_path`
mounted_path = tempfile.mkdtemp()
mount_context = ade_data.mount(mounted_path)

mount_context.start()
!ls ../ade_corpus

ade_split_0_test.json	ade_split_3_train.json	ade_split_7_test.json
ade_split_0_train.json	ade_split_4_test.json	ade_split_7_train.json
ade_split_1_test.json	ade_split_4_train.json	ade_split_8_test.json
ade_split_1_train.json	ade_split_5_test.json	ade_split_8_train.json
ade_split_2_test.json	ade_split_5_train.json	ade_split_9_test.json
ade_split_2_train.json	ade_split_6_test.json	ade_split_9_train.json
ade_split_3_test.json	ade_split_6_train.json


### Generate Data For NER Task

In [None]:
## Create and activate conda env, install requirements and run in terminal:
# !python generate_data.py \
#     --task ner \
#     --input_dir ../n2c2_2018/ \
#     --ade_dir ../ade_corpus/ \
#     --target_dir dataset_ner/ \
#     --max_seq_len 512 \
#     --dev_split 0.1 \
#     --tokenizer biobert-base \
#     --ext txt \
#     --sep " "

### Generate Data For RE Task

In [None]:
# !python generate_data.py \
#     --task re \
#     --input_dir ../n2c2_2018/ \
#     --ade_dir ../ade_corpus/ \
#     --target_dir dataset_re/ \
#     --max_seq_len 512 \
#     --dev_split 0.1 \
#     --tokenizer biobert-base \
#     --ext tsv \
#     --sep tab 

### Register Processed Datasets

In [18]:
#Upload processed datasets to datastore
data_store = Datastore(ws, "workspaceblobstore")
data_store.upload(src_dir="./dataset_ner", target_path="dataset_ner")


Uploading an estimated of 9 files
Uploading ./dataset_ner/devel.txt
Uploaded ./dataset_ner/devel.txt, 1 files out of an estimated total of 9
Uploading ./dataset_ner/labels.txt
Uploaded ./dataset_ner/labels.txt, 2 files out of an estimated total of 9
Uploading ./dataset_ner/test.txt
Uploaded ./dataset_ner/test.txt, 3 files out of an estimated total of 9
Uploading ./dataset_ner/devel.pkl
Uploaded ./dataset_ner/devel.pkl, 4 files out of an estimated total of 9
Uploading ./dataset_ner/train.txt
Uploaded ./dataset_ner/train.txt, 5 files out of an estimated total of 9
Uploading ./dataset_ner/train_dev.txt
Uploaded ./dataset_ner/train_dev.txt, 6 files out of an estimated total of 9
Uploading ./dataset_ner/test.pkl
Uploaded ./dataset_ner/test.pkl, 7 files out of an estimated total of 9
Uploading ./dataset_ner/train.pkl
Uploaded ./dataset_ner/train.pkl, 8 files out of an estimated total of 9
Uploading ./dataset_ner/train_dev.pkl
Uploaded ./dataset_ner/train_dev.pkl, 9 files out of an estimated 

In [16]:
#Upload processed datasets to datastore
data_store = Datastore(ws, "workspaceblobstore")
data_store.upload(src_dir="./dataset_re", target_path="dataset_re")


Uploading an estimated of 10 files
Target already exists. Skipping upload for dataset_re/dev.tsv
Target already exists. Skipping upload for dataset_re/dev_rel.pkl
Target already exists. Skipping upload for dataset_re/test.pkl
Target already exists. Skipping upload for dataset_re/test.tsv
Target already exists. Skipping upload for dataset_re/test_labels.tsv
Target already exists. Skipping upload for dataset_re/test_labels_rel.pkl
Target already exists. Skipping upload for dataset_re/test_rel.pkl
Target already exists. Skipping upload for dataset_re/train.pkl
Target already exists. Skipping upload for dataset_re/train.tsv
Target already exists. Skipping upload for dataset_re/train_rel.pkl
Uploaded 0 files


In [19]:
ner_data_store_path = [DataPath(data_store, "dataset_ner/")]
re_data_store_path = [DataPath(data_store, "dataset_re/")]

In [20]:
#Create file datasets
ner_file_dataset = Dataset.File.from_files(path=ner_data_store_path)
re_file_dataset = Dataset.File.from_files(path=re_data_store_path)

In [21]:
#Register Datasets

register_ner = ner_file_dataset.register(workspace=ws,
                                    name="ehr_ade_labelled_dataset_ner",
                                    description="Tokenized EHR and ADE labelled dataset splits for NER task",
                                    tags={"file_type": "directory"})

In [22]:
register_re = re_file_dataset.register(workspace=ws,
                                    name="ehr_ade_labelled_dataset_re",
                                    description="Tokenized EHR and ADE labelled dataset splits for RE task",
                                    tags={"file_type": "directory"})