In [1]:
!pip install stanza transformers torch



In [2]:
import stanza
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch

In [3]:
sample_text = """
Doctor: What symptoms are you experiencing?
Patient: I've been having headaches and a fever for the last three days.
Doctor: Have you taken any medication for it?
Patient: Only some over-the-counter ibuprofen.
"""

In [4]:
stanza.download('en')
nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,ner')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [5]:
doc_stanza = nlp_stanza(sample_text)
print("Entities (Stanza):")
for ent in doc_stanza.ents:
    print(f"{ent.text} [{ent.type}]")

Entities (Stanza):
the last three days [DATE]


In [6]:
tokenizer_bio = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model_bio = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-v1.1")
ner_pipeline = pipeline("ner", model=model_bio, tokenizer=tokenizer_bio)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


In [7]:
print("Medical Entities (BioBERT):")
medical_entities = ner_pipeline(sample_text)
for entity in medical_entities:
    print(f"{entity['word']} [{entity['entity']}]")

Medical Entities (BioBERT):
Doctor [LABEL_0]
: [LABEL_0]
What [LABEL_0]
symptoms [LABEL_0]
are [LABEL_1]
you [LABEL_0]
experiencing [LABEL_0]
? [LABEL_0]
Pat [LABEL_0]
##ient [LABEL_0]
: [LABEL_0]
I [LABEL_0]
' [LABEL_0]
ve [LABEL_1]
been [LABEL_1]
having [LABEL_0]
headache [LABEL_0]
##s [LABEL_0]
and [LABEL_0]
a [LABEL_0]
fever [LABEL_0]
for [LABEL_1]
the [LABEL_1]
last [LABEL_1]
three [LABEL_1]
days [LABEL_1]
. [LABEL_0]
Doctor [LABEL_0]
: [LABEL_0]
Have [LABEL_1]
you [LABEL_0]
taken [LABEL_1]
any [LABEL_1]
medication [LABEL_0]
for [LABEL_1]
it [LABEL_0]
? [LABEL_0]
Pat [LABEL_0]
##ient [LABEL_0]
: [LABEL_0]
Only [LABEL_0]
some [LABEL_1]
over [LABEL_1]
- [LABEL_1]
the [LABEL_1]
- [LABEL_1]
counter [LABEL_1]
i [LABEL_0]
##bu [LABEL_0]
##p [LABEL_0]
##ro [LABEL_1]
##fen [LABEL_1]
. [LABEL_0]


In [8]:
bio_model_path = '/content/biobert_model'
model_bio.save_pretrained(bio_model_path)
tokenizer_bio.save_pretrained(bio_model_path)

('/content/biobert_model/tokenizer_config.json',
 '/content/biobert_model/special_tokens_map.json',
 '/content/biobert_model/vocab.txt',
 '/content/biobert_model/added_tokens.json',
 '/content/biobert_model/tokenizer.json')