In [None]:
! pwd

In [None]:
! nvcc --version

In [None]:
! nvidia-smi

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    capability = torch.cuda.get_device_capability(device.index)
    supports_fp16 = capability[0] >= 7  # FP16 support requires compute capability 7.0 or higher
    print(f"GPU supports FP16: {supports_fp16}")
else:
    print("No GPU available")


Install transformers

In [None]:
# !pip install git+https://github.com/huggingface/transformers

Install the other required libraries

In [None]:
# ! pip install --upgrade transformers
! pip install transformers[sentencepiece]
! pip install transformers[torch]
# ! pip install seqeval
! pip install seqeval[gpu]
! pip install conllu
! pip install seaborn
! pip install wandb

Login to wandb and set parameters

In [None]:
# login to wandb
import wandb
wandb.login()

In [None]:
# %env WANDB_PROJECT=RTB-NER-Transfer-Learning
# %env WANDB_PROJECT=RTB-NER-DEBUG
# %env WANDB_TAGS =Transfer Learning,train,BERT
# %env WANDB_WATCH=all

#try this for sweeps
%env WANDB_CONSOLE="off"
%env WANDB_DISABLE_SERVICE=true

In [None]:
# %env WANDB_PROJECT=WNUT-NER-Transfer-Learning
# %env WANDB_TAGS = ["Transfer Learning", "BERT", "train", "WNUT"]

Preprocess the data

In [None]:
# ! tr '\t' ' ' < data_10/ciat_ner_diseases-output-iob-tags-10_train.txt > data_10/train.txt.tmp
# ! tr '\t' ' ' < data_10/ciat_ner_diseases-output-iob-tags-10_test.txt > data_10/test.txt.tmp
# ! tr '\t' ' ' < data_10/ciat_ner_diseases-output-iob-tags-10_validate.txt > data_10/dev.txt.tmp

In [None]:
# ! python scripts/preprocess.py data_10/train.txt.tmp bert-large-cased 128 > data_10/train.txt
# ! python scripts/preprocess.py data_10/test.txt.tmp bert-large-cased 128 > data_10/test.txt
# ! python scripts/preprocess.py data_10/dev.txt.tmp bert-large-cased 128 > data_10/dev.txt

In [1]:
# ! tr '\t' ' ' < data_30/ciat_ner_diseases-output-iob-tags-30_train.txt > data_30/train.txt.tmp
# ! tr '\t' ' ' < data_30/ciat_ner_diseases-output-iob-tags-30_test.txt > data_30/test.txt.tmp
# ! tr '\t' ' ' < data_30/ciat_ner_diseases-output-iob-tags-30_validate.txt > data_30/dev.txt.tmp

In [2]:
# ! python scripts/preprocess.py data_30/train.txt.tmp bert-large-cased 128 > data_30/bert_large_cased/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp bert-large-cased 128 > data_30/bert_large_cased/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp bert-large-cased 128 > data_30/bert_large_cased/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp bert-large-cased 256 > data_30/bert_large_cased/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp bert-large-cased 256 > data_30/bert_large_cased/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp bert-large-cased 256 > data_30/bert_large_cased/256/dev.txt

In [3]:
# ! python scripts/preprocess.py data_30/train.txt.tmp bert-large-uncased 128 > data_30/bert_large_uncased/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp bert-large-uncased 128 > data_30/bert_large_uncased/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp bert-large-uncased 128 > data_30/bert_large_uncased/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp bert-large-uncased 256 > data_30/bert_large_uncased/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp bert-large-uncased 256 > data_30/bert_large_uncased/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp bert-large-uncased 256 > data_30/bert_large_uncased/256/dev.txt

In [4]:
# ! python scripts/preprocess.py data_30/train.txt.tmp allenai/scibert_scivocab_uncased 128 > data_30/scibert_scivocab_uncased/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp allenai/scibert_scivocab_uncased 128 > data_30/scibert_scivocab_uncased/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp allenai/scibert_scivocab_uncased 128 > data_30/scibert_scivocab_uncased/128/dev.txt

# ! python scripts/preprocess.py data_30/train.txt.tmp allenai/scibert_scivocab_uncased 256 > data_30/scibert_scivocab_uncased/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp allenai/scibert_scivocab_uncased 256 > data_30/scibert_scivocab_uncased/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp allenai/scibert_scivocab_uncased 256 > data_30/scibert_scivocab_uncased/256/dev.txt

In [5]:
# ! python scripts/preprocess.py data_30/train.txt.tmp allenai/scibert_scivocab_cased 128 > data_30/scibert_scivocab_cased/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp allenai/scibert_scivocab_cased 128 > data_30/scibert_scivocab_cased/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp allenai/scibert_scivocab_cased 128 > data_30/scibert_scivocab_cased/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp allenai/scibert_scivocab_cased 256 > data_30/scibert_scivocab_cased/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp allenai/scibert_scivocab_cased 256 > data_30/scibert_scivocab_cased/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp allenai/scibert_scivocab_cased 256 > data_30/scibert_scivocab_cased/256/dev.txt

In [6]:
# ! python scripts/preprocess.py data_30/train.txt.tmp microsoft/deberta-v2-xlarge 128 > data_30/deberta_v2_xlarge/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp microsoft/deberta-v2-xlarge 128 > data_30/deberta_v2_xlarge/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp microsoft/deberta-v2-xlarge 128 > data_30/deberta_v2_xlarge/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp microsoft/deberta-v2-xlarge 256 > data_30/deberta_v2_xlarge/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp microsoft/deberta-v2-xlarge 256 > data_30/deberta_v2_xlarge/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp microsoft/deberta-v2-xlarge 256 > data_30/deberta_v2_xlarge/256/dev.txt

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# ! python scripts/preprocess.py data_30/train.txt.tmp roberta-large 128 > data_30/roberta_large/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp roberta-large 128 > data_30/roberta_large/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp roberta-large 128 > data_30/roberta_large/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp roberta-large 256 > data_30/roberta_large/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp roberta-large 256 > data_30/roberta_large/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp roberta-large 256 > data_30/roberta_large/256/dev.txt

In [8]:
# ! python scripts/preprocess.py data_30/train.txt.tmp google/electra-base-discriminator 128 > data_30/electra_base/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp google/electra-base-discriminator 128 > data_30/electra_base/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp google/electra-base-discriminator 128 > data_30/electra_base/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp google/electra-base-discriminator 256 > data_30/electra_base/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp google/electra-base-discriminator 256 > data_30/electra_base/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp google/electra-base-discriminator 256 > data_30/electra_base/256/dev.txt

In [9]:
# ! python scripts/preprocess.py data_30/train.txt.tmp google/electra-large-discriminator 128 > data_30/electra_large/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp google/electra-large-discriminator 128 > data_30/electra_large/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp google/electra-large-discriminator 128 > data_30/electra_large/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp google/electra-large-discriminator 256 > data_30/electra_large/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp google/electra-large-discriminator 256 > data_30/electra_large/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp google/electra-large-discriminator 256 > data_30/electra_large/256/dev.txt

In [10]:
# ! python scripts/preprocess.py data_30/train.txt.tmp KISTI-AI/Scideberta-full 128 > data_30/sciberta_full/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp KISTI-AI/Scideberta-full 128 > data_30/sciberta_full/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp KISTI-AI/Scideberta-full 128 > data_30/sciberta_full/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp KISTI-AI/Scideberta-full 256 > data_30/sciberta_full/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp KISTI-AI/Scideberta-full 256 > data_30/sciberta_full/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp KISTI-AI/Scideberta-full 256 > data_30/sciberta_full/256/dev.txt

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# ! python scripts/preprocess.py data_30/train.txt.tmp microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 128 > data_30/PubMedBert_base_uncased/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 128 > data_30/PubMedBert_base_uncased/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 128 > data_30/PubMedBert_base_uncased/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 256 > data_30/PubMedBert_base_uncased/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 256 > data_30/PubMedBert_base_uncased/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext 256 > data_30/PubMedBert_base_uncased/256/dev.txt

In [12]:
# ! python scripts/preprocess.py data_30/train.txt.tmp dmis-lab/biobert-base-cased-v1.2 128 > data_30/biobert_base_cased/128/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp dmis-lab/biobert-base-cased-v1.2 128 > data_30/biobert_base_cased/128/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp dmis-lab/biobert-base-cased-v1.2 128 > data_30/biobert_base_cased/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp dmis-lab/biobert-base-cased-v1.2 256 > data_30/biobert_base_cased/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp dmis-lab/biobert-base-cased-v1.2 256 > data_30/biobert_base_cased/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp dmis-lab/biobert-base-cased-v1.2 256 > data_30/biobert_base_cased/256/dev.txt

In [13]:
! python scripts/preprocess.py data_30/train.txt.tmp microsoft/deberta-v3-large 128 > data_30/deberta_v3_large/128/train.txt
! python scripts/preprocess.py data_30/test.txt.tmp microsoft/deberta-v3-large 128 > data_30/deberta_v3_large/128/test.txt
! python scripts/preprocess.py data_30/dev.txt.tmp microsoft/deberta-v3-large 128 > data_30/deberta_v3_large/128/dev.txt
#
# ! python scripts/preprocess.py data_30/train.txt.tmp microsoft/deberta-v3-large 256 > data_30/deberta_v3_large/256/train.txt
# ! python scripts/preprocess.py data_30/test.txt.tmp microsoft/deberta-v3-large 256 > data_30/deberta_v3_large/256/test.txt
# ! python scripts/preprocess.py data_30/dev.txt.tmp microsoft/deberta-v3-large 256 > data_30/deberta_v3_large/256/dev.txt

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# ! tr '\t' ' ' < data_20/ciat_ner_diseases-output-iob-tags-20_test.txt > data_20/test.txt.tmp
# ! tr '\t' ' ' < data_20/ciat_ner_diseases-output-iob-tags-20_train.txt > data_20/train.txt.tmp
# ! tr '\t' ' ' < data_20/ciat_ner_diseases-output-iob-tags-20_validate.txt > data_20/dev.txt.tmp

# do in shell
# export MAX_LENGTH=128
# export BERT_MODEL=bert-base-cased
# python3 scripts/preprocess.py data_20/train.txt.tmp $BERT_MODEL $MAX_LENGTH > data_20/train.txt
# python3 scripts/preprocess.py data_20/dev.txt.tmp $BERT_MODEL $MAX_LENGTH > data_20/dev.txt
# python3 scripts/preprocess.py data_20/test.txt.tmp $BERT_MODEL $MAX_LENGTH > data_20/test.txt

# ! cat data_20/train.txt data_20/dev.txt data_20/test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > data_20/labels.txt

In [None]:
# ! tr '\t' ' ' < data_4K/ciat_ner_diseases-output-iob-tags-4000_train.txt > data_4K/train.txt.tmp
# ! tr '\t' ' ' < data_4K/ciat_ner_diseases-output-iob-tags-4000_test.txt > data_4K/test.txt.tmp
# ! tr '\t' ' ' < data_4K/ciat_ner_diseases-output-iob-tags-4000_validate.txt > data_4K/dev.txt.tmp

In [None]:
# ! python scripts/preprocess.py data_4K/train.txt.tmp allenai/longformer-base-4096 4096 > data_4K/longformer/4096/train.txt
# ! python scripts/preprocess.py data_4K/test.txt.tmp allenai/longformer-base-4096 4096 > data_4K/longformer/4096/test.txt
# ! python scripts/preprocess.py data_4K/dev.txt.tmp allenai/longformer-base-4096 4096 > data_4K/longformer/4096/dev.txt

Run the training

In [None]:
! python run_ner.py ./data_30/bert_large_cased/train_config_bert_large_cased_128.json

In [None]:
# ! python run_ner.py ./data_30/bert_large_cased/train_config_bert_large_cased_256.json

In [None]:
! python run_ner.py ./data_30/bert_large_uncased/train_config_bert_large_uncased_128.json

In [None]:
# ! python run_ner.py ./data_30/bert_large_uncased/train_config_bert_large_uncased_256.json

In [None]:
! python run_ner.py ./data_30/scibert_scivocab_uncased/train_config_scibert_scivocab_uncased_128.json

In [None]:
# ! python run_ner.py ./data_30/scibert_scivocab_uncased/train_config_scibert_scivocab_uncased_256.json

In [None]:
! python run_ner.py ./data_30/scibert_scivocab_cased/train_config_scibert_scivocab_cased_128.json

In [None]:
# ! python run_ner.py ./data_30/scibert_scivocab_cased/train_config_scibert_scivocab_cased_256.json

In [None]:
! python run_ner.py ./data_30/deberta_v2_xlarge/train_config_deberta_v2_xlarge_128.json

In [None]:
# ! python run_ner.py ./data_30/deberta_v2_xlarge/train_config_deberta_v2_xlarge_256.json

In [None]:
! python run_ner.py ./data_30/roberta_large/train_config_roberta_large_128.json

In [None]:
# ! python run_ner.py ./data_30/roberta_large/train_config_roberta_large_256.json

In [None]:
! python run_ner.py ./data_30/electra_base/train_config_electra_base_128.json

In [None]:
# ! python run_ner.py ./data_30/electra_base/train_config_electra_base_256.json

In [None]:
! python run_ner.py ./data_30/electra_large/train_config_electra_large_128.json

In [None]:
# ! python run_ner.py ./data_30/electra_large/train_config_electra_large_256.json

In [None]:
! python run_ner.py ./data_30/sciberta_full/train_config_sciberta_full_128.json

In [None]:
# ! python run_ner.py ./data_30/sciberta_full/train_config_sciberta_full_256.json

In [None]:
! python run_ner.py ./data_30/PubMedBert_base_uncased/train_config_PubMedBert_base_uncased_128.json

In [None]:
# ! python run_ner.py ./data_30/PubMedBert_base_uncased/train_config_PubMedBert_base_uncased_256.json

In [None]:
! python run_ner.py ./data_30/biobert_base_cased/train_config_biobert_base_128.json

In [None]:
# ! python run_ner.py ./data_30/biobert_base_cased/train_config_biobert_base_256.json

In [None]:
! python run_ner.py ./data_30/deberta_v3_large/train_config_deberta_v3_large_128.json

In [None]:
# ! python run_ner.py ./data_30/deberta_v3_large/train_config_deberta_v3_large_256.json
# # ! python run_ner.py ./data_30/deberta_v3_large/train_config_deberta_v3_large_256_8b.json

In [None]:
! python run_ner.py ./data_4K/longformer/train_config_longformer_4096.json

In [None]:
# ! python sweep.py

In [None]:
# ! runpodctl stop pod $RUNPOD_POD_ID