<a href="https://colab.research.google.com/github/Luensmann/Bachelorarbeit/blob/main/MultiCorpu_all_Label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate
!pip install wandb
!pip install seqeval

In [None]:
from datasets import load_dataset, Features, Sequence, Value, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, EarlyStoppingCallback, IntervalStrategy, Trainer, DataCollatorForTokenClassification
import evaluate
import json
import wandb
import os
import numpy as np


In [None]:
from huggingface_hub import notebook_login
notebook_login()

ModuleNotFoundError: ignored

In [None]:
# labels

labels = ['O', 'B-Gene_Protein', 'I-Gene_Protein', 'B-DNAMutation', 'I-DNAMutation', 'B-ProteinMutation', 'I-ProteinMutation', 'B-SNP', 'I-SNP', 'B-DNA_modification', 'I-DNA_modification', 'B-RNA', 'I-RNA', 'B-RNA_Mutation', 'I-RNA_Mutation', 'B-locus', 'I-locus', 'B-disease', 'I-disease', 'B-body-part', 'I-body-part', 'B-mutation', 'I-mutation', 'B-Physiology', 'I-Physiology', 'B-cohort-patient', 'I-cohort-patient', 'B-size', 'I-size', 'B-gender', 'I-gender', 'B-age', 'I-age', 'B-Concepts_Ideas', 'I-Concepts_Ideas', 'B-Disorder', 'I-Disorder', 'B-Phenomena', 'I-Phenomena', 'B-ethnicity', 'I-ethnicity']


In [None]:
# Load Dataset
dataset = load_dataset("Brizape/multiCorp_tokenized_split_0404_dev")
dataset = dataset.cast_column("ner_tags", Sequence(feature=ClassLabel(names=labels)))
dataset

In [None]:
tokenized_dataset = dataset.remove_columns(["id", "tokens", "ner_tags"])
tokenized_dataset

In [None]:
dataset['train'].features

In [None]:
# Tokenizer
tokenizer_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trunction=True , model_max_len=512)

In [None]:
# padding
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
# metric
seqeval = evaluate.load("seqeval")

In [None]:
label_list = dataset["train"].features[f"ner_tags"].feature.names
label_list

In [None]:
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

import numpy as np

labels = labels


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels, mode='strict')
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
label2id = {'B-Concepts_Ideas': 33,
             'B-DNAMutation': 3,
             'B-DNA_modification': 9,
             'B-Disorder': 35,
             'B-Gene_Protein': 1,
             'B-Phenomena': 37,
             'B-Physiology': 23,
             'B-ProteinMutation': 5,
             'B-RNA': 11,
             'B-RNA_Mutation': 13,
             'B-SNP': 7,
             'B-age': 31,
             'B-body-part': 19,
             'B-cohort-patient': 25,
             'B-disease': 17,
             'B-ethnicity': 39,
             'B-gender': 29,
             'B-locus': 15,
             'B-mutation': 21,
             'B-size': 27,
             'I-Concepts_Ideas': 34,
             'I-DNAMutation': 4,
             'I-DNA_modification': 10,
             'I-Disorder': 36,
             'I-Gene_Protein': 2,
             'I-Phenomena': 38,
             'I-Physiology': 24,
             'I-ProteinMutation': 6,
             'I-RNA': 12,
             'I-RNA_Mutation': 14,
             'I-SNP': 8,
             'I-age': 32,
             'I-body-part': 20,
             'I-cohort-patient': 26,
             'I-disease': 18,
             'I-ethnicity': 40,
             'I-gender': 30,
             'I-locus': 16,
             'I-mutation': 22,
             'I-size': 28,
             'O': 0}
id2label = {0: 'O',
             1: 'B-Gene_Protein',
             2: 'I-Gene_Protein',
             3: 'B-DNAMutation',
             4: 'I-DNAMutation',
             5: 'B-ProteinMutation',
             6: 'I-ProteinMutation',
             7: 'B-SNP',
             8: 'I-SNP',
             9: 'B-DNA_modification',
             10: 'I-DNA_modification',
             11: 'B-RNA',
             12: 'I-RNA',
             13: 'B-RNA_Mutation',
             14: 'I-RNA_Mutation',
             15: 'B-locus',
             16: 'I-locus',
             17: 'B-disease',
             18: 'I-disease',
             19: 'B-body-part',
             20: 'I-body-part',
             21: 'B-mutation',
             22: 'I-mutation',
             23: 'B-Physiology',
             24: 'I-Physiology',
             25: 'B-cohort-patient',
             26: 'I-cohort-patient',
             27: 'B-size',
             28: 'I-size',
             29: 'B-gender',
             30: 'I-gender',
             31: 'B-age',
             32: 'I-age',
             33: 'B-Concepts_Ideas',
             34: 'I-Concepts_Ideas',
             35: 'B-Disorder',
             36: 'I-Disorder',
             37: 'B-Phenomena',
             38: 'I-Phenomena',
             39: 'B-ethnicity',
             40: 'I-ethnicity'}

In [None]:
# Training several model with different learning rates
ln_rate = [2e-5, 5e-5, 10e-5]

for learnrate in ln_rate:

  runname = "MultiCorp_all_label_" + str(learnrate)

  model = AutoModelForTokenClassification.from_pretrained(
      "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", num_labels=41, id2label=id2label, label2id=label2id
  )

  training_args = TrainingArguments(
      report_to = 'wandb',                     # enable logging to W&B
      output_dir = runname,    # output directory/ name for huggingface hub
      learning_rate=learnrate,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
      weight_decay=0.01,
      evaluation_strategy = 'steps',          # check evaluation metrics at each epoch
      max_steps = 2000,
      logging_steps = 25,                    # we will log every 25 steps
      eval_steps = 25,                      # we will perform evaluation every 25 steps
      save_steps = 25,
      load_best_model_at_end=True,
      metric_for_best_model = 'eval_loss',
      greater_is_better = False,
      push_to_hub=True,
      run_name = runname             # name of the W&B run
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_dataset["train"],
      eval_dataset=tokenized_dataset["validation"],
      tokenizer=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
      callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
  )

  trainer.train()
  wandb.finish()

  trainer.save_model()

In [None]:
wandb.finish()
trainer.save_model()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▁▇▇███
eval/f1,▁▂▇██▇█
eval/loss,█▃▂▁▁▂▁
eval/precision,▁▆█▇█▇█
eval/recall,▁▁▆█▇▇▇
eval/runtime,▁▃▄▆▇█▆
eval/samples_per_second,█▆▅▃▂▁▃
eval/steps_per_second,█▆▅▃▂▁▃
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███

0,1
eval/accuracy,0.97014
eval/f1,0.74466
eval/loss,0.09525
eval/precision,0.73898
eval/recall,0.75043
eval/runtime,3.2869
eval/samples_per_second,30.728
eval/steps_per_second,2.13
train/epoch,6.73
train/global_step,175.0


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
To https://huggingface.co/Brizape/SETH_5e-05_0404_ES6_strict_tok
   b4087f7..42d5266  main -> main

   b4087f7..42d5266  main -> main

