# Requirements

In [2]:
!pip install transformers
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m117.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

To fine-tune a transformer model, the subsequent steps should be followed:

*   Tokenize data
*   Train the model
*   Evaluate the model







In [17]:
!pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate>=0.20.2 (from transformers[torch])
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


# Step1: Tokenization

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [2]:
def tokenize_function(example):
   tokenizer_output= tokenizer(example['tokens'],truncation=True,is_split_into_words=True)
   return{**tokenizer_output,'labels':example['ner_tags']}


In [28]:
# Load data
dataset=load_dataset("conll2003")



  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
# Prepare the tokenizer
checkpoint="dslim/bert-base-NER"
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

In [5]:
dataset=dataset.map(tokenize_function,batched=True)
dataset= dataset.remove_columns(['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'])



Map:   0%|          | 0/3250 [00:00<?, ? examples/s]



In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [6]:
# Split the data
train_dataset=dataset['train']
val_dataset=dataset['validation']
test_dataset=dataset['test']

# Step2: FineTuning

In [7]:
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments
import evaluate
import numpy as np
from transformers import Trainer
from transformers import DataCollatorWithPadding
import torch

In [19]:
#Dynamic padding
def my_collate(batch):
  input_ids=[example['input_ids'] for example in batch]
  attention_masks=[example['attention_mask'] for example in batch]
  labels=[example['labels'] for example in batch]
  max_length=max(len(x) for x in input_ids)
  #pad all inputs to max length
  padded_input_ids=[lst+[0]*(max_length-len(lst)) for lst in input_ids]
  padded_attention_masks=[lst+[0]*(max_length-len(lst)) for lst in attention_masks]
  padded_labels=[lst+[0]*(max_length-len(lst)) for lst in labels]

  return {'input_ids':torch.tensor(padded_input_ids),'attention_mask':torch.tensor(padded_attention_masks),'labels':torch.tensor(padded_labels)}

In [9]:
#Trainer API
trainig_args=TrainingArguments('bert_trainer')
model=AutoModelForTokenClassification.from_pretrained(checkpoint)

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [10]:
def compute_metrics(eval_preds):
  metric=evaluate.load("accuracy")
  logits,labels=eval_preds
  predictions=np.argmax(logits,axis=-1)
  return metric.compute(predictions=predictions,references=labels)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [20]:
trainer=Trainer(
    model,
    trainig_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=my_collate,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

Step,Training Loss
500,0.1503
1000,0.0809
1500,0.0698
2000,0.0574
2500,0.0501
3000,0.043
3500,0.0373
4000,0.0296
4500,0.0291
5000,0.0252


TrainOutput(global_step=5268, training_loss=0.05556457949662263, metrics={'train_runtime': 559.4261, 'train_samples_per_second': 75.297, 'train_steps_per_second': 9.417, 'total_flos': 923954298531210.0, 'train_loss': 0.05556457949662263, 'epoch': 3.0})