<a href="https://colab.research.google.com/github/Kira1108/huggingface-examples/blob/main/PosTagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from IPython.display import clear_output

!pip install transformers
!pip install datasets
!pip install evaluate
!pip install git+https://github.com/Kira1108/huggingface_utils.git
clear_output()

In [2]:
import nltk
import json
from nltk.corpus import brown
from dataclasses import dataclass
from datasets import load_dataset
from huggingface_utils.labels import LabelAligner

**Extract data**

In [3]:
nltk.download('brown')
nltk.download('universal_tagset')
corpus = brown.tagged_sents(tagset = 'universal')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


**Transform Data**

In [4]:
@dataclass
class TokenSequenceWriter:

    fname:str
    fitted:bool = False

    def fit_labels(self, corpus):
        
        if self.fitted:
            return 
        print("Retrieving label information......")
        label_names = list({label for sent in corpus for word, label in sent})
        self.label_names = sorted(label_names)
        self.id2label = {i:l for i,l in enumerate(label_names)}
        self.label2id = {v:k for k,v in self.id2label.items()}
        self.fitted = True

    
    def format_corpus(self,corpus):
        self.fit_labels(corpus)

        data = []
        label_names = set()
        label = set
        for sent in corpus:
            inputs = []
            targets = []
            for in_,tok_ in sent:
                inputs.append(in_)
                targets.append(self.label2id[tok_])
            data.append({
                "tokens":inputs,
                "labels":targets
            })
        return data

    def _to_json(self, data):
        with open(self.fname + ".json",'w') as f:
            for line in data:
                f.write(json.dumps(line) + "\n")

        with open(self.fname + "_labelname.json",'w') as f:
            f.write(json.dumps({'label_names':self.label_names}))

    def write(self, corpus):
        print(f"Writing data to {self.fname}.......")
        self._to_json(self.format_corpus(corpus))
        print(f"Data Write to {self.fname} success.")

In [5]:
datawriter = TokenSequenceWriter("brown")
datawriter.write(corpus)

Writing data to brown.......
Retrieving label information......
Data Write to brown success.


**Load Data**

In [22]:
# data = load_dataset("json",data_files = 'brown.json')
# label_names = json.load(open('brown_labelname.json','r'))['label_names']
# id2label = {i:l for i,l in enumerate(label_names)}
# label2id = {v:k for k,v in id2label.items()}
# data = data['train'].train_test_split(test_size = 0.1)

data = load_dataset("json",data_files = 'brown.json')
data = data['train'].shuffle().select(range(20000))
label_names = json.load(open('brown_labelname.json','r'))['label_names']
id2label = {i:l for i,l in enumerate(label_names)}
label2id = {v:k for k,v in id2label.items()}
data = data.train_test_split(test_size = 0.1)




  0%|          | 0/1 [00:00<?, ?it/s]

**Tokenize Data**

You should create a batch tokenization function like:

```python
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

aligner = LabelAligner(label_names = label_names,use_iob = False)

def tokenize_fn(batch):

    tokenized_inputs = tokenizer(
        batch['tokens'], 
        truncation = True, 
        is_split_into_words = True)
    
    labels = batch['labels']

    aligned_labels = [
        aligner(labels = l, word_ids = tokenized_inputs.word_ids(i)) 
        for i,l in enumerate(labels)]
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs
```



Or encapsulate the functionalities somewhere else

In [23]:
from huggingface_utils.tokenize import TokenClassifyTokenizeFn
from transformers import AutoTokenizer
# choose a checkpoint
checkpoint = 'distilbert-base-cased'

# load the default tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# load a label aligner
aligner = LabelAligner(label_names = label_names,use_iob = False)

# create batch tokenize function
tokenize_fn = TokenClassifyTokenizeFn(tokenizer = tokenizer, label_aligner = aligner)

# test out things
# print("Test tokenize function:")
# tokenize_fn(data['train'][:1])

tokenized_dataset = data.map(
    tokenize_fn, 
    batched = True,
    remove_columns = ['tokens'])

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde99

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [24]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 18000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [25]:
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import pipeline
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

**Collator and Metric**

In [26]:
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)


**Model and trainer creation**

In [27]:
def flatten(list_of_lists):
    """Simple flatten function"""
    return [val for sublist in list_of_lists for val in sublist]

def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels

    # compute labels from logits
    preds = np.argmax(logits, axis = -1)

    # mask with -100
    fill_with_null = np.where(np.array(labels) == -100, -100, preds)

    # filter out -100
    pred_labels = [[l for l in label if l!= -100] for label in fill_with_null]
    labels = [[l for l in label if l!= -100] for label in labels]
    pred_labels = flatten(pred_labels)
    labels = flatten(labels)

    preds = np.array(preds)
    return {
        "f1":f1_score(labels, pred_labels, average = 'macro'),
        "accuracy":accuracy_score(labels, pred_labels)
    }

import numpy as np
logits = np.random.random((10,3))
labels = np.random.choice([0,1,2], size = (1,10))
print(np.argmax(logits, axis = -1))
print(labels[0])
compute_metrics((logits, labels))

[0 2 2 2 1 1 0 1 2 2]
[2 2 1 0 1 1 1 1 1 1]


{'f1': 0.29523809523809524, 'accuracy': 0.4}

In [28]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, 
    id2label = id2label, 
    label2id = label2id
)

training_args  = TrainingArguments(
    "distilbert_finetuned-ner",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate =2e-5,
    num_train_epochs = 3,
    weight_decay = 0.01
)

trainer = Trainer(
    model = model,
    tokenizer = tokenizer,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": ".",
    "1": "ADJ",
    "2": "ADP",
    "3": "ADV",
    "4": "CONJ",
    "5": "DET",
    "6": "NOUN",
    "7": "NUM",
    "8": "PRON",
    "9": "PRT",
    "10": "VERB",
    "11": "X"
  },
  "initializer_range": 0.02,
  "label2id": {
    ".": 0,
    "ADJ": 1,
    "ADP": 2,
    "ADV": 3,
    "CONJ": 4,
    "DET": 5,
    "NOUN": 6,
    "NUM": 7,
    "PRON": 8,
    "PRT": 9,
    "VERB": 10,
    "X": 11
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos

**Model training**

In [29]:
trainer.train()

***** Running training *****
  Num examples = 18000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6750
  Number of trainable parameters = 65200140
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0657,0.05458,0.953562,0.984542
2,0.0365,0.048835,0.961846,0.986461
3,0.0198,0.049845,0.96655,0.98699


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
Saving model checkpoint to distilbert_finetuned-ner/checkpoint-2250
Configuration saved in distilbert_finetuned-ner/checkpoint-2250/config.json
Model weights saved in distilbert_finetuned-ner/checkpoint-2250/pytorch_model.bin
tokenizer config file saved in distilbert_finetuned-ner/checkpoint-2250/tokenizer_config.json
Special tokens file saved in distilbert_finetuned-ner/checkpoint-2250/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
Saving model checkpoint to distilbert_finetuned-ner/checkpoint-4500
Configuration saved in distilbert_finetuned-ner/checkpoint-4500/config.json
Model weights saved in distilbert_finetuned-ner/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in distilbert_finetuned-ner/checkpoint-4500/tokenizer_config.json
Special tokens file saved in distilbert_finetuned-ner/checkpoint-4500/special_tokens_map.json
***** Running Evaluation *****
 

TrainOutput(global_step=6750, training_loss=0.06167837792855722, metrics={'train_runtime': 223.0145, 'train_samples_per_second': 242.137, 'train_steps_per_second': 30.267, 'total_flos': 696606986751552.0, 'train_loss': 0.06167837792855722, 'epoch': 3.0})

**Model persistence**

In [30]:
trainer.save_model("my-saved-model")

Saving model checkpoint to my-saved-model
Configuration saved in my-saved-model/config.json
Model weights saved in my-saved-model/pytorch_model.bin
tokenizer config file saved in my-saved-model/tokenizer_config.json
Special tokens file saved in my-saved-model/special_tokens_map.json


**Use Model**

In [31]:
ner = pipeline("token-classification", model = 'my-saved-model', aggregation_strategy = 'simple', device = 0)
ner("Bill and John went to Washonton DC yesterday.")

loading configuration file my-saved-model/config.json
Model config DistilBertConfig {
  "_name_or_path": "my-saved-model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": ".",
    "1": "ADJ",
    "2": "ADP",
    "3": "ADV",
    "4": "CONJ",
    "5": "DET",
    "6": "NOUN",
    "7": "NUM",
    "8": "PRON",
    "9": "PRT",
    "10": "VERB",
    "11": "X"
  },
  "initializer_range": 0.02,
  "label2id": {
    ".": 0,
    "ADJ": 1,
    "ADP": 2,
    "ADV": 3,
    "CONJ": 4,
    "DET": 5,
    "NOUN": 6,
    "NUM": 7,
    "PRON": 8,
    "PRT": 9,
    "VERB": 10,
    "X": 11
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float

[{'entity_group': 'NOUN',
  'score': 0.9990263,
  'word': 'Bill',
  'start': 0,
  'end': 4},
 {'entity_group': 'CONJ',
  'score': 0.999728,
  'word': 'and',
  'start': 5,
  'end': 8},
 {'entity_group': 'NOUN',
  'score': 0.99969137,
  'word': 'John',
  'start': 9,
  'end': 13},
 {'entity_group': 'X',
  'score': 0.9998166,
  'word': 'went',
  'start': 14,
  'end': 18},
 {'entity_group': 'PRON',
  'score': 0.9997248,
  'word': 'to',
  'start': 19,
  'end': 21},
 {'entity_group': 'NOUN',
  'score': 0.9993143,
  'word': 'Washonton DC yesterday',
  'start': 22,
  'end': 44},
 {'entity_group': 'ADV',
  'score': 0.99992955,
  'word': '.',
  'start': 44,
  'end': 45}]