<a href="https://colab.research.google.com/github/Kira1108/huggingface-examples/blob/main/PosTagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from IPython.display import clear_output

!pip install transformers
!pip install datasets
!pip install evaluate
!pip install seqeval
!pip install git+https://github.com/Kira1108/huggingface_utils.git
clear_output()

In [2]:
import nltk
import json
from nltk.corpus import brown
from dataclasses import dataclass
from datasets import load_dataset
from huggingface_utils.labels import LabelAligner

**Extract data**

In [3]:
nltk.download('brown')
nltk.download('universal_tagset')
corpus = brown.tagged_sents(tagset = 'universal')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


**Transform Data**

In [4]:
@dataclass
class TokenSequenceWriter:

    fname:str
    fitted:bool = False

    def fit_labels(self, corpus):
        
        if self.fitted:
            return 
        print("Retrieving label information......")
        label_names = list({label for sent in corpus for word, label in sent})
        self.label_names = sorted(label_names)
        self.id2label = {i:l for i,l in enumerate(label_names)}
        self.label2id = {v:k for k,v in self.id2label.items()}
        self.fitted = True

    
    def format_corpus(self,corpus):
        self.fit_labels(corpus)

        data = []
        label_names = set()
        label = set
        for sent in corpus:
            inputs = []
            targets = []
            for in_,tok_ in sent:
                inputs.append(in_)
                targets.append(self.label2id[tok_])
            data.append({
                "tokens":inputs,
                "labels":targets
            })
        return data

    def _to_json(self, data):
        with open(self.fname + ".json",'w') as f:
            for line in data:
                f.write(json.dumps(line) + "\n")

        with open(self.fname + "_labelname.json",'w') as f:
            f.write(json.dumps({'label_names':self.label_names}))

    def write(self, corpus):
        print(f"Writing data to {self.fname}.......")
        self._to_json(self.format_corpus(corpus))
        print(f"Data Write to {self.fname} success.")

In [5]:
datawriter = TokenSequenceWriter("brown")
datawriter.write(corpus)

Writing data to brown.......
Retrieving label information......
Data Write to brown success.


**Load Data**

In [6]:
data = load_dataset("json",data_files = 'brown.json')
label_names = json.load(open('brown_labelname.json','r'))['label_names']
id2label = {i:l for i,l in enumerate(label_names)}
label2id = {v:k for k,v in id2label.items()}
data = data['train'].train_test_split(test_size = 0.1)
data



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-86fd7c1b4fa17a08/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-86fd7c1b4fa17a08/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 51606
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 5734
    })
})

**Tokenize Data**

You should create a batch tokenization function like:

```python
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

aligner = LabelAligner(label_names = label_names,use_iob = False)

def tokenize_fn(batch):

    tokenized_inputs = tokenizer(
        batch['tokens'], 
        truncation = True, 
        is_split_into_words = True)
    
    labels = batch['labels']

    aligned_labels = [
        aligner(labels = l, word_ids = tokenized_inputs.word_ids(i)) 
        for i,l in enumerate(labels)]
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs
```



Or encapsulate the functionalities somewhere else

In [7]:
from huggingface_utils.tokenize import TokenClassifyTokenizeFn
from transformers import AutoTokenizer
# choose a checkpoint
checkpoint = 'distilbert-base-cased'

# load the default tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# load a label aligner
aligner = LabelAligner(label_names = label_names,use_iob = False)

# create batch tokenize function
tokenize_fn = TokenClassifyTokenizeFn(tokenizer = tokenizer, label_aligner = aligner)

# test out things
# print("Test tokenize function:")
# tokenize_fn(data['train'][:1])

tokenized_dataset = data.map(
    tokenize_fn, 
    batched = True,
    remove_columns = ['tokens'])

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

  0%|          | 0/52 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 51606
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 5734
    })
})

In [9]:
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import pipeline
from huggingface_utils.metrics.ner import NerMetric

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)
compute_metrics = NerMetric(label_names = label_names)

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [11]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, 
    id2label = id2label, 
    label2id = label2id
)

training_args  = TrainingArguments(
    "distilbert_finetuned-ner",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate =2e-5,
    num_train_epochs = 3,
    weight_decay = 0.01
)

trainer = Trainer(
    model = model,
    tokenizer = tokenizer,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

In [None]:
trainer.train()

***** Running training *****
  Num examples = 51606
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 19353
  Number of trainable parameters = 65200140
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
