In [1]:
!pip3 install transformers datasets evaluate numpy torch seqeval huggingface_hub
!apt install git-lfs

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=61a32fd4f4ef83ed3c8d6a4e67f1e0c9407aaafd99c0d144b1c04e11b8f5985b
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval, evaluate
Successfully installed evaluate-0.4.3 seqeval-1.2.2
Reading p

In [None]:
!git config --global user.email ""
!git config --global user.name ""

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, path_dataset="eriktks/conll2003", revision=None):
        self.raw_data = load_dataset(path_dataset)
        self.train_set = self.raw_data["train"]
        self.test_set = self.raw_data["test"]
        self.val_set = self.raw_data["validation"]
        self.size = len(self.train_set) + len(self.test_set) + len(self.val_set)
        self.name_tags = self.raw_data["train"].features["ner_tags"].feature.names
        self.num_classes = self.raw_data["train"].features["ner_tags"].feature.num_classes
        print("-"*40, "Information of Dataset", "-"*40)
        print(self.raw_data)
        print("Labels tag name: ", self.name_tags)
        print("Number of tag name: ", self.num_classes)
        print("-"*40, "Information of Dataset", "-"*40)
        
    
    def __len__(self):
        return self.size
    
    def __getitem__(self, index):
        data = self.train_set[index]["tokens"]
        target = self.train_set[index]["ner_tags"]
        return {
            "data_text": data,
            "target_text": target
        }
    
    def illustrate_sample(self, index):
        sample = self[index]
        words = sample["data_text"]
        labels = sample["target_text"]
        line1 = line2 = ""
        for word, label in zip(words, labels):
            name_tag = self.name_tags[label]
            max_length = max(len(name_tag), len(word))
            line1 += word + " "*(max_length - len(word) + 1)
            line2 += name_tag + " "*(max_length - len(name_tag) + 1)
        print("Example " + str(index) + ":\n" + line1 + "\n" + line2)
        
    

In [5]:
from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification
)
from torch.utils.data import DataLoader

class Preprocessing():
    def __init__(self, model_tokenizer="bert-base-cased", batch_size=8, dataset=CustomDataset()):
        self.tokenizer = AutoTokenizer.from_pretrained(model_tokenizer)
        print("-"*50, "Information of Tokenizer", "-"*50)
        print(self.tokenizer)
        print("-"*50, "Information of Tokenizer", "-"*50)
        self.tokenized_train_set, self.tokenized_test_set, self.tokenized_val_set = self.map_tokenize_dataset(dataset=dataset)
        self.data_collator = DataCollatorForTokenClassification(tokenizer=self.tokenizer)
        self.id2label, self.label2id = self.hashmap_id_label(dataset=dataset)
        self.train_loader, self.test_loader, self.val_loader = self.data_loader(batch_size=batch_size)
        self.step_train_loader, self.step_test_loader, self.step_val_loader = len(self.train_loader), len(self.test_loader), len(self.val_loader)
    
    def align_labels_from_tokens(self, name_tags, word_ids):
        """After Tokenizer the length of labels is changed,
        preprocess the labels to new labels
        Args:
            name_tags (list): list of name tags [O, B-xxx, I-xxx]
            word_ids (list): position of tokens

        Returns:
            new labels: list of new labels
        """
        new_labels = []
        current_word = None
        for word_id in word_ids:
            if word_id != current_word:
                # start new token
                current_word = word_id
                label = -100 if word_id is None else name_tags[word_id]
                new_labels.append(label)
            elif word_id == None:
                # special token
                new_labels.append(-100)
            else:
                # word_id same previous word_id
                label = name_tags[word_id]
                # Nếu word_id giống cái trước đó => B-xxx convert to I-xxx 
                # Do token bị tách không có nghĩa luôn được gán B-xxx do cùng word_id với trước đó
                if label % 2 == 1:
                    label += 1
                new_labels.append(label)
        return new_labels
            
    def tokenize_with_align_labels(self, sample):
        tokenized_inputs = self.tokenizer(
            sample["tokens"], 
            truncation=True, 
            is_split_into_words=True
        )
        all_labels = sample["ner_tags"]
        new_labels = []
        for i, labels in enumerate(all_labels):
            word_ids = tokenized_inputs.word_ids(i)
            new_labels.append(self.align_labels_from_tokens(labels, word_ids))
            
        tokenized_inputs["labels"] = new_labels
        return tokenized_inputs
    
    def map_tokenize_dataset(self, dataset):
        tokenized_train_set = dataset.train_set.map(
            self.tokenize_with_align_labels,
            batched=True,
            remove_columns=dataset.train_set.column_names
        )
        tokenized_test_set = dataset.test_set.map(
            self.tokenize_with_align_labels,
            batched=True,
            remove_columns=dataset.test_set.column_names
        )
        tokenized_val_set = dataset.val_set.map(
            self.tokenize_with_align_labels,
            batched=True,
            remove_columns=dataset.val_set.column_names
        )
        return tokenized_train_set, tokenized_test_set, tokenized_val_set
        
    def hashmap_id_label(self, dataset):
        id2label = {i: label for i, label in enumerate(dataset.name_tags)}
        label2id = {label: i for i, label in id2label.items()}
        return id2label, label2id
    
    def data_loader(self, batch_size):
        train_loader = DataLoader(
            self.tokenized_train_set,
            shuffle=True,
            collate_fn=self.data_collator,
            batch_size=batch_size
        )
        
        val_loader = DataLoader(
            self.tokenized_val_set,
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=batch_size
        )
        
        test_loader = DataLoader(
            self.tokenized_test_set,
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=batch_size
        )
        return train_loader, test_loader, val_loader
        


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for eriktks/conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/eriktks/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

---------------------------------------- Information of Dataset ----------------------------------------
DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
Labels tag name:  ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Number of tag name:  9
---------------------------------------- Information of Dataset ----------------------------------------


In [6]:
from transformers import (
    TrainingArguments,
    Trainer,
    get_scheduler,
    AutoModelForTokenClassification
)
import evaluate
import torch
import os
#from preprocessing import Preprocessing
#from data_training import CustomDataset
import numpy as np
import evaluate
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
from huggingface_hub import Repository, HfApi, HfFolder

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Used Device: ", device)

class Training():
    def __init__(self, model_name="bert-base-cased", learning_rate=2e-5, epoch=5, 
                 num_warmup_steps=0, name_metric="seqeval", path_tensorboard="data_run", path_save="token_classifier_scratch"):
        self.dataset = CustomDataset()
        self.process = Preprocessing(dataset=self.dataset)
        self.model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            id2label=self.process.id2label,
            label2id=self.process.label2id
        )
        print("-"*50, "Information of Model", "-"*50)
        print(self.model)
        print("Parameters: ", int(self.model.num_parameters() / 1000000),  "M")
        print("-"*50, "Information of Model", "-"*50)
        self.epochs = epoch
        self.num_steps = self.epochs * self.process.step_train_loader
        self.optimizer = torch.optim.AdamW(
            self.model.parameters(), 
            lr=learning_rate
        )
        self.lr_scheduler = get_scheduler(
            name="linear",
            optimizer=self.optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=self.num_steps
        )
        self.metric = evaluate.load(name_metric)
        self.writer = SummaryWriter("runs/" + path_tensorboard)
        
        # Define necessary variables
        self.api = HfApi(token="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")
        self.repo_name = path_save  # Replace with your repo name
        self.author = "Chessmen"
        self.repo_id = self.author + "/" + self.repo_name
        self.token = HfFolder.get_token()
        self.repo = self.setup_hf_repo(self.repo_name, self.repo_id, self.token)
        
    def setup_hf_repo(self, local_dir, repo_id, token):
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)
        
        try:
            self.api.repo_info(repo_id)
            print(f"Repository {repo_id} exists. Cloning...")
        except Exception as e:
            print(f"Repository {repo_id} does not exist. Creating...")
            self.api.create_repo(repo_id=repo_id, token=token, private=True)
        
        repo = Repository(local_dir=local_dir, clone_from=repo_id)
        return repo
    
    def save_and_upload(self, epoch, final_commit=False):
        # Save model, tokenizer, and additional files
        self.model.save_pretrained(self.repo_name)
        self.process.tokenizer.save_pretrained(self.repo_name)

        # Push to Hugging Face Hub
        self.repo.git_add(pattern=".")
        commit_message = "Final Commit: Complete fine-tuned model" if final_commit else f"Epoch {epoch}: Update fine-tuned model and metrics"
        self.repo.git_commit(commit_message)
        self.repo.git_push()

        print(f"Model and files pushed to Hugging Face Hub for epoch {epoch}: {self.repo_id}")
    
    def compute_metrics(self, eval_preds):
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)
        
        # Xoá token đặc biệt và chuyển chúng về name tags
        true_labels = [[self.dataset.name_tags[l] for l in label if l != -100]for label in labels]
        true_predictions = [
            [self.dataset.name_tags[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        all_metrics = self.metric.compute(
            predictions=true_predictions,
            references=true_labels
        )
        return {
            "precision": all_metrics["overall_precision"],
            "recall": all_metrics["overall_recall"],
            "f1": all_metrics["overall_f1"],
            "accuracy": all_metrics["overall_accuracy"],
        }
    
    def postprocess(self, predictions, labels):
        predictions = predictions.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        # Xoá token đặc biệt và chuyển chúng về name tags
        true_labels = [[self.dataset.name_tags[l] for l in label if l != -100]for label in labels]
        true_predictions = [
            [self.dataset.name_tags[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        
        return true_predictions, true_labels
    
    def fit(self, flag_step=False):
        progress_bar = tqdm(range(self.num_steps))
        interval = 200
        for epoch in range(self.epochs):
            # training
            self.model.train()
            n_train_samples = 0
            total_train_loss = 0
            for i, batch in enumerate(self.process.train_loader):
                batch = {k: v.to(device) for k, v in batch.items()}
                n_train_samples += len(batch)
                outputs = self.model.to(device)(**batch)
                losses = outputs.loss
                losses.backward()
                
                total_train_loss += round(losses.item(),4)
                
                self.optimizer.step()
                self.lr_scheduler.step()
                self.optimizer.zero_grad()
                progress_bar.update(1)
                if (i + 1) % interval == 0 and flag_step == True:
                    print("Epoch: {}/{}, Iteration: {}/{}, Train Loss: {}".format(
                        epoch + 1,
                        self.epochs,
                        i + 1,
                        self.process.step_train_loader,
                        losses.item())
                    )
                    self.writer.add_scalar('Train/Loss', round(losses.item(),4), epoch * self.process.step_train_loader + i)
            
            # evaluate
            self.model.eval()
            n_val_samples = 0
            total_val_loss = 0
            for i, batch in enumerate(self.process.val_loader):
                batch = {k: v.to(device) for k, v in batch.items()}
                n_val_samples += len(batch)
                with torch.no_grad():
                    outputs = self.model.to(device)(**batch)
                logits = outputs.logits
                losses = outputs.loss
                predictions = torch.argmax(logits, dim=-1)
                
                total_val_loss += round(losses.item(),4)
                
                labels = batch["labels"]
                true_predictions, true_labels = self.postprocess(predictions, labels)
                self.metric.add_batch(predictions=true_predictions, references=true_labels)
                if (i + 1) % interval == 0 and flag_step == True:
                    print("Epoch: {}/{}, Iteration: {}/{}, Val Loss: {}".format(
                        epoch + 1,
                        self.epochs,
                        i + 1,
                        self.process.step_val_loader,
                        losses.item())
                    )
                    self.writer.add_scalar('Val/Loss', round(losses.item(),4), epoch * self.process.step_val_loader + i)         
            
            epoch_train_loss = total_train_loss / n_train_samples
            epoch_val_loss = total_val_loss / n_val_samples
            print(f"train_loss: {epoch_train_loss}  - val_loss: {epoch_val_loss}")
    
            metrics = self.metric.compute()
            print(
                f"epoch {epoch+1}:",
                {
                    key: metrics[f"overall_{key}"]
                    for key in ["precision", "recall", "f1", "accuracy"]
                },
            )
            # Save and upload after each epoch
            final_commit = ((epoch+1) == self.epochs)
            self.save_and_upload((epoch+1), final_commit)
            
        
                

Used Device:  cuda


In [7]:
train = Training()
train.fit()

---------------------------------------- Information of Dataset ----------------------------------------
DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
Labels tag name:  ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Number of tag name:  9
---------------------------------------- Information of Dataset ----------------------------------------


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

-------------------------------------------------- Information of Tokenizer --------------------------------------------------
BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
-

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


-------------------------------------------------- Information of Model --------------------------------------------------
BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
       

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Repository Chessmen/token_classifier_scratch does not exist. Creating...


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Chessmen/token_classifier_scratch into local empty directory.


  0%|          | 0/8780 [00:00<?, ?it/s]

train_loss: 0.03556409453302957  - val_loss: 0.015240909090909091 -
epoch 1: {'precision': 0.9113673805601318, 'recall': 0.9309996634129922, 'f1': 0.921078921078921, 'accuracy': 0.9825601930888327}


Upload file model.safetensors:   0%|          | 1.00/411M [00:00<?, ?B/s]

To https://huggingface.co/Chessmen/token_classifier_scratch
   ef5f6c0..d066e41  main -> main



Model and files pushed to Hugging Face Hub for epoch 1: Chessmen/token_classifier_scratch
train_loss: 0.010366913439635535  - val_loss: 0.014757432432432436 -
epoch 2: {'precision': 0.9274765122795451, 'recall': 0.9469875462807136, 'f1': 0.9371304854692315, 'accuracy': 0.9848560664037205}


Upload file model.safetensors:   0%|          | 1.00/411M [00:00<?, ?B/s]

To https://huggingface.co/Chessmen/token_classifier_scratch
   d066e41..1b9f392  main -> main



Model and files pushed to Hugging Face Hub for epoch 2: Chessmen/token_classifier_scratch
train_loss: 0.005588653189066058  - val_loss: 0.013519103194103195 -
epoch 3: {'precision': 0.9253289473684211, 'recall': 0.9468192527768429, 'f1': 0.9359507569455997, 'accuracy': 0.985636074645317}


Upload file model.safetensors:   0%|          | 1.00/411M [00:00<?, ?B/s]

To https://huggingface.co/Chessmen/token_classifier_scratch
   1b9f392..1335b7a  main -> main



Model and files pushed to Hugging Face Hub for epoch 3: Chessmen/token_classifier_scratch
train_loss: 0.003070045558086566  - val_loss: 0.014535073710073695 -
epoch 4: {'precision': 0.9279249629751523, 'recall': 0.9490070683271625, 'f1': 0.9383476162742325, 'accuracy': 0.9861953258374051}


Upload file model.safetensors:   0%|          | 1.00/411M [00:00<?, ?B/s]

To https://huggingface.co/Chessmen/token_classifier_scratch
   1335b7a..287ccf7  main -> main



Model and files pushed to Hugging Face Hub for epoch 4: Chessmen/token_classifier_scratch
train_loss: 0.0018046981776765466  - val_loss: 0.015159520884520869 -
epoch 5: {'precision': 0.9323878627968337, 'recall': 0.9515314708852238, 'f1': 0.9418624021322671, 'accuracy': 0.9870930711720728}


Upload file model.safetensors:   0%|          | 1.00/411M [00:00<?, ?B/s]

To https://huggingface.co/Chessmen/token_classifier_scratch
   287ccf7..a6705b9  main -> main



Model and files pushed to Hugging Face Hub for epoch 5: Chessmen/token_classifier_scratch


In [44]:
%load_ext tensorboard
%tensorboard --logdir runs