## Make Data set

In [1]:
import pickle

with open('texts.pkl','rb') as f:
  texts = pickle.load(f)

with open('tags.pkl','rb') as f:
  tags = pickle.load(f)

In [2]:
#split into training and validation sets
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('dslim/bert-base-NER')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [4]:
#get tokenized texts
train_encodings = tokenizer(train_texts, is_split_into_words=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, padding=True, truncation=True)

In [5]:
#define the tags and labels
unique_tags = ('O','B-PRODUCT','I-PRODUCT')
tag2id = {tag:id for id,tag in enumerate(unique_tags)}
id2tag = {id:tag for id,tag in enumerate(unique_tags)}

In [6]:
#Tokenizer is sub word so it splits each word into sub tokens thus we need to align the labels to how the tokenizer split the words
def encode_tags(tags_list,encodings,label_all_tokens):

    labels_list = []
    for i, tags in enumerate(tags_list):
        word_ids = encodings.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[tags[word_idx-1]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(tag2id[tags[word_idx-1]] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels_list.append(label_ids)

    return labels_list

In [7]:
#align the labels
train_labels = encode_tags(train_tags,train_encodings,True)
val_labels = encode_tags(val_tags,val_encodings,True)

In [8]:
#make the dataset class that will mimic the DataSet class in hugging face and will produce objects that will work with hugging face when we train
import torch
class FurnitureDataset:
    def __init__(self,encodings,labels):
        self.encodings = encodings
        self. labels = labels

    def __getitem__(self,idx):
        items = {key:torch.tensor(val[idx]) for key,val in self.encodings.items()}
        items['labels'] = torch.tensor(self.labels[idx])
        items.pop('token_type_ids')
        return items

    def __len__(self):
        return len(self.labels)

In [9]:
#make the training and validation datasets
train_dataset = FurnitureDataset(train_encodings,train_labels)
val_dataset = FurnitureDataset(val_encodings,val_labels)

In [10]:
val_dataset[0:5]

{'input_ids': tensor([[  101, 12050,  1106,  ...,     0,     0,     0],
         [  101, 13899,  1306,  ...,  2539,  1665,   102],
         [  101,  3341, 17228,  ...,   122,   119,   102],
         [  101, 16851,  1643,  ...,     0,     0,     0],
         [  101,  3341,  2117,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[-100,    0,    0,  ..., -100, -100, -100],
         [-100,    0,    0,  ...,    0,    0, -100],
         [-100,    0,    0,  ...,    0,    0, -100],
         [-100,    0,    0,  ..., -100, -100, -100],
         [-100,    0,    0,  ..., -100, -100, -100]])}

In [11]:
len(train_dataset)

119

## Make evaluation function (precision,recall,f1,accuracy)

In [13]:
import evaluate
metric = evaluate.load('seqeval')

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [14]:
#create function that takes in predictions and actual labels and computes evaluation metrics
import numpy as np

def compute_metrics(eval_preds):
    logits,labels = eval_preds
    predictions = np.argmax(logits,axis=-1)

    true_labels = [[id2tag[l] for l in label if l!=-100] for label in labels]
    true_predictions = [[id2tag[p] for p,l in zip(pred,label) if l!=-100] for pred,label in zip(predictions,labels)]

    all_metrics = metric.compute(predictions=true_predictions,references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

## Train model

In [15]:
#Load pretrained model from hugging face that we will fine tune and train on our custom data set
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    'dslim/bert-base-NER',
    id2label = id2tag,
    label2id = tag2id,
    ignore_mismatched_sizes=True, #lets us use our own entites
)

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) 

In [16]:
model.config.num_labels

3

In [17]:
#use DataLoader to split our datasets into batches
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(val_dataset, batch_size=8)

In [18]:
#set up optimizer, in this case i chose to use AdamW
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [19]:
#define the number of epochs we will train for and use a linear lr scheduler for dynamic change of the lr during training for better results
from transformers import get_scheduler

num_epochs = 80
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [20]:
#use google collab GPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [26]:
#finally we define our training loop, and print the training loss at the end of each epoch
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    print(loss)

  0%|          | 0/1200 [00:00<?, ?it/s]

tensor(0.3430, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.2804, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.3265, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.2104, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.2595, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0623, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0842, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0809, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0781, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0406, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0854, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0197, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0904, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0349, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0711, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.0123, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.1388, device='cuda:0', grad_fn=

In [27]:
#save fine tuned model for use
model.save_pretrained('./fine-tuned-NER') #overall 0.0013 training loss
tokenizer.save_pretrained('./fine-tuned-NER')

('./fine-tuned-NER/tokenizer_config.json',
 './fine-tuned-NER/special_tokens_map.json',
 './fine-tuned-NER/vocab.txt',
 './fine-tuned-NER/added_tokens.json',
 './fine-tuned-NER/tokenizer.json')