In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
import plotly.express as px

import pandas as pd
import numpy as np
import transformers
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup, BertConfig
import torch
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import torch.nn as nn
from sklearn.metrics import accuracy_score
import pickle as pkl
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import string
import os


In [3]:
## Loading the dataset
data_path = "ner_datasetreference.csv"
data = pd.read_csv(data_path, encoding= 'latin-1')
data.dropna()
data.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Thousands,O
1,,of,O
2,,demonstrators,O
3,,have,O
4,,marched,O


In [4]:
## Finding the number of tags in the tag column of the dataset
num_tags = data["Tag"].nunique()
num_tags

17

In [5]:
## Finding the count of each tags in the tag column of the dataset
data['Tag'].value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [6]:
## Plotting the count of each tags

bar = data['Tag'].value_counts()
fig = px.bar(x = bar.index, y = bar.values, title = "Count of Each Tags")

fig.update_xaxes(title_text = "Tag Names", tickangle= 320)
fig.update_yaxes(title_text = "Tag Counts")

fig.show()

In [7]:
class Config:
    CLS = [101]
    SEP = [102]
    VALUE_TOKEN = [-100]
    MAX_LEN = 256
    TRAIN_BATCH_SIZE = 32
    VAL_BATCH_SIZE = 8
    EPOCHS = 3
    TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
data["Sentence #"] = data["Sentence #"].fillna(method='ffill')
enc_tag = LabelEncoder().fit(data['Tag'])
data["Encoded_Tag"] = enc_tag.transform(data["Tag"])
pkl.dump(enc_tag, open('labelenc.pkl', 'wb'))
data.head()

Unnamed: 0,Sentence #,Word,Tag,Encoded_Tag
0,Sentence: 1,Thousands,O,16
1,Sentence: 1,of,O,16
2,Sentence: 1,demonstrators,O,16
3,Sentence: 1,have,O,16
4,Sentence: 1,marched,O,16


In [9]:
data_gr = data.groupby("Sentence #").agg({'Word': list,  'Tag':list, 'Encoded_Tag': list})
data_gr.head()

Unnamed: 0_level_0,Word,Tag,Encoded_Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[16, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16..."
Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16..."
Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[16, 16, 7, 16, 16, 16, 16, 16, 2, 16, 16, 16,..."
Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[O, O, O, O, O, O, O, O, O, O, O]","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]"
Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[2, 16, 16, 6, 14, 16, 7, 16, 2, 16, 3, 16, 3,..."


In [10]:
#Train Test Split
train_sent, val_sent, train_tag, val_tag = train_test_split(data_gr['Word'], data_gr['Encoded_Tag'], test_size=0.01, random_state=10)

In [11]:
class Dataset:

  def __init__(self, texts, tags):


    self.texts = texts
    self.tags = tags

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, index):
    texts = self.texts[index]
    tags = self.tags[index]

    #Tokenise
    ids = []
    target_tag = []

    for idx, words in enumerate(texts):
        inputs= Config.TOKENIZER.encode(words, add_special_tokens=False)

        input_len = len(inputs)
        ids.extend(inputs)
        target_tag.extend(input_len * [tags[idx]])

    #To Add Special Tokens, subtract 2 from MAX_LEN
    ids = ids[:Config.MAX_LEN - 2]
    target_tag = target_tag[:Config.MAX_LEN - 2]

    #Add Sepcial Tokens
    ids = Config.CLS + ids + Config.SEP
    target_tags = Config.VALUE_TOKEN + target_tag + Config.VALUE_TOKEN

    mask = [1] * len(ids)
    token_type_ids = [0] * len(ids)

    #Add Padding if the input_len is small

    padding_len = Config.MAX_LEN - len(ids)
    ids = ids + ([0] * padding_len)
    target_tags = target_tags + ([0] * padding_len)
    mask = mask + ([0] * padding_len)
    token_type_ids = token_type_ids + ([0] * padding_len)

    return {
        "ids" : torch.tensor(ids, dtype=torch.long),
        "mask" : torch.tensor(mask, dtype=torch.long),
        "token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long),
        "target_tags" : torch.tensor(target_tags, dtype=torch.long)
      }


In [12]:
device =  "cuda" if torch.cuda.is_available() else "cpu"

In [13]:
class NERBertModel(nn.Module):

    def __init__(self, num_tag):
        super(NERBertModel, self).__init__()
        self.num_tag = num_tag
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert_drop = nn.Dropout(0.3)
        self.out_tag = nn.Linear(768, self.num_tag)

    def forward(self, ids, mask, token_type_ids, target_tags):
        output, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        bert_out = self.bert_drop(output)
        tag = self.out_tag(bert_out)

        #Calculate the loss
        Critirion_Loss = nn.CrossEntropyLoss()
        active_loss = mask.view(-1) == 1
        active_logits = tag.view(-1, self.num_tag)
        active_labels = torch.where(active_loss, target_tags.view(-1), torch.tensor(Critirion_Loss.ignore_index).type_as(target_tags))
        loss = Critirion_Loss(active_logits, active_labels)
        return tag, loss

In [14]:
def train_fn(train_data_loader, model, optimizer, device, scheduler):
    loss_ = 0
    for data in tqdm(train_data_loader, total = len(train_data_loader)):
        for i, j in data.items():
            data[i] = j.to(device)

        #Backward Propagation
        optimizer.zero_grad()
        _, loss = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        loss_ += loss.item()
    return model, loss_ / len(train_data_loader)



In [15]:
model = NERBertModel(num_tags)
model.train()

# Save the model parameters to a file
model_path = 'ner_bert_model.pth'
torch.save(model.state_dict(), model_path)
print(f'Model saved at path: {model_path}')

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model saved at path: ner_bert_model.pth


In [16]:
def val_fn(val_data_loader, model, optimizer, device, scheduler):
    model.eval()
    loss_ = 0
    for data in tqdm(val_data_loader, total = len(val_data_loader)):
        for i, j in data.items():
            data[i] = j.to(device)
        _, loss = model(**data)
        loss_ += loss.item()
    return loss_ / len(val_data_loader)

In [17]:
train_dataset = Dataset(texts = train_sent, tags = train_tag)
val_dataset = Dataset(texts = val_sent, tags = val_tag)
train_data_loader = DataLoader(train_dataset, batch_size=Config.TRAIN_BATCH_SIZE)
val_data_loader = DataLoader(val_dataset, batch_size=Config.VAL_BATCH_SIZE)


In [18]:
model.load_state_dict(torch.load(model_path))
model.to(device)
print(model)

NERBertModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [19]:
#Function for getparameters
def get_hyperparameters(model, fine_tuning):

    if fine_tuning:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "gamma", "beta"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate": 0.0,
            },
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

    return optimizer_grouped_parameters


In [20]:
FULL_FINETUNING = True
optimizer_grouped_parameters = get_hyperparameters(model, FULL_FINETUNING)
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=3e-5)
num_train_steps = int(len(train_sent) / Config.TRAIN_BATCH_SIZE * Config.EPOCHS)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)

In [None]:
for epoch in range(Config.EPOCHS):
    model, train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
    val_loss = val_fn(val_data_loader, model, optimizer, device, scheduler)
    print(f"Epoch: {epoch + 1}, Train_loss: {train_loss}, Val_loss: {val_loss}")

100%|██████████| 1484/1484 [32:20<00:00,  1.31s/it]
100%|██████████| 60/60 [00:07<00:00,  7.71it/s]


Epoch: 1, Train_loss: 0.23326724812368338, Val_loss: 0.16115380420039097


100%|██████████| 1484/1484 [31:44<00:00,  1.28s/it]
100%|██████████| 60/60 [00:07<00:00,  7.75it/s]


Epoch: 2, Train_loss: 0.1476638252593356, Val_loss: 0.1503830986097455


 24%|██▍       | 356/1484 [07:36<23:53,  1.27s/it]

In [None]:
def prediction(test_sentence, model, enc_tag):
    for i in list(string.punctuation):
        test_sentence = test_sentence.replace(i, ' ' + i)
    test_sentence = test_sentence.split()
    print(test_sentence)
    Token_inputs = Config.TOKENIZER.encode(test_sentence, add_special_tokens=False)
    print(Token_inputs)
    test_dataset =  Dataset(test_sentence, tags= [[1] * len(test_sentence)])
    num_tag = len(enc_tag.classes_)

    with torch.no_grad():
        data = test_dataset[0]
        for i, j in data.items():
            data[i] = j.to(device).unsqueeze(0)
        tag, _ = model(**data)

        print(enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1))[1:len(Token_inputs)+1])


In [None]:
test_sentence = "Charles was born in France on 19 November 1996."
prediction(test_sentence, model, enc_tag)
