In [1]:
# !pip install transformers
import transformers

In [10]:
from transformers import AutoModel

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
bert_model = AutoModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [19]:
## CONFIGURATION FILES

MAX_LEN = 128
FOLDER_PATH = '/content/drive/My Drive/Colab Notebooks/Bert_projects/NER/'
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 2
MODEL_PATH = 'model.bin'
TRAINING_FILE = FOLDER_PATH + 'Data/ner_dataset.csv'

In [25]:
data_ = pd.read_csv(TRAINING_FILE,encoding='latin-1')
data_.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [13]:
import torch
## Data loader


class EntityDataset:
    def __init__(self, texts, pos, tags):
        # texts: [["hi", ",", "my", "name", "is", "ajay"], ["hello".....]]
        # pos/tags: [[1 2 3 4 1 5], [....].....]]
        self.texts = texts
        self.pos = pos
        self.tags = tags
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = self.texts[item]
        pos = self.pos[item]
        tags = self.tags[item]

        ids = []
        target_pos = []
        target_tag =[]

        for i, s in enumerate(text):
            inputs = tokenizer.encode(
                s,
                add_special_tokens=False
            )
            # abhishek: ab ##hi ##sh ##ek
            input_len = len(inputs)
            ids.extend(inputs)
            target_pos.extend([pos[i]] * input_len)
            target_tag.extend([tags[i]] * input_len)

        ids = ids[:MAX_LEN - 2]
        target_pos = target_pos[:MAX_LEN - 2]
        target_tag = target_tag[:MAX_LEN - 2]

        ids = [101] + ids + [102]
        target_pos = [0] + target_pos + [0]
        target_tag = [0] + target_tag + [0]

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        padding_len = MAX_LEN - len(ids)

        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_pos = target_pos + ([0] * padding_len)
        target_tag = target_tag + ([0] * padding_len)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_pos": torch.tensor(target_pos, dtype=torch.long),
            "target_tag": torch.tensor(target_tag, dtype=torch.long),
        }

In [14]:
## Engine file
from tqdm import tqdm

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        _, _, loss = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        _, _, loss = model(**data)
        final_loss += loss.item()
    return final_loss / len(data_loader)


In [17]:
## Mode file
import torch.nn as nn

def loss_fn(output, target, mask, num_labels):
  lfn = nn.CrossEntropyLoss()
  # since we dont have to calculate loss for whole sentence jst need to calculate loss where padding is not there i.e mask=1
  active_loss = mask.view(-1)==1
  active_logits = output.view(-1,num_labels)
  active_labels = torch.where(
      active_loss,
      target.view(-1),
      torch.tensor(lfn.ignore_index).type_as(target) # here we are ignoring when active loss is zero and replacing with -100
      )
  
  loss = lfn(active_logits,active_labels)

  return loss


class EntityModel(nn.Module):
  def __init__(self,num_tag,num_pos):
    super(EntityModel,self).__init__()
    self.bert = bert_model
    self.num_tag = num_tag
    self.num_pos = num_pos
    self.bert_drop_1 = nn.Dropout(0.3)
    self.bert_drop_2 = nn.Dropout(0.3)
    self.out_tag  = nn.Linear(768,self.num_tag)
    self.out_pos  = nn.Linear(768,self.num_pos)

  def forward(self, ids, mask, token_type_ids, target_pos, target_tag):
        o1, _ = self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids)

        bo_tag = self.bert_drop_1(o1)
        bo_pos = self.bert_drop_2(o1)

        tag = self.out_tag(bo_tag)
        pos = self.out_pos(bo_pos)

        loss_tag = loss_fn(tag, target_tag, mask, self.num_tag)
        loss_pos = loss_fn(pos, target_pos, mask, self.num_pos)

        loss = (loss_tag + loss_pos) / 2

        return tag, pos, loss

In [20]:
## train file

import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


def process_data(data_path):
  df = pd.read_csv(data_path , encoding='latin-1')
  df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")

  enc_pos = LabelEncoder()
  enc_tag = LabelEncoder()

  df.loc[:, "POS"] = enc_pos.fit_transform(df["POS"])
  df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])

  sentences = df.groupby("Sentence #")["Word"].apply(list).values
  pos = df.groupby("Sentence #")["POS"].apply(list).values
  tag = df.groupby("Sentence #")["Tag"].apply(list).values

  return sentences, pos, tag, enc_pos, enc_tag

if __name__ =='__main__':
  sentences, pos, tag, enc_pos, enc_tag = process_data(TRAINING_FILE)

  meta_data = {
      "enc_pos":enc_pos,
      "enc_tag":enc_tag
  }

  joblib.dump(meta_data,FOLDER_PATH + "meta.bin")

  num_pos = len(list(enc_pos.classes_))
  num_tag = len(list(enc_tag.classes_))

  (
      train_sentences,
      test_sentences,
      train_pos,
      test_pos,
      train_tag,
      test_tag
  ) = train_test_split(sentences,pos,tag, random_state=42, test_size=0.1)

  train_dataset = EntityDataset(
        texts=train_sentences, pos=train_pos, tags=train_tag
    )

  train_data_loader = torch.utils.data.DataLoader(
      train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4
  )

  valid_dataset = EntityDataset(
      texts=test_sentences, pos=test_pos, tags=test_tag
  )

  valid_data_loader = torch.utils.data.DataLoader(
      valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
  )

  device = torch.device("cuda")
  model = EntityModel(num_tag=num_tag, num_pos=num_pos)
  model.to(device)

  param_optimizer = list(model.named_parameters())
  no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
  optimizer_parameters = [
      {
          "params": [
              p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
          ],
          "weight_decay": 0.001,
      },
      {
          "params": [
              p for n, p in param_optimizer if any(nd in n for nd in no_decay)
          ],
          "weight_decay": 0.0,
      },
  ]

  num_train_steps = int(len(train_sentences) / TRAIN_BATCH_SIZE * EPOCHS)
  optimizer = AdamW(optimizer_parameters, lr=3e-5)
  scheduler = get_linear_schedule_with_warmup(
      optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
  )

  best_loss = np.inf
  for epoch in range(EPOCHS):
    train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
    test_loss = eval_fn(valid_data_loader, model, device)
    print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
    if test_loss < best_loss:
        torch.save(model.state_dict(), FOLDER_PATH + MODEL_PATH)
        best_loss = test_loss


  0%|          | 0/1349 [00:00<?, ?it/s][A
  0%|          | 1/1349 [00:01<29:18,  1.30s/it][A
  0%|          | 2/1349 [00:01<24:55,  1.11s/it][A
  0%|          | 3/1349 [00:02<21:53,  1.03it/s][A
  0%|          | 4/1349 [00:03<19:49,  1.13it/s][A
  0%|          | 5/1349 [00:03<18:29,  1.21it/s][A
  0%|          | 6/1349 [00:04<17:26,  1.28it/s][A
  1%|          | 7/1349 [00:05<16:40,  1.34it/s][A
  1%|          | 8/1349 [00:05<16:09,  1.38it/s][A
  1%|          | 9/1349 [00:06<15:50,  1.41it/s][A
  1%|          | 10/1349 [00:07<15:37,  1.43it/s][A
  1%|          | 11/1349 [00:08<15:27,  1.44it/s][A
  1%|          | 12/1349 [00:08<15:21,  1.45it/s][A
  1%|          | 13/1349 [00:09<15:15,  1.46it/s][A
  1%|          | 14/1349 [00:10<15:14,  1.46it/s][A
  1%|          | 15/1349 [00:10<15:14,  1.46it/s][A
  1%|          | 16/1349 [00:11<15:10,  1.46it/s][A
  1%|▏         | 17/1349 [00:12<15:09,  1.46it/s][A
  1%|▏         | 18/1349 [00:12<15:10,  1.46it/s][A
  1%|▏    

Train Loss = 0.13286377820949186 Valid Loss = 0.09646034685739627



  0%|          | 0/1349 [00:00<?, ?it/s][A
  0%|          | 1/1349 [00:01<25:46,  1.15s/it][A
  0%|          | 2/1349 [00:01<22:33,  1.00s/it][A
  0%|          | 3/1349 [00:02<20:22,  1.10it/s][A
  0%|          | 4/1349 [00:03<18:48,  1.19it/s][A
  0%|          | 5/1349 [00:03<17:41,  1.27it/s][A
  0%|          | 6/1349 [00:04<16:54,  1.32it/s][A
  1%|          | 7/1349 [00:05<16:25,  1.36it/s][A
  1%|          | 8/1349 [00:05<16:01,  1.39it/s][A
  1%|          | 9/1349 [00:06<15:48,  1.41it/s][A
  1%|          | 10/1349 [00:07<15:38,  1.43it/s][A
  1%|          | 11/1349 [00:07<15:30,  1.44it/s][A
  1%|          | 12/1349 [00:08<15:24,  1.45it/s][A
  1%|          | 13/1349 [00:09<15:20,  1.45it/s][A
  1%|          | 14/1349 [00:10<15:21,  1.45it/s][A
  1%|          | 15/1349 [00:10<15:18,  1.45it/s][A
  1%|          | 16/1349 [00:11<15:17,  1.45it/s][A
  1%|▏         | 17/1349 [00:12<15:13,  1.46it/s][A
  1%|▏         | 18/1349 [00:12<15:10,  1.46it/s][A
  1%|▏    

Train Loss = 0.058377550695342784 Valid Loss = 0.09535231203539297


In [27]:
def _get_prediction(sentence):

    meta_data = joblib.load(FOLDER_PATH + "meta.bin")
    enc_pos = meta_data["enc_pos"]
    enc_tag = meta_data["enc_tag"]

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    tokenized_sentence = tokenizer.encode(sentence)

    sentence = sentence.split()
    print(sentence)
    print(tokenized_sentence)

    test_dataset = EntityDataset(
        texts=[sentence], 
        pos=[[0] * len(sentence)], 
        tags=[[0] * len(sentence)]
    )

    device = torch.device("cuda")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.load_state_dict(torch.load(FOLDER_PATH+MODEL_PATH))
    model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, pos, _ = model(**data)

        print("Tags---",
            enc_tag.inverse_transform(
                tag.argmax(2).cpu().numpy().reshape(-1)
            )[:len(tokenized_sentence)]
        )
        print("POS----",
            enc_pos.inverse_transform(
                pos.argmax(2).cpu().numpy().reshape(-1)
            )[:len(tokenized_sentence)]
        )

In [28]:
_get_prediction('ajay lives in india')

['ajay', 'lives', 'in', 'india']
[101, 19128, 4710, 3268, 1999, 2634, 102]
Tags--- ['B-art' 'B-per' 'B-per' 'O' 'O' 'B-geo' 'B-art']
POS---- ['$' 'NNP' 'NNP' 'VBZ' 'IN' 'NNP' '$']
