In [None]:
!pip install transformers
!pip install pytorch-crf

In [1]:
import joblib
import torch
import torch.nn as nn
import transformers
from torchcrf import CRF

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import model_selection

from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [2]:
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 64
    EPOCHS = 10
    BASE_MODEL_PATH = "bert-base-uncased"
    MODEL_PATH = "model.bin"
    TRAINING_FILE = "/content/drive/MyDrive/NLP/ner_dataset.csv"
    TOKENIZER = transformers.BertTokenizer.from_pretrained(
        BASE_MODEL_PATH,
        do_lower_case=True
    )

def process_data(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")

    enc_tag = preprocessing.LabelEncoder()
    
    # fit_transform : a function that transforms the str labels into int labels
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"]) # and return the int 

    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, tag, enc_tag

class EntityDataset:
    def __init__(self, texts, tags):
        self.texts = texts
        self.tags = tags
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = self.texts[item]
        tags = self.tags[item]

        ids = []
        target_tag =[]

        for i, s in enumerate(text): 
            inputs = config.TOKENIZER.encode(
                s,
                add_special_tokens=False
            )
            # abhishek: ab ##hi ##sh ##ek
            input_len = len(inputs)
            ids.extend(inputs) # add all the sub-words
            target_tag.extend([tags[i]] * input_len) # all the sub-words from one word are labeled with the same tag

        ids = ids[:config.MAX_LEN - 2] # leave space for [CLS][SEP]
        target_tag = target_tag[:config.MAX_LEN - 2]

        ids = [101] + ids + [102] # 101 -> CLS  102 -> SEP
        target_tag = [0] + target_tag + [0]

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        padding_len = config.MAX_LEN - len(ids)

        ids = ids + ([0] * padding_len) # make all the input the same length
        mask = mask + ([0] * padding_len) # 0->acceptable 1->omitted
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tag = target_tag + ([0] * padding_len)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_tag": torch.tensor(target_tag, dtype=torch.long),
        }

In [3]:
sentences, tag, enc_tag = process_data(config.TRAINING_FILE)

meta_data = {"enc_tag": enc_tag}
joblib.dump(meta_data, "meta.bin")

num_tag = len(list(enc_tag.classes_))

(train_sentences,teva_sentences,train_tag,teva_tag) = model_selection.train_test_split(sentences, tag, random_state=42, test_size=0.1)
(test_sentences,valid_sentences,test_tag,valid_tag) = model_selection.train_test_split(teva_sentences, teva_tag, random_state=42, test_size=0.5)

train_dataset = EntityDataset(texts=train_sentences, tags=train_tag)
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE)

valid_dataset = EntityDataset(texts=valid_sentences, tags=valid_tag)
valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.TRAIN_BATCH_SIZE)

test_dataset = EntityDataset(texts=test_sentences, tags=test_tag)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=config.TRAIN_BATCH_SIZE)

In [3]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    i = 0 
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        loss = model.Loss_fn(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
        
        i += 1
        #if i % 100 == 0:
        #  print("train_loss: {}".format(loss.item()))

    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0

    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        loss = model.Loss_fn(**data)
        final_loss += loss.item()

    return final_loss / len(data_loader)

def loss_fn(output, target, mask, num_labels):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss

In [4]:
class EntityModel(nn.Module):
    def __init__(self, num_tag):
        super(EntityModel, self).__init__()
        self.num_tag = num_tag
        self.bert = transformers.BertModel.from_pretrained(
            config.BASE_MODEL_PATH
        )
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, self.num_tag)
        self.lstm = nn.LSTM(768, 768//2, num_layers=2, bidirectional=True, batch_first=True, dropout=0.3)
        self.crf = CRF(self.num_tag, batch_first=True)
    
    def forward(self, ids, mask, token_type_ids, target_tag):
        o1, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        seq_out, _ = self.lstm(o1)
        tag = self.fc(seq_out)
        crf_tag = self.crf.decode(tag, mask.bool())
        return crf_tag

    def Loss_fn(self, ids, mask, token_type_ids, target_tag):
        o1, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        seq_out, _ = self.lstm(o1)
        y_pred = self.fc(seq_out)
        loss = -self.crf.forward(y_pred, target_tag, mask.bool(), reduction='mean')
        return loss

In [6]:
device = torch.device("cuda")
model = EntityModel(num_tag=num_tag)
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(
    len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS
)
optimizer = AdamW(optimizer_parameters, lr=4e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_train_steps
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
best_loss = np.inf
for epoch in range(config.EPOCHS):
    train_loss = train_fn(
        train_data_loader, 
        model, 
        optimizer, 
        device, 
        scheduler
    )
    test_loss = eval_fn(
        valid_data_loader,
        model,
        device
    )
    print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
    if test_loss < best_loss:
        torch.save(model.state_dict(), config.MODEL_PATH)
        best_loss = test_loss

In [6]:
device = torch.device("cuda")
model = EntityModel(num_tag=num_tag)
model.load_state_dict(torch.load('/content/drive/MyDrive/NLP/BertLstmCrf.bin'))
model.to(device)

In [8]:
model.eval()

y_labels = []
y_preds = []

for data in tqdm(test_data_loader, total=len(test_data_loader)):
    for k, v in data.items():
        data[k] = v.to(device)
    
    for i in range(len(data["mask"])):
        mask = data["mask"][i].cpu().numpy()
        target = data["target_tag"][i].cpu().numpy()
        temp = []
        for i in range(len(mask)):  
          if mask[i] == 1:
            temp.append(target[i])
        y_preds.append(temp)
        
    
    y_pred = model(**data)
    for sentence in y_pred:
      y_labels.append(sentence)

y_true, y_pred = [], []
for tags in y_labels:
  for tag in tags:
    y_true.append(tag)

for tags in y_preds:
  for tag in tags:
    y_pred.append(tag)

100%|██████████| 38/38 [00:11<00:00,  3.32it/s]


In [9]:
from sklearn.metrics import classification_report
target_names = list(enc_tag.classes_)
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

       B-art       0.99      1.00      1.00      4816
       B-eve       0.13      0.33      0.19         6
       B-geo       0.91      0.84      0.87      3232
       B-gpe       0.93      0.94      0.94       855
       B-nat       0.08      0.33      0.12         3
       B-org       0.68      0.80      0.74      1469
       B-per       0.84      0.86      0.85      1305
       B-tim       0.86      0.92      0.89      1070
       I-art       0.21      0.40      0.28        10
       I-eve       0.00      0.00      0.00         0
       I-geo       0.82      0.68      0.74       545
       I-gpe       0.25      1.00      0.40         3
       I-nat       0.00      0.00      0.00         0
       I-org       0.67      0.73      0.70      1000
       I-per       0.93      0.87      0.90      1783
       I-tim       0.79      0.85      0.82       311
           O       0.99      0.99      0.99     47688

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
meta_data = joblib.load("/content/drive/MyDrive/NLP/meta.bin")
enc_tag = meta_data["enc_tag"]
num_tag = len(list(enc_tag.classes_))

device = torch.device("cuda")

model = EntityModel(num_tag=17)
model.load_state_dict(torch.load("/content/drive/MyDrive/NLP/BertLstmCrf.bin"))
model.to(device)

In [6]:
def inference(input_sentence, model):
    meta_data = joblib.load("/content/drive/MyDrive/NLP/meta.bin")
    enc_tag = meta_data["enc_tag"]

    #input_sentence = "Mr. Trump’s tweets began just moments after a Fox News report by Mike Tobin, a reporter for the network, about protests in Minnesota and elsewhere."
    tokenized_sentence = config.TOKENIZER.encode(input_sentence)

    input_sentence = input_sentence.split()

    test_dataset = EntityDataset(texts=[input_sentence], tags=[[0]*len(input_sentence)])

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        y_pred = model(**data)[0]
        
        tokenized_sentence = config.TOKENIZER.convert_ids_to_tokens(tokenized_sentence)
        tokenized_ner      = enc_tag.inverse_transform(y_pred)[:len(tokenized_sentence)]

        new_ner = []
        new_token = []

        for i in range(1,len(tokenized_sentence)-1):
          if tokenized_sentence[i].startswith("##"):
            new_token[-1] = new_token[-1] + tokenized_sentence[i][2:]
          else:
            new_token.append(tokenized_sentence[i])
            new_ner.append(tokenized_ner[i])

        for k,v in zip(new_token, new_ner):
          print(v, k)

In [7]:
sentence = """Mr. Trump’s tweets began just moments after a Fox 
            News report by Mike Tobin, a reporter for the network, 
            about protests in Minnesota and elsewhere."""

inference(sentence, model)

B-per mr
B-per .
I-per trump
O ’
O s
O tweets
O began
O just
O moments
O after
O a
B-org fox
I-org news
O report
O by
B-per mike
I-per tobin
O ,
O a
O reporter
O for
O the
O network
O ,
O about
O protests
O in
B-geo minnesota
O and
O elsewhere
O .
