<a href="https://colab.research.google.com/github/Jay0073/Transformer-Based-NLP/blob/main/NER-with-PretrainedModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition with pretrained model


In [65]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from datasets import ClassLabel, Sequence

In [17]:
# Load text and labels from separate files
def load_data(text_file, labels_file):
    with open(text_file, 'r') as f:
        sentences = f.readlines()

    with open(labels_file, 'r') as f:
        label_lines = f.readlines()

    # Create aligned lists
    sentences = [s.strip().split() for s in sentences]  # Split words in each sentence
    labels = [l.strip().split() for l in label_lines]  # Split labels for each sentence

    return sentences, labels

In [18]:
# Load your data
sentences, labels = load_data('Data/Ner Dataset/text_train.txt', 'Data/Ner Dataset/labels_train.txt')

In [19]:
sentences[1]

['Despite',
 'Bangladesh',
 "'s",
 'highest',
 'total',
 'ever',
 'in',
 'a',
 'limited-overs',
 'match',
 ',',
 'the',
 'Kiwis',
 'were',
 'able',
 'to',
 'win',
 'the',
 'match',
 'by',
 'six',
 'wickets',
 'in',
 'Auckland',
 '.']

In [38]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

In [39]:
tokenizer

BertTokenizerFast(name_or_path='dbmdz/bert-large-cased-finetuned-conll03-english', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [40]:
# Tokenize and align labels
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = []
    aligned_labels = []
    for sentence, label in zip(sentences, labels):
        encodings = tokenizer(sentence, truncation=True, padding='max_length', is_split_into_words=True, return_tensors='pt')
        # Align labels with tokenized input
        label_ids = [-100] * len(encodings["input_ids"][0])  # Default labels to -100
        for j in range(len(label)):
            if j < len(encodings["input_ids"][0]) - 2:  # Adjust for [CLS] and [SEP]
                label_ids[j + 1] = label[j] if label[j] != "O" else -100  # Retain 'O' labels as -100
        tokenized_inputs.append(encodings)
        aligned_labels.append(label_ids)

    return tokenized_inputs, aligned_labels

In [41]:
inputs, labels = tokenize_and_align_labels(sentences, labels)

In [42]:
# Tokenized input
inputs[0]

{'input_ids': tensor([[  101,  1203,  2512,   112,   188,  5428,  1264,  1144,  2297,   170,
         22407,   118, 14112,  1158,  1782,  1166,  6735,  1107,  1103,  1148,
          1104,  1210,  1141,   118,  1285,  1835,  1116,  1107,  1203,  2512,
           119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [43]:
# Split the data into training and testing sets (80% training, 20% testing)
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    inputs,
    labels,
    test_size=0.2,  # 20% for testing
    random_state=12  # For reproducibility
)

In [73]:
class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

        # Get unique labels and map them to IDs
        unique_labels = set([label for sublist in labels for label in sublist])
        self.label2id = {label: i for i, label in enumerate(unique_labels)}
        self.id2label = {i: label for label, i in self.label2id.items()}

    def __getitem__(self, idx):
        # Access individual elements from the encodings dictionary
        item = {
            'input_ids': self.encodings[idx]['input_ids'][0],  # Access the first element of input_ids
            'attention_mask': self.encodings[idx]['attention_mask'][0],  # Access the first element of attention_mask
            # ...add other required inputs in a similar way...
        }
        # Convert labels to numerical IDs using the label2id mapping
        item['labels'] = torch.tensor([self.label2id.get(label, -100) for label in self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [74]:
# Create dataset objects
train_dataset = NERDataset(train_inputs, train_labels)
test_dataset = NERDataset(test_inputs, test_labels)

In [75]:
train_dataset

<__main__.NERDataset at 0x7bd478437810>

In [69]:
# Load the BERT model for token classification
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [76]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [77]:
# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [78]:
trainer.train()

IndexError: Target 11 is out of bounds.

In [None]:
trainer.evaluate()