## Installations

In [1]:
!pip install datasets
# Torch is already installed in colab, but needs to be installed separately otherwise



## Imports

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, get_scheduler
from datasets import load_dataset
import pandas as pd
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
from sklearn.metrics import classification_report

## Download Dataset from Hugging face

In [3]:
# Load CoNLL-2003 dataset from Hugging Face
dataset = load_dataset("conll2003")

# Display dataset structure
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


## Visualise a sample input/output in the data

In [4]:
import pandas as pd

# Load the label mapping from the dataset
ner_label_names = dataset["train"].features["ner_tags"].feature.names

# Function to display a sample sentence with labels
def show_sample(dataset, index=0):
    tokens = dataset["train"][index]["tokens"]
    ner_tags = dataset["train"][index]["ner_tags"]
    labels = [ner_label_names[tag] for tag in ner_tags] # Convert indices to corresponding names

    df = pd.DataFrame({"Token": tokens, "NER Label": labels})
    return df

df_sample = show_sample(dataset, index=0)

In [5]:
df_sample

Unnamed: 0,Token,NER Label
0,EU,B-ORG
1,rejects,O
2,German,B-MISC
3,call,O
4,to,O
5,boycott,O
6,British,B-MISC
7,lamb,O
8,.,O


## Dataset Preparation

### Tokenization

#### Tokenize an example for visualization

In [6]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Example sentence
example_text = dataset["train"][0]["tokens"]
example_labels = dataset["train"][0]["ner_tags"]

# Tokenize the sentence
tokens = tokenizer(example_text, is_split_into_words=True, truncation=True)

# Display tokenized output
print("Original Tokens:", example_text)
print("Tokenized:", tokenizer.convert_ids_to_tokens(tokens["input_ids"]))


Original Tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
Tokenized: ['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']


#### Logic to align correspodning NER tags
- This is needed since during tokenization some words might get split into sub words dure to is_split_into_words flag being True

In [7]:
def align_labels_with_tokens(labels, word_ids):
    """
    Aligns the NER labels with tokenized word pieces, assigning the same label to all subwords.
    """
    aligned_labels = []

    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)  # Ignore special tokens, like start and end
        else:
            aligned_labels.append(labels[word_idx])  # Assign the label to all tokens/subwords

    return aligned_labels


# # Tokenize sample sentence and get word_ids
tokens = tokenizer(example_text, is_split_into_words=True, truncation=True)
word_ids = tokens.word_ids() # These correspond to the original sequence in which your input seq was present. Note words that get split will have the same word id.

# Align labels with tokens
aligned_labels = align_labels_with_tokens(example_labels, word_ids)

# Display results
df_aligned = pd.DataFrame({
    "Tokenized Word": tokenizer.convert_ids_to_tokens(tokens["input_ids"]),
    "Aligned Label": [ner_label_names[l] if l != -100 else "IGNORED" for l in aligned_labels]
})

In [8]:
df_aligned

Unnamed: 0,Tokenized Word,Aligned Label
0,[CLS],IGNORED
1,EU,B-ORG
2,rejects,O
3,German,B-MISC
4,call,O
5,to,O
6,boycott,O
7,British,B-MISC
8,la,O
9,##mb,O


#### Main function that will perform the tokenization and ner tags alignment

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],  # Input tokens
        truncation=True,
        padding="max_length",  # Ensure all sequences are padded to the same length
        max_length=128,  # Limit sequence length
        is_split_into_words=True,
    )

    # Align labels for each example
    aligned_labels = []
    for idx in range(len(examples["tokens"])):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        labels = examples["ner_tags"][idx]
        aligned_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


#### Tokenize the train, val and test datasets

In [10]:
# Apply tokenization and label alignment to the dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Keep only necessary columns
tokenized_datasets = tokenized_datasets.remove_columns(
    ["tokens", "pos_tags", "chunk_tags", "id", "ner_tags"]
)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

#### Torchify the tokenised data

In [11]:
# Convert to PyTorch datasets
train_dataset = tokenized_datasets["train"].with_format("torch")
val_dataset = tokenized_datasets["validation"].with_format("torch")
test_dataset = tokenized_datasets["test"].with_format("torch")

In [12]:
train_dataset[0]['labels']

tensor([-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100])

In [13]:
train_dataset[0]['input_ids']

tensor([  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
          119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [14]:
train_dataset[0]['attention_mask']

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

#### Create Data Loaders for fetching a batch of data

In [15]:
# Use DataLoader without additional collation
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=None)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=None)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=None)


In [16]:
# Fetch one batch from the training DataLoader
batch = next(iter(train_loader))

# Display the keys and shapes of the batch
for key, value in batch.items():
    print(f"{key}: {value.shape}")


input_ids: torch.Size([16, 128])
token_type_ids: torch.Size([16, 128])
attention_mask: torch.Size([16, 128])
labels: torch.Size([16, 128])


## Model Hyperparameters

In [24]:
num_epochs = 4
lr = 5e-5
num_labels = len(dataset["train"].features["ner_tags"].feature.names)  # Number of unique NER labels

## Load pre-trained BERT model for Token level classification

In [25]:
# Load pretrained BERT model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",  # Pretrained BERT model
    num_labels=num_labels
)

# Move model to GPU (if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [26]:
params = sum(param.numel() for param in model.parameters())

In [27]:
params

107726601

## Define Optimization and LR Scheduler

In [28]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define scheduler
num_training_steps = len(train_loader) * num_epochs  # Assuming 3 epochs
lr_scheduler = get_scheduler(
    "linear",  # Linear decay
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)




## Function to evaluate model performance on val/test set

In [29]:
# Function for model evaluation and visualization
def evaluate(model, val_loader, label_names, generate_report= False):
    model.eval()  # Set model to evaluation mode
    all_predictions = []
    all_labels = []
    total_loss = 0

    with torch.no_grad():  # Disable gradient computation
        for batch in tqdm(val_loader, desc="Evaluating"):
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}

            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            logits = outputs.logits  # Raw model outputs
            predictions = torch.argmax(logits, dim=-1)  # Predicted labels

            # Collect predictions and labels
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())

    # Convert predictions and labels to a single flat list for evaluation
    all_predictions = [p for batch in all_predictions for p in batch]
    all_labels = [l for batch in all_labels for l in batch]

    # Print epoch summary
    print(f"Average Loss: {total_loss / len(val_loader):.4f}")

    if generate_report:
      all_labels = [l if l != -100 else 0 for l in all_labels]
      # Print classification report
      print("\nClassification Report:")
      print(classification_report(all_labels, all_predictions, target_names=label_names))

## Main Training Loop

In [30]:
# Training loop with validation
for epoch in range(num_epochs):
    # Training phase
    model.train()  # Set model to training mode
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    total_loss = 0

    for batch in progress_bar:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        progress_bar.set_postfix({"Loss": loss.item()})

    # Adjust learning rate
    lr_scheduler.step()

    # Print epoch summary
    print(f"Epoch {epoch + 1} completed. Average Loss: {total_loss / len(train_loader):.4f}")

    # Validation phase
    print("\nStarting Validation...")
    label_names = dataset["train"].features["ner_tags"].feature.names
    evaluate(model, val_loader, label_names)

Epoch 1: 100%|██████████| 878/878 [04:56<00:00,  2.96it/s, Loss=0.226]


Epoch 1 completed. Average Loss: 0.1412

Starting Validation...


Evaluating: 100%|██████████| 204/204 [00:23<00:00,  8.52it/s]


Average Loss: 0.0818


Epoch 2: 100%|██████████| 878/878 [04:56<00:00,  2.96it/s, Loss=0.0944]


Epoch 2 completed. Average Loss: 0.0467

Starting Validation...


Evaluating: 100%|██████████| 204/204 [00:23<00:00,  8.54it/s]


Average Loss: 0.0648


Epoch 3: 100%|██████████| 878/878 [04:56<00:00,  2.96it/s, Loss=0.0063]


Epoch 3 completed. Average Loss: 0.0281

Starting Validation...


Evaluating: 100%|██████████| 204/204 [00:23<00:00,  8.63it/s]


Average Loss: 0.0770


Epoch 4: 100%|██████████| 878/878 [04:56<00:00,  2.96it/s, Loss=0.00921]


Epoch 4 completed. Average Loss: 0.0224

Starting Validation...


Evaluating: 100%|██████████| 204/204 [00:23<00:00,  8.56it/s]


Average Loss: 0.0791


## Generate classification report on test set

In [31]:
evaluate(model, test_loader, label_names, generate_report=True)

Evaluating: 100%|██████████| 216/216 [00:25<00:00,  8.52it/s]


Average Loss: 0.1826

Classification Report:
              precision    recall  f1-score   support

           O       1.00      0.80      0.89    426460
       B-PER       0.17      0.94      0.29      2986
       I-PER       0.19      1.00      0.31      2704
       B-ORG       0.10      0.92      0.17      3524
       I-ORG       0.12      0.90      0.22      1309
       B-LOC       0.15      0.92      0.26      2998
       I-LOC       0.12      0.89      0.22       415
      B-MISC       0.24      0.68      0.35      1266
      I-MISC       0.11      0.58      0.18       322

    accuracy                           0.80    441984
   macro avg       0.24      0.85      0.32    441984
weighted avg       0.97      0.80      0.87    441984



## Save the fine-tuned BERT model

In [32]:
model.save_pretrained("/content/fine_tuned_bert_ner_v2")
tokenizer.save_pretrained("/content/fine_tuned_bert_ner_v2")

('/content/fine_tuned_bert_ner_v2/tokenizer_config.json',
 '/content/fine_tuned_bert_ner_v2/special_tokens_map.json',
 '/content/fine_tuned_bert_ner_v2/vocab.txt',
 '/content/fine_tuned_bert_ner_v2/added_tokens.json',
 '/content/fine_tuned_bert_ner_v2/tokenizer.json')

## Inference

In [33]:
def perform_ner_inference(sentences, label_names, model_path="./fine_tuned_bert_ner_v2", max_length=128, device=None):
    """
    Perform NER inference on a sentence or a list of sentences.
    """
    # Ensure sentences is a list
    if isinstance(sentences, str):
        sentences = [sentences]

    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)

    # Move the model to the appropriate device
    if device is None:
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()  # Set the model to evaluation mode

    results = []

    for sentence in sentences:
        # Tokenize the sentence
        inputs = tokenizer(
            sentence,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt",
        )
        inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to device

        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)  # Get predicted labels

        # Convert input IDs back to tokens
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

        # Map predictions to NER tags
        predicted_labels = [label_names[label] for label in predictions[0].cpu().numpy()]

        # Filter out special tokens
        filtered_tokens = []
        filtered_predictions = []
        for token, prediction in zip(tokens, predicted_labels):
            if token not in ["[CLS]", "[SEP]", "[PAD]"]:
                filtered_tokens.append(token)
                filtered_predictions.append(prediction)

        # Create a mapping of tokens to their predicted labels
        result = {token: label for token, label in zip(filtered_tokens, filtered_predictions)}
        results.append(result)

    return results


In [34]:
sentences = [
    "My name is Jay Lodha and I'm from Nagpur"
]

# Perform inference
outputs = perform_ner_inference(sentences, label_names)

In [35]:
outputs

[{'My': 'O',
  'name': 'O',
  'is': 'O',
  'Jay': 'B-PER',
  'Lo': 'I-PER',
  '##dha': 'I-PER',
  'and': 'O',
  'I': 'O',
  "'": 'O',
  'm': 'O',
  'from': 'O',
  'Na': 'B-LOC',
  '##g': 'B-LOC',
  '##pur': 'B-LOC'}]