# BiLSTM-CRF with BIO tagging, word embeddings, char-LSTM



In [None]:
!pip install -q transformers torch datasets accelerate scikit-learn

### 🔹 Downloading the Dataset

We begin by downloading the CoNLL-2003 dataset from Kaggle using `kagglehub`.

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("juliangarratt/conll2003-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/juliangarratt/conll2003-dataset?dataset_version_number=1...


100%|██████████| 960k/960k [00:00<00:00, 104MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/juliangarratt/conll2003-dataset/versions/1





### 🔹 Extracting and Organizing Files

The downloaded files are moved to a specified directory for easy access and further processing.


In [5]:
import shutil

shutil.move(path, "/content/conll2003-dataset")

'/content/conll2003-dataset'

## Data Preprocessing


### 🔹 Reading the Dataset

A custom parser function reads `.txt` files and extracts sentences and their corresponding NER labels in a structured format.

In [7]:
def read_conll2003(file_path):
    sentences, labels = [], []
    words, tags = [], []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            if line.startswith("-DOCSTART") or line == "" or line == "\n":
                if words:
                    sentences.append(words)
                    labels.append(tags)
                    words, tags = [], []
            else:
                splits = line.strip().split()
                words.append(splits[0])
                tags.append(splits[-1])  # Use NER tag
        if words:
            sentences.append(words)
            labels.append(tags)
    return sentences, labels

train_sents, train_labels = read_conll2003("/content/conll2003-dataset/conll2003/train.txt")
val_sents, val_labels = read_conll2003("/content/conll2003-dataset/conll2003/testa.txt")
test_sents, test_labels = read_conll2003("/content/conll2003-dataset/conll2003/testb.txt")

print(f"Example:\n{list(zip(train_sents[0], train_labels[0]))}")


Example:
[('EU', 'B-ORG'), ('rejects', 'O'), ('German', 'B-MISC'), ('call', 'O'), ('to', 'O'), ('boycott', 'O'), ('British', 'B-MISC'), ('lamb', 'O'), ('.', 'O')]


In [8]:
len(train_sents), len(val_sents), len(test_sents)

(14041, 3250, 3453)

## Vocabulary Construction

### 🔹 Word & Tag Vocabulary

We construct `word2idx` and `tag2idx` mappings to convert text and labels into numerical format suitable for model training.

In [9]:
from collections import Counter
from itertools import chain

PAD = "<PAD>"
UNK = "<UNK>"

def build_vocab(sentences, min_freq=1):
    word_counter = Counter(chain(*sentences))
    vocab = {word for word, freq in word_counter.items() if freq >= min_freq}
    idx2word = [PAD, UNK] + sorted(vocab)
    word2idx = {w: i for i, w in enumerate(idx2word)}
    return word2idx, idx2word

def build_tag_map(labels):
    tags = sorted(set(tag for seq in labels for tag in seq))
    tag2idx = {tag: i for i, tag in enumerate(tags)}
    idx2tag = {i: tag for tag, i in tag2idx.items()}
    return tag2idx, idx2tag

word2idx, idx2word = build_vocab(train_sents)
tag2idx, idx2tag = build_tag_map(train_labels)

### 🔹 Character-Level Vocabulary

Character-level vocabularies are built to support char-level embeddings, enabling better handling of unknown or rare words.

In [10]:
def build_char_vocab(sentences):
    chars = set(c for word in chain(*sentences) for c in word)
    idx2char = [PAD, UNK] + sorted(chars)
    char2idx = {c: i for i, c in enumerate(idx2char)}
    return char2idx, idx2char

char2idx, idx2char = build_char_vocab(train_sents)

## Data Encoding

### 🔹 Encoding Words and Characters

Sentences are tokenized into word and character indices using the created vocabularies. Character tokens are padded or truncated.

In [None]:
def encode_words(sentences, word2idx):
    return [[word2idx.get(w, word2idx[UNK]) for w in sent] for sent in sentences]

def encode_chars(sentences, char2idx, max_word_len=15):
    encoded = []
    for sent in sentences:
        sent_chars = []
        for word in sent:
            char_ids = [char2idx.get(c, char2idx[UNK]) for c in word]
            # pad or truncate
            char_ids = char_ids[:max_word_len] + [char2idx[PAD]] * (max_word_len - len(char_ids))
            sent_chars.append(char_ids)
        encoded.append(sent_chars)
    return encoded

### 🔹 Encoding Labels

NER tags are encoded into integer labels using the tag-to-index map.

In [None]:
def encode_labels(labels, tag2idx):
    return [[tag2idx[t] for t in seq] for seq in labels]

In [12]:
encode_words(train_sents[:2], word2idx), train_sents[:2]

([[6420, 20821, 7229, 14822, 22700, 14673, 5084, 18390, 125], [10721, 4911]],
 [['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
  ['Peter', 'Blackburn']])

## Dataset and DataLoader

### 🔹 Creating a PyTorch Dataset Class

A custom `NERDataset` class is defined to serve word, character, and label sequences in a format suitable for training.

In [None]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

class NERDataset(Dataset):
    def __init__(self, sentences, labels, word2idx, char2idx, tag2idx, max_word_len=15):
        self.word_ids = encode_words(sentences, word2idx)
        self.char_ids = encode_chars(sentences, char2idx, max_word_len)
        self.tag_ids = encode_labels(labels, tag2idx)

    def __len__(self):
        return len(self.word_ids)

    def __getitem__(self, idx):
        return {
            "words": torch.tensor(self.word_ids[idx], dtype=torch.long),
            "chars": torch.tensor(self.char_ids[idx], dtype=torch.long),
            "labels": torch.tensor(self.tag_ids[idx], dtype=torch.long),
        }


### 🔹 Collate Function

A `collate_fn` is defined to pad sequences properly for batch processing.

In [None]:
def collate_fn(batch):
    words = [item["words"] for item in batch]
    chars = [item["chars"] for item in batch]
    labels = [item["labels"] for item in batch]

    # pad word sequences
    words_padded = pad_sequence(words, batch_first=True)
    labels_padded = pad_sequence(labels, batch_first=True)

    # pad char sequences
    max_sent_len = max([c.shape[0] for c in chars])
    max_word_len = chars[0].shape[1]
    padded_chars = torch.zeros((len(chars), max_sent_len, max_word_len), dtype=torch.long)
    for i, char_seq in enumerate(chars):
        padded_chars[i, :char_seq.shape[0], :] = char_seq

    return words_padded, padded_chars, labels_padded

## Model Architecture

### 🔹 BiLSTM-CRF Overview

We design a model combining word embeddings, character-level BiLSTM, word-level BiLSTM, and a CRF layer for structured prediction.

### 🔹 Word and Character Embeddings

Separate embedding layers are used for words and characters to capture both lexical and sub-word features.

### 🔹 BiLSTM Layers

The concatenated embeddings are passed through a bidirectional LSTM to learn contextual representations.

### 🔹 CRF Layer

A CRF layer is added on top to model tag dependencies and improve sequence labeling performance.

In [14]:
!pip install -q pytorch-crf

In [15]:
import torch.nn as nn
from torchcrf import CRF  # pip install pytorch-crf

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, char_vocab_size, tagset_size,
                 word_emb_dim=100, char_emb_dim=30, char_hidden_dim=50,
                 word_hidden_dim=200, char_max_len=15):

        super(BiLSTM_CRF, self).__init__()

        # Word Embedding
        self.word_emb = nn.Embedding(vocab_size, word_emb_dim, padding_idx=0)

        # Char Embedding + Char LSTM
        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
        self.char_lstm = nn.LSTM(input_size=char_emb_dim, hidden_size=char_hidden_dim,
                                 batch_first=True, bidirectional=True)

        # BiLSTM over word + char
        self.lstm = nn.LSTM(input_size=word_emb_dim + char_hidden_dim * 2,
                            hidden_size=word_hidden_dim, batch_first=True, bidirectional=True)

        # Final tag classifier
        self.hidden2tag = nn.Linear(word_hidden_dim * 2, tagset_size)

        # CRF
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, words, chars, labels=None, mask=None):
        batch_size, sent_len = words.shape
        _, _, word_len = chars.shape

        # Word embeddings
        word_embeds = self.word_emb(words)  # [B, T, D]

        # Char embeddings
        chars = chars.view(-1, word_len)  # [B*T, W]
        char_embeds = self.char_emb(chars)  # [B*T, W, D]
        _, (h_n, _) = self.char_lstm(char_embeds)  # h_n: [2, B*T, H]
        char_repr = torch.cat([h_n[0], h_n[1]], dim=-1)  # [B*T, 2H]
        char_repr = char_repr.view(batch_size, sent_len, -1)  # [B, T, 2H]

        # Concatenate word and char representations
        combined = torch.cat([word_embeds, char_repr], dim=-1)  # [B, T, D+2H]

        # BiLSTM
        lstm_out, _ = self.lstm(combined)  # [B, T, 2H]

        emissions = self.hidden2tag(lstm_out)  # [B, T, num_tags]

        # Apply CRF
        if labels is not None:
            loss = -self.crf(emissions, labels, mask=mask, reduction='mean')
            return loss
        else:
            prediction = self.crf.decode(emissions, mask=mask)
            return prediction


In [16]:
def create_mask(words, pad_idx=0):
    return (words != pad_idx).type(torch.uint8)  # [B, T]

In [17]:
!pip install -q seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


### 🔹 Evaluation Metrics

We use `seqeval` to compute the F1 score and classification report, which are essential for evaluating NER performance.


In [24]:
from seqeval.metrics import classification_report, f1_score, accuracy_score

def compute_metrics(true_tags, pred_tags):
    # print(classification_report(true_tags, pred_tags))
    print(f"F1 Score: {f1_score(true_tags, pred_tags):.4f}")

## Training the Model


In [25]:
from tqdm import tqdm
import torch.optim as optim
from torch.utils.data import DataLoader

def train_model(model, train_ds, val_ds, tag2idx, idx2tag, epochs=3, batch_size=32, lr=1e-3):
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0.0
        train_iter = tqdm(train_loader, desc=f"Epoch {epoch} - Training", leave=False)

        for words, chars, labels in train_iter:
            mask = create_mask(words).to(device)
            words, chars, labels = words.to(device), chars.to(device), labels.to(device)

            loss = model(words, chars, labels, mask=mask)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            train_iter.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch}: Train Loss = {avg_loss:.4f}")
        evaluate_model(model, val_loader, idx2tag, device)

def evaluate_model(model, loader, idx2tag, device):
    model.eval()
    true_tags, pred_tags = [], []

    eval_iter = tqdm(loader, desc="Evaluating", leave=False)

    with torch.no_grad():
        for words, chars, labels in eval_iter:
            mask = create_mask(words).to(device)
            words, chars = words.to(device), chars.to(device)
            preds = model(words, chars, mask=mask)

            for i in range(len(preds)):
                real_len = mask[i].sum().item()
                pred_seq = [idx2tag[idx] for idx in preds[i][:real_len]]
                true_seq = [idx2tag[idx.item()] for idx in labels[i][:real_len]]
                pred_tags.append(pred_seq)
                true_tags.append(true_seq)

    compute_metrics(true_tags, pred_tags)


## Inference

### 🔹 Making Predictions

A utility function allows inference on new sentences, predicting NER tags using the trained model.

In [26]:
def infer(model, sentence, word2idx, char2idx, idx2tag, device):
    model.eval()
    tokens = sentence.split()
    word_ids = torch.tensor([[word2idx.get(w, word2idx[UNK]) for w in tokens]])
    char_ids = torch.tensor([[[char2idx.get(c, char2idx[UNK]) for c in w[:15]] +
                              [char2idx[PAD]]*(15 - len(w)) for w in tokens]])

    mask = create_mask(word_ids).to(device)
    word_ids, char_ids = word_ids.to(device), char_ids.to(device)
    preds = model(word_ids, char_ids, mask=mask)[0]

    return list(zip(tokens, [idx2tag[i] for i in preds]))


In [27]:
train_ds = NERDataset(train_sents, train_labels, word2idx, char2idx, tag2idx)
val_ds   = NERDataset(val_sents, val_labels, word2idx, char2idx, tag2idx)
test_ds = NERDataset(test_sents, test_labels, word2idx, char2idx, tag2idx)

In [None]:
model = BiLSTM_CRF(
    vocab_size=len(word2idx),
    char_vocab_size=len(char2idx),
    tagset_size=len(tag2idx),
    word_emb_dim=100,
    char_emb_dim=30,
    char_hidden_dim=50,
    word_hidden_dim=200,
    char_max_len=15
)

In [None]:
train_model(
    model=model,
    train_ds=train_ds,
    val_ds=val_ds,
    tag2idx=tag2idx,
    idx2tag=idx2tag,
    epochs=10,
    batch_size=32,
    lr=1e-3
)



Epoch 1: Train Loss = 4.8780




F1 Score: 0.6871




Epoch 2: Train Loss = 1.5689




F1 Score: 0.7677




Epoch 3: Train Loss = 0.8658




F1 Score: 0.7849




Epoch 4: Train Loss = 0.4519




F1 Score: 0.8163




Epoch 5: Train Loss = 0.2200




F1 Score: 0.8273




Epoch 6: Train Loss = 0.0985




F1 Score: 0.8206




Epoch 7: Train Loss = 0.0404




F1 Score: 0.8340




Epoch 8: Train Loss = 0.0215




F1 Score: 0.8347




Epoch 9: Train Loss = 0.0129




F1 Score: 0.8323




Epoch 10: Train Loss = 0.0079


                                                            

F1 Score: 0.8251




### 🔹 Inference Example

A sample sentence is passed through the model to visualize its NER tagging capability.


In [31]:
import torch

# Define the device in the global scope
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sentence = "Barack Obama was born in Hawaii ."
predicted_tags = infer(model, sentence, word2idx, char2idx, idx2tag, device)
print(predicted_tags)

[('Barack', 'O'), ('Obama', 'I-PER'), ('was', 'O'), ('born', 'O'), ('in', 'O'), ('Hawaii', 'B-LOC'), ('.', 'O')]
