In [None]:
# Install necessary libraries
!pip install torch==2.0.0 torchtext==0.15.1


Collecting torch==2.0.0
  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchtext==0.15.1
  Downloading torchtext-0.15.1-cp310-cp310-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.0)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.0)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.0)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.0)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.0)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Co

In [None]:
# Download spacy English model
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Import libraries
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset, DataLoader
import spacy
import random
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from collections import Counter


In [None]:
# Enable GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
# Load spacy tokenizer
spacy_en = spacy.load("en_core_web_sm")


In [None]:
# Helper function to parse the dataset
def parse_conll_file(filename):
    sentences = []
    tags = []
    with open(filename, "r") as f:
        sentence = []
        tag_seq = []
        for line in f:
            if line == "\n":  # Sentence boundary
                sentences.append(" ".join(sentence))
                tags.append(" ".join(tag_seq))
                sentence = []
                tag_seq = []
            else:
                splits = line.strip().split()
                sentence.append(splits[0])  # Word
                tag_seq.append(splits[-1])  # NER Tag
    return sentences, tags

# Parse training, validation, and test data
train_sentences, train_tags = parse_conll_file("/content/drive/MyDrive/Deep_Learning_Projects/NER/conll2003/eng.train")
val_sentences, val_tags = parse_conll_file("/content/drive/MyDrive/Deep_Learning_Projects/NER/conll2003/eng.testb")
test_sentences, test_tags = parse_conll_file("/content/drive/MyDrive/Deep_Learning_Projects/NER/conll2003/eng.testa")

# Save as CSV files for DataLoader
def save_to_csv(sentences, tags, filename):
    df = pd.DataFrame({"sentence": sentences, "tags": tags})
    df.to_csv(filename, index=False)

save_to_csv(train_sentences, train_tags, "/content/drive/MyDrive/Deep_Learning_Projects/NER/NER_dataset/train.csv")
save_to_csv(val_sentences, val_tags, "/content/drive/MyDrive/Deep_Learning_Projects/NER/NER_dataset/val.csv")
save_to_csv(test_sentences, test_tags, "/content/drive/MyDrive/Deep_Learning_Projects/NER/NER_dataset/test.csv")


In [None]:
# Define a tokenizer using torchtext's get_tokenizer
tokenizer = get_tokenizer("basic_english")


In [None]:
# Custom Dataset Class
class CustomNERDataset(Dataset):
    def __init__(self, csv_file, tokenizer):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx, 0]
        tags = self.data.iloc[idx, 1].split()
        tokens = self.tokenizer(sentence)

        # Ensure tokens and tags have the same length
        if len(tokens) > len(tags):
            tokens = tokens[:len(tags)]
        elif len(tags) > len(tokens):
            tags = tags[:len(tokens)]

        return tokens, tags


In [None]:
# Build vocabularies
def build_vocab(dataset):
    token_counter = Counter()
    tag_counter = Counter()

    for tokens, tags in dataset:
        token_counter.update(tokens)
        tag_counter.update(tags)

    # Build vocabularies using torchtext
    special_tokens = ['<unk>', '<pad>', '<bos>', '<eos>']
    TEXT = torchtext.vocab.vocab(token_counter, min_freq=1, specials=special_tokens)
    TAGS = torchtext.vocab.vocab(tag_counter, min_freq=1, specials=['<pad>'])

    # Set default index for unknown tokens
    TEXT.set_default_index(TEXT.get_stoi()['<unk>'])
    TAGS.set_default_index(TAGS.get_stoi()['<pad>'])

    return TEXT, TAGS


In [None]:
# Custom collate function for padding
def collate_fn(batch):
    sentences, tags = zip(*batch)

    # Get lengths of each sequence in the batch
    lengths = [len(s) for s in sentences]
    max_len = max(lengths)

    # Convert tokens to indices and pad, using get_default_index() for unknown tokens
    padded_sentences = [
        [TEXT[word] for word in sentence] +  # TEXT vocabulary handles unknown tokens automatically
        [TEXT['<pad>']] * (max_len - len(sentence))
        for sentence in sentences
    ]

    # Convert tags to indices and pad
    padded_tags = [
        [TAGS[tag] for tag in tag_seq] +  # TAGS vocabulary handles unknown tags automatically
        [TAGS['<pad>']] * (max_len - len(tag_seq))
        for tag_seq in tags
    ]

    return (
        torch.tensor(padded_sentences, dtype=torch.long),
        torch.tensor(padded_tags, dtype=torch.long)
    )


In [None]:
# BiLSTM Model
class BiLSTMNER(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        dropped_out = self.dropout(lstm_out)
        predictions = self.fc(dropped_out)
        return predictions


In [None]:
# Create datasets
train_dataset = CustomNERDataset("/content/drive/MyDrive/Deep_Learning_Projects/NER/NER_dataset/train.csv", tokenizer)
val_dataset = CustomNERDataset("/content/drive/MyDrive/Deep_Learning_Projects/NER/NER_dataset/val.csv", tokenizer)
test_dataset = CustomNERDataset("/content/drive/MyDrive/Deep_Learning_Projects/NER/NER_dataset/test.csv", tokenizer)


In [None]:
# Build vocabularies
TEXT, TAGS = build_vocab(train_dataset)


In [None]:
# Create DataLoaders
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)


In [None]:
# Model parameters
INPUT_DIM = len(TEXT)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(TAGS)
PAD_IDX = TEXT.get_stoi()['<pad>']


In [None]:
# Define checkpoint directory
CHECKPOINT_DIR = "/content/drive/MyDrive/Deep_Learning_Projects/NER/checkpoints"

# Save model checkpoint
def save_checkpoint(model, optimizer, epoch, loss, filename="checkpoint.pth"):
    checkpoint_path = os.path.join(CHECKPOINT_DIR, filename)
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, checkpoint_path)
    print(f"Checkpoint saved: {checkpoint_path}")

# Load model checkpoint
def load_checkpoint(model, optimizer, filename):
    checkpoint_path = os.path.join(CHECKPOINT_DIR, filename)
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    return model, optimizer, epoch, loss


In [None]:
# Training loop with dimension checks
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0

    pbar = tqdm(train_loader, desc="Training")

    for batch in pbar:
        tokens, tags = batch
        tokens = tokens.to(device)
        tags = tags.to(device)

        optimizer.zero_grad()

        # Forward pass
        predictions = model(tokens)

        # Reshape predictions and tags for loss calculation
        batch_size, seq_len, num_classes = predictions.shape
        predictions = predictions.view(-1, num_classes)
        tags = tags.view(-1)

        # Verify shapes before loss calculation
        assert predictions.shape[0] == tags.shape[0], \
            f"Prediction shape {predictions.shape} doesn't match target shape {tags.shape}"

        loss = criterion(predictions, tags)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        pbar.set_postfix(loss=epoch_loss / (pbar.n + 1))

    return epoch_loss / len(train_loader)


In [None]:
# Initialize model
model = BiLSTMNER(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX).to(device)

# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=TAGS.get_stoi()['<pad>'])

# Training configuration
N_EPOCHS = 5

# Training loop
for epoch in range(N_EPOCHS):
    avg_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f"\nEpoch {epoch+1} Loss: {avg_loss:.4f}")

    # Save checkpoint
    save_checkpoint(
        model,
        optimizer,
        epoch,
        avg_loss,
        filename=f"checkpoint_epoch_{epoch+1}.pth"
    )


Training: 100%|██████████| 235/235 [36:17<00:00,  9.27s/it, loss=0.702]



Epoch 1 Loss: 0.7025
Checkpoint saved: /content/drive/MyDrive/Deep_Learning_Projects/NER/checkpoints/checkpoint_epoch_1.pth


Training: 100%|██████████| 235/235 [35:09<00:00,  8.98s/it, loss=0.464]



Epoch 2 Loss: 0.4639
Checkpoint saved: /content/drive/MyDrive/Deep_Learning_Projects/NER/checkpoints/checkpoint_epoch_2.pth


Training: 100%|██████████| 235/235 [35:01<00:00,  8.94s/it, loss=0.352]



Epoch 3 Loss: 0.3524
Checkpoint saved: /content/drive/MyDrive/Deep_Learning_Projects/NER/checkpoints/checkpoint_epoch_3.pth


Training: 100%|██████████| 235/235 [35:00<00:00,  8.94s/it, loss=0.281]



Epoch 4 Loss: 0.2807
Checkpoint saved: /content/drive/MyDrive/Deep_Learning_Projects/NER/checkpoints/checkpoint_epoch_4.pth


Training: 100%|██████████| 235/235 [38:29<00:00,  9.83s/it, loss=0.231]



Epoch 5 Loss: 0.2310
Checkpoint saved: /content/drive/MyDrive/Deep_Learning_Projects/NER/checkpoints/checkpoint_epoch_5.pth


Training:   1%|          | 2/235 [00:20<39:51, 10.26s/it, loss=0.209]


KeyboardInterrupt: 

In [None]:
# Evaluation
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            tokens, tags = batch
            tokens = tokens.to(device)
            tags = tags.to(device)

            # Forward pass
            predictions = model(tokens)

            # Reshape for loss calculation
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)

            # Calculate loss
            loss = criterion(predictions, tags)
            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    return avg_loss


In [None]:
# Prediction
def predict(model, sentence, tokenizer, TEXT, TAGS, device):
    model.eval()

    # Tokenize the input sentence
    tokens = tokenizer(sentence)

    # Convert tokens to indices, using vocabulary's default handling for unknown tokens
    token_indices = [TEXT[token] for token in tokens]
    token_tensor = torch.tensor([token_indices]).to(device)

    with torch.no_grad():
        # Get predictions
        predictions = model(token_tensor)
        predicted_indices = predictions.argmax(dim=2)[0]

        # Convert predictions to tags
        predicted_tags = [TAGS.get_itos()[idx.item()] for idx in predicted_indices]

    # Print results
    print("\nTokens and their predicted tags:")
    for token, tag in zip(tokens, predicted_tags):
        print(f"{token}: {tag}")

    return list(zip(tokens, predicted_tags))


In [None]:
# Evaluate on validation set
print("\nEvaluating validation set...")
val_loss = evaluate(model, val_loader, criterion, device)
print(f"Validation Loss: {val_loss:.4f}")



Evaluating validation set...


Evaluating: 100%|██████████| 58/58 [00:00<00:00, 76.51it/s]

Validation Loss: 0.4273





In [None]:
# Evaluate on test set
print("\nEvaluating test set...")
test_loss = evaluate(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f}")



Evaluating test set...


Evaluating: 100%|██████████| 55/55 [00:00<00:00, 99.53it/s]

Test Loss: 0.3725





In [None]:
# Making a prediction
example = "John works at Microsoft in New York."

print("\nPredicting NER tags for:", example)
results = predict(model, example, tokenizer, TEXT, TAGS, device)



Predicting NER tags for: John works at Microsoft in New York.

Tokens and their predicted tags:
john: B-PER
works: I-PER
at: O
microsoft: B-LOC
in: O
new: B-LOC
york: I-LOC
.: O
