<a href="https://colab.research.google.com/github/Ismat-Samadov/colab_notebooks/blob/main/NER_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Sample Azerbaijani sentences with entity labels (PERSON, LOCATION, ORGANIZATION)
sentences = [
    ["İlham", "Əliyev", "Bakıda", "BMT-nin", "konfransında", "iştirak", "etdi"],
    ["Leyla", "Gəncə", "şəhərində", "Azərsun", "şirkətində", "işləyir"],
    ["Rəşad", "Sumqayıt", "şəhərinə", "səyahət", "etdi"],
    ["Nigar", "və", "Zaur", "İstanbulda", "Türk Hava Yolları", "ofisində", "görüşdülər"],
    ["Samir", "Bakıda", "BP", "şirkətinə", "işə", "daxil", "oldu"]
]

labels = [
    ["B-PERSON", "I-PERSON", "B-LOCATION", "B-ORGANIZATION", "O", "O", "O"],
    ["B-PERSON", "B-LOCATION", "O", "B-ORGANIZATION", "O", "O"],
    ["B-PERSON", "B-LOCATION", "O", "O", "O"],
    ["B-PERSON", "O", "B-PERSON", "B-LOCATION", "B-ORGANIZATION", "O", "O"],
    ["B-PERSON", "B-LOCATION", "B-ORGANIZATION", "O", "O", "O", "O"]
]

# Create vocabulary and label mappings
all_words = [word for sentence in sentences for word in sentence]
unique_words = set(all_words)
word_to_idx = {word: idx for idx, word in enumerate(unique_words, 1)}
word_to_idx["<UNK>"] = 0  # Unknown token

# Map labels to integers
label_to_idx = {"B-PERSON": 0, "I-PERSON": 1, "B-LOCATION": 2, "B-ORGANIZATION": 3, "O": 4}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and validation sets (80% train, 20% validation)
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42
)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class NERDataset(Dataset):
    def __init__(self, sentences, labels, word_to_idx, label_to_idx):
        self.sentences = sentences
        self.labels = labels
        self.word_to_idx = word_to_idx
        self.label_to_idx = label_to_idx

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        tags = self.labels[idx]

        word_idxs = [self.word_to_idx.get(word, self.word_to_idx["<UNK>"]) for word in words]
        tag_idxs = [self.label_to_idx[tag] for tag in tags]

        return torch.tensor(word_idxs, dtype=torch.long), torch.tensor(tag_idxs, dtype=torch.long)

def pad_collate(batch):
    (sentences, labels) = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word_to_idx["<UNK>"])
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 for ignored tokens
    return sentences_padded, labels_padded

# Create DataLoader instances for train and validation
train_dataset = NERDataset(train_sentences, train_labels, word_to_idx, label_to_idx)
val_dataset = NERDataset(val_sentences, val_labels, word_to_idx, label_to_idx)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=pad_collate)
val_loader = DataLoader(val_dataset, batch_size=1, collate_fn=pad_collate)


In [None]:
import torch.nn as nn

class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=64, hidden_dim=128):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = torch.log_softmax(tag_space, dim=2)
        return tag_scores


In [None]:
import pandas as pd
from sklearn.metrics import classification_report

def train_model(model, train_loader, val_loader, num_epochs=10):
    # Initialize lists to collect metrics for each epoch
    epoch_list, loss_list, precision_list, recall_list, f1_list = [], [], [], [], []

    loss_function = nn.CrossEntropyLoss(ignore_index=-100)  # Ignore padding label (-100)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

    # Training loop with metric tracking
    for epoch in range(1, num_epochs + 1):
        model.train()  # Set model to training mode
        total_loss = 0

        # Training phase
        for sentence, tags in train_loader:
            model.zero_grad()
            tag_scores = model(sentence)

            # Reshape to match dimensions required by CrossEntropyLoss
            tag_scores = tag_scores.view(-1, tag_scores.shape[-1])
            tags = tags.view(-1)

            loss = loss_function(tag_scores, tags)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # Evaluation phase
        true_labels, predicted_labels = evaluate_model(model, val_loader, idx_to_label)
        report = classification_report(true_labels, predicted_labels, labels=list(label_to_idx.keys()), zero_division=0, output_dict=True)

        # Retrieve metrics
        precision = report['weighted avg']['precision']
        recall = report['weighted avg']['recall']
        f1_score = report['weighted avg']['f1-score']

        # Append metrics to lists
        epoch_list.append(f"Epoch {epoch}/{num_epochs}")
        loss_list.append(avg_loss)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1_score)

    # Create a DataFrame with the collected metrics
    df = pd.DataFrame({
        "Epoch": epoch_list,
        "Loss": loss_list,
        "Precision": precision_list,
        "Recall": recall_list,
        "F1-score": f1_list
    })

    # Display the DataFrame
    print("\nTraining Progress")
    print(df.to_string(index=False))
    return df


In [None]:
def evaluate_model(model, data_loader, idx_to_label):
    all_predictions = []
    all_true_labels = []

    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        for sentences, labels in data_loader:
            # Make predictions
            tag_scores = model(sentences)
            predictions = torch.argmax(tag_scores, dim=2)

            for pred, true in zip(predictions, labels):
                pred = pred.cpu().numpy()
                true = true.cpu().numpy()

                # Remove padding (-100) for accurate evaluation
                true = [t for t in true if t != -100]
                pred = pred[:len(true)]

                all_predictions.extend([idx_to_label[p] for p in pred])
                all_true_labels.extend([idx_to_label[t] for t in true])

    return all_true_labels, all_predictions


In [None]:
# Initialize model and DataLoader instances
vocab_size = len(word_to_idx)
tagset_size = len(label_to_idx)
model = BiLSTM_NER(vocab_size, tagset_size)

# Train the model and display training progress
training_progress_df = train_model(model, train_loader, val_loader)

# Evaluate on test data
true_labels, predicted_labels = evaluate_model(model, val_loader, idx_to_label)
print(classification_report(true_labels, predicted_labels, labels=list(label_to_idx.keys()), zero_division=0))



Training Progress
      Epoch     Loss  Precision  Recall  F1-score
 Epoch 1/10 1.616464   0.333333     0.5  0.396825
 Epoch 2/10 1.577114   0.250000     0.5  0.333333
 Epoch 3/10 1.519056   0.250000     0.5  0.333333
 Epoch 4/10 1.438615   0.250000     0.5  0.333333
 Epoch 5/10 1.365465   0.250000     0.5  0.333333
 Epoch 6/10 1.290568   0.250000     0.5  0.333333
 Epoch 7/10 1.226007   0.250000     0.5  0.333333
 Epoch 8/10 1.162358   0.250000     0.5  0.333333
 Epoch 9/10 1.107923   0.250000     0.5  0.333333
Epoch 10/10 1.051664   0.250000     0.5  0.333333
                precision    recall  f1-score   support

      B-PERSON       0.00      0.00      0.00         1
      I-PERSON       0.00      0.00      0.00         0
    B-LOCATION       0.00      0.00      0.00         1
B-ORGANIZATION       0.00      0.00      0.00         1
             O       0.50      1.00      0.67         3

      accuracy                           0.50         6
     macro avg       0.10      0.20  