In [None]:
# Import necessary libraries
import pandas as pd
import os
from google.colab import files

# Read the files
train_file = '/content/drive/MyDrive/coNLL-2003/train.txt'
test_file = '/content/drive/MyDrive/coNLL-2003/test.txt'
valid_file = '/content/drive/MyDrive/coNLL-2003/valid.txt'

# Load the datasets
def load_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        sentence = []
        for line in file:
            if line.startswith("-DOCSTART-") or line == "\n":
                if sentence:
                    data.append(sentence)
                    sentence = []
                continue
            splits = line.split()
            sentence.append((splits[0], splits[-1]))
        if sentence:
            data.append(sentence)
    return data

train_data = load_data(train_file)
test_data = load_data(test_file)
valid_data = load_data(valid_file)

# Perform EDA
print(f"Number of sentences in training data: {len(train_data)}")
print(f"Number of sentences in test data: {len(test_data)}")
print(f"Number of sentences in validation data: {len(valid_data)}")

Number of sentences in training data: 14041
Number of sentences in test data: 3453
Number of sentences in validation data: 3250


In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load SpaCy's English model
nlp = spacy.load('en_core_web_sm')

# Preprocess the text
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS and token.is_alpha]
    return tokens

# Preprocess the sentences
def preprocess_sentences(data):
    preprocessed_data = []
    for sentence in data:
        preprocessed_sentence = []
        for token, label in sentence:
            preprocessed_sentence.append((token.lower(), label))
        preprocessed_data.append(preprocessed_sentence)
    return preprocessed_data

train_data = preprocess_sentences(train_data)
test_data = preprocess_sentences(test_data)
valid_data = preprocess_sentences(valid_data)

In [None]:
def convert_to_spacy_format(data):
    spacy_data = []
    for sentence in data:
        tokens = [token for token, label in sentence]
        entities = []
        start = 0
        for token, label in sentence:
            if label != 'O':
                entity_type = label.split('-')[1]
                end = start + len(token)
                entities.append((start, end, entity_type))
            start += len(token) + 1  # +1 for the space
        spacy_data.append((" ".join(tokens), {"entities": entities}))
    return spacy_data

train_data_spacy = convert_to_spacy_format(train_data)
test_data_spacy = convert_to_spacy_format(test_data)
valid_data_spacy = convert_to_spacy_format(valid_data)

# Check the format of one training example
print(train_data_spacy[0])

('eu rejects german call to boycott british lamb .', {'entities': [(0, 2, 'ORG'), (11, 17, 'MISC'), (34, 41, 'MISC')]})


In [None]:
import spacy
from spacy.training.example import Example

# Create a blank model
ner_model = spacy.blank('en')

# Adding NER pipeline
ner = ner_model.add_pipe('ner')

# Adding labels to the NER pipeline
for _, annotations in train_data_spacy:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

# Training the NER model
optimizer = ner_model.begin_training()
for i in range(10):
    losses = {}
    for text, annotations in train_data_spacy:
        example = Example.from_dict(ner_model.make_doc(text), annotations)
        ner_model.update([example], sgd=optimizer, losses=losses)
    print(f"Losses at iteration {i}: {losses}")

# Save the trained model
ner_model.to_disk("ner_model")

Losses at iteration 0: {'ner': 25279.164906708254}
Losses at iteration 1: {'ner': 16639.567506198935}
Losses at iteration 2: {'ner': 12950.233830539863}
Losses at iteration 3: {'ner': 10973.965105321333}
Losses at iteration 4: {'ner': 9624.510969091}
Losses at iteration 5: {'ner': 8797.047597918116}
Losses at iteration 6: {'ner': 8233.231268975806}
Losses at iteration 7: {'ner': 7556.344762219701}
Losses at iteration 8: {'ner': 7226.671152668854}
Losses at iteration 9: {'ner': 7051.985375567793}


In [None]:
import spacy
from sklearn.metrics import classification_report

# Load the trained model
ner_model = spacy.load("ner_model")

# Evaluation function to map character positions to token indices
def evaluate_model(ner_model, data):
    y_true = []
    y_pred = []
    for text, annotations in data:
        doc = ner_model(text)
        true_entities = annotations['entities']
        pred_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

        # Created a list of tokens and labels
        tokens = text.split()
        true_labels = ['O'] * len(tokens)
        pred_labels = ['O'] * len(tokens)

        # Map character positions to token indices for true entities
        for start_char, end_char, label in true_entities:
            start_idx = None
            end_idx = None
            char_pos = 0
            for i, token in enumerate(tokens):
                token_start = char_pos
                token_end = char_pos + len(token)
                if start_idx is None and token_start <= start_char < token_end:
                    start_idx = i
                if token_start < end_char <= token_end:
                    end_idx = i + 1
                    break
                char_pos += len(token) + 1

            if start_idx is not None and end_idx is not None:
                for i in range(start_idx, end_idx):
                    true_labels[i] = label

        # Map character positions to token indices for predicted entities
        for start_char, end_char, label in pred_entities:
            start_idx = None
            end_idx = None
            char_pos = 0
            for i, token in enumerate(tokens):
                token_start = char_pos
                token_end = char_pos + len(token)
                if start_idx is None and token_start <= start_char < token_end:
                    start_idx = i
                if token_start < end_char <= token_end:
                    end_idx = i + 1
                    break
                char_pos += len(token) + 1

            if start_idx is not None and end_idx is not None:
                for i in range(start_idx, end_idx):
                    pred_labels[i] = label

        y_true.extend(true_labels)
        y_pred.extend(pred_labels)

    print(classification_report(y_true, y_pred, zero_division=1))

# Evaluate the model
evaluate_model(ner_model, test_data_spacy)

              precision    recall  f1-score   support

         LOC       0.73      0.86      0.79      1925
        MISC       0.64      0.76      0.69       918
           O       0.98      0.95      0.97     38323
         ORG       0.63      0.59      0.61      2496
         PER       0.63      0.87      0.73      2773

    accuracy                           0.92     46435
   macro avg       0.72      0.81      0.76     46435
weighted avg       0.93      0.92      0.92     46435



In [None]:
from spacy.util import minibatch, compounding
import random

# Hyperparameters
n_iter = 20
batch_sizes = compounding(4.0, 32.0, 1.001)

# Training the NER model with hyperparameter tuning
optimizer = ner_model.begin_training()
for i in range(n_iter):
    random.shuffle(train_data_spacy)
    losses = {}
    batches = minibatch(train_data_spacy, size=batch_sizes)
    for batch in batches:
        for text, annotations in batch:
            example = Example.from_dict(ner_model.make_doc(text), annotations)
            ner_model.update([example], sgd=optimizer, losses=losses)
    print(f"Losses at iteration {i}: {losses}")

Losses at iteration 0: {'ner': 25112.164629945495}
Losses at iteration 1: {'ner': 15944.30416615865}
Losses at iteration 2: {'ner': 12657.166979519201}
Losses at iteration 3: {'ner': 10680.233803815318}
Losses at iteration 4: {'ner': 9392.75999691699}
Losses at iteration 5: {'ner': 8499.091049969538}
Losses at iteration 6: {'ner': 7794.64563372191}
Losses at iteration 7: {'ner': 7624.673642367829}
Losses at iteration 8: {'ner': 7328.527135310303}
Losses at iteration 9: {'ner': 7133.846433882183}
Losses at iteration 10: {'ner': 6659.078680311733}
Losses at iteration 11: {'ner': 6540.233494758112}
Losses at iteration 12: {'ner': 6158.41653732479}
Losses at iteration 13: {'ner': 5989.326527768521}
Losses at iteration 14: {'ner': 5877.946050942893}
Losses at iteration 15: {'ner': 5845.147243482898}
Losses at iteration 16: {'ner': 5773.716715156144}
Losses at iteration 17: {'ner': 5588.3435025619265}
Losses at iteration 18: {'ner': 5678.182558758138}
Losses at iteration 19: {'ner': 5628.958

In [None]:
# Evaluate the optimized model
evaluate_model(ner_model, test_data_spacy)

              precision    recall  f1-score   support

         LOC       0.86      0.80      0.83      1925
        MISC       0.75      0.67      0.71       918
           O       0.96      0.99      0.97     38323
         ORG       0.79      0.65      0.71      2496
         PER       0.92      0.71      0.80      2773

    accuracy                           0.94     46435
   macro avg       0.86      0.76      0.80     46435
weighted avg       0.94      0.94      0.94     46435



In [None]:
import joblib

# Save the model to a file using Joblib
joblib.dump(ner_model, "optimized_ner_model.joblib")

['optimized_ner_model.joblib']

In [None]:
import pickle

# Save the model to a file using pickle
model_path = "optimized_ner_model.pkl"
with open(model_path, 'wb') as f:
    pickle.dump(ner_model, f)