In [32]:
import spacy
from spacy.training import Example

# Load English tokenizer, tagger, parser, and NER
# Create a new blank English model
nlp = spacy.blank("en")

# Add the NER pipeline component to the blank model
ner = nlp.add_pipe("ner")

In [31]:
def read_training_data(description_file, labeled_description_file):
    with open(description_file, "r", encoding="utf-8") as file:
        descriptions = file.readlines()
    with open(labeled_description_file, "r", encoding="utf-8") as file:
        labeled_descriptions = file.readlines()
    return [line.strip() for line in descriptions], [line.strip() for line in labeled_descriptions]

# Process training data to extract descriptions and entities
def process_training_data(descriptions, labeled_descriptions):
    TRAIN_DATA = []
    for desc, labeled_desc in zip(descriptions, labeled_descriptions):
        if "\"" in labeled_desc:
            entity_info = labeled_desc.split("\"")
            entity_name = entity_info[1].strip()
            entity_type = entity_info[2].split(" - ")[1]
            start = desc.find(entity_name)
            end = start + len(entity_name)
            TRAIN_DATA.append((desc, {"entities": [(start, end, "ENTITY")]}))
    return TRAIN_DATA

# Define file paths
description_file = r"C:\Users\Lenovo\OneDrive\Desktop\Folders\NaharOm\BSA\Main_Project\ner_train_input.txt"
labeled_description_file = r"C:\Users\Lenovo\OneDrive\Desktop\Folders\NaharOm\BSA\Main_Project\ner_train_label.txt"

# Load training data
descriptions, labeled_descriptions = read_training_data(description_file, labeled_description_file)
TRAIN_DATA = process_training_data(descriptions, labeled_descriptions)

# Define the pipeline components
ner = nlp.get_pipe("ner")

# Add new entity labels to the pipeline
ner.add_label("ENTITY")

# Disable other pipeline components to only train NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

# Training the NER model
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.create_optimizer()
    for itn in range(10):
        losses = {}
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)            
        print(losses)

# Save the trained model
nlp.to_disk("ner_model")

# Test the trained model
test_text = "chq paid micr inward clearing  sumit loomba s o sh vire union bank of india  ubi  union bank of india"
doc = nlp(test_text)
for ent in doc.ents:
    print(ent)
    print(ent.text, ent.label_)

{'ner': 18.837906578746566}
{'ner': 18.966169102459208}
{'ner': 10.68156114730256}
{'ner': 16.186866608683307}
{'ner': 8.656663141091348}
{'ner': 10.84857808815708}
{'ner': 8.624519152545513}
{'ner': 10.429577002301448}
{'ner': 11.313666459882821}
{'ner': 14.087052674760411}
union bank of
union bank of BANK
union bank of
union bank of BANK


In [30]:
# Test the trained model
test_text = "funds transfer debit landcraft developers"
doc = nlp(test_text)
for ent in doc.ents:
    print(ent)
    print(ent.text, ent.label_)