In [1]:
import spacy
from spacy.tokens import DocBin
import pickle

In [2]:
nlp = spacy.load("en_core_web_sm")

with open("./training_data/spacy_training_data.pkl", "rb") as file:
    spacy_format_data = pickle.load(file)


# Splitting data
# Calculate the index to split the data
split_index = int(len(spacy_format_data) * 0.80)

# Split the data into training and testing datasets
training_data = spacy_format_data[:split_index]
testing_data = spacy_format_data[split_index:]
# print(training_data[0])
# Check the lengths of the training and testing datasets
print(
    f"Length of train data:{len(training_data)}, \nLength of test data:{len(testing_data)}"
)

# the DocBin will store the example documents --train
db = DocBin()
for sample in training_data:
    doc = nlp(sample[0])
    entities = []
    for start, end, label in sample[1]["entities"]:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            entities.append(span)
    doc.ents = entities
    db.add(doc)
db.to_disk("./training_data/train.spacy")
print("Training data prepared")
# the DocBin will store the example documents --test
db_test = DocBin()
for sample in testing_data:
    doc = nlp(sample[0])
    entities = []
    for start, end, label in sample[1]["entities"]:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            entities.append(span)
    doc.ents = entities
    db_test.add(doc)
db_test.to_disk("./training_data/test.spacy")
print("Testing data prepared")

print("Preprocessing done")

Length of train data:6025, 
Length of test data:1507
Training data prepared
Testing data prepared
Preprocessing done


In [3]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [5]:
!python -m spacy train config.cfg --output ./output --paths.train ./training_data/train.spacy --paths.dev ./training_data/test.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     45.95    0.00    0.00    0.00    0.00
  0     200        406.16   2023.67   99.55   99.23   99.86    1.00
  0     400         17.58     88.54   99.91   99.92   99.89    1.00
  0     600          2.71      6.28   99.81   99.63   99.99    1.00
  0     800         12.37     30.95   99.80   99.99   99.62    1.00
  0    1000         89.88    112.59   99.89   99.93   99.85    1.00
  0    1200         16.64     17.86   99.99  100.00   99.99    1.00
  0    1400          3.24      3.87  100.00  100.00  100.00    1.00
  1    1600         33.21     27.72   99.91   99.93   99.88    1.00
  1    1800         15.02     11.37   99.99  1

In [7]:
!python -m spacy evaluate ./output/model-best/ ./training_data/test.spacy

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   100.00
NER R   100.00
NER F   100.00
SPEED   5073  

[1m

                              P        R        F
PATIENT-ID               100.00   100.00   100.00
PATIENT-NAME             100.00   100.00   100.00
PATIENT-ADDRESS          100.00   100.00   100.00
PATIENT-TYPE             100.00   100.00   100.00
PATIENT-DOB              100.00   100.00   100.00
PATIENT-GENDER           100.00   100.00   100.00
PATIENT-ADMIT-DATE       100.00   100.00   100.00
PATIENT-DISCHARGE-DATE   100.00   100.00   100.00
PATIENT-DOCID            100.00   100.00   100.00



TESTING ON ONE SAMPLE


In [47]:
nlp = spacy.load("./output/model-last/")

if not nlp:
    print("Model is not loaded...")
else:
    text = "Community General Hospital Patient Account# P-9877are Patient Name Runesh Gazane, Address - Vanshaj, Silver Rd, Pune, Type - Inp"
    doc = nlp(text)
    for ent in doc.ents:
        print(f"Word -> {ent.text} -------- Label -> {ent.label_}")

Word -> P-9877are -------- Label -> PATIENT-ID
Word -> Runesh Gazane -------- Label -> PATIENT-NAME
Word -> Vanshaj -------- Label -> PATIENT-TYPE
Word -> Inp -------- Label -> PATIENT-TYPE
