# Importing Libraries

In [8]:
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [23]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Importing Dataset

In [9]:
import json
from datetime import datetime
# dataset-link: https://huggingface.co/datasets/MongoDB/accident_reports/blob/main/accidents_reports.json
with open('/content/accidents_reports.json', 'r') as f:
  data = json.load(f)

In [10]:
data[0]

{'incidentId': 'INC-2024-001',
 'dateTime': '2024-03-08T09:01:41.295149',
 'location': {'site': 'Factory B', 'region': 'East'},
 'type': 'Equipment Failure',
 'description': 'Equipment Failure occurred at Factory B.',
 'severityLevel': 'low',
 'relatedProcedures': ['CHEM-012'],
 'immediateActions': 'Contained spill and alerted hazardous material team',
 'rootCauses': [{'description': 'Inadequate safety checks',
   'category': 'procedural error',
   'preventionRecommendations': 'Review and update safety procedures'}]}

In [12]:
# Function to extract entities and format sentence
def convert_to_spacy_format(data):
    training_data = []

    for incident in data:
        # Parse date and time
        dt = datetime.fromisoformat(incident["dateTime"])
        date_str = dt.strftime("%B %d, %Y")  # "March 08, 2024"
        time_str = dt.strftime("%H:%M")      # "09:01"

        text = (
            f"On {date_str} at {time_str}, a {incident['type']} occurred at "
            f"{incident['location']['site']} in the {incident['location']['region']} region. "
            f"The incident ID is {incident['incidentId']}. The severity was {incident['severityLevel']}. "
            f"Immediate action: {incident['immediateActions']}. Related procedures: {', '.join(incident['relatedProcedures'])}. "
        )

        # Add root causes
        for cause in incident["rootCauses"]:
            text += (
                f"Root cause: {cause['description']} (Category: {cause['category']}). "
                f"Recommendation: {cause['preventionRecommendations']}. "
            )

        # Find entity spans (simple string matching)
        entities = []
        entity_map = {
            "INCIDENT_ID": incident["incidentId"],
            "DATE": date_str,
            "TIME": time_str,
            "SITE": incident["location"]["site"],
            "REGION": incident["location"]["region"],
            "INCIDENT_TYPE": incident["type"],
            "SEVERITY_LEVEL": incident["severityLevel"],
        }

        for label, value in entity_map.items():
            start = text.find(value)
            if start != -1:
                end = start + len(value)
                entities.append((start, end, label))

        # Related Procedures
        for proc in incident["relatedProcedures"]:
            start = text.find(proc)
            if start != -1:
                entities.append((start, start + len(proc), "PROCEDURE_CODE"))

        # Root causes
        for cause in incident["rootCauses"]:
            desc = cause["description"]
            cat = cause["category"]
            rec = cause["preventionRecommendations"]

            for val, label in [(desc, "ROOT_CAUSE_DESC"), (cat, "ROOT_CAUSE_CATEGORY"), (rec, "PREVENTION_RECOMMEND")]:
                start = text.find(val)
                if start != -1:
                    entities.append((start, start + len(val), label))

        training_data.append({"text": text,"entities": entities})

    return training_data

In [13]:
# Convert your structured data to spaCy training format
spacy_training_data = convert_to_spacy_format(data)

In [14]:
# Pretty print sample
import pprint
pprint.pprint(spacy_training_data[0])

{'entities': [(109, 121, 'INCIDENT_ID'),
              (3, 17, 'DATE'),
              (21, 26, 'TIME'),
              (60, 69, 'SITE'),
              (77, 81, 'REGION'),
              (30, 47, 'INCIDENT_TYPE'),
              (140, 143, 'SEVERITY_LEVEL'),
              (236, 244, 'PROCEDURE_CODE'),
              (258, 282, 'ROOT_CAUSE_DESC'),
              (294, 310, 'ROOT_CAUSE_CATEGORY'),
              (329, 364, 'PREVENTION_RECOMMEND')],
 'text': 'On March 08, 2024 at 09:01, a Equipment Failure occurred at Factory '
         'B in the East region. The incident ID is INC-2024-001. The severity '
         'was low. Immediate action: Contained spill and alerted hazardous '
         'material team. Related procedures: CHEM-012. Root cause: Inadequate '
         'safety checks (Category: procedural error). Recommendation: Review '
         'and update safety procedures. '}


# Split the dataset into train and dev

In [15]:
# Split into train and dev
train_data, dev_data = train_test_split(spacy_training_data, test_size=0.2, random_state=42)

# JSON to spaCy NER Format

In [18]:
# Utility function to convert data to .spacy binary format
def create_spacy_binary(data, output_file, nlp):
    doc_bin = DocBin()
    for training_example in tqdm(data):
      text = training_example['text']
      labels = training_example['entities']
      doc = nlp.make_doc(text)
      ents = []

      for start, end, label in labels:
        span = doc.char_span(start, end, label = label, alignment_mode='contract')
        if span is None:
          print("Skipping entity")
        else:
          ents.append(span)
      filtered_ents = filter_spans(ents)
      doc.ents = filtered_ents
      doc_bin.add(doc)

    doc_bin.to_disk(output_file)

In [19]:
# Load blank pipeline and save train/dev sets
nlp = spacy.blank("en")
create_spacy_binary(train_data, "train.spacy", nlp)
create_spacy_binary(dev_data, "dev.spacy", nlp)

100%|██████████| 80/80 [00:00<00:00, 1350.53it/s]
100%|██████████| 20/20 [00:00<00:00, 1285.45it/s]


# NER Model

In [20]:
# https://spacy.io/usage/training

In [21]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [24]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     42.39    0.00    0.00    0.00    0.00
  2     200        238.73   2495.76   87.84   86.43   89.30    0.88
  5     400         45.98    464.29   96.61   93.45  100.00    0.97
  8     600         50.80    479.40   96.61   93.45  100.00    0.97
 12     800        221.39    760.25   96.61   93.45  100.00    0.97
 18    1000         33.79    674.05   96.61   93.45  100.00    0.97
 25    1200         26.63    846.02   96.61   93.45  100.00    0.97
 33    1400         23.21   1043.67   96.61   93.45  100.00    0.97
 44    1600         26.69   1298.35   96.61   93.45  100.00    0.97
 58    1800         40.70   1542.45   96.61   

In [25]:
ner_model = spacy.load("output/model-best")

# Testing the model

In [28]:
test_sentences = [
    "On April 15, 2024 at 14:30, a Gas Leak occurred at Refinery B in the North region. The incident ID is INC789012. The severity was Critical. Immediate action: Evacuate personnel. Related procedures: PR-104, PR-205. Root cause: Corroded pipeline (Category: Equipment Failure). Recommendation: Schedule regular inspections.",

    "On May 03, 2024 at 08:15, a Chemical Spill occurred at Plant X in the South region. The incident ID is INC456789. The severity was Moderate. Immediate action: Contain spill and notify authorities. Related procedures: PR-210. Root cause: Improper storage (Category: Human Error). Recommendation: Train staff on handling procedures.",

    "On March 22, 2024 at 19:45, a Fire occurred at Unit 3 in the Central region. The incident ID is INC111222. The severity was High. Immediate action: Activate sprinkler system. Related procedures: PR-301, PR-412. Root cause: Electrical short circuit (Category: Technical Fault). Recommendation: Upgrade old wiring.",

    "On January 11, 2024 at 03:00, a Power Failure occurred at Facility Z in the East region. The incident ID is INC333444. The severity was Low. Immediate action: Switch to backup generator. Related procedures: PR-001. Root cause: Grid instability (Category: External Factor). Recommendation: Coordinate with power provider."
]

In [29]:
# Predict and print results
for text in test_sentences:
    doc = ner_model(text)
    print(f"\nText: {text}")
    for ent in doc.ents:
        print(f" → {ent.text} ({ent.label_})")


Text: On April 15, 2024 at 14:30, a Gas Leak occurred at Refinery B in the North region. The incident ID is INC789012. The severity was Critical. Immediate action: Evacuate personnel. Related procedures: PR-104, PR-205. Root cause: Corroded pipeline (Category: Equipment Failure). Recommendation: Schedule regular inspections.
 → April 15, 2024 (DATE)
 → 14:30 (TIME)
 → Gas Leak (INCIDENT_TYPE)
 → Refinery B (SITE)
 → North (REGION)
 → INC789012 (SEVERITY_LEVEL)
 → Critical (SEVERITY_LEVEL)
 → Evacuate personnel (ROOT_CAUSE_DESC)
 → PR-205 (PROCEDURE_CODE)
 → Equipment Failure (ROOT_CAUSE_CATEGORY)
 → Schedule regular inspections (PREVENTION_RECOMMEND)

Text: On May 03, 2024 at 08:15, a Chemical Spill occurred at Plant X in the South region. The incident ID is INC456789. The severity was Moderate. Immediate action: Contain spill and notify authorities. Related procedures: PR-210. Root cause: Improper storage (Category: Human Error). Recommendation: Train staff on handling procedures.
 →

# Entitity Visualization

In [30]:
# Custom color mapping
colors = {
    "DATE": "#FFD700",               # gold
    "TIME": "#87CEFA",               # light blue
    "INCIDENT_TYPE": "#FF6347",      # tomato
    "SITE": "#90EE90",               # light green
    "REGION": "#DDA0DD",             # plum
    "INCIDENT_ID": "#FFA07A",        # light salmon
    "SEVERITY_LEVEL": "#B0E0E6",     # powder blue
    "PROCEDURE_CODE": "#00CED1",     # dark turquoise
    "ROOT_CAUSE_DESC": "#FA8072",    # salmon
    "ROOT_CAUSE_CATEGORY": "#FFB6C1",# light pink
    "PREVENTION_RECOMMEND": "#20B2AA" # light sea green
}

# Visualization options
options = {"colors": colors}

In [33]:
doc = ner_model(test_sentences[0])
spacy.displacy.render(doc, style="ent", options=options, jupyter=True)