In [9]:
# Install spaCy and download the English model if not already installed
!pip install spacy
!python -m spacy download en_core_web_sm

import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Example text
# Text1 — Industrial Safety Incident Example (Hydraulic Leak) Source: SEE Forge Creators of FAT FINGER Aug 23, 2023
text1 = """The personnel directly involved were Alex and Maya. Initial inspection indicates a potential hydraulic fluid leakage. 
Documentation of Incident: An incident report was prepared with details of the event, initial observations, and photographs of the
affected area. Root Cause Analysis: Preliminary findings suggest that inadequate maintenance led to the hydraulic system’s malfunction"""
#text2 — Mining Safety NLP Analysis Context Source:MDPI
text2 = """To achieve the goal of preventing serious injuries and fatalities, it is important for a mine site to analyze site specific 
mine safety data. The advances in natural language processing create an opportunity to develop machine learning tools to automate analysis 
of mine health and safety management systems data without requiring experts at every mine site. If causation and associated details can be
automatically extracted from the safety reports, NLP can be used to quickly gain insight into safety incidents from historical reports that
are filed away in the safety management databases."""
#text3 - Text sample — Adverse Drug Event Definition & NLP Potential Source: Wikipedia
text3 = """Pharmacovigilance, also known as drug safety, is the discipline within pharmaceutical science that addresses the identification, 
evaluation, and mitigation of adverse effects and other drug-related problems associated with pharmaceutical products. A central concern is
adverse drug reactions, defined as harmful and unintended responses to a medicinal product."""
#text4 - Text sample — Medication Error Reports Context Source:Nature
text4 = """Incident reports of medication errors are valuable learning resources for improving patient safety. Pertinent information is often
contained within unstructured free text, which prevents automated analysis and limits the usefulness of these data. NLP can structure this free
text automatically and retrieve relevant past incidents and learning materials"""
#text5 - Text sample — Incident Report Process Source: OHSE
text5 = """Key components of an incident investigation report include: Incident Overview — a brief summary including date, time, location, and
nature of the incident; Background Information — a description of site conditions, equipment, and personnel involved; Summary of Findings — linking
identified root causes and contributing factors; and Recommendations — specific corrective and preventive actions."""
# Process the text
doc1 = nlp(text1)
doc2 = nlp(text2)
doc3 = nlp(text3)
doc4 = nlp(text4)
doc5 = nlp(text5)

print("\n")





[notice] A new release of pip is available: 24.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 4.8 MB/s eta 0:00:03
     --------- ------------------------------ 3.1/12.8 MB 8.8 MB/s eta 0:00:02
     ------------------ --------------------- 6.0/12.8 MB 10.9 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 11.5 MB/s eta 0:00:01
     ----------------------------------- --- 11.8/12.8 MB 11.9 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 11.3 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 24.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip






In [13]:
# 1. Tokenization with Part-of-speech and Dependency Parsing
print("=== Tokenization, POS, and Dependency Parsing ===\n")

docs = {
    "Industrial Safety": doc1,
    "Mining Safety": doc2,
    "Pharmacovigilance": doc3,
    "Medication Errors": doc4,
    "Incident Reporting": doc5
}

for category, doc in docs.items():
    print(f"\n--- {category} ---\n")
    
    for token in doc:
        print(f"{token.text:20} POS: {token.pos_:10} Dep: {token.dep_}")

=== Tokenization, POS, and Dependency Parsing ===


--- Industrial Safety ---

The                  POS: DET        Dep: det
personnel            POS: NOUN       Dep: nsubj
directly             POS: ADV        Dep: advmod
involved             POS: ADJ        Dep: acl
were                 POS: AUX        Dep: ROOT
Alex                 POS: PROPN      Dep: attr
and                  POS: CCONJ      Dep: cc
Maya                 POS: PROPN      Dep: conj
.                    POS: PUNCT      Dep: punct
Initial              POS: ADJ        Dep: amod
inspection           POS: NOUN       Dep: nsubj
indicates            POS: VERB       Dep: ROOT
a                    POS: DET        Dep: det
potential            POS: ADJ        Dep: amod
hydraulic            POS: ADJ        Dep: amod
fluid                POS: NOUN       Dep: compound
leakage              POS: NOUN       Dep: dobj
.                    POS: PUNCT      Dep: punct

                    POS: SPACE      Dep: dep
Documentation        POS

In [15]:
# 2. Named Entity Recognition (NER)
print("=== Named Entities Recognition ===\n")

docs = {
    "Industrial Safety": doc1,
    "Mining Safety": doc2,
    "Pharmacovigilance": doc3,
    "Medication Errors": doc4,
    "Incident Reporting": doc5
}

for category, doc in docs.items():
    print(f"\n--- {category} ---")
    
    for ent in doc.ents:
        print(f"Entity: {ent.text:25} Label: {ent.label_}")

=== Named Entities Recognition ===


--- Industrial Safety ---
Entity: Alex and Maya             Label: WORK_OF_ART
Entity: Documentation of Incident Label: ORG
Entity: Root Cause Analysis:      Label: ORG

--- Mining Safety ---
Entity: NLP                       Label: ORG

--- Pharmacovigilance ---

--- Medication Errors ---
Entity: NLP                       Label: ORG

--- Incident Reporting ---
Entity: Background Information    Label: ORG
Entity: Recommendations           Label: PERSON


In [17]:
# 3. Lemmatization
print("\n=== Lemmatization ===\n")

for category, doc in docs.items():
    print(f"\n--- {category} ---")
    
    for token in doc:
        print(f"{token.text:20} Lemma: {token.lemma_}")


=== Lemmatization ===


--- Industrial Safety ---
The                  Lemma: the
personnel            Lemma: personnel
directly             Lemma: directly
involved             Lemma: involved
were                 Lemma: be
Alex                 Lemma: Alex
and                  Lemma: and
Maya                 Lemma: Maya
.                    Lemma: .
Initial              Lemma: initial
inspection           Lemma: inspection
indicates            Lemma: indicate
a                    Lemma: a
potential            Lemma: potential
hydraulic            Lemma: hydraulic
fluid                Lemma: fluid
leakage              Lemma: leakage
.                    Lemma: .

                    Lemma: 

Documentation        Lemma: Documentation
of                   Lemma: of
Incident             Lemma: Incident
:                    Lemma: :
An                   Lemma: an
incident             Lemma: incident
report               Lemma: report
was                  Lemma: be
prepared             Lem

In [19]:
# 4. Sentence Segmentation
print("=== Sentences ===\n")

for category, doc in docs.items():
    print(f"\n--- {category} ---")
    
    for sent in doc.sents:
        print(sent.text)

=== Sentences ===


--- Industrial Safety ---
The personnel directly involved were Alex and Maya.
Initial inspection indicates a potential hydraulic fluid leakage. 

Documentation of Incident:
An incident report was prepared with details of the event, initial observations, and photographs of the
affected area.
Root Cause Analysis: Preliminary findings suggest that inadequate maintenance led to the hydraulic system’s malfunction

--- Mining Safety ---
To achieve the goal of preventing serious injuries and fatalities, it is important for a mine site to analyze site specific 
mine safety data.
The advances in natural language processing create an opportunity to develop machine learning tools to automate analysis 
of mine health and safety management systems data without requiring experts at every mine site.
If causation and associated details can be
automatically extracted from the safety reports, NLP can be used to quickly gain insight into safety incidents from historical reports that
a

In [21]:
# 5. Stop Word Removal
print("\n=== Stop Word Removal ===\n")

for category, doc in docs.items():
    print(f"\n--- {category} ---")
    
    filtered_tokens = [
        token.text for token in doc
        if not token.is_stop and not token.is_punct
    ]
    
    print("Filtered Tokens:", filtered_tokens)


=== Stop Word Removal ===


--- Industrial Safety ---
Filtered Tokens: ['personnel', 'directly', 'involved', 'Alex', 'Maya', 'Initial', 'inspection', 'indicates', 'potential', 'hydraulic', 'fluid', 'leakage', '\n', 'Documentation', 'Incident', 'incident', 'report', 'prepared', 'details', 'event', 'initial', 'observations', 'photographs', '\n', 'affected', 'area', 'Root', 'Cause', 'Analysis', 'Preliminary', 'findings', 'suggest', 'inadequate', 'maintenance', 'led', 'hydraulic', 'system', 'malfunction']

--- Mining Safety ---
Filtered Tokens: ['achieve', 'goal', 'preventing', 'injuries', 'fatalities', 'important', 'site', 'analyze', 'site', 'specific', '\n', 'safety', 'data', 'advances', 'natural', 'language', 'processing', 'create', 'opportunity', 'develop', 'machine', 'learning', 'tools', 'automate', 'analysis', '\n', 'health', 'safety', 'management', 'systems', 'data', 'requiring', 'experts', 'site', 'causation', 'associated', 'details', '\n', 'automatically', 'extracted', 'safety', 

In [23]:
# 6. Parts of Speech Tagging with Explanation
print("\n=== Parts of Speech (POS) Tagging ===\n")

for category, doc in docs.items():
    print(f"\n--- {category} ---")
    print(f"{'Token':20} {'POS':10} Explanation")
    print("-" * 60)
    
    for token in doc:
        print(f"{token.text:20} {token.pos_:10} {spacy.explain(token.pos_)}")


=== Parts of Speech (POS) Tagging ===


--- Industrial Safety ---
Token                POS        Explanation
------------------------------------------------------------
The                  DET        determiner
personnel            NOUN       noun
directly             ADV        adverb
involved             ADJ        adjective
were                 AUX        auxiliary
Alex                 PROPN      proper noun
and                  CCONJ      coordinating conjunction
Maya                 PROPN      proper noun
.                    PUNCT      punctuation
Initial              ADJ        adjective
inspection           NOUN       noun
indicates            VERB       verb
a                    DET        determiner
potential            ADJ        adjective
hydraulic            ADJ        adjective
fluid                NOUN       noun
leakage              NOUN       noun
.                    PUNCT      punctuation

                    SPACE      space
Documentation        PROPN      prope