In [2]:
import re
import json
import pdfplumber  # pip install pdfplumber

# ----------------------------
# Define keyword lists
# ----------------------------
clinical_terms_keywords = [
    "bleeding", "erosion", "hemorrhoids", "diverticulosis",
    "polyps", "gastritis", "Barrett", "colonoscopy", "biopsy",
    "melanosis", "proctitis"
]

anatomical_locations_keywords = [
    "rectum", "anal verge", "anus", "cecum", "sigmoid colon",
    "ascending colon", "transverse colon", "descending colon",
    "hepatic flexure", "splenic flexure", "stomach", "duodenum",
    "esophagus"
]

procedures_keywords = ["colonoscopy", "EGD", "biopsy", "EGD with biopsy"]

# ----------------------------
# Regex for ICD-10 / CPT codes
# ----------------------------
def extract_icd_codes(text):
    return list(set(re.findall(r'\b[A-Z]\d{1,2}\.?\d*\b', text)))

def extract_cpt_codes(text):
    return list(set(re.findall(r'\b\d{5}\b', text)))

# ----------------------------
# Extract keywords from text
# ----------------------------
def extract_terms(text, keywords):
    return list(set([k for k in keywords if re.search(r'\b' + re.escape(k) + r'\b', text, re.I)]))

# ----------------------------
# Process single report
# ----------------------------
def process_report(text, report_id):
    return {
        "ReportID": report_id,
        "Clinical Terms": extract_terms(text, clinical_terms_keywords),
        "Anatomical Locations": extract_terms(text, anatomical_locations_keywords),
        "Diagnosis": extract_terms(text, clinical_terms_keywords),
        "Procedures": extract_terms(text, procedures_keywords),
        "ICD-10": extract_icd_codes(text),
        "CPT": extract_cpt_codes(text),
        "HCPCS": [],
        "Modifiers": []
    }

# ----------------------------
# Extract text from PDF
# ----------------------------
def extract_text_from_pdf(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"
    return full_text

# ----------------------------
# Split reports based on “Report” keyword or section pattern
# ----------------------------
def split_reports(full_text):
    # Assumes reports are separated by 'Report' or 'Diagnosis' headings
    reports = re.split(r'\bReport\s*\d+[:\-]|\bDiagnosis[:\-]', full_text, flags=re.I)
    reports = [r.strip() for r in reports if len(r.strip()) > 50]  # filter empty parts
    return reports

# ----------------------------
# Main execution
# ----------------------------
pdf_path = "D:/Downloads/Input Data for assignment 5.pdf"  # your PDF file
pdf_text = extract_text_from_pdf(pdf_path)
reports_text = split_reports(pdf_text)

json_reports = [process_report(r, f"Report {i+1}") for i, r in enumerate(reports_text)]

# Save to JSON file
with open("clinical_reports_from_pdf.json", "w") as f:
    json.dump(json_reports, f, indent=4)

# Print output nicely
print(json.dumps(json_reports, indent=4))

[
    {
        "ReportID": "Report 1",
        "Clinical Terms": [
            "polyps",
            "diverticulosis",
            "colonoscopy",
            "melanosis",
            "hemorrhoids"
        ],
        "Anatomical Locations": [
            "rectum",
            "cecum"
        ],
        "Diagnosis": [
            "polyps",
            "diverticulosis",
            "colonoscopy",
            "melanosis",
            "hemorrhoids"
        ],
        "Procedures": [
            "colonoscopy"
        ],
        "ICD-10": [
            "K64.8",
            "Z86.0100",
            "K57.90"
        ],
        "CPT": [],
        "HCPCS": [],
        "Modifiers": []
    },
    {
        "ReportID": "Report 2",
        "Clinical Terms": [
            "hemorrhoids",
            "polyps",
            "colonoscopy"
        ],
        "Anatomical Locations": [
            "sigmoid colon",
            "transverse colon",
            "rectum",
            "hepatic flexure",
           

In [3]:
import fitz  # PyMuPDF
import json
import spacy

# ----------------------------
# STEP 1: Load Med7 model
# ----------------------------
# If installed from local tar.gz
nlp = spacy.load("en_core_med7_lg")

# ----------------------------
# STEP 2: Extract text from PDF
# ----------------------------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

# ----------------------------
# STEP 3: Process text with Med7
# ----------------------------
def extract_medical_entities(text):
    doc = nlp(text)
    entities = {}
    
    # Group entities by label
    for ent in doc.ents:
        label = ent.label_
        if label not in entities:
            entities[label] = []
        entities[label].append(ent.text)
    
    # Remove duplicates
    for label in entities:
        entities[label] = list(set(entities[label]))
    
    return entities

# ----------------------------
# STEP 4: Process PDF reports
# ----------------------------
def process_reports(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    
    # Split multiple reports if needed
    reports = text.split("Report")  # crude split, adjust if needed
    reports = [r.strip() for r in reports if r.strip()]
    
    all_reports = []
    for i, report_text in enumerate(reports):
        report_entities = extract_medical_entities(report_text)
        report_json = {
            "ReportID": f"Report {i+1}",
            "Entities": report_entities
        }
        all_reports.append(report_json)
    
    return all_reports

# ----------------------------
# STEP 5: Save JSON
# ----------------------------
def save_to_json(data, filename="med7_reports.json"):
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)
    print(f"Saved JSON to {filename}")

# ----------------------------
# STEP 6: Main driver
# ----------------------------
if __name__ == "__main__":
    pdf_path = "clinicalreport.pdf"  # Your PDF file path
    reports_data = process_reports(pdf_path)
    save_to_json(reports_data)
    print(json.dumps(reports_data, indent=4))


OSError: [E053] Could not read config file from C:\Users\heman\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\en_core_med7_lg\en_core_med7_lg-0.0.3\config.cfg

In [4]:
import fitz  # PyMuPDF
import spacy
import json
import re

# ---------- 1. Extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

# ---------- 2. Load proper scispaCy NER model ----------
def load_model():
    print("Loading scispaCy NER model...")
    # This model has actual NER for diseases and chemicals
    nlp = spacy.load("en_ner_bc5cdr_md")
    print("✅ Model loaded successfully!")
    return nlp

# ---------- 3. Extract entities ----------
def extract_entities(text, nlp):
    doc = nlp(text)

    result = {
        "Clinical Terms": [],
        "Anatomical Locations": [],
        "Diagnosis": [],
        "Procedures": [],
        "ICD-10": [],
        "CPT": [],
        "HCPCS": []
    }

    # Map scispaCy labels to our JSON categories
    label_map = {
        "DISEASE": "Clinical Terms",
        "CHEMICAL": "Clinical Terms"
    }

    # NER extraction
    for ent in doc.ents:
        label = label_map.get(ent.label_, None)
        if label and ent.text not in result[label]:
            result[label].append(ent.text)

    # Regex-based extraction for codes
    result["ICD-10"] = list(set(re.findall(r"\b[A-Z]\d{2}(?:\.\d+)?\b", text)))
    result["CPT"] = list(set(re.findall(r"\b\d{4,5}\b", text)))
    result["HCPCS"] = list(set(re.findall(r"\bJ\d{4}\b", text)))

    return result

# ---------- 4. Main function ----------
def main():
    # Example text (replace with PDF if needed)
    text = """The patient was diagnosed with myocardial infarction and prescribed aspirin. 
    An echocardiogram showed left ventricular hypertrophy. 
    ICD-10: I21.9, I51.7. CPT: 93306. HCPCS: J0120."""

    nlp = load_model()
    entities = extract_entities(text, nlp)

    # Save JSON
    with open("clinical_output.json", "w") as f:
        json.dump(entities, f, indent=4)

    print("\n✅ Extracted Entities in JSON format:\n")
    print(json.dumps(entities, indent=4))

if __name__ == "__main__":
    main()

Loading scispaCy NER model...


OSError: [E050] Can't find model 'en_ner_bc5cdr_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [1]:
import fitz  # PyMuPDF
import spacy
import json
import re

# ---------- 1. Extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

# ---------- 2. Load proper scispaCy NER model ----------
def load_model():
    print("Loading scispaCy NER model...")
    nlp = spacy.load("en_ner_bc5cdr_md")  # ✅ This model recognizes diseases and chemicals
    print("✅ Model loaded successfully!")
    return nlp

# ---------- 3. Extract entities ----------
def extract_entities(text, nlp):
    doc = nlp(text)

    result = {
        "Clinical Terms": [],
        "Anatomical Locations": [],
        "Diagnosis": [],
        "Procedures": [],
        "ICD-10": [],
        "CPT": [],
        "HCPCS": []
    }

    # Map scispaCy labels to your JSON categories
    label_map = {
        "DISEASE": "Clinical Terms",
        "CHEMICAL": "Clinical Terms"
    }

    # NER extraction
    for ent in doc.ents:
        label = label_map.get(ent.label_, None)
        if label and ent.text not in result[label]:
            result[label].append(ent.text)

    # Regex-based extraction for codes
    result["ICD-10"] = list(set(re.findall(r"\b[A-Z]\d{2}(?:\.\d+)?\b", text)))
    result["CPT"] = list(set(re.findall(r"\b\d{4,5}\b", text)))
    result["HCPCS"] = list(set(re.findall(r"\bJ\d{4}\b", text)))

    return result

# ---------- 4. Main function ----------
def main():
    # Example text (you can replace this with PDF text)
    text = """The patient was diagnosed with myocardial infarction and prescribed aspirin. 
    An echocardiogram showed left ventricular hypertrophy. 
    ICD-10: I21.9, I51.7. CPT: 93306. HCPCS: J0120."""

    # Load the NER model
    nlp = load_model()

    # Extract entities
    entities = extract_entities(text, nlp)

    # Save JSON
    with open("clinical_output.json", "w") as f:
        json.dump(entities, f, indent=4)

    # Print output
    print("\n✅ Extracted Entities in JSON format:\n")
    print(json.dumps(entities, indent=4))

if __name__ == "__main__":
    main()


Loading scispaCy NER model...


OSError: [E050] Can't find model 'en_ner_bc5cdr_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [3]:
import fitz  # PyMuPDF
import spacy
import json
import re

# ---------- 1. Extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

# ---------- 2. Load scispaCy model ----------
def load_model():
    print("Loading scispaCy model...")
    nlp = spacy.load("en_core_sci_lg")  # Easy to install, works immediately
    print("✅ Model loaded successfully!")
    return nlp

# ---------- 3. Map entity labels to JSON fields ----------
def map_entity_label(ent_text, ent_label):
    """
    Map scispaCy entity labels to your JSON fields
    """
    label_map = {
        "DISEASE_OR_SYNDROME": "Diagnosis",
        "SYMPTOM": "Clinical Terms",
        "CHEMICAL": "Clinical Terms",
        "ANATOMICAL_SYSTEM": "Anatomical Locations",
        "BODY_PART": "Anatomical Locations",
        "PROCEDURE": "Procedures",
        "TEST": "Procedures",
        "DIAGNOSIS": "Diagnosis"
    }
    return label_map.get(ent_label, None)

# ---------- 4. Extract entities ----------
def extract_entities(text, nlp):
    doc = nlp(text)

    result = {
        "Clinical Terms": [],
        "Anatomical Locations": [],
        "Diagnosis": [],
        "Procedures": [],
        "ICD-10": [],
        "CPT": [],
        "HCPCS": []
    }

    for ent in doc.ents:
        mapped_field = map_entity_label(ent.text, ent.label_)
        if mapped_field:
            if ent.text not in result[mapped_field]:
                result[mapped_field].append(ent.text)

    # Regex-based extraction for codes
    result["ICD-10"] = list(set(re.findall(r"\b[A-Z]\d{2}(?:\.\d+)?\b", text)))
    result["CPT"] = list(set(re.findall(r"\b\d{4,5}\b", text)))
    result["HCPCS"] = list(set(re.findall(r"\bJ\d{4}\b", text)))

    return result

# ---------- 5. Main function ----------
def main():
    text = """Atypical chest pain. Right upper quadrant (RUQ) abdominal pain. 
    Mild gastritis. Barrett’s esophagus. Biopsies for H. pylori. 
    No ulcers or masses. EGD (esophagogastroduodenoscopy) performed. 
    ICD-10: R07.89, R10.11, K29.70. CPT: 43239. HCPCS: J3490."""

    nlp = load_model()
    entities = extract_entities(text, nlp)

    # Save JSON
    with open("clinical_output.json", "w") as f:
        json.dump(entities, f, indent=4)

    # Print output
    print("\n✅ Extracted Entities in JSON format:\n")
    print(json.dumps(entities, indent=4))

if __name__ == "__main__":
    main()


Loading scispaCy model...
✅ Model loaded successfully!

✅ Extracted Entities in JSON format:

{
    "Clinical Terms": [],
    "Anatomical Locations": [],
    "Diagnosis": [],
    "Procedures": [],
    "ICD-10": [
        "K29.70",
        "R10.11",
        "R07.89"
    ],
    "CPT": [
        "43239"
    ],
    "HCPCS": [
        "J3490"
    ]
}


In [7]:
import fitz  # PyMuPDF
import spacy
import json
import re

# ---------- 1. Extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

# ---------- 2. Load scispaCy model ----------
def load_model():
    print("Loading scispaCy model...")
    nlp = spacy.load("en_core_sci_lg")
    print("✅ Model loaded successfully!")
    return nlp

# ---------- 3. Keywords for better extraction ----------
CLINICAL_TERMS_KEYWORDS = [
    "chest pain", "gastritis", "barrett’s esophagus", "ulcers", "myocardial infarction", "asthma", "diabetes", "hypertension"
]
PROCEDURES_KEYWORDS = [
    "EGD", "endoscopy", "colonoscopy", "biopsy", "echocardiogram"
]
ANATOMICAL_LOCATIONS_KEYWORDS = [
    "right upper quadrant", "left ventricle", "abdominal", "esophagus", "rectum"
]
DIAGNOSIS_KEYWORDS = [
    "gastritis", "barrett’s esophagus", "myocardial infarction", "hypertension", "diabetes"
]

# ---------- 4. Extract entities ----------
def extract_entities(text, nlp):
    doc = nlp(text)

    result = {
        "Clinical Terms": [],
        "Anatomical Locations": [],
        "Diagnosis": [],
        "Procedures": [],
        "ICD-10": [],
        "CPT": [],
        "HCPCS": []
    }

    # 1️⃣ Add NER entities (model may catch some)
    for ent in doc.ents:
        # We put everything in Clinical Terms first
        if ent.text not in result["Clinical Terms"]:
            result["Clinical Terms"].append(ent.text)

    # 2️⃣ Add keywords for missing entries
    for kw in CLINICAL_TERMS_KEYWORDS:
        if re.search(r"\b" + re.escape(kw) + r"\b", text, re.IGNORECASE):
            if kw not in result["Clinical Terms"]:
                result["Clinical Terms"].append(kw)

    for kw in PROCEDURES_KEYWORDS:
        if re.search(r"\b" + re.escape(kw) + r"\b", text, re.IGNORECASE):
            if kw not in result["Procedures"]:
                result["Procedures"].append(kw)

    for kw in ANATOMICAL_LOCATIONS_KEYWORDS:
        if re.search(r"\b" + re.escape(kw) + r"\b", text, re.IGNORECASE):
            if kw not in result["Anatomical Locations"]:
                result["Anatomical Locations"].append(kw)

    for kw in DIAGNOSIS_KEYWORDS:
        if re.search(r"\b" + re.escape(kw) + r"\b", text, re.IGNORECASE):
            if kw not in result["Diagnosis"]:
                result["Diagnosis"].append(kw)

    # 3️⃣ Regex extraction for codes
    result["ICD-10"] = list(set(re.findall(r"\b[A-Z]\d{2}(?:\.\d+)?\b", text)))
    result["CPT"] = list(set(re.findall(r"\b\d{5}\b", text)))
    result["HCPCS"] = list(set(re.findall(r"\bJ\d{4}\b", text)))

    return result

# ---------- 5. Main function ----------
def main():
    text = """Report 1: 
Diagnosis: 
Z86.0100 History of colon polyps 
Z86.0100 - Personal history of colonic polyps K64.8 - Internal hemorrhoids K57.90 – 
Diverticulosis 
Procedure: 
Procedure Code Colonoscopy 
Anesthesia Type : Monitored Anesthesia Care ASA Class : II  
Lactated Ringers - Solution, Intravenous as directed - 350 00 , Last Administered By: 
Smith, George At 1041 on 07/07/2025 Lidocaine HCI 2 % Solution, IV - 20 00 , Last 
Administered By: Smith, George At 1023 on 07/07/2025 Propofol 500 MG/50ML 
Emulsion, Intravenous - 240 00 , Last Administered By: Smith, George At 1041 on 
07/07/2025 
Colonoscopy PROCEDURE : There was nothing precluding endoscopy on history or 
physical exam. Informed consent was obtained with risks and benefits explained to the 
patient. The patient tolerated the procedure well. There no immediate complications. 
The patient was placed in left lateral decubitus position. A rectal exam was performed. 
The pediatric colonoscope was inserted into the rectum and carefully advanced to the 
cecum. The cecum was identified by the ileocecal valve, the triradiate fold and 
appendiceal orifice. Careful inspection was made as the colonoscope was removed 
including retroflexion in the rectum. Findings- The preparation was good. There was 
melanosis coli in the proximal colon. There was moderate sigmoid diverticulosis. 
Internal hemorrhoids were seen. IMPRESSION : The patient is an 82-year-old female 
with history of colon polyps. Today's exam did not reveal any polyps. she did have 
melanosis coli, diverticulosis and internal hemorrhoids. PLAN : No routine colonoscopy 
Colonoscopy The patient tolerated the procedure without complications .The 
colonoscopy was uneventful gastic
"""

    nlp = load_model()
    entities = extract_entities(text, nlp)

    # Save JSON
    with open("clinical_output.json", "w") as f:
        json.dump(entities, f, indent=4)

    # Print JSON output
    print("\n✅ Extracted Entities in JSON format:\n")
    print(json.dumps(entities, indent=4))

if __name__ == "__main__":
    main()


Loading scispaCy model...
✅ Model loaded successfully!

✅ Extracted Entities in JSON format:

{
    "Clinical Terms": [
        "Report",
        "Diagnosis",
        "Z86.0100",
        "History of colon polyps",
        "Personal history",
        "colonic polyps",
        "K64.8 - Internal hemorrhoids K57.90",
        "Diverticulosis",
        "Procedure",
        "Procedure Code",
        "Colonoscopy",
        "Anesthesia Type",
        "Monitored Anesthesia Care",
        "ASA Class : II  \n",
        "Lactated Ringers",
        "Solution",
        "Intravenous",
        "directed",
        "Administered",
        "Smith",
        "George",
        "Lidocaine",
        "IV",
        "Propofol",
        "MG/50ML",
        "Emulsion",
        "PROCEDURE",
        "endoscopy",
        "history",
        "physical exam",
        "Informed consent",
        "risks",
        "benefits",
        "patient",
        "tolerated",
        "procedure",
        "complications",
        "left 

In [8]:
# Install necessary libraries if not installed
# pip install transformers torch

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import json

# ------------------- 1. Load BioBERT NER Model -------------------
model_name = "d4data/biobert_v1.1_pubmed_ner"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ------------------- 2. Define report category rules -------------------
# You can expand this mapping based on your project
REPORT_CATEGORIES = {
    "Cardiology": ["hypertension", "myocardial infarction", "echocardiogram", "left ventricle"],
    "Gastroenterology": ["gastritis", "colonoscopy", "barrett’s esophagus", "ulcers", "diverticulosis"],
    "Endocrinology": ["diabetes", "metformin", "insulin"]
}

# ------------------- 3. Extract entities -------------------
def extract_entities(text):
    entities = ner_pipeline(text)
    
    result = {
        "Clinical Terms": [],
        "Procedures": [],
        "Diagnosis": [],
    }

    # Map entities to categories
    for ent in entities:
        label = ent['entity_group']
        word = ent['word']
        
        if label in ['DISEASE', 'Disease', 'Condition']:
            if word not in result["Clinical Terms"]:
                result["Clinical Terms"].append(word)
        elif label in ['PROCEDURE', 'Surgery', 'Treatment']:
            if word not in result["Procedures"]:
                result["Procedures"].append(word)
        elif label in ['DIAGNOSIS', 'Finding']:
            if word not in result["Diagnosis"]:
                result["Diagnosis"].append(word)

    return result

# ------------------- 4. Classify report -------------------
def classify_report(entities):
    # Count matches per category
    scores = {category: 0 for category in REPORT_CATEGORIES}
    all_terms = entities["Clinical Terms"] + entities["Procedures"] + entities["Diagnosis"]
    
    for category, keywords in REPORT_CATEGORIES.items():
        for term in all_terms:
            term_lower = term.lower()
            for kw in keywords:
                if kw.lower() in term_lower:
                    scores[category] += 1
    
    # Pick category with highest matches
    predicted_category = max(scores, key=scores.get)
    return predicted_category if scores[predicted_category] > 0 else "Unknown"

# ------------------- 5. Main -------------------
def main():
    medical_text = """
    Patient was diagnosed with hypertension and prescribed Metformin. 
    A laparoscopic appendectomy was performed last week. 
    Follow-up diagnosis shows improvement in diabetes symptoms.
    """

    entities = extract_entities(medical_text)
    report_category = classify_report(entities)

    output = {
        "Entities": entities,
        "Predicted_Report_Category": report_category
    }

    # Save JSON
    with open("clinical_report_output.json", "w") as f:
        json.dump(output, f, indent=4)

    # Print JSON output
    print(json.dumps(output, indent=4))

if __name__ == "__main__":
    main()


OSError: d4data/biobert_v1.1_pubmed_ner is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [9]:
import fitz  # PyMuPDF
import spacy
import json
import re

# ---------- 1. Extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

# ---------- 2. Load scispaCy model ----------
def load_model():
    print("Loading scispaCy model...")
    nlp = spacy.load("en_core_sci_lg")
    print("✅ Model loaded successfully!")
    return nlp

# ---------- 3. Load your JSON format as reference ----------
def load_reference_json(json_file):
    with open(json_file, "r") as f:
        data = json.load(f)

    reference = {
        "Clinical Terms": set(),
        "Anatomical Locations": set(),
        "Diagnosis": set(),
        "Procedures": set(),
        "ICD-10": set(),
        "CPT": set(),
        "HCPCS": set()
    }

    for entry in data:
        for key in reference.keys():
            reference[key].update(entry.get(key, []))

    return reference

# ---------- 4. Extract entities using NER + JSON keywords + regex ----------
def extract_entities(text, nlp, reference):
    doc = nlp(text)

    result = {key: [] for key in reference.keys()}

    # 1️⃣ NER extraction (put all recognized entities into Clinical Terms)
    for ent in doc.ents:
        if ent.text not in result["Clinical Terms"]:
            result["Clinical Terms"].append(ent.text)

    # 2️⃣ JSON keyword matching
    for category, items in reference.items():
        for kw in items:
            if re.search(r"\b" + re.escape(kw) + r"\b", text, re.IGNORECASE):
                if kw not in result[category]:
                    result[category].append(kw)

    # 3️⃣ Regex for codes (in case they appear in text but not in JSON)
    result["ICD-10"] = list(set(result["ICD-10"]) | set(re.findall(r"\b[A-Z]\d{2}(?:\.\d+)?\b", text)))
    result["CPT"] = list(set(result["CPT"]) | set(re.findall(r"\b\d{5}\b", text)))
    result["HCPCS"] = list(set(result["HCPCS"]) | set(re.findall(r"\b[JAB]\d{4}\b", text)))

    return result

# ---------- 5. Main ----------
def main():
    pdf_path = "clinicalreport.pdf"   # Replace with your PDF path
    json_reference = "json.txt"  # Your JSON file (the one you sent earlier)

    nlp = load_model()
    reference = load_reference_json(json_reference)
    text = extract_text_from_pdf(pdf_path)
    entities = extract_entities(text, nlp, reference)

    # Save output
    with open("clinical_output.json", "w") as f:
        json.dump(entities, f, indent=4)

    print("\n✅ Extracted Entities:\n")
    print(json.dumps(entities, indent=4))

if __name__ == "__main__":
    main()


Loading scispaCy model...
✅ Model loaded successfully!

✅ Extracted Entities:

{
    "Clinical Terms": [
        "Report",
        "Diagnosis",
        "Z86.0100",
        "History of colon polyps",
        "Personal history",
        "colonic polyps",
        "K64.8 - Internal hemorrhoids K57.90",
        "Diverticulosis",
        "Procedure",
        "Procedure Code",
        "Anesthesia",
        "Monitored Anesthesia Care",
        "ASA Class",
        "II  \n \n",
        "Lactated Ringers",
        "Solution",
        "Intravenous",
        "directed",
        "Administered",
        "Smith",
        "George",
        "Lidocaine",
        "IV",
        "Propofol",
        "MG/50ML",
        "Emulsion",
        "Colonoscopy",
        "PROCEDURE",
        "endoscopy",
        "history",
        "physical exam",
        "Informed consent",
        "risks",
        "bene\ufb01ts",
        "patient",
        "tolerated",
        "procedure",
        "complications",
        "left late

In [11]:
import fitz  # PyMuPDF
import spacy
import json
import re
import os

# ---------- 1. Extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

# ---------- 2. Load scispaCy model ----------
def load_model():
    print("Loading scispaCy model...")
    nlp = spacy.load("en_core_sci_lg")
    print("✅ Model loaded successfully!")
    return nlp

# ---------- 3. Load JSON from TXT ----------
def load_json_from_txt(txt_file):
    with open(txt_file, "r") as f:
        content = f.read()
        data = json.loads(content)  # Parse as JSON
    return data

# ---------- 4. Build reference keywords ----------
def build_reference_keywords(data):
    reference = {key: set() for key in ["Clinical Terms", "Anatomical Locations", "Diagnosis", "Procedures", "ICD-10", "CPT", "HCPCS"]}
    for entry in data:
        for key in reference.keys():
            reference[key].update(entry.get(key, []))
    return reference

# ---------- 5. Extract entities ----------
def extract_entities(text, nlp, reference):
    doc = nlp(text)
    result = {key: [] for key in reference.keys()}

    # NER extraction
    for ent in doc.ents:
        if ent.text not in result["Clinical Terms"]:
            result["Clinical Terms"].append(ent.text)

    # Keyword matching from JSON reference
    for category, items in reference.items():
        for kw in items:
            if re.search(r"\b" + re.escape(kw) + r"\b", text, re.IGNORECASE):
                if kw not in result[category]:
                    result[category].append(kw)

    # Regex extraction for codes
    result["ICD-10"] = list(set(result["ICD-10"]) | set(re.findall(r"\b[A-Z]\d{2}(?:\.\d+)?\b", text)))
    result["CPT"] = list(set(result["CPT"]) | set(re.findall(r"\b\d{5}\b", text)))
    result["HCPCS"] = list(set(result["HCPCS"]) | set(re.findall(r"\b[JAB]\d{4}\b", text)))

    return result

# ---------- 6. Process multiple PDFs ----------
def process_reports(pdf_folder, txt_json_file, output_file="merged_clinical_output.json"):
    nlp = load_model()
    json_data = load_json_from_txt(txt_json_file)
    reference = build_reference_keywords(json_data)

    all_results = {}

    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            print(f"Processing report: {filename}")
            text = extract_text_from_pdf(pdf_path)
            entities = extract_entities(text, nlp, reference)
            all_results[filename] = entities

    # Save merged results
    with open(output_file, "w") as f:
        json.dump(all_results, f, indent=4)

    print(f"\n✅ All reports processed! Merged JSON saved as '{output_file}'.")

# ---------- 7. Main ----------
if __name__ == "__main__":
    pdf_folder = "clinicalreport.pdf"       # Folder containing all report PDFs
    txt_json_file = "json.txt"  # TXT file containing JSON reference
    process_reports(pdf_folder, txt_json_file)


Loading scispaCy model...
✅ Model loaded successfully!


NotADirectoryError: [WinError 267] The directory name is invalid: 'clinicalreport.pdf'

In [1]:
import fitz  # PyMuPDF
import spacy
import json
import re

# ---------- 1. Extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

# ---------- 2. Load scispaCy model ----------
def load_model():
    print("Loading scispaCy model...")
    nlp = spacy.load("en_core_sci_lg")
    print("✅ Model loaded successfully!")
    return nlp

# ---------- 3. Load JSON reference from TXT ----------
def load_json_from_txt(txt_file):
    with open(txt_file, "r") as f:
        content = f.read()
        data = json.loads(content)  # Parse JSON
    return data

# ---------- 4. Build reference keywords ----------
def build_reference_keywords(data):
    reference = {key: set() for key in ["Clinical Terms", "Anatomical Locations", "Diagnosis", "Procedures", "ICD-10", "CPT", "HCPCS"]}
    for entry in data:
        for key in reference.keys():
            reference[key].update(entry.get(key, []))
    return reference

# ---------- 5. Process a single report ----------
def process_report(text, nlp, reference, report_id=None):
    doc = nlp(text)
    result = {key: [] for key in reference.keys()}
    if report_id:
        result["Report ID"] = report_id

    # 1️⃣ NER extraction
    for ent in doc.ents:
        if ent.text not in result["Clinical Terms"]:
            result["Clinical Terms"].append(ent.text)

    # 2️⃣ Keyword extraction using JSON reference
    for category, items in reference.items():
        for kw in items:
            if re.search(r"\b" + re.escape(kw) + r"\b", text, re.IGNORECASE):
                if kw not in result[category]:
                    result[category].append(kw)

    # 3️⃣ Regex for codes if not present in JSON
    result["ICD-10"] = list(set(result["ICD-10"]) | set(re.findall(r"\b[A-Z]\d{2}(?:\.\d+)?\b", text)))
    result["CPT"] = list(set(result["CPT"]) | set(re.findall(r"\b\d{5}\b", text)))
    result["HCPCS"] = list(set(result["HCPCS"]) | set(re.findall(r"\b[JAB]\d{4}\b", text)))

    return result

# ---------- 6. Main ----------
def main():
    pdf_path = "clinicalreport.pdf"  # Your PDF
    txt_json_file = "json.txt"  # TXT file with JSON reference

    nlp = load_model()
    json_data = load_json_from_txt(txt_json_file)
    reference = build_reference_keywords(json_data)

    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)

    # Split into individual reports
    reports = re.split(r'\bReport\s*\d+:', text, flags=re.IGNORECASE)
    reports = [r.strip() for r in reports if r.strip()]

    all_json = []
    for i, report_text in enumerate(reports):
        report_json = process_report(report_text, nlp, reference, report_id=f"Report {i+1}")
        all_json.append(report_json)

    # Save all reports to JSON
    with open("clinical_reports.json", "w") as f:
        json.dump(all_json, f, indent=4)

    print("\n✅ Extracted reports JSON:\n")
    print(json.dumps(all_json, indent=4))

if __name__ == "__main__":
    main()


Loading scispaCy model...


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


✅ Model loaded successfully!

✅ Extracted reports JSON:

[
    {
        "Clinical Terms": [
            "Diagnosis",
            "Z86.0100",
            "History of colon polyps",
            "Personal history",
            "colonic polyps",
            "K64.8 - Internal hemorrhoids K57.90",
            "Diverticulosis",
            "Procedure",
            "Procedure Code",
            "Anesthesia",
            "Monitored Anesthesia Care",
            "ASA Class",
            "II  \n \n",
            "Lactated Ringers",
            "Solution",
            "Intravenous",
            "directed",
            "Administered",
            "Smith",
            "George",
            "Lidocaine",
            "IV",
            "Propofol",
            "MG/50ML",
            "Emulsion",
            "Colonoscopy",
            "PROCEDURE",
            "endoscopy",
            "history",
            "physical exam",
            "Informed consent",
            "risks",
            "bene\ufb01ts",
  

In [2]:
import fitz  # PyMuPDF
import spacy
import json
import re

# ---------- 1. Extract text from PDF ----------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

# ---------- 2. Load scispaCy model ----------
def load_model():
    print("Loading scispaCy model...")
    nlp = spacy.load("en_core_sci_lg")  # Or any lightweight model if memory issues
    print("✅ Model loaded successfully!")
    return nlp

# ---------- 3. Load JSON reference from TXT ----------
def load_json_from_txt(txt_file):
    with open(txt_file, "r", encoding="utf-8") as f:
        content = f.read()
        data = json.loads(content)  # Parse JSON
    return data

# ---------- 4. Build reference keywords ----------
def build_reference_keywords(data):
    reference = {key: set() for key in ["Clinical Terms", "Anatomical Locations", "Diagnosis", "Procedures", "ICD-10", "CPT", "HCPCS"]}
    for entry in data:
        for key in reference.keys():
            reference[key].update(entry.get(key, []))
    return reference

# ---------- 5. Process a single report ----------
def process_report(text, nlp, reference, report_id=None):
    doc = nlp(text)
    result = {key: [] for key in reference.keys()}
    if report_id:
        result["Report ID"] = report_id

    # 1️⃣ NER extraction
    for ent in doc.ents:
        if ent.text not in result["Clinical Terms"]:
            result["Clinical Terms"].append(ent.text)

    # 2️⃣ Keyword extraction using JSON reference
    for category, items in reference.items():
        for kw in items:
            if re.search(r"\b" + re.escape(kw) + r"\b", text, re.IGNORECASE):
                if kw not in result[category]:
                    result[category].append(kw)

    # 3️⃣ Regex for codes if not present in JSON
    result["ICD-10"] = list(set(result["ICD-10"]) | set(re.findall(r"\b[A-Z]\d{2}(?:\.\d+)?\b", text)))
    result["CPT"] = list(set(result["CPT"]) | set(re.findall(r"\b\d{5}\b", text)))
    result["HCPCS"] = list(set(result["HCPCS"]) | set(re.findall(r"\b[JAB]\d{4}\b", text)))

    # 4️⃣ Remove duplicates from Clinical Terms if present in other categories
    other_fields = set(result["Diagnosis"] + result["Procedures"] + result["ICD-10"] + result["CPT"] + result["HCPCS"])
    result["Clinical Terms"] = [term for term in result["Clinical Terms"] if term not in other_fields]

    return result

# ---------- 6. Main ----------
def main():
    pdf_path = "clinicalreport.pdf"  # Your PDF path
    txt_json_file = "json.txt"       # TXT file containing JSON reference

    # Load NLP model
    nlp = load_model()

    # Load reference JSON
    json_data = load_json_from_txt(txt_json_file)
    reference = build_reference_keywords(json_data)

    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)

    # Split into individual reports (assuming "Report 1:", "Report 2:", etc.)
    reports = re.split(r'\bReport\s*\d+:', text, flags=re.IGNORECASE)
    reports = [r.strip() for r in reports if r.strip()]

    all_json = []
    for i, report_text in enumerate(reports):
        report_json = process_report(report_text, nlp, reference, report_id=f"Report {i+1}")
        all_json.append(report_json)

    # Save all reports to JSON
    with open("clinical_reports.json", "w", encoding="utf-8") as f:
        json.dump(all_json, f, indent=4)

    print("\n✅ Extracted reports JSON:\n")
    print(json.dumps(all_json, indent=4))

if __name__ == "__main__":
    main()


Loading scispaCy model...
✅ Model loaded successfully!

✅ Extracted reports JSON:

[
    {
        "Clinical Terms": [
            "Diagnosis",
            "History of colon polyps",
            "Personal history",
            "colonic polyps",
            "K64.8 - Internal hemorrhoids K57.90",
            "Diverticulosis",
            "Procedure",
            "Procedure Code",
            "Anesthesia",
            "Monitored Anesthesia Care",
            "ASA Class",
            "II  \n \n",
            "Lactated Ringers",
            "Solution",
            "Intravenous",
            "directed",
            "Administered",
            "Smith",
            "George",
            "Lidocaine",
            "IV",
            "Propofol",
            "MG/50ML",
            "Emulsion",
            "PROCEDURE",
            "endoscopy",
            "history",
            "physical exam",
            "Informed consent",
            "risks",
            "bene\ufb01ts",
            "patient",
    