# Traditional Based Natural Langauge Processing

## Imported the required library

##### * fitz -> Required to read and extract text from the pdf files
##### * re -> Used for regular expressions to find patterns like ICD-10 codes, CPT codes, or keywords in text.
##### * json -> Used to create and save structured JSON output from the extracted clinical information. 

In [17]:
import fitz  
import re
import json

### Extract text from PDF

In [18]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

### Define keyword lexicons (Traditional NLP)

In [19]:
clinical_terms_keywords = [
    "bleeding", "erosion", "hemorrhoids", "diverticulosis", "polyps", "gastritis",
    "Barrett", "colonoscopy", "biopsy", "melanosis", "proctitis", "ulcer", "pain",
    "chest pain", "abdominal pain", "Barrett's esophagus"
]

anatomical_locations_keywords = [
    "rectum", "anal verge", "anus", "cecum", "sigmoid colon", "ascending colon",
    "transverse colon", "descending colon", "hepatic flexure", "splenic flexure",
    "stomach", "duodenum", "esophagus", "ileum", "colon", "right upper quadrant"
]

procedures_keywords = [
    "colonoscopy", "EGD", "EGD with biopsy", "biopsy", "retroflexion"
]

### Regex for code extraction

In [20]:
def extract_icd_codes(text):
    return list(set(re.findall(r'\b[A-Z]\d{1,2}\.?\d*\b', text)))

def extract_cpt_codes(text):
    return list(set(re.findall(r'\b\d{5}\b', text)))

def extract_hcpcs_codes(text):
    return list(set(re.findall(r'\b[A-Z]\d{4}\b', text)))

### Keyword search using Traditional NLP

In [21]:
def extract_terms(text, keywords):
    found = []
    for k in keywords:
        if re.search(r'\b' + re.escape(k) + r'\b', text, re.IGNORECASE):
            found.append(k)
    return list(set(found))

### Process report into structured JSON

In [22]:
def process_report(text, report_id="Report"):
    return {
        "ReportID": report_id,
        "Clinical Terms": extract_terms(text, clinical_terms_keywords),
        "Anatomical Locations": extract_terms(text, anatomical_locations_keywords),
        "Diagnosis": extract_terms(text, clinical_terms_keywords),
        "Procedures": extract_terms(text, procedures_keywords),
        "ICD-10": extract_icd_codes(text),
        "CPT": extract_cpt_codes(text),
        "HCPCS": extract_hcpcs_codes(text)
    }

### Main Function

In [24]:
def main():
    # Provide your PDF file path here 👇
    pdf_path = "clinicalreport.pdf"

    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)

    # If your PDF has multiple reports, you can split by keyword like “Report”
    reports = re.split(r'\bReport\s*\d+:', text, flags=re.IGNORECASE)
    reports = [r.strip() for r in reports if r.strip()]

    all_json = []
    for i, report_text in enumerate(reports):
        report_json = process_report(report_text, report_id=f"Report {i+1}")
        all_json.append(report_json)

    # Save to JSON
    with open("clinical_reports_traditional.json", "w") as f:
        json.dump(all_json, f, indent=4)

    # Print neatly
    print(json.dumps(all_json, indent=4))

if __name__ == "__main__":
    main()

[
    {
        "ReportID": "Report 1",
        "Clinical Terms": [
            "polyps",
            "diverticulosis",
            "melanosis",
            "colonoscopy",
            "hemorrhoids"
        ],
        "Anatomical Locations": [
            "colon",
            "cecum",
            "rectum"
        ],
        "Diagnosis": [
            "polyps",
            "diverticulosis",
            "melanosis",
            "colonoscopy",
            "hemorrhoids"
        ],
        "Procedures": [
            "colonoscopy"
        ],
        "ICD-10": [
            "K57.90",
            "Z86.0100",
            "K64.8"
        ],
        "CPT": [],
        "HCPCS": []
    },
    {
        "ReportID": "Report 2",
        "Clinical Terms": [
            "polyps",
            "colonoscopy",
            "hemorrhoids"
        ],
        "Anatomical Locations": [
            "sigmoid colon",
            "ascending colon",
            "descending colon",
            "ileum",
            "cec