In [8]:
!pip install torch --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# Cell 1: Installations
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
!pip install neo4j




[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz (15.9 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting spacy<3.5.0,>=3.4.1 (from en_core_sci_sm==0.5.1)
  Using cached spacy-3.4.4.tar.gz (1.2 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [530 lines of output]
      Collecting setuptools
        Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
      Collecting cython<3.0,>=0.25
        Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Using cached cymem-2.0.11-cp312-cp312-win_amd64.whl.metadata (8.8 kB)
      Collecting preshed<3.1.0,>=3.0.2
        Using cached preshed-3.0.10-cp312-cp312-win_amd64.whl.metadata (2.5 kB)
      Collecting murmurhash<1.1.0,>=0.28.0
        Using cached murmurhash-1.0.13-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
      Collecting thinc<8.2.0,>=8.1.0
        Using cached thinc-8.1.12.tar.gz (190 kB)
        Installing build dependencies: started
        Installing build dependencies: finished with status 'done'
        Getting requirements to build wheel: started
        Getti




[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import csv
import os
import torch
from transformers import pipeline
import re

# --- YOUR PREDEFINED SCHEMA ---
ENTITIES = ['Patient', 'Condition', 'Procedure', 'Medication', 'Allergy']
RELATIONS = ['has_condition', 'underwent_procedure', 'prescribed_medication', 'has_allergy']

# --- UNSTRUCTURED TEXT ---
PATIENT_TEXT = """
A 40-year-old male, John Doe, presented with complaints of mild fatigue and was
subsequently diagnosed with Hypertension. His cholesterol levels were elevated,
indicating High Cholesterol. The patient mentioned a past surgical history of an
Appendectomy. He is currently on a daily dose of Aspirin. Mr. Doe also noted
a known allergy to Penicillin, which causes a severe rash.
"""

def extract_information_hf_public(text, entities_schema, relations_schema):
    """
    Extracts entities using a PUBLIC, high-quality BioBERT model.
    """
    # --- Check for CUDA and set the device ---
    device = 0 if torch.cuda.is_available() else -1
    if device == 0:
        print("✅ CUDA GPU found. Running on GPU for faster performance.")
    else:
        # This is the warning you are seeing.
        print("⚠️ CUDA GPU not found. Running on CPU. This may be slow.")

    print("🔬 Initializing Hugging Face pipeline with a public BioBERT model...")
    
    # --- THE ONLY CHANGE IS THIS MODEL NAME ---
    # This model is public and does not require a login.
    ner_pipeline = pipeline(
        "ner",
        model="dslim/bert-base-NER",
        aggregation_strategy="simple",
        device=device
    )
    print("  - Pipeline loaded. Starting extraction...")

    ner_results = ner_pipeline(text)

    nodes = []
    relationships = []
    seen_nodes = set()
    
    patient_name_match = re.search(r"male, ([\w\s]+),", text)
    patient_name = patient_name_match.group(1).strip() if patient_name_match else "Unknown Patient"

    patient_id = f"p_{patient_name.replace(' ', '_')}"
    if patient_id not in seen_nodes:
        nodes.append({'id': patient_id, 'name': patient_name, 'label': 'Patient'})
        seen_nodes.add(patient_id)
    print(f"  - Identified Patient: {patient_name}")

    # Smarter Allergy Detection
    allergen = None
    if "allergy" in text.lower():
        allergy_pos = text.lower().find("allergy")
        closest_med_dist = float('inf')
        for ent in ner_results:
            # The BioBERT model uses labels like 'CHEMICAL' or 'DISEASE'
            if ent['entity_group'] in ['CHEMICAL', 'GENE_OR_GENE_PRODUCT']:
                dist = abs(ent['start'] - allergy_pos)
                if dist < closest_med_dist:
                    closest_med_dist = dist
                    allergen = ent['word']
    
    print(f"  - Detected Allergen: {allergen}")

    for entity in ner_results:
        entity_text = entity['word']
        entity_label = entity['entity_group']
        
        node_label = None
        relation_type = None

        if entity_label == "DISEASE" and 'Condition' in entities_schema:
            node_label = 'Condition'
            relation_type = 'has_condition'
        elif entity_label == "CHEMICAL":
            if entity_text == allergen and 'Allergy' in entities_schema:
                node_label = 'Allergy'
                relation_type = 'has_allergy'
            elif 'Medication' in entities_schema:
                node_label = 'Medication'
                relation_type = 'prescribed_medication'
        # Note: This model is also not trained for "Procedures". That requires a different model.

        if node_label and relation_type:
            clean_entity_text = entity_text.replace(' ##', '')
            node_id = f"{node_label[0].lower()}_{clean_entity_text.replace(' ', '_')}"
            
            if node_id not in seen_nodes:
                nodes.append({'id': node_id, 'name': clean_entity_text, 'label': node_label})
                seen_nodes.add(node_id)
            
            relationships.append({'start_id': patient_id, 'type': relation_type, 'end_id': node_id})
            print(f"    -> Mapped: ({clean_entity_text}, Model Label: {entity_label}, Your Label: {node_label})")

    print("✅ Hugging Face Extraction complete.")
    return nodes, relationships


def save_for_bulk_import(nodes, relationships):
    print("📄 Saving data into robust CSV format...")
    with open('nodes.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=[':ID', 'name', ':LABEL'])
        writer.writeheader()
        node_rows = [{':ID': n['id'], 'name': n['name'], ':LABEL': n['label']} for n in nodes]
        writer.writerows(node_rows)
    print("  - `nodes.csv` created successfully.")
    with open('relationships.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=[':START_ID', ':TYPE', ':END_ID'])
        writer.writeheader()
        rel_rows = [{':START_ID': r['start_id'], ':TYPE': r['type'], ':END_ID': r['end_id']} for r in relationships]
        writer.writerows(rel_rows)
    print("  - `relationships.csv` created successfully.")


# --- MAIN EXECUTION BLOCK ---
if __name__ == "__main__":
    nodes, relationships = extract_information_hf_public(PATIENT_TEXT, ENTITIES, RELATIONS)
    
    if not nodes:
        print("\nNo information was extracted. Exiting.")
    else:
        print("\nExtracted Nodes:", [n['name'] for n in nodes])
        print("Extracted Relationships:", len(relationships), "total")
        save_for_bulk_import(nodes, relationships)

⚠️ CUDA GPU not found. Running on CPU. This may be slow.
🔬 Initializing Hugging Face pipeline with a public BioBERT model...


OSError: dmis-lab/biobert-base-cased-v1.2-ner is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`