In [1]:
import pandas as pd
import os

# Load the CSV file
extracted_entities = pd.read_csv("processed_df.csv")

# Check the data
print(extracted_entities.head())

          note_id                                              input  \
0  16002318-DS-17  <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...   
1   15638884-DS-4  <SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...   
2  12435705-DS-14  <SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...   
3   12413577-DS-4  <SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...   
4  17967161-DS-29  <SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...   

                                            entities  \
0  {'PROBLEM': ['101', '7 pound weight loss', 'a ...   
1  {'PROBLEM': ['+', '-', '1 cm area', 'a " cyst ...   
2  {'PROBLEM': ['a 0. 7 x 0. 7 x 0. 7 cm simple c...   
3  {'PROBLEM': ['a third - degree uterine prolaps...   
4  {'PROBLEM': ['101', 'abuse', 'acute pancreatit...   

                                            problems  \
0  ['101', '7 pound weight loss', 'a fever', 'a l...   
1  ['+', '-', '1 cm area', 'a " cyst "', 'a 2cm d...   
2  ['a 0. 7 x 0. 7 x 0. 7 cm simple cyst', 'a 2. ...   
3  ['a

In [9]:
# Load the CSV file
reduced_text = pd.read_csv("processed_reduced_texts.csv")

# Check the data
print(reduced_text.head())

          note_id                                              input  \
0  16002318-DS-17  <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...   
1   15638884-DS-4  <SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...   
2  12435705-DS-14  <SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...   
3   12413577-DS-4  <SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...   
4  17967161-DS-29  <SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...   

                                              target  input_tokens  \
0  This is a ___ yo F admitted to the hospital af...          1195   
1  Mr. ___ is a ___ yo man with CAD with prior MI...          3496   
2  Mr. ___ is a ___ w/ Ph+ve ALL on dasatanib and...          5591   
3  On ___, Ms. ___ was admitted to the gynecology...          1119   
4  Mr. ___ underwent an angiogram on ___ which sh...          3307   

   target_tokens                                       reduced_text  \
0             75  <|begin_of_text|><SEX> F <SERVICE> SURGERY <AL...   
1   

In [11]:
reduced_text.nunique()

note_id              100
input                100
target               100
input_tokens         100
target_tokens         91
reduced_text         100
importance_scores    100
dtype: int64

In [12]:
extracted_entities.nunique()

note_id       100
input         100
entities      100
problems      100
treatments    100
tests          99
dtype: int64

In [13]:
# Check if both have the same set of note_id values
same_note_ids_set = set(reduced_text["note_id"]) == set(extracted_entities["note_id"])

if same_note_ids_set:
    print("Both DataFrames have the same note_id values (ignoring order).")
else:
    print("The note_id values are different.")


Both DataFrames have the same note_id values (ignoring order).


In [14]:
# Merge the DataFrames on the note_id column
merged_df = pd.merge(
    reduced_text[["note_id", "input", "reduced_text"]],  # Select relevant columns from reduced_text
    extracted_entities[["note_id", "entities", "problems", "treatments", "tests"]],  # Select relevant columns from extracted_entities
    on="note_id",  # Merge on the note_id column
    how="inner"  # Keep only rows with matching note_id
)

# Display the merged DataFrame
print(merged_df.head())


          note_id                                              input  \
0  16002318-DS-17  <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...   
1   15638884-DS-4  <SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...   
2  12435705-DS-14  <SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...   
3   12413577-DS-4  <SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...   
4  17967161-DS-29  <SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...   

                                        reduced_text  \
0  <|begin_of_text|><SEX> F <SERVICE> SURGERY <AL...   
1  <|begin_of_text|><SEX> M <SERVICE> MEDICINE <A...   
2  <|begin_of_text|><SEX> M <SERVICE> MEDICINE <A...   
3  <|begin_of_text|><SEX> F <SERVICE> OBSTETRICS/...   
4  <|begin_of_text|><SEX> M <SERVICE> SURGERY <AL...   

                                            entities  \
0  {'PROBLEM': ['101', '7 pound weight loss', 'a ...   
1  {'PROBLEM': ['+', '-', '1 cm area', 'a " cyst ...   
2  {'PROBLEM': ['a 0. 7 x 0. 7 x 0. 7 cm simple c...   
3  {'P

In [15]:
merged_df.nunique()

note_id         100
input           100
reduced_text    100
entities        100
problems        100
treatments      100
tests            99
dtype: int64

In [16]:
# Save the merged DataFrame to a CSV file
merged_df.to_csv("merged_data.csv", index=False)
print("Merged DataFrame saved to 'merged_data.csv'.")


Merged DataFrame saved to 'merged_data.csv'.


In [17]:
print(merged_df['problems'].iloc[5])

['/ yellow / malodorous drainage', '5 x 8. 6 cm heterogeneous lesion', '_', 'a 3. 9 cm hypoenhancing lesion in segment', 'a 7', 'a bowel movement', 'a hematoma', 'a moderate right pleural effusion', 'a simple cyst', 'a stroke', 'abdominal aortic aneurysm', 'additional hypodense lesions in the spleen', 'additional small hematomas', 'albuteral _ _', 'allergies', 'anicteric', 'any bulging', 'associated fluid collection', 'atrial fibrillation', 'beefy red', 'bibasilar atelectasis', 'bright red blood', 'cce', 'cramps in your abdomen or legs', 'decrease in urination', 'dehydration', 'dizziness', 'dry mouth', 'edema of the rectus musculature', 'erogeneous lesion', 'extended constipation', 'extraluminal contrast', 'fever greater than 101', 'finding', 'fluid collections', 'focal lesions', 'focal renal lesions', 'genetic polyposis', 'hematoma', 'heterogeneous internal signal', 'his large _ _ _ hernia', 'homogenous attenuation', 'hydronephrosis', 'hypertension', 'inability to tolerate', 'inabilit

In [2]:
extracted_entities.head(4)

Unnamed: 0,note_id,input,entities,problems,treatments,tests
0,16002318-DS-17,<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...,"{'PROBLEM': ['101', '7 pound weight loss', 'a ...","['101', '7 pound weight loss', 'a fever', 'a l...","['abdominal exercises', 'albuterol sulfate', '...","['b12', 'bmi', 'calcium', 'physical exam']"
1,15638884-DS-4,<SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...,"{'PROBLEM': ['+', '-', '1 cm area', 'a "" cyst ...","['+', '-', '1 cm area', 'a "" cyst ""', 'a 2cm d...","['a bankart repair', 'a nicotine patch', 'a st...","['.', '_', 'a', 'a ct scan', 'absbaso', 'abseo..."
2,12435705-DS-14,<SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...,{'PROBLEM': ['a 0. 7 x 0. 7 x 0. 7 cm simple c...,"['a 0. 7 x 0. 7 x 0. 7 cm simple cyst', 'a 2. ...","['2', 'a prolonged course', 'ampicillin', 'ant...","['16s rdna primer set', 'aa', 'abl', 'acid fas..."
3,12413577-DS-4,<SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...,{'PROBLEM': ['a third - degree uterine prolaps...,"['a third - degree uterine prolapse', 'abnorma...","['a stool softener', 'acetaminophen', 'admissi...","['hct', 'hgb', 'mch', 'mchc', 'mcv', 'nadr', '..."


In [3]:
print(extracted_entities['input'].iloc[5])

<SEX> M <SERVICE> SURGERY <ALLERGIES> Codeine / Tetracycline <ATTENDING> ___ ___ Complaint: Large ___ hernia and sessile polyp <MAJOR SURGICAL OR INVASIVE PROCEDURE> ___ and midline hernia repair, left component separation, retrorectus repair of the stomal hernia primary midline repair with the above-mentioned anterior left-sided component separation and placement of mesh and panniculectomy <HISTORY OF PRESENT ILLNESS> Mr. ___ is a ___ year old male with history of rectal cancer and a longstanding left-sided colostomy. He had a colonoscopy in ___ of this year where a large polyp that was unable to be extracted by endoscope was found by Dr. ___. He was referred to Dr. ___ surgical resection of the polyp. Per last clinic note by Dr. ___: It has been 2 months since I last saw Mr. ___ for his ___ hernia and a month since he underwent colonoscopy with identification of a low-grade cancer proximal to his end colostomy. Since he is last seen there are no changes in his health and well-being. 

### Connect to neo4j knowledge graph

In [29]:
## Graphdb configuration
NEO4J_URI="neo4j+s://8a886660.databases.neo4j.io"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="9FYDkCTM2Vq4qxWFFwik0uYP6BJ-fReP9XOYj-oDqZ4"

import os
os.environ["NEO4J_URI"]=NEO4J_URI
os.environ["NEO4J_USERNAME"]=NEO4J_USERNAME
os.environ["NEO4J_PASSWORD"]=NEO4J_PASSWORD

from langchain_community.graphs import Neo4jGraph
neo4j_graph=Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
)

In [30]:
neo4j_graph

<langchain_community.graphs.neo4j_graph.Neo4jGraph at 0x7f415af9b070>

In [8]:
# Test query to check current data structure
check_query = """
MATCH (n)
WITH labels(n) as nodeTypes, count(n) as count
RETURN nodeTypes, count
ORDER BY count DESC
"""

try:
    structure = neo4j_graph.query(check_query)
    print("Node structure:", structure)
    
    # Check relationships
    rel_query = """
    MATCH ()-[r]->()
    RETURN type(r) as relationship_type, count(r) as count
    ORDER BY count DESC
    """
    relationships = neo4j_graph.query(rel_query)
    print("\nRelationship counts:", relationships)
    
except Exception as e:
    print(f"Error checking structure: {e}")

Node structure: [{'nodeTypes': ['Diagnosis'], 'count': 1425}, {'nodeTypes': ['Drug'], 'count': 618}, {'nodeTypes': ['Procedure'], 'count': 352}, {'nodeTypes': ['Patient'], 'count': 92}]

Relationship counts: [{'relationship_type': 'PRESCRIBED', 'count': 5334}, {'relationship_type': 'HAS_DIAGNOSIS', 'count': 3061}, {'relationship_type': 'HAS_PROCEDURE', 'count': 608}]


In [5]:
!pip install langchain -q neo4j python-dotenv langchain_community

In [26]:
print(merged_df['entities'].iloc[8])

{'PROBLEM': ['a worsening', 'abdominal pain', 'acute distress', 'anxiety', 'arthralgias', 'chest pain', 'chronic corticosteroid', 'clubbing', 'copd exacerbation', 'cough', 'cvas in', 'cyanosis', 'decreased breath sounds bilaterally', 'diarrhea', 'difficulty', 'difficulty breathing', 'diminished breath sounds bilateral', 'faint bibasilar rales', 'faint expiratory wheeze', 'faint wheezing', 'focal consolidation', 'gallops', 'gradually worsening dyspnea', 'guarding', 'hypertension', 'hypertension anxiety', 'lipped', 'lower extremity edema', 'melanoma', 'mi', 'mild to moderate dyspnea', 'much difficulty', 'multiple vertebral fractures', 'murmurs', 'myalgias', 'nasal congestion', 'nausea / vomiting', 'non - distended', 'non - tender', 'organomegaly', 'osteoporosis', 'palpitations', 'phase', 'pillow orthopnea', 'pleural effusion', 'pulmonary', 'purse', 'purseped', 'rales', 'rebound tenderness', 'rhinorrhea', 'rubs', 'sclera anicteric', 'sore throat', 'sputum production', 'subjective fevers']

In [23]:
merged_df.head(10)

Unnamed: 0,note_id,input,reduced_text,entities,problems,treatments,tests
0,16002318-DS-17,<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...,<|begin_of_text|><SEX> F <SERVICE> SURGERY <AL...,"{'PROBLEM': ['101', '7 pound weight loss', 'a ...","['101', '7 pound weight loss', 'a fever', 'a l...","['abdominal exercises', 'albuterol sulfate', '...","['b12', 'bmi', 'calcium', 'physical exam']"
1,15638884-DS-4,<SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...,<|begin_of_text|><SEX> M <SERVICE> MEDICINE <A...,"{'PROBLEM': ['+', '-', '1 cm area', 'a "" cyst ...","['+', '-', '1 cm area', 'a "" cyst ""', 'a 2cm d...","['a bankart repair', 'a nicotine patch', 'a st...","['.', '_', 'a', 'a ct scan', 'absbaso', 'abseo..."
2,12435705-DS-14,<SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...,<|begin_of_text|><SEX> M <SERVICE> MEDICINE <A...,{'PROBLEM': ['a 0. 7 x 0. 7 x 0. 7 cm simple c...,"['a 0. 7 x 0. 7 x 0. 7 cm simple cyst', 'a 2. ...","['2', 'a prolonged course', 'ampicillin', 'ant...","['16s rdna primer set', 'aa', 'abl', 'acid fas..."
3,12413577-DS-4,<SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...,<|begin_of_text|><SEX> F <SERVICE> OBSTETRICS/...,{'PROBLEM': ['a third - degree uterine prolaps...,"['a third - degree uterine prolapse', 'abnorma...","['a stool softener', 'acetaminophen', 'admissi...","['hct', 'hgb', 'mch', 'mchc', 'mcv', 'nadr', '..."
4,17967161-DS-29,<SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...,<|begin_of_text|><SEX> M <SERVICE> SURGERY <AL...,"{'PROBLEM': ['101', 'abuse', 'acute pancreatit...","['101', 'abuse', 'acute pancreatitis', 'angina...","['a', 'a 3 mm x 40 mm balloon percutaneous tra...","['angap', 'blood', 'blood calcium', 'blood ck ..."
5,16956007-DS-20,<SEX> M <SERVICE> SURGERY <ALLERGIES> Codeine ...,<|begin_of_text|><SEX> M <SERVICE> SURGERY <AL...,"{'PROBLEM': ['/ yellow / malodorous drainage',...","['/ yellow / malodorous drainage', '5 x 8. 6 c...","['a completion colectomy', 'a new ileostomy', ...","['a chest cat scan', 'absbaso', 'abseos', 'abs..."
6,16919911-DS-15,<SEX> F <SERVICE> MEDICINE <ALLERGIES> Penicil...,<|begin_of_text|><SEX> F <SERVICE> MEDICINE <A...,"{'PROBLEM': ['a', 'abdominal distention', 'abd...","['a', 'abdominal distention', 'abdominal pain'...","['5l fluid', 'a therapeutic thoracentesis', 'a...","['02 sat', 'alkphos', 'angap', 'ast', 'blood',..."
7,15682570-DS-25,<SEX> M <SERVICE> MEDICINE <ALLERGIES> No Know...,<|begin_of_text|><SEX> M <SERVICE> MEDICINE <A...,"{'PROBLEM': ['a small heart attack', 'a snf', ...","['a small heart attack', 'a snf', 'acute blood...","['a left pectoral pacemaker', 'an antibiotic',...","['anion gap', 'anisocyt', 'art po2', 'bands', ..."
8,12135369-DS-24,<SEX> F <SERVICE> MEDICINE <ALLERGIES> Compazi...,<|begin_of_text|><SEX> F <SERVICE> MEDICINE <A...,"{'PROBLEM': ['a worsening', 'abdominal pain', ...","['a worsening', 'abdominal pain', 'acute distr...","['( dua', ')', '1000', 'admissionmission medic...","['angap', 'baso', 'bicarbonate', 'blood glucos..."
9,11906321-DS-20,<SEX> M <SERVICE> NEUROSURGERY <ALLERGIES> Pat...,<|begin_of_text|><SEX> M <SERVICE> NEUROSURGER...,"{'PROBLEM': ['- defined, rounded 21 mm area of...","['- defined, rounded 21 mm area of contrast en...","['a shower cap', 'admission', 'advil', 'any an...","['- to - shin', 'alternating movements', 'anio..."


In [24]:
# Test connection by running a simple query
query = "MATCH (n) RETURN n LIMIT 5"
results = neo4j_graph.query(query)
print("Sample Nodes from Graph:")
for result in results:
    print(result)


Sample Nodes from Graph:
{'n': {'id': 10000032}}
{'n': {'id': 10001217}}
{'n': {'id': 10001725}}
{'n': {'id': 10002428}}
{'n': {'id': 10002495}}


In [27]:
merged_df.nunique()

note_id         100
input           100
reduced_text    100
entities        100
problems        100
treatments      100
tests            99
dtype: int64

In [37]:
from huggingface_hub import login

# Use your Hugging Face token
login("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")

In [45]:
!pip install -q transformers huggingface_hub
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes

In [46]:
import torch
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import os

In [47]:
# Set environment variable for better memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Configure quantization (8-bit)
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)
print("Environment setup and quantization configuration done.")


Environment setup and quantization configuration done.


In [48]:
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

print("Loading model and tokenizer...")
with tqdm(total=2, desc="Initializing Model and Tokenizer", unit="step") as pbar:
    model_name = "meta-llama/Llama-3.2-1B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        output_attentions=True,  # Enable attention outputs for AGTD
        return_dict_in_generate=True  # Ensures attention outputs are generated
    )
    pbar.update(1)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = 'left'
    tokenizer.pad_token_id = tokenizer.eos_token_id
    pbar.update(1)
print("Model and tokenizer loaded successfully.")


Loading model and tokenizer...


Initializing Model and Tokenizer:   0%|          | 0/2 [00:00<?, ?step/s]


RuntimeError: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):
cannot import name '_TrimmedRelease' from 'packaging.version' (/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/packaging/version.py)

In [33]:
def retrieve_context_from_kg(problems, treatments, tests, graph):
    """
    Retrieve context from the Knowledge Graph for given entities.

    Args:
        problems (list): List of problems.
        treatments (list): List of treatments.
        tests (list): List of tests.
        graph (Neo4jGraph): Neo4j graph instance.

    Returns:
        dict: Retrieved context (problems, treatments, tests).
    """
    # Skip if all entities are empty
    if not (problems or treatments or tests):
        return {"problems": [], "treatments": [], "tests": []}

    # Construct query parts
    problem_query = " OR ".join([f'd.name CONTAINS "{problem}"' for problem in problems]) if problems else "FALSE"
    treatment_query = " OR ".join([f't.name CONTAINS "{treatment}"' for treatment in treatments]) if treatments else "FALSE"
    test_query = " OR ".join([f'test.name CONTAINS "{test}"' for test in tests]) if tests else "FALSE"

    # Combine into a single Cypher query
    cypher_query = f"""
    MATCH (p:Patient)-[:HAS_DIAGNOSIS]->(d:Diagnosis)
    WHERE {problem_query}
    RETURN d.name AS Entity, 'Diagnosis' AS Type
    UNION
    MATCH (p:Patient)-[:PRESCRIBED]->(t:Drug)
    WHERE {treatment_query}
    RETURN t.name AS Entity, 'Treatment' AS Type
    UNION
    MATCH (p:Patient)-[:UNDERWENT_PROCEDURE]->(test:Procedure)
    WHERE {test_query}
    RETURN test.name AS Entity, 'Test' AS Type
    """
    
    # Execute query and parse results
    try:
        results = graph.query(cypher_query)
    except Exception as e:
        print(f"Query failed: {e}")
        return {"problems": [], "treatments": [], "tests": []}

    # Parse query results into structured context
    context = {"problems": [], "treatments": [], "tests": []}
    for record in results:
        if record["Type"] == "Diagnosis":
            context["problems"].append(record["Entity"])
        elif record["Type"] == "Treatment":
            context["treatments"].append(record["Entity"])
        elif record["Type"] == "Test":
            context["tests"].append(record["Entity"])

    return context


In [34]:
def prepare_rag_input(reduced_text, context):
    """
    Combine reduced text with KG context into a single input for RAG.

    Args:
        reduced_text (str): Text to summarize.
        context (dict): Retrieved KG context (problems, treatments, tests).

    Returns:
        str: Combined input string for the RAG model.
    """
    context_str = ""
    for key, values in context.items():
        if values:
            context_str += f"{key.capitalize()}:\n" + "\n".join(values) + "\n\n"
    
    return f"Context:\n{context_str}Input:\n{reduced_text}\n\nTask: Summarize the input while considering the context."


In [35]:
def generate_summary(input_text, model, tokenizer, generation_params):
    """
    Generate a summary using the LLaMA model.

    Args:
        input_text (str): Combined input of reduced text and KG context.
        model: Pretrained LLaMA model.
        tokenizer: Tokenizer for the LLaMA model.
        generation_params (dict): Parameters for text generation.

    Returns:
        str: The generated summary.
    """
    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=2048).to(model.device)

    # Generate the summary
    outputs = model.generate(**inputs, **generation_params)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [36]:
# Define generation parameters for LLaMA
generation_params = {
    "do_sample": True,
    "top_p": 0.8,
    "temperature": 0.7,
    "top_k": 50,
    "max_new_tokens": 200,
    "repetition_penalty": 1.2,
}

# Process each row
for _, row in merged_df.iterrows():
    # Step 1: Extract entities and reduced text
    problems = row["problems"]
    treatments = row["treatments"]
    tests = row["tests"]
    reduced_text = row["reduced_text"]

    # Step 2: Retrieve KG context
    context = retrieve_context_from_kg(problems, treatments, tests, neo4j_graph)

    # Step 3: Prepare RAG input
    rag_input = prepare_rag_input(reduced_text, context)

    # Step 4: Generate summary
    summary = generate_summary(rag_input, model, tokenizer, generation_params)

    # Step 5: Print the result
    print("Summarized Text:")
    print(summary)


[#A3D6]  _: <CONNECTION> error: Failed to read from defunct connection ResolvedIPv4Address(('34.66.78.163', 7687)) (ResolvedIPv4Address(('34.66.78.163', 7687))): OSError('No data')
Unable to retrieve routing information
Transaction failed and will be retried in 1.145510783112604s (Unable to retrieve routing information)


NameError: name 'model' is not defined