In [1]:
pip install pandas sentence-transformers faiss-cpu numpy

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-

In [2]:
# Install required dependencies
!pip install transformers
!pip install sentence-transformers
!pip install faiss-cpu
!pip install pandas

# Import necessary libraries
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd

# Load the combined EHR data
df = pd.read_csv("combined_ehr.csv")

# Initialize models
summarizer = pipeline("summarization", model="t5-small")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Define summary generation functions
def generate_patient_summary(row):
    """Generate a summary of patient demographics and vital signs."""
    patient_text = (
        f"Patient ID: {row['PatientID']}, Age: {row['Age']}, Sex: {row['Sex']}, "
        f"Ethnicity: {row['Ethnicity']}, Language: {row['Language']}, "
        f"Vitals: {row['vitals_list']}, Diagnoses: {row['diagnoses_list']}"
    )
    return summarizer(patient_text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

def generate_history_summary(row):
    """Generate a summary of the patient's medical history."""
    if pd.notna(row['conditions_list']) and pd.notna(row['conditions_ICD10_list']):
        conditions = row['conditions_list'].split(';')
        icd10 = row['conditions_ICD10_list'].split(';')
        history_text = "Medical History: " + ", ".join(
            [f"{cond.strip()} (ICD10: {code.strip()})" for cond, code in zip(conditions, icd10)]
        )
    else:
        history_text = "No medical history."
    return summarizer(history_text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

def generate_allergy_summary(row):
    """Generate a summary of the patient's allergies."""
    if pd.notna(row['allergies_list']) and pd.notna(row['allergy_reactions_list']):
        allergens = row['allergies_list'].split(';')
        reactions = row['allergy_reactions_list'].split(';')
        allergy_text = "Allergies: " + ", ".join(
            [f"{allergen.strip()} (Reaction: {reaction.strip()})" for allergen, reaction in zip(allergens, reactions)]
        )
    else:
        allergy_text = "No allergies."
    return summarizer(allergy_text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

def generate_lab_summary(row):
    """Generate a summary of the patient's lab test results."""
    if pd.notna(row['lab_tests_list']) and pd.notna(row['lab_values_list']):
        tests = row['lab_tests_list'].split(';')
        values = row['lab_values_list'].split(';')
        lab_text = "Lab Tests: " + ", ".join(
            [f"{test.strip()}: {value.strip()}" for test, value in zip(tests, values)]
        )
    else:
        lab_text = "No lab tests."
    return summarizer(lab_text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

# Initialize FAISS index and storage lists
index = faiss.IndexFlatL2(384)  # 384 is the embedding dimension for 'all-MiniLM-L6-v2'
embedding_list = []
metadata = []
summary_list = []

# Process all patients in the dataset
for _, row in df.iterrows():
    patient_id = row['PatientID']

    # Generate summaries
    patient_summary = generate_patient_summary(row)
    history_summary = generate_history_summary(row)
    allergy_summary = generate_allergy_summary(row)
    lab_summary = generate_lab_summary(row)

    # Store summaries in a dictionary
    summaries = {
        "patient_summary": patient_summary,
        "history_summary": history_summary,
        "allergy_summary": allergy_summary,
        "lab_summary": lab_summary
    }

    # Generate embeddings and collect metadata
    for summary_type, summary in summaries.items():
        embedding = embedder.encode(summary)
        embedding_list.append(embedding)
        metadata.append((patient_id, summary_type))
        summary_list.append(summary)

# Convert embeddings to numpy array and add to FAISS index
embedding_array = np.array(embedding_list).astype('float32')
index.add(embedding_array)

# Define retrieval function
def retrieve_summaries(query, top_k=10):
    """Retrieve the top-k most relevant summaries for a given query."""
    query_embedding = embedder.encode([query]).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    results = [(metadata[i], summary_list[i]) for i in indices[0]]
    return results

# Example usage
query = "medical history of patient 1"
results = retrieve_summaries(query)
print(f"Query: {query}")
for (patient_id, summary_type), summary_text in results:
    print(f"Patient {patient_id} - {summary_type}: {summary_text}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 50, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 50, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Both `m

Query: medical history of patient 1
Patient 10 - history_summary: medical history: type 2 diabetes (ICD10: E11.9)
Patient 15 - history_summary: medical history: type 2 diabetes (ICD10: E11.9)
Patient 27 - history_summary: medical history: type 2 diabetes (ICD10: E11.9)


In [5]:
# Save the FAISS index to disk
import faiss
faiss.write_index(index, "ehr_embeddings.faiss")

In [None]:
# Load the FAISS index from disk
index = faiss.read_index("ehr_embeddings.faiss")

In [6]:
# Save summaries and metadata to a CSV file
import csv

with open("summaries_metadata.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["PatientID", "SummaryType", "SummaryText"])
    for (patient_id, summary_type), summary_text in zip(metadata, summary_list):
        writer.writerow([patient_id, summary_type, summary_text])

In [18]:
def retrieve_summaries(query, top_k=30):
    """Retrieve the top-k most relevant summaries for a given query."""
    query_embedding = embedder.encode([query]).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    results = [(metadata[i], summary_list[i]) for i in indices[0]]
    return results

In [22]:
# Example usage
query = "allergy_summary of Patient ID 1"
results = retrieve_summaries(query)
print(f"Query: {query}")
for (patient_id, summary_type), summary_text in results:
    print(f"Patient {patient_id} - {summary_type}: {summary_text}")

Query: allergy_summary of Patient ID 1
Patient 12 - history_summary: medical history: Asthma (ICD10: J45.909)
Patient 48 - history_summary: medical history: Asthma (ICD10: J45.909)
Patient 51 - history_summary: medical history: Asthma (ICD10: J45.909)
Patient 81 - history_summary: medical history: Asthma (ICD10: J45.909), COPD (IDC10: j44.9) .
Patient 58 - history_summary: medical history: Type 2 Diabetes (ICD10: E11.9), Asthma (IDC10: J45.909).
Patient 29 - patient_summary: Patient ID: 29, Age: 34, Sex: F, Ethnicity: African, Language: German, Vitals: 2021-01-07: HR=99, BP=125/86, Temp=97.9, SpO2=97, Diagnoses: Asthma .
Patient 6 - allergy_summary: allergies: Peanuts (Reaction: Anaphylaxis)
Patient 13 - allergy_summary: allergies: Peanuts (Reaction: Anaphylaxis)
Patient 18 - allergy_summary: allergies: Peanuts (Reaction: Anaphylaxis)
Patient 20 - allergy_summary: allergies: Peanuts (Reaction: Anaphylaxis)
Patient 38 - allergy_summary: allergies: Peanuts (Reaction: Anaphylaxis)
Patient

In [23]:
# Install required dependencies (run in your environment if needed)
# !pip install transformers sentence-transformers faiss-cpu pandas

from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
import re

# Load your EHR data (replace with your actual data source)
df = pd.read_csv("combined_ehr.csv")

# Initialize models
summarizer = pipeline("summarization", model="t5-base")  # Better summarization with t5-base
embedder = SentenceTransformer('pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb')  # Medical-specific embeddings

# Summary generation with patient ID and type embedded
def generate_patient_summary(row):
    patient_id = row['PatientID']
    patient_text = (
        f"Patient {patient_id}'s profile summary: Age: {row['Age']}, Sex: {row['Sex']}, "
        f"Ethnicity: {row['Ethnicity']}, Language: {row['Language']}, "
        f"Vitals: {row['vitals_list']}, Diagnoses: {row['diagnoses_list']}"
    )
    return summarizer(patient_text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

def generate_history_summary(row):
    patient_id = row['PatientID']
    if pd.notna(row['conditions_list']) and pd.notna(row['conditions_ICD10_list']):
        conditions = row['conditions_list'].split(';')
        icd10 = row['conditions_ICD10_list'].split(';')
        history_text = f"Patient {patient_id}'s medical history summary: " + ", ".join(
            [f"{cond.strip()} (ICD10: {code.strip()})" for cond, code in zip(conditions, icd10)]
        )
    else:
        history_text = f"Patient {patient_id}'s medical history summary: No history recorded."
    return summarizer(history_text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

def generate_allergy_summary(row):
    patient_id = row['PatientID']
    if pd.notna(row['allergies_list']) and pd.notna(row['allergy_reactions_list']):
        allergens = row['allergies_list'].split(';')
        reactions = row['allergy_reactions_list'].split(';')
        allergy_text = f"Patient {patient_id}'s allergy summary: " + ", ".join(
            [f"{allergen.strip()} (Reaction: {reaction.strip()})" for allergen, reaction in zip(allergens, reactions)]
        )
    else:
        allergy_text = f"Patient {patient_id}'s allergy summary: No allergies recorded."
    return summarizer(allergy_text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

def generate_lab_summary(row):
    patient_id = row['PatientID']
    if pd.notna(row['lab_tests_list']) and pd.notna(row['lab_values_list']):
        tests = row['lab_tests_list'].split(';')
        values = row['lab_values_list'].split(';')
        lab_text = f"Patient {patient_id}'s lab test summary: " + ", ".join(
            [f"{test.strip()}: {value.strip()}" for test, value in zip(tests, values)]
        )
    else:
        lab_text = f"Patient {patient_id}'s lab test summary: No lab tests recorded."
    return summarizer(lab_text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

# Initialize FAISS index
index = faiss.IndexFlatL2(768)  # BioBERT embedding dimension
embedding_list = []
metadata = []
summary_list = []

# Generate summaries and embeddings
for _, row in df.iterrows():
    patient_id = row['PatientID']
    summaries = {
        "patient_summary": generate_patient_summary(row),
        "history_summary": generate_history_summary(row),
        "allergy_summary": generate_allergy_summary(row),
        "lab_summary": generate_lab_summary(row)
    }
    for summary_type, summary in summaries.items():
        embedding = embedder.encode(summary)
        embedding_list.append(embedding)
        metadata.append((str(patient_id), summary_type))
        summary_list.append(summary)

# Add embeddings to FAISS index
embedding_array = np.array(embedding_list).astype('float32')
index.add(embedding_array)

# Expanded keywords for summary types
summary_type_keywords = {
    'patient_summary': ['patient profile', 'profile', 'patient details', 'demographics', 'vitals', 'info'],
    'history_summary': ['medical history', 'history', 'past diagnoses', 'conditions', 'illnesses'],
    'allergy_summary': ['allergies', 'allergy', 'reactions', 'intolerances', 'allergic'],
    'lab_summary': ['lab tests', 'lab results', 'blood work', 'test results', 'labs']
}

# Extract patient ID from query
def extract_patient_id(query):
    match = re.search(r'patient (\d+)', query, re.IGNORECASE)
    return match.group(1) if match else None

# Detect summary type from query
def detect_summary_type(query):
    query_lower = query.lower()
    for summary_type, keywords in summary_type_keywords.items():
        if any(keyword in query_lower for keyword in keywords):
            return summary_type
    return None

# Optimized retrieval function
def retrieve_summaries(query, top_k=100, final_k=3):
    patient_id = extract_patient_id(query)
    summary_type = detect_summary_type(query)

    # Generate query embedding
    query_embedding = embedder.encode([query]).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    results = [(metadata[i], summary_list[i], distances[0][j]) for j, i in enumerate(indices[0])]

    # Filter by patient ID if specified
    if patient_id:
        results = [r for r in results if r[0][0] == patient_id]
    else:
        return [("Error", "Please specify a patient ID (e.g., 'patient 4')")]

    # Filter by summary type if detected
    if summary_type:
        results = [r for r in results if r[0][1] == summary_type]

    # Sort by similarity and take top results
    results.sort(key=lambda x: x[2])
    top_results = results[:final_k]

    # Fallback: return any summaries for the patient if no type-specific matches
    if not top_results:
        fallback_results = [(meta, summary, distance) for meta, summary, distance in results if meta[0] == patient_id]
        fallback_results.sort(key=lambda x: x[2])
        top_results = fallback_results[:final_k]

    return [(meta, summary) for meta, summary, _ in top_results] if top_results else [("No matches", "No relevant summaries found")]

# Example usage
if __name__ == "__main__":
    query = "lab results of patient 4"
    results = retrieve_summaries(query)
    print(f"Query: {query}")
    for (patient_id, summary_type), summary_text in results:
        print(f"Patient {patient_id} - {summary_type}: {summary_text}")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 50, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 50, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Both `m

Query: lab results of patient 4


ValueError: too many values to unpack (expected 2)

In [None]:
# Save the FAISS index to disk
import faiss
faiss.write_index(index, "ehr_embeddings.faiss")

In [None]:
# Load the FAISS index from disk
index = faiss.read_index("ehr_embeddings.faiss")

In [None]:
# Save summaries and metadata to a CSV file
import csv

with open("summaries_metadata.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["PatientID", "SummaryType", "SummaryText"])
    for (patient_id, summary_type), summary_text in zip(metadata, summary_list):
        writer.writerow([patient_id, summary_type, summary_text])

In [None]:
def retrieve_summaries(query, top_k=30):
    """Retrieve the top-k most relevant summaries for a given query."""
    query_embedding = embedder.encode([query]).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    results = [(metadata[i], summary_list[i]) for i in indices[0]]
    return results