In [1]:
import os
import json
from langchain import LLMChain, PromptTemplate
from langchain.tools import tool
import requests
from openai import OpenAI
from langchain_community.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import jsonschema_specifications
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.agents import Tool, initialize_agent, AgentType
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

os.environ['OPENAI_API_KEY'] = "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"
client = OpenAI( api_key=os.environ.get('OPENAI_API_KEY') )

# Initialize the OpenAI Embeddings model
embedding_model_name = "text-embedding-ada-002"  # Recommended OpenAI text embedding model
embeddings = OpenAIEmbeddings(model=embedding_model_name)


# Open the JSON file
with open('Clinical_Trials_Data.json') as f:
    # Load the JSON data into a variable
    all_trials = json.load(f)

  embeddings = OpenAIEmbeddings(model=embedding_model_name)


In [2]:
# Function to extract patient data
def extract_patient_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    data = data['entry']

    # Variables to hold extracted data
    patient_data = {}
    conditions = []
    care_plans = []
    encounters = []
    diagnostic_reports = []
    observations = []
    procedures = []

    for entry in data:
        resource = entry['resource']
        resource_type = resource['resourceType']

        if resource_type == "Patient":
            patient = resource
            patient_data['name'] = patient['name'][0]['family'] if 'name' in patient else None
            patient_data['gender'] = patient.get('gender', None)
            patient_data['birthDate'] = patient.get('birthDate', None)

            for ext in patient.get('extension', []):
                if ext['url'] == 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-race':
                    patient_data['race'] = ext['extension'][0]['valueCoding']['display']
                elif ext['url'] == 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity':
                    patient_data['ethnicity'] = ext['extension'][0]['valueCoding']['display']
            patient_data['address'] = patient['address'][0]['city'] if 'address' in patient else None

        elif resource_type == "Condition":
            condition = resource
            conditions.append({
                'condition': condition['code']['coding'][0]['display'],
                'clinicalStatus': condition.get('clinicalStatus', {}).get('coding', [{}])[0].get('code', None),
                'verificationStatus': condition.get('verificationStatus', {}).get('coding', [{}])[0].get('code', None),
                'onsetDateTime': condition.get('onsetDateTime', None)
            })

        elif resource_type == "CarePlan":
            care_plan = resource
            care_plans.append({
                'plan': care_plan['activity'][0]['detail']['code']['coding'][0]['display'] if 'activity' in care_plan else None,
                'status': care_plan.get('status', None),
                'start': care_plan.get('period', {}).get('start', None)
            })

        elif resource_type == "Encounter":
            encounter = resource
            encounters.append({
                'type': encounter['type'][0]['coding'][0]['display'] if 'type' in encounter else None,
                'date': encounter.get('period', {}).get('start', None)
            })

        elif resource_type == "DiagnosticReport":
            diagnostic_report = resource
            diagnostic_reports.append({
                'report': diagnostic_report['code']['coding'][0]['display'] if 'code' in diagnostic_report else None,
                'effectiveDateTime': diagnostic_report.get('effectiveDateTime', None)
            })

        elif resource_type == "Observation":
            observation = resource
            observations.append({
                'observation': observation['code']['coding'][0]['display'] if 'code' in observation else None,
                'value': observation.get('valueQuantity', {}).get('value', None),
                'unit': observation.get('valueQuantity', {}).get('unit', None),
                'effectiveDateTime': observation.get('effectiveDateTime', None)
            })

        elif resource_type == "Procedure":
            procedure = resource
            procedures.append({
                'procedure': procedure['code']['coding'][0]['display'] if 'code' in procedure else None,
                'status': procedure.get('status', None),
                'performedDateTime': procedure.get('performedDateTime', None)
            })

    return {
        'patient_data': patient_data,
        'conditions': conditions,
        'care_plans': care_plans,
        'encounters': encounters,
        'diagnostic_reports': diagnostic_reports,
        'observations': observations,
        'procedures': procedures
    }

In [3]:
# Initialize vector_store as None
vector_store = None

def batch_data(data_list, batch_size):
    """Yield successive batches from data_list."""
    for i in range(0, len(data_list), batch_size):
        yield data_list[i:i + batch_size]

# Adjust the batch size as needed
batch_size = 1

# Process batches of trial data
for i, batch in enumerate(batch_data(all_trials, batch_size)):
    texts = []
    metadatas = []
    for trial in batch:
        # Extract required fields from the trial data
        description = trial.get('descriptionModule', {}).get('briefSummary', '')
        location = trial.get('locationModule', {})
        eligibility = trial.get('eligibilityModule', {})
        nct_id = trial.get('identificationModule', {}).get('nctId', '')

        # Combine description and eligibility criteria
        combined_text = f"NCTId:\n{nct_id}\nDescription:\n{description}\nEligibility Criteria:\n{eligibility}"

        # Append the text and metadata
        texts.append(combined_text)
        metadatas.append({
            'nctId': nct_id,
            'eligibility': eligibility,
            'location': location,
        })

    # If it's the first batch, initialize the FAISS vector store
    if i == 0:
        # Create the vector store from the texts and embeddings
        vector_store = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
    else:
        # Add the new texts and metadata to the existing vector store
        vector_store.add_texts(texts, metadatas=metadatas)

# Optional: Save the FAISS vector store to disk for future use
vector_store.save_local("faiss_store")

In [4]:
vector_store = FAISS.load_local("faiss_store",embeddings,allow_dangerous_deserialization=True)
retriever = vector_store.as_retriever()

# k = 10  # Number of top documents to retrieve (increase this for more results)
# results = retriever.get_relevant_documents(patient_info, k=k)

In [5]:
# Create a custom prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are an assistant that helps find clinical trials for patients.

Given the following patient information and context, provide a list of all the clinical trials that the patient is eligible for, in the following format:

NCT ID: [nctId]
Brief Summary: [descriptionModule]
Eligibility Criteria: [eligibilityModule]

Patient Information: {question}

Context:
{context}

Answer:
"""
)

In [6]:
# Initialize your language model
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    # chain_type="stuff"  # 'stuff' is suitable for simple concatenation
    chain_type_kwargs={"prompt": prompt_template}
)

  llm = ChatOpenAI(model_name="gpt-4o", temperature=0)


In [7]:
# Define the tool that uses the retriever
tools = [
    Tool(
        name="Clinical Trial Retriever",
        func=lambda q: qa_chain.run(q),
        description="Use this tool to fetch relevant clinical trials based on patient information."
    ),
]

# Initialize the agent with the new tool
agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

  agent = initialize_agent(


In [8]:
# Function to interact with OpenAI API
def generate_patient_data_response(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant tasked with reading patient data and outputting only the translated patient data, without including any additional information or explanations. Please structure the output in a clean and organized format."},
            {"role": "user", "content": prompt}
        ],
    )
    return response.choices[0].message.content

extracted_data = extract_patient_data("/Users/vijaykirandegala/Downloads/FHIR_STU4_Filtering/Sample_FHIR_Data/STU1/female/Magdalen131_Moen819_e209d094-31e4-4fc0-a314-0a2b2c6feaef.json")

prompt = f"""Patient data: {extracted_data}. Write a detailed description of the patient’s clinical data to match them to relevant clinical trials. The final paragraph should be well-organized, coherent, and written in a narrative style to help an LLM efficiently retrieve relevant clinical trials through a RAG system.
"""
readble_patient_data = generate_patient_data_response(prompt)
query = f"""{readble_patient_data}"""

# Get the response from the agent
response = agent.run(query)
print(response)

  response = agent.run(query)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo find relevant clinical trials for Moen819, I should use the Clinical Trial Retriever tool, focusing on her active conditions: obesity and malignant neoplasm of the breast. These conditions are the most pertinent for identifying suitable clinical trials.

Action: Clinical Trial Retriever
Action Input: "Female, born 1958, White, obesity, malignant neoplasm of breast, Quincy"[0m
Observation: [36;1m[1;3mBased on the provided patient information and context, the patient is eligible for the following clinical trial:

NCT ID: NCT04365569  
Brief Summary: More than 65% of breast cancer survivors are overweight and less than one-third participate in recommended levels of physical exercise. Obese breast cancer survivors have been found to have greater than a two-fold increase in mortality compared to women with normal body mass index (BMI). The current standard for weight loss interventions involves in-person counseling. However,