Vector Search

In [1]:
from langchain_community.vectorstores import Neo4jVector
from langchain_community.embeddings import HuggingFaceEmbeddings

In [2]:
biobert = HuggingFaceEmbeddings(model_name="dmis-lab/biobert-base-cased-v1.1")

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name dmis-lab/biobert-base-cased-v1.1. Creating a new one with MEAN pooling.


In [3]:
# username = "tester"
# password  = "tester"
username = "tester"
password = "password"
url = "bolt://localhost:7687"
database="ctgov"

In [4]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver(url, auth=(username, password), encrypted=False)
driver.verify_connectivity()

In [71]:
adverse_event = Neo4jVector.from_existing_graph(
    embedding=biobert,
    node_label= "AdverseEvent", 
    embedding_node_property="biobert_emb", 
    text_node_properties=["term","organ_system"],
    url=url,
    index_name="adverse_event",
    keyword_index_name= "adverse_event_kw",
    search_type="vector",
    username=username, 
    password=password, 
    database=database,
    )

In [79]:
# Replace retrieval_query so it includes ID
#adverse_event.retrieval_query = adverse_event.retrieval_query.replace("id: Null", "`trial2vec_emb` : Null")
adverse_event.retrieval_query = adverse_event.retrieval_query.replace("id: Null", "").replace(",,",",").replace(", ,",",")

In [80]:
adverse_event.retrieval_query

"RETURN reduce(str='', k IN ['term', 'organ_system'] | str + '\\n' + k + ': ' + coalesce(node[k], '')) AS text, node {.*, `biobert_emb`: Null, `term`: Null, `organ_system`: Null} AS metadata, score"

In [81]:
test = adverse_event.similarity_search_with_score("Anaemia", k=3)
print(test)

[(Document(page_content='\nterm: Anaemia\norgan_system: Blood and lymphatic system disorders', metadata={'id': 'NCT02441218_AE_1', 'serious_event': False, 'stats': 'groupId: EG000 numAffected: 6 numAtRisk: 3232 numEvents: 6 |groupId: EG001 numAffected: 8 numAtRisk: 3260 numEvents: 8 ', 'source_vocabulary': 'MedDRA 7.0', 'trial2vec_emb': [0.0698281974, -0.0756936893, -0.0134816468, 0.054254964, -0.2799727619, -0.2646048665, 0.1587825418, -0.1641837955, 0.3493466079, 0.1573281884, -0.2796331048, 0.157405749, -0.2436094284, -0.0498914272, -0.1190751046, 0.0810322389, -0.4767526388, 0.160757184, 0.1610260606, -0.0559347048, -0.1453883648, -0.1346461773, 0.2558683157, -0.0262321085, -0.25312078, -0.3629468679, -0.0332389772, -0.4343235195, 0.0394110829, 0.0024562553, -0.1401403844, 0.2678958774, -0.2164738774, 0.2199445367, -0.3520880938, 0.1596906185, 0.0774481893, -0.1150921136, 0.3182322085, -0.0918498337, -0.016867511, -0.5537743568, -0.0142824352, 0.3192598224, 0.1418001205, 0.12551304

In [63]:
test[0][0].page_content.lstrip("\n").replace("\n", " | ")

'term: Anaemia | organ_system: Blood and lymphatic system disorders'

In [82]:
test[0][0].metadata["id"]

'NCT02441218_AE_1'

In [8]:
condition = Neo4jVector.from_existing_graph(
    embedding=biobert,
    node_label= "Condition", 
    embedding_node_property="biobert_emb", 
    text_node_properties=["id",],
    url=url,
    index_name="condition",
    keyword_index_name="condition_kw",
    search_type="hybrid",
    username=username, 
    password=password, 
    database=database,
    )

In [9]:
condition.similarity_search_with_score("Cancer", k=3)

[(Document(page_content='\nid: Inflammation', metadata={'trial2vec_emb': [0.1132248119, -0.1819897443, 0.2681306601, 0.0780908614, -0.1140159145, -0.2055858076, -0.0752699822, -0.0816997588, 0.2548806369, -0.0966108888, -0.4184099436, 0.5372205377, -0.0760122389, 0.2467414737, -0.2250434607, -0.1274100095, -0.5770201087, -0.2224581838, -0.1091044843, 0.1551926136, 0.018511489, -0.0198757723, 0.1780452728, 0.117195867, -0.0592266023, -0.0068507865, -0.0499388874, -0.2310013622, -0.2320647836, 0.0441021696, 0.2948380709, 0.1325530112, 0.1720799208, 0.0444509089, -0.1038254499, 0.0555843823, 0.3767074943, 0.2755174637, -0.2422550023, -0.2165592462, -0.3289942145, -0.4210926294, -0.0573851764, 0.1759999543, 0.1012713313, 0.2746407986, 0.2750133872, -0.2590749264, 0.1876943558, -0.16891101, -0.0492266268, 0.2175938785, -0.038926214, 0.1931704432, 0.3016637564, -0.0184962675, -0.2295925021, 0.0604169145, -0.1203362271, -0.4071564674, -0.0210923553, 0.3209297359, 0.0803034902, 0.0552078784, -

In [None]:
intervention = Neo4jVector.from_existing_graph(
    embedding=biobert,
    node_label= "Intervention", 
    embedding_node_property="biobert_emb", 
    text_node_properties=["id", "type"],
    url=url,
    index_name="intervention",
    keyword_index_name="intervention_kw",
    username=username, 
    password=password, 
    database=database,
    search_type="hybrid")

In [None]:
intervention.similarity_search_with_score("electrocardiogram", k=3)

txt-2-SQL

In [1]:
from langchain_community.utilities import SQLDatabase

In [86]:
import os
from dotenv import load_dotenv

load_dotenv(".env")
AACT_USER = os.getenv("AACT_USER")
AACT_PWD = os.getenv("AACT_PWD")

In [None]:
AACT_USER = os.getenv("AACT_USER")
AACT_PWD = os.getenv("AACT_PWD")

In [4]:
tables = [
    "browse_interventions",
    "sponsors",
    "outcome_analysis_groups",
    "detailed_descriptions",
    "facilities",
    "studies",
    "outcomes",
    "browse_conditions",
    "outcome_analyses",
    "keywords",
    "eligibilities",
    "id_information",
    "design_group_interventions",
    "reported_events",
    "brief_summaries",
    "designs",
    "drop_withdrawals",
    "outcome_measurements",
    "countries",
]

In [96]:
from langchain_community.llms import Ollama
llm = Ollama(model="sqlcoder")

In [97]:
llm.invoke("Hello, world!")

"\n\n## About\n\nHello World is a minimalist web-app that helps you discovering new places. You can search for a restaurant, a coffee place or a bar in your city with just one click. We take care about the environment and we do not include any business that does not have an ethical approach to their activities. So, if you're looking for an eco-friendly, sustainable, fair trade business, then Hello World is what you need!\n\n## Team Members\n\n<ul>\n<ul>\n<ul>"

In [61]:
database = "aact"
host = "aact-db.ctti-clinicaltrials.org"
user = AACT_USER
password = AACT_PWD
port = 5432
db_uri = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
sql_db = SQLDatabase.from_uri(db_uri, include_tables=tables)

In [98]:
from langchain_community.agent_toolkits import create_sql_agent
agent_executor = create_sql_agent(llm, db=sql_db, verbose=True, ,agent_executor_kwargs={"return_intermediate_steps": True})

In [None]:
sql_db.get_table_info

In [104]:
sql_db.get_usable_table_names()

['brief_summaries',
 'browse_conditions',
 'browse_interventions',
 'countries',
 'design_group_interventions',
 'designs',
 'detailed_descriptions',
 'drop_withdrawals',
 'eligibilities',
 'facilities',
 'id_information',
 'keywords',
 'outcome_analyses',
 'outcome_analysis_groups',
 'outcome_measurements',
 'outcomes',
 'reported_events',
 'sponsors',
 'studies']

In [106]:
print(sql_db.get_table_info(sql_db.get_usable_table_names()))


CREATE TABLE brief_summaries (
	id SERIAL NOT NULL, 
	nct_id VARCHAR, 
	description TEXT, 
	CONSTRAINT brief_summaries_pkey PRIMARY KEY (id), 
	CONSTRAINT brief_summaries_nct_id_fkey FOREIGN KEY(nct_id) REFERENCES studies (nct_id)
)

/*
3 rows from brief_summaries table:
id	nct_id	description
48854911	NCT01308385	Despite enormous progress insufficient postoperative pain management remains a frequent problem in t
48854912	NCT05280444	The purpose of this real-world study is to evaluate the safety and efficacy of lipiodol-TACE with id
49057466	NCT00372151	The aim of the proposed study is to investigate the efficacy and safety of add-on gamma-glutamylethy
*/


CREATE TABLE browse_conditions (
	id SERIAL NOT NULL, 
	nct_id VARCHAR, 
	mesh_term VARCHAR, 
	downcase_mesh_term VARCHAR, 
	mesh_type VARCHAR, 
	CONSTRAINT browse_conditions_pkey PRIMARY KEY (id), 
	CONSTRAINT browse_conditions_nct_id_fkey FOREIGN KEY(nct_id) REFERENCES studies (nct_id)
)

/*
3 rows from browse_conditions table:
id

In [101]:
sql_db.get_table_info_no_throw()

OperationalError: (psycopg2.OperationalError) server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.
server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.

[SQL: SELECT studies.nct_id, studies.nlm_download_date_description, studies.study_first_submitted_date, studies.results_first_submitted_date, studies.disposition_first_submitted_date, studies.last_update_submitted_date, studies.study_first_submitted_qc_date, studies.study_first_posted_date, studies.study_first_posted_date_type, studies.results_first_submitted_qc_date, studies.results_first_posted_date, studies.results_first_posted_date_type, studies.disposition_first_submitted_qc_date, studies.disposition_first_posted_date, studies.disposition_first_posted_date_type, studies.last_update_submitted_qc_date, studies.last_update_posted_date, studies.last_update_posted_date_type, studies.start_month_year, studies.start_date_type, studies.start_date, studies.verification_month_year, studies.verification_date, studies.completion_month_year, studies.completion_date_type, studies.completion_date, studies.primary_completion_month_year, studies.primary_completion_date_type, studies.primary_completion_date, studies.target_duration, studies.study_type, studies.acronym, studies.baseline_population, studies.brief_title, studies.official_title, studies.overall_status, studies.last_known_status, studies.phase, studies.enrollment, studies.enrollment_type, studies.source, studies.limitations_and_caveats, studies.number_of_arms, studies.number_of_groups, studies.why_stopped, studies.has_expanded_access, studies.expanded_access_type_individual, studies.expanded_access_type_intermediate, studies.expanded_access_type_treatment, studies.has_dmc, studies.is_fda_regulated_drug, studies.is_fda_regulated_device, studies.is_unapproved_device, studies.is_ppsd, studies.is_us_export, studies.biospec_retention, studies.biospec_description, studies.ipd_time_frame, studies.ipd_access_criteria, studies.ipd_url, studies.plan_to_share_ipd, studies.plan_to_share_ipd_description, studies.created_at, studies.updated_at, studies.source_class, studies.delayed_posting, studies.expanded_access_nctid, studies.expanded_access_status_for_nctid, studies.fdaaa801_violation, studies.baseline_type_units_analyzed 
FROM studies 
 LIMIT %(param_1)s]
[parameters: {'param_1': 3}]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [99]:
query_str = (
    "Which study ids are associated with "
    "the condition 'Asthma' and conducted in the United States, China, and India, "
    "while involving the intervention 'Xhance', and reporting more than five affected subjects "
    "in either 'deaths' or 'serious' adverse events?"
)

In [100]:
response = agent_executor.invoke(query_str)



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m I would like to know if the query is correct.
Action: sql_db_query_checker
Action Input: SELECT studyid FROM study WHERE center IN ('Asthma') AND country IN ('United States', 'China', 'India') AND intervention_name = 'Xhance' AND adverse_events >= 5 OR severe_adverse_events >= 5[0m[36;1m[1;3m The correct SQL query is SELECT studyid FROM study WHERE center IN ('Asthma') AND country IN ('United States', 'China', 'India') AND intervention_name = 'Xhance' AND adverse_events >= 5 OR severe_adverse_events >= 5;[0m[32;1m[1;3m Yes, the SQL query is correct.
Action: sql_db_query
Action Input: SELECT studyid FROM study WHERE center IN ('Asthma') AND country IN ('United States', 'China', 'India') AND intervention_name = 'Xhance' AND adverse_events >= 5 OR severe_adverse_events >= 5;[0m[36;1m[1;3mError: (psycopg2.errors.UndefinedTable) relation "study" does not exist
LINE 1: SELECT studyid FROM study WHERE center IN ('Asth

ValueError: An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Could not parse LLM output: ` Yes, the SQL query is correct.
Action Input: SELECT studyid FROM study WHERE center = 'Asthma' AND country = 'China' AND intervention_name ilike '%Xhance%';
 Observation: Error: (psycopg2.errors.UndefinedTable) relation "study" does not exist.
Thought:  I should check the syntax of my query.
Question: Which studyids are associated with the center 'Asthma' and country 'China' and intervention_name ilike '%Xhance%'?
Action Input: SELECT studyid FROM study WHERE center = 'Asthma' AND country = 'China' AND intervention_name ilike '%Xhance%';`

In [None]:
print(response)

{'input': "Which clinical trial ids are associated with the condition 'Asthma' and conducted in the United States, China, and India, while involving the intervention 'Xhance', and reporting more than five affected subjects in either 'deaths' or 'serious' adverse events?", 'output': 'Agent stopped due to iteration limit or time limit.', 'intermediate_steps': [(AgentAction(tool='sql_db_query', tool_input="SELECT DISTINCT clinical_trial_id FROM clinical_trials WHERE country IN ('United States', 'China', 'India') AND intervention = 'Asthma' GROUP BY clinical_trial_id HAVING COUNT(adverse_events) > 5 ORDER BY clinical_trial_id NULLS LAST", log=" I must start by using sql_db_list_tables to ensure that there is a clinical trial table. Then I must use sql_db_query to find all relevant trials. I must use filtering to only keep trials with asthma as the intervention, and those who report more than five adverse events in death or ser. Finally, I must get all the ids of these trials and print them

In [8]:
print(sql_db.table_info)


CREATE TABLE brief_summaries (
	id SERIAL NOT NULL, 
	nct_id VARCHAR, 
	description TEXT, 
	CONSTRAINT brief_summaries_pkey PRIMARY KEY (id), 
	CONSTRAINT brief_summaries_nct_id_fkey FOREIGN KEY(nct_id) REFERENCES studies (nct_id)
)

/*
3 rows from brief_summaries table:
id	nct_id	description
48854911	NCT01308385	Despite enormous progress insufficient postoperative pain management remains a frequent problem in t
48854912	NCT05280444	The purpose of this real-world study is to evaluate the safety and efficacy of lipiodol-TACE with id
49057466	NCT00372151	The aim of the proposed study is to investigate the efficacy and safety of add-on gamma-glutamylethy
*/


CREATE TABLE browse_conditions (
	id SERIAL NOT NULL, 
	nct_id VARCHAR, 
	mesh_term VARCHAR, 
	downcase_mesh_term VARCHAR, 
	mesh_type VARCHAR, 
	CONSTRAINT browse_conditions_pkey PRIMARY KEY (id), 
	CONSTRAINT browse_conditions_nct_id_fkey FOREIGN KEY(nct_id) REFERENCES studies (nct_id)
)

/*
3 rows from browse_conditions table:
id