In [1]:
from tqdm import tqdm

import numpy as np
import os

from utils_neo4j import Neo4jApp
from utils_llm import GraphRAG
from stark_qa import load_qa, load_skb
from utils import save_dict_to_file, load_dict_from_file, change_qa_dataset_to_tuple
from utils_evaluation import collect_responses

from dotenv import load_dotenv

load_dotenv()

dataset_name = os.getenv("dataset_name")
data_path = os.getenv("data_path")

# Load params for Graph
scheme = os.getenv("scheme")
host_name = os.getenv("host_name")
port = os.getenv("port")
user = os.getenv("user")
password = os.getenv("password")
database = os.getenv("database")


  from .autonotebook import tqdm as notebook_tqdm


Loading from /Users/louislacombe/CODE/quantmetry/stark/prime/processed!


In [2]:
# Load the retrieval dataset
qa_dataset = load_qa(dataset_name, human_generated_eval=True)
qa_dataset = change_qa_dataset_to_tuple(qa_dataset)

Use file from /Users/louislacombe/.cache/huggingface/hub/datasets--snap-stanford--stark/snapshots/7b0352c7dcefbf254478c203bcfdf284a08866ac/qa/prime/stark_qa/stark_qa_human_generated_eval.csv.


In [3]:
uri = f"{scheme}://{host_name}:{port}"
graph = Neo4jApp(uri, user, password, database)
graph.verify_connection()

In [4]:
graph_rag = GraphRAG(graph=graph, add_similarity=True, relation_score=True)

In [5]:
name_dict = "07_08_SP_SIM_entity_finder_exact_match_more_sim_60k.json"

In [6]:
more_results = collect_responses(qa_dataset, graph_rag)


  0%|          | 0/109 [00:00<?, ?it/s]

len of before limit and question similarity:  [9, 7, 7]
question:  I have pharyngitis and chemosis. What skin disease might I have?
number of tokens:  19421


  1%|          | 1/109 [00:12<22:00, 12.23s/it]

len of before limit and question similarity:  [5, 7, 7]
question:  mixed mucinous and nonmucinous bronchioloalveolar adenocarcinoma is a subtype of what disease?
number of tokens:  23017


  2%|▏         | 2/109 [00:25<22:50, 12.81s/it]

len of before limit and question similarity:  [5, 5, 7, 5, 5, 5, 5, 5]
question:  Which pathway is a promising therapeutic target in breast cancer that renders the protein insensitive to inhibition by the WNT antagonist DKK1 and interacts with KREMEN, DKK family genes?
number of tokens:  57891


  3%|▎         | 3/109 [00:50<32:38, 18.47s/it]

len of before limit and question similarity:  [5, 5, 7, 5, 5]
question:  Which gene is involved in vesicle transport and locate in kinetochore, and in the pathway of antigen processing?
number of tokens:  28264


  4%|▎         | 4/109 [01:06<30:35, 17.48s/it]

len of before limit and question similarity:  [5, 5, 5, 9, 5, 5]
question:  What drug with ~300 Da weight that stimulates the immune system has synergy with a few different acids?
number of tokens:  42305


  5%|▍         | 5/109 [01:23<29:54, 17.25s/it]

len of before limit and question similarity:  [5, 7, 7, 7, 5, 5]
question:  Which gene is present in endometrium, adipose tissue, colon and is a pseudogene on chromosome 9?
number of tokens:  42020


  6%|▌         | 6/109 [02:05<44:04, 25.68s/it]

len of before limit and question similarity:  [5, 5, 7]
question:  What variant of sugar (yum!) hits a protein involved in carbohydrate binding?
number of tokens:  17597


  6%|▋         | 7/109 [02:12<33:28, 19.69s/it]

len of before limit and question similarity:  [7, 5]
question:  My friend has been prescribed Tasonermin, what diseases might they have?
number of tokens:  8247


  7%|▋         | 8/109 [02:19<25:55, 15.40s/it]

len of before limit and question similarity:  [9]
question:  Why do prostate cancer cases rise in agriculturally intensive areas in California?
number of tokens:  5067


  8%|▊         | 9/109 [02:25<20:47, 12.48s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 7]
question:  What is the gene that causes significant bleeding if defected and interact with cellular response to manganese ion and ATP binding?
number of tokens:  26613


  9%|▉         | 10/109 [02:38<20:57, 12.70s/it]

len of before limit and question similarity:  [5, 5, 7]
question:  I remember I heard of a drug whos name sounds like "depression" and is used to treat nocturia. What is it?
number of tokens:  8104


 10%|█         | 11/109 [02:48<19:19, 11.83s/it]

len of before limit and question similarity:  [5, 7, 5, 9, 7, 7]
question:  Which HIV drug targets CCR5 and has side effects like Alopecia, Syncope, Chronic sinusitis?
number of tokens:  33737


 11%|█         | 12/109 [03:01<19:59, 12.36s/it]

len of before limit and question similarity:  [7, 7]
question:  I have visual loss and abnormal electroretinogram, what I got?
number of tokens:  12285


 12%|█▏        | 13/109 [03:08<17:09, 10.73s/it]

len of before limit and question similarity:  [5, 7, 5]
question:  I was diagnosed with RA and i want to avoid elevated hepatic transaminase, which drug should I avoid?
number of tokens:  10081


 13%|█▎        | 14/109 [03:15<15:06,  9.55s/it]

len of before limit and question similarity:  [9, 5, 5]
question:  Is there a new formulation of ciprofloxacin to be used as radioimaging agent for diagnosis?
number of tokens:  13153


 14%|█▍        | 15/109 [03:24<14:42,  9.39s/it]

len of before limit and question similarity:  [5, 5, 5, 5]
question:  A coding mutation in an oligomer that interacts with a lysomsomal transport pathways would disrupt which pathway?
number of tokens:  22325


 15%|█▍        | 16/109 [03:34<14:59,  9.67s/it]

len of before limit and question similarity:  [7, 7]
question:  What type of disease is a paraphilic disorder?
number of tokens:  10136


 16%|█▌        | 17/109 [03:40<13:03,  8.52s/it]

len of before limit and question similarity:  [5, 7, 7]
question:  This protein interacts with HBQ1 and SIRT5
number of tokens:  14625


 17%|█▋        | 18/109 [03:46<11:53,  7.84s/it]

len of before limit and question similarity:  [7, 5, 5, 5]
question:  My dad said his head hurts and his muscles have been feeling really weak. He also told me he saw stars yesterday. What might be wrong with him?
number of tokens:  29620


 17%|█▋        | 19/109 [03:59<13:58,  9.31s/it]

len of before limit and question similarity:  [5, 7]
question:  What is a drug to treat sclerosing cholangitis
number of tokens:  6055


 18%|█▊        | 20/109 [04:05<12:16,  8.28s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 7, 7, 7]
question:  Which gene involve in the process of action potentials in vertebrate neurons after AHP, and is present in numerous cellular contexts such as amygdala, epithelium,  cerebral cortex, etc
number of tokens:  53313


 19%|█▉        | 21/109 [04:24<16:59, 11.58s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 7]
question:  I have inflammation in my gums, and it turns sollwne and puffy. Which disease could potentially be the issue?
number of tokens:  32319


 20%|██        | 22/109 [04:39<17:59, 12.40s/it]

len of before limit and question similarity:  [5, 5, 5, 5]
question:  What drug would you give to a patient whose grandmother suffered from glutathione reductase deficiency?
number of tokens:  19110


 21%|██        | 23/109 [04:49<16:57, 11.84s/it]

len of before limit and question similarity:  [5, 5, 5, 9]
question:  Is there an Fe-containing compound that works with an antibiotic that is contraindicated for gout?
number of tokens:  18575


 22%|██▏       | 24/109 [05:01<16:41, 11.78s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 9, 5, 5, 7, 7, 7, 7]
question:  I am a doctor, and I have this patient with the following symptoms: a lack of T lymphocytes, lack of immunodeficiency, severe respiratory infections. The patient has the following biomarkers enriched: JAK3, RAG2, CTLA4. What disease is this?
number of tokens:  61370


 23%|██▎       | 25/109 [05:31<24:01, 17.16s/it]

len of before limit and question similarity:  [7, 7, 7, 7, 7, 7, 7, 7]
question:  Which drug has a synergistic interaction with Cloranolol, Hydrolyzed Cephalothin, Vorapaxar, Guanethidine, Ertapenem, Lomerizine, Cadralazine, and Pentobarbital?
number of tokens:  44960


 24%|██▍       | 26/109 [05:48<24:00, 17.35s/it]

len of before limit and question similarity:  [5, 7, 7, 7]
question:  This protein is found in the vastus lateralis and cingulate cortex and intestine
number of tokens:  32745


 25%|██▍       | 27/109 [06:00<21:27, 15.70s/it]

len of before limit and question similarity:  [7, 7]
question:  Is MTND5P11 expressed in any part of the brain?
number of tokens:  8023


 26%|██▌       | 28/109 [06:10<18:44, 13.89s/it]

len of before limit and question similarity:  [7, 7, 7, 5]
question:  I have GJB2, GJA1, GJB1 enriched, which pathway is this?
number of tokens:  15236


 27%|██▋       | 29/109 [06:17<15:58, 11.99s/it]

len of before limit and question similarity:  [5, 5, 7, 7, 7]
question:  Which gene is involved in chromosome positioning, and interact with MTUS1, SLC25A2, and PDE3A?
number of tokens:  24057


 28%|██▊       | 30/109 [06:30<15:57, 12.12s/it]

len of before limit and question similarity:  [5, 5, 5, 5]
question:  What's an amino acid variant that can target a protein in rod cells in my patient's eyes?
number of tokens:  25349


 28%|██▊       | 31/109 [06:40<14:49, 11.40s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 5, 5]
question:  I think I have the flu but it's not flu season. Also I'm really tired and my temperature is over 100. I was recently in France and drank a lot of tap water there. Any idea why I might be sick?
number of tokens:  47446


 29%|██▉       | 32/109 [06:58<17:22, 13.54s/it]

len of before limit and question similarity:  [7, 5]
question:  TBXAS1 can be affected by what drug?
number of tokens:  5482


 30%|███       | 33/109 [07:05<14:46, 11.67s/it]

len of before limit and question similarity:  [5, 5, 7]
question:  The production of interleukin 10 (IL-10) contributes to the development of which disease?
number of tokens:  13097


 31%|███       | 34/109 [07:14<13:21, 10.69s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 9, 5, 5, 5, 5, 5, 7, 7, 7]
question:  I am a doctor and I ran NICU and I have a baby with severe infections in pneumonia, diiarrhea and skin rashes, deafness. The blood test shows increased marker of IL7R, CD3D. Which disease is this?
number of tokens:  61404


 32%|███▏      | 35/109 [07:52<23:26, 19.00s/it]

len of before limit and question similarity:  [5, 5, 7]
question:  Which gene is intronless, and is associated with Hermansky-Pudlak syndrome when disrupted?
number of tokens:  12598


 33%|███▎      | 36/109 [08:02<19:37, 16.13s/it]

len of before limit and question similarity:  [5, 7, 7, 7, 7]
question:  Which gene is evolved to respond to gram-negative bacterial infections and interact with CD14, PSMA3, TSC22D4?
number of tokens:  26915


 34%|███▍      | 37/109 [08:19<19:41, 16.40s/it]

len of before limit and question similarity:  [7, 5, 5, 5]
question:  A ROS1 tumor mutation might lead to which carcinomas?
number of tokens:  19164


 35%|███▍      | 38/109 [08:31<17:52, 15.10s/it]

len of before limit and question similarity:  [5, 7, 7, 7, 5]
question:  What is a gene target of Cyanocobalamin that encodes for GTPase activity and protein homodimerization activity in the Propionyl-CoA catabolism pathway?
number of tokens:  32839


 36%|███▌      | 39/109 [08:42<16:09, 13.85s/it]

len of before limit and question similarity:  [5, 5, 7]
question:  What's an example of a gene that is part of the RNA recognition motif family and is involved in transcription coactivator activity?
number of tokens:  11062


 37%|███▋      | 40/109 [08:50<13:53, 12.09s/it]

len of before limit and question similarity:  [7, 7, 7, 5]
question:  Which pathway involves APEX1, POLB and locates in nucleoplasm and is part of the resolution of AP sites?
number of tokens:  21745


 38%|███▊      | 41/109 [09:01<13:24, 11.83s/it]

len of before limit and question similarity:  [5, 5, 7, 5, 5, 5]
question:  Which protein is involved in regulation of keratinocyte differentiation and transcription by RNA polymerase II and is in chromosome 17?
number of tokens:  39978


 39%|███▊      | 42/109 [09:16<14:16, 12.78s/it]

len of before limit and question similarity:  [7, 5, 5, 5]
question:  I'm currently taking tremelimumab for HIV. Do you know of any antibody-like molecules that would work well with my current treatment?
number of tokens:  14398


 39%|███▉      | 43/109 [09:27<13:22, 12.15s/it]

len of before limit and question similarity:  [5, 7]
question:  What is a membrane-crossing protein that interacts with RETREG3?
number of tokens:  5018


 40%|████      | 44/109 [09:32<10:55, 10.09s/it]

len of before limit and question similarity:  [7, 5]
question:  Hermansky-Pudlak syndrome is associated with what protein?
number of tokens:  11696


 41%|████▏     | 45/109 [09:38<09:26,  8.86s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 5]
question:  An expecting couple performed genotyping on their unborn child and discovered SNPs on chromosome 16. The fetus displays some abnormal bone developement. What can the obstetrician report to the parents?
number of tokens:  43064


 42%|████▏     | 46/109 [09:53<11:16, 10.74s/it]

len of before limit and question similarity:  [7, 5]
question:  CSNK1A1 interacts with what other protein?
number of tokens:  9693


 43%|████▎     | 47/109 [09:59<09:33,  9.25s/it]

len of before limit and question similarity:  [5, 5, 7]
question:  What diseases is exposure to 2,3',4,4',5-pentachlorobiphenyl associated with?
number of tokens:  16197


 44%|████▍     | 48/109 [10:06<08:39,  8.52s/it]

len of before limit and question similarity:  [5, 5]
question:  Where in the cell will I find immunoglobulin lambda variable 2-18?
number of tokens:  5158


 45%|████▍     | 49/109 [10:11<07:39,  7.65s/it]

len of before limit and question similarity:  [7, 5, 5]
question:  PAN2 and various CNOTs are all involved in what biological process?
number of tokens:  12211


 46%|████▌     | 50/109 [10:21<08:03,  8.20s/it]

len of before limit and question similarity:  [7, 5, 7, 5, 7, 5, 5]
question:  Which skin disease is caused by a mutation in AAGAB or COL14A1 gene, and super rare, with  epidermal hyperkeratosis in palms and soles?
number of tokens:  61311


 47%|████▋     | 51/109 [10:46<12:46, 13.22s/it]

len of before limit and question similarity:  [5, 5, 7]
question:  A doctor might prescribe what drug in addition to Atosiban?
number of tokens:  9275


 48%|████▊     | 52/109 [10:53<10:48, 11.37s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 7]
question:  Which gene is part of a post-splicing multiprotein complex, exhibit RNA helicase and hydrolase, present only in the cytoplasm?
number of tokens:  26035


 49%|████▊     | 53/109 [11:03<10:27, 11.21s/it]

len of before limit and question similarity:  [5, 7, 5, 7, 7]
question:  Which drug can treat migraine by blocking CGRP binding and targets CALCRL, CALCB?
number of tokens:  23045


 50%|████▉     | 54/109 [11:16<10:39, 11.63s/it]

len of before limit and question similarity:  [7, 5, 5]
question:  GTF2F2 initiates transcription with what polymerase?
number of tokens:  11710


 50%|█████     | 55/109 [11:26<09:55, 11.02s/it]

len of before limit and question similarity:  [7, 7, 7, 7, 7, 7, 7, 7, 7, 5]
question:  RPS6KA4, TUBA1A, KIF4A, CLTC, SHTN1, ACTB, SH3GL2, RPS6KA6, and TUBB8 are part of which pathway?
number of tokens:  62292


 51%|█████▏    | 56/109 [11:52<13:47, 15.61s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 9, 5]
question:  My mom got into ICU unfortunately and had ventilator on. The doctor said she got COPD and pneumonia, which drug should the doctor give my mom?
number of tokens:  34136


 52%|█████▏    | 57/109 [12:07<13:29, 15.56s/it]

len of before limit and question similarity:  [7, 5, 5, 5, 7]
question:  I'm on prednisolone for juvenile arthritis. What's a possible protein that my drug is targeting based on its relation to this disease?
number of tokens:  31387


 53%|█████▎    | 58/109 [12:20<12:20, 14.52s/it]

len of before limit and question similarity:  [5, 5, 5]
question:  Are there genes on chromosome 12 involved in gene transcription?
number of tokens:  12851


 54%|█████▍    | 59/109 [12:31<11:22, 13.65s/it]

len of before limit and question similarity:  [5, 7, 5]
question:  I have severe insomnia and I have pyoureter, what insomnia drugs that I CANNOT have?
number of tokens:  9923


 55%|█████▌    | 60/109 [12:40<10:00, 12.25s/it]

len of before limit and question similarity:  [5, 7, 7, 7, 7, 7, 7]
question:  Which gene is present in spinal cord, nasal cavity mucosa, intestine, biceps brachii, tendon and involved in cell migration?
number of tokens:  61352


 56%|█████▌    | 61/109 [13:02<12:08, 15.17s/it]

len of before limit and question similarity:  [7, 5, 7]
question:  BTNL8 is part of which pathway of the adaptive Immune System?
number of tokens:  16474


 57%|█████▋    | 62/109 [13:11<10:27, 13.34s/it]

len of before limit and question similarity:  [7, 5, 5]
question:  The protein encoded by HIF3A is associated with negative regulation of what?
number of tokens:  18035


 58%|█████▊    | 63/109 [13:20<09:10, 11.97s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 9, 7, 7, 7, 7, 5]
question:  I have a patient with the following symptoms: young, tropical regions, resttrctive cardimyopathy, hypertesnion,  Prolonged QRS complex, Left atrial enlargement. What disease should I diagnose?
number of tokens:  61560


 59%|█████▊    | 64/109 [13:46<12:15, 16.33s/it]

len of before limit and question similarity:  [7, 5, 5, 5, 5]
question:  I have a cardiovascular disease and my WGS result tells me that I have a mutation in TBXAS1 gene. Which drug should I have?
number of tokens:  24528


 60%|█████▉    | 65/109 [14:03<12:03, 16.45s/it]

len of before limit and question similarity:  [7, 5]
question:  Are there any rare diseases related to ERF?
number of tokens:  8201


 61%|██████    | 66/109 [14:12<10:07, 14.14s/it]

len of before limit and question similarity:  [5, 5, 7, 5]
question:  Which gene is related to platelet function and is involved in rRNA catabolic process and if disrupted, can cause excessive blooding?
number of tokens:  19469


 61%|██████▏   | 67/109 [14:22<09:08, 13.05s/it]

len of before limit and question similarity:  [7, 7]
question:  bronchogenic carcinoma is a subtype of what disease?
number of tokens:  9364


 62%|██████▏   | 68/109 [14:29<07:31, 11.02s/it]

len of before limit and question similarity:  [7, 7, 5]
question:  My doctor told me that I have a disease whose cause is unknown but that can lead to heart failure. She also told me that I have unusual brain lactate levels, whatever that means. Can you guess what I'm sick with?
number of tokens:  17234


 63%|██████▎   | 69/109 [14:39<07:09, 10.74s/it]

len of before limit and question similarity:  [7, 5]
question:  Chlorhexidine can treat what gum disease?
number of tokens:  5853


 64%|██████▍   | 70/109 [14:44<05:50,  8.98s/it]

len of before limit and question similarity:  [5, 5, 5, 7, 7, 5]
question:  Which gene locates in chomorosme 4, and is part of the complex that initiates GPI in the brain and is associated with hypercoagulability syndrome?
number of tokens:  42851


 65%|██████▌   | 71/109 [15:04<07:55, 12.51s/it]

len of before limit and question similarity:  [5, 5, 5, 7, 7, 7]
question:  I have a patient with WGS result showing mutaiton in KRT85 gene, and has Atrichia, Photophobia, and Sparse hair, which does he got?
number of tokens:  35798


 66%|██████▌   | 72/109 [15:25<09:13, 14.97s/it]

len of before limit and question similarity:  [5, 5, 7, 5]
question:  I am interested in genes implicated in visual disorders such as myopia that relate to retinoid processing
number of tokens:  18551


 67%|██████▋   | 73/109 [15:38<08:34, 14.28s/it]

len of before limit and question similarity:  [5, 5, 7, 7, 5]
question:  Which gene encodes the GIP-GnT complex and is present in temporal lobe and prefrontal cortex and associated with hyperphosphatasia?
number of tokens:  26892


 68%|██████▊   | 74/109 [15:51<08:04, 13.84s/it]

len of before limit and question similarity:  [5, 5]
question:  Acute Lymphocytic Leukemia is associated with what gene?
number of tokens:  7780


 69%|██████▉   | 75/109 [15:56<06:25, 11.35s/it]

len of before limit and question similarity:  [7, 7, 5]
question:  My breast cancer is expressing too much MAP3K1. What drug can I take to help?
number of tokens:  11254


 70%|██████▉   | 76/109 [16:03<05:27,  9.93s/it]

len of before limit and question similarity:  [9, 7]
question:  Amoxicillin is often used to treat what disease?
number of tokens:  9498


 71%|███████   | 77/109 [16:09<04:39,  8.75s/it]

len of before limit and question similarity:  [5, 5, 7]
question:  Are there any investigational drugs targeting the same gene fumagillin?
number of tokens:  13821


 72%|███████▏  | 78/109 [16:19<04:48,  9.29s/it]

len of before limit and question similarity:  [5, 5]
question:  What environmental factors might impact microrna 222?
number of tokens:  5161


 72%|███████▏  | 79/109 [16:27<04:20,  8.70s/it]

len of before limit and question similarity:  [5, 7]
question:  What are the challenges of treating a FLT3 mutation in Acute Myeloid Leukemia
number of tokens:  8356


 73%|███████▎  | 80/109 [16:34<03:58,  8.22s/it]

len of before limit and question similarity:  [5, 5, 5]
question:  What is a gene involved in transcription by RNA polymerase II
number of tokens:  10534


 74%|███████▍  | 81/109 [16:43<03:55,  8.40s/it]

len of before limit and question similarity:  [5, 5, 7, 5, 5]
question:  If a patient can't metabolize vitamins and cofactors, what's an example of a more specific pathway that might be interrupted?
number of tokens:  35856


 75%|███████▌  | 82/109 [16:58<04:40, 10.40s/it]

len of before limit and question similarity:  [5, 5, 5, 9, 5]
question:  Complex machine learning methods like alpha fold could help scientists study protein repair and which other pathways?
number of tokens:  33151


 76%|███████▌  | 83/109 [17:11<04:52, 11.24s/it]

len of before limit and question similarity:  [5, 5, 7, 7, 5]
question:  Which gene in chromosme 11 and is iinvolved in heart left ventricle, liver and is a pseudogene?
number of tokens:  36229


 77%|███████▋  | 84/109 [17:38<06:40, 16.00s/it]

len of before limit and question similarity:  [7, 9, 5]
question:  What complex of antibiotics is Lactulose and Warfarin both examples of?
number of tokens:  10795


 78%|███████▊  | 85/109 [17:49<05:51, 14.65s/it]

len of before limit and question similarity:  [5, 5, 7, 5]
question:  What is the name of the drug that both (1) has synergistic interaction with Tenoxicam, and (2) has molecular weight 120.36?
number of tokens:  18470


 79%|███████▉  | 86/109 [17:59<04:59, 13.03s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 5, 5, 5, 5]
question:  Which biological pathway, identified with a stable identifier "R-HSA-8853333" in REACTOME, specifically focuses on the signaling mechanisms of FGFR2 fusions associated with various types of cancers and includes events like dimerization and autophosphorylation of these fusions?
number of tokens:  61132


 80%|███████▉  | 87/109 [18:20<05:40, 15.47s/it]

len of before limit and question similarity:  [5, 5, 5, 7]
question:  What is a human pathway that involves defective translocation of RB1 mutants, is associated with cancer, and has been externally reviewed?
number of tokens:  26050


 81%|████████  | 88/109 [18:29<04:47, 13.69s/it]

len of before limit and question similarity:  [7, 7, 7, 7]
question:  What is a rare type of cancer that can develop in the small intestine, causing symptoms like abdominal pain and weight loss?
number of tokens:  28130


 82%|████████▏ | 89/109 [18:42<04:26, 13.30s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 5]
question:  What signaling pathway is abnormally activated by FLT3 ITD mutations in cancer cells, leading to increased expression of genes that promote cell survival and proliferation?
number of tokens:  31076


 83%|████████▎ | 90/109 [18:57<04:24, 13.90s/it]

len of before limit and question similarity:  [5, 5, 9]
question:  Which drug targets a protein that is associated with acute promyelocytic leukemia
number of tokens:  18011


 83%|████████▎ | 91/109 [19:04<03:33, 11.84s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 9, 5, 5]
question:  What is a group of rare genetic disorders characterized by a distinctive "molar tooth sign" on brain imaging, along with neurological symptoms like developmental delay, intellectual disability, and abnormal eye movements, sometimes accompanied by additional organ system abnormalities?
number of tokens:  61397


 84%|████████▍ | 92/109 [19:27<04:19, 15.26s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 7, 7]
question:  What is the signaling pathway that regulates the development of the testes in the male embryo, and interacts with both protein WT1 and GATA4?
number of tokens:  51659


 85%|████████▌ | 93/109 [19:45<04:16, 16.01s/it]

len of before limit and question similarity:  [5, 5, 7, 7]
question:  What is the name of the gene whose expression is related to intestine, but is not CHD7?
number of tokens:  17447


 86%|████████▌ | 94/109 [19:57<03:41, 14.74s/it]

len of before limit and question similarity:  [5, 7, 7, 7, 7]
question:  Which biological process interacts with GCNT2, STK39, FOXC1 and CHMP4B?
number of tokens:  24645


 87%|████████▋ | 95/109 [20:10<03:19, 14.24s/it]

len of before limit and question similarity:  [5, 7, 5, 5, 5, 7]
question:  What is an investigational small molecule drug that aims to treat multiple sclerosis by blocking the interaction between VCAM-1 and alpha4-integrins to prevent immune cell migration into the central nervous system?
number of tokens:  34374


 88%|████████▊ | 96/109 [20:24<03:05, 14.28s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 5]
question:  What is a gene that hosts a cluster of at least six microRNAs involved in processes like cell survival and proliferation, and is often amplified in some cancers?
number of tokens:  30981


 89%|████████▉ | 97/109 [20:44<03:08, 15.74s/it]

len of before limit and question similarity:  [5, 7, 5, 7, 5, 5, 5, 5, 5, 5, 5, 7, 7, 5]
question:  What signaling pathway is triggered when the innate immune system detects viral RNA in the cytoplasm, leading to the activation of transcription factor IRF7 by the TRAF6 protein complex to induce the production of type I interferons? This pathway helps defend against RNA viruses like influenza, measles, and hepatitis C.
number of tokens:  61602


 90%|████████▉ | 98/109 [21:20<04:02, 22.03s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5]
question:  What are the initial steps in the assembly of the RNA polymerase II preinitiation complex at a gene promoter? This involves the sequential binding of general transcription factors and ultimately enables transcription to begin.
number of tokens:  24220


 91%|█████████ | 99/109 [21:40<03:33, 21.30s/it]

len of before limit and question similarity:  [7, 5, 5, 5]
question:  Which disease relates to the Neoplasm of the inner ear phenotype?
number of tokens:  26453


 92%|█████████▏| 100/109 [21:53<02:49, 18.86s/it]

len of before limit and question similarity:  [5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5]
question:  Let's study the genetics of limb development and wonder how mutations in certain genes can lead to limb abnormalities. What is an example of a gene belonging to the homeobox family of transcription factors that, when deleted along with other nearby HOXD genes, is associated with severe limb defects?
number of tokens:  61512


 93%|█████████▎| 101/109 [22:32<03:20, 25.02s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
question:  I am wondering how connective tissues like skin, bone and tendons get their strength and structure. What biochemical pathway involves the synthesis of collagen and its modification by enzymes like hydroxylases to enable the formation of strong collagen fibrils?
number of tokens:  61032


 94%|█████████▎| 102/109 [23:06<03:13, 27.58s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 5, 5]
question:  Imagine you're a detective trying to diagnose a very rare genetic disorder based on a unique combination of symptoms. What syndrome is characterized by the triad of Klippel-Feil anomaly (cervical vertebral fusion), Duane syndrome (bilateral abducens nerve palsy with eye retraction), and congenital deafness?
number of tokens:  6284


 94%|█████████▍| 103/109 [23:21<02:22, 23.75s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 7]
question:  Imagine you're a pharmacologist on a quest to find an experimental drug to treat high blood pressure. What is an investigational adrenergic alpha-1 receptor antagonist that has been studied for the treatment of hypertension?
number of tokens:  22124


 95%|█████████▌| 104/109 [23:32<01:40, 20.13s/it]

len of before limit and question similarity:  [5, 7, 5, 5, 5, 5]
question:  What's the name of the cellular pathway that takes place in the nucleolus, where RNA polymerase I transcribes ribosomal DNA into the 45S precursor RNA that's processed into mature ribosomal RNAs?
number of tokens:  42547


 96%|█████████▋| 105/109 [23:48<01:15, 18.77s/it]

len of before limit and question similarity:  [5, 5, 5, 7, 7, 7, 5, 7, 5, 5, 7, 5, 5, 9]
question:  What cellular pathway involves the repression of E2F target genes like Cyclin A, CDK1, and E2F1 by RBL1 (p107) and RBL2 (p130) in complex with the histone deacetylase HDAC1 during G0 and early G1 phases of the cell cycle?
number of tokens:  61705


 97%|█████████▋| 106/109 [24:19<01:07, 22.44s/it]

len of before limit and question similarity:  [5, 5, 5, 5, 5, 5, 5, 5, 5]
question:  Help me. I am trying to diagnose a patient with persistent joint pain, and I suspect a condition where the bone is dying due to compromised blood supply, often linked to factors like steroid use, alcohol abuse, or underlying diseases - what's the name of this sneaky bone-killing culprit?
number of tokens:  61371


 98%|█████████▊| 107/109 [24:42<00:45, 22.68s/it]

len of before limit and question similarity:  [5, 5]
question:  Which drug targets PDEIVB?
number of tokens:  5025


 99%|█████████▉| 108/109 [24:48<00:17, 17.48s/it]

len of before limit and question similarity:  [5, 5, 7, 5, 5, 5, 5, 9, 9, 9, 5]
question:  What host defense pathway involves proteins like S100A7, S100A8/A9, lactoferrin, and lipocalin-2 sequestering metal ions such as zinc, manganese, and iron to limit their availability to invading microbes?
number of tokens:  61494


100%|██████████| 109/109 [25:15<00:00, 13.90s/it]


In [7]:
# # for item in np.arange(4):
# #     more_results = collect_responses(qa_dataset, graph_rag, dict_=more_results)

In [8]:
# results_dict = load_dict_from_file(data_path + name_dict)
# more_results = collect_responses(qa_dataset, graph_rag, dict_=results_dict)

In [9]:
save_dict_to_file(more_results, data_path + name_dict)

Dictionary saved to ../data/results/07_08_SP_SIM_entity_finder_exact_match_more_sim_60k.json
