## Part 1: Filtering data

### Part 1a: Rough filtering with Custom Query Signature

In [2]:
from collections import OrderedDict
import re

import polars as pl
from urllib import parse

In [17]:
ALLOWED_PARAMS = ("q", "fields", "scopes")  # need only these to test LLM initially
OPS = ["AND", "OR", "NOT", "(", ")", "?", "*", r"\d+-\d+", '"', "~", "^", "TO", "+"]  # most of ES ops

def parse_encoded_uri(encoded_uri):
    uri = parse.unquote(encoded_uri)
    uri_components = parse.urlparse(uri)

    path = uri_components.path.removesuffix("/")
    if not path.startswith("/v1"):
        path = None

    return path, uri_components

def filter_uri_params(encoded_uri):
    path, uri_components = parse_encoded_uri(encoded_uri)
    params = parse.parse_qsl(uri_components.query)

    params = filter(lambda x: x[0] in ALLOWED_PARAMS, params)

    # hacky use of unquote and quote, can be more efficient by iterating over params
    return f"{path}?{parse.unquote(parse.urlencode(list(params), quote_via=parse.quote))}"

def get_uri_signature(encoded_uri):
    path, uri_components = parse_encoded_uri(encoded_uri)
    params = parse.parse_qs(uri_components.query) # NOTE: qs vs qsl

    # NOTE: we assume len(params["param"]) == 1
    # these are just rough filters
    q_str = params["q"][0] if "q" in params else ""
    ops_pattern = re.compile(r'(' + '|'.join(map(re.escape, OPS)) + r')')
    field_pattern = re.compile(r'([a-zA-Z\._]+)\d*:')

    q_fields = field_pattern.findall(q_str)
    q_ops = ops_pattern.findall(q_str)

    fields = params["fields"][0] if "fields" in params else ""
    scopes = params["scopes"][0] if "scopes" in params else ""
    # take only /v3/query or /v3/gene from the path, then sort and join all other fields
    sig = ("/".join(path.split("/")[:3]), fields, ''.join(sorted(q_ops)), ''.join(sorted(q_fields)), ''.join(sorted(scopes)))
    return tuple(map(lambda s: s.lower(), sig))

print(get_uri_signature("/v1/gene/1023/?fields=symbol,entrezgene&q=(entrezgene:dipeptidyl%20peptidase%20%28DPP%29-4*)"))
print(get_uri_signature("/v1/query?species=human&q=chr3%3A108324182-108324285%3A%20"))
print(get_uri_signature("/v1/query/?species=human&fields=symbol&q=Feasibility+of+leadless+left+ventricular+septal+pacing+with+the+WiSE-CRT+system+to+target+the+left+bundle+branch+area:+A+porcine+model+and+multicenter+patient+experience"))
print(filter_uri_params("/v1/query/?entrezonly=true&species=human&fields=symbol&q=Feasibility+of+leadless+left+ventricular+septal+pacing+with+the+WiSE-CRT+system+to+target+the+left+bundle+branch+area:+A+porcine+model+and+multicenter+patient+experience"))
print(filter_uri_params("/v1/query?q=CHRNA1&scopes=symbol&fields=name,symbol,entrezgene,ensembl.gene,pathway&species=human"))

('/v1/gene', 'symbol,entrezgene', '(())*', 'entrezgene', '')
('/v1/query', '', '', 'chr', '')
('/v1/query', 'symbol', '', 'area', '')
/v1/query?fields=symbol&q=Feasibility of leadless left ventricular septal pacing with the WiSE-CRT system to target the left bundle branch area: A porcine model and multicenter patient experience
/v1/query?q=CHRNA1&scopes=symbol&fields=name,symbol,entrezgene,ensembl.gene,pathway


In [18]:
encoded_uri = "/v3/query/?fields=symbol,entrezgene&q=dipeptidyl%20peptidase%20%28DPP%29-4"
uri = parse.unquote(encoded_uri)
uri_components = parse.urlparse(uri)
parse.urlencode(parse.parse_qsl(uri_components.query), quote_via=parse.quote)

'fields=symbol%2Centrezgene&q=dipeptidyl%20peptidase%20%28DPP%29-4'

In [19]:
parse.urlencode({'fields': 'symbol,entrezgene', 'q': 'dipeptidyl peptidase (DPP)-4'}, quote_via=parse.quote)

'fields=symbol%2Centrezgene&q=dipeptidyl%20peptidase%20%28DPP%29-4'

In [20]:
path, uri_components = parse_encoded_uri("/v3/query?fields=symbol%2Centrezgene&q=1")
parse.parse_qs(uri_components.query)

{'fields': ['symbol,entrezgene'], 'q': ['1']}

In [41]:
import pandas as pd
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=24)

df = pd.read_csv("data/logs/mychem_log_paths.csv")

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [42]:
df.head()

Unnamed: 0,log.path
0,/v1/query?q=lomitapide&fields=chembl.drug_indi...
1,/v1/metadata
2,/v1/metadata
3,/v1/metadata/fields
4,/v1/metadata


In [43]:
# use new.path to construct query_sig
# unquote log.path for easier parsing for the downstream LLM
df["new.path"] = df["log.path"].parallel_apply(filter_uri_params)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9636), Label(value='0 / 9636'))), …

In [44]:
df["query_sig"] = df["new.path"].parallel_apply(get_uri_signature)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9636), Label(value='0 / 9636'))), …

In [45]:
df["log.path"] = df["log.path"].parallel_apply(parse.unquote)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9636), Label(value='0 / 9636'))), …

In [46]:
pl_df = pl.from_pandas(df)
grp = pl_df.group_by("query_sig")

In [47]:
filtered = grp.head(10)  # optionally get longest queries

In [48]:
# make np arrays render on single line in CSV
import numpy as np
np.set_printoptions(threshold=np.inf)
np.set_printoptions(linewidth=np.inf)

filtered.to_pandas().to_csv("data/logs/mychem_filtered.csv", index=False)

### Part 1b: Validating queries with GET

#### We only consider queries with at least one result as they are certainly correct. We may also take a small portion of negative class for more robustness.

In [49]:
import httpx
import pandas as pd
from tqdm import tqdm
import json

df = pd.read_csv("data/logs/mychem_filtered.csv").drop_duplicates(subset=["log.path"])
df.head()

Unnamed: 0,query_sig,log.path,new.path
0,['/v1/query' 'drugcentral.drug_use.reduce_risk...,/v1/query?q=_exists_:drugcentral.drug_use.redu...,/v1/query?q=_exists_:drugcentral.drug_use.redu...
6,['/v1/query' 'drugcentral.drug_use.off_label_u...,/v1/query?q=_exists_:drugcentral.drug_use.off_...,/v1/query?q=_exists_:drugcentral.drug_use.off_...
12,['/v1/query' 'pharmgkb.smiles' '' '_exists_' ''],/v1/query?q=_exists_:pharmgkb.smiles&fields=ph...,/v1/query?q=_exists_:pharmgkb.smiles&fields=ph...
17,['/v1/query' 'pharmgkb.xrefs.pubchem.sid' '' '...,/v1/query?q=_exists_:pharmgkb.xrefs.pubchem.si...,/v1/query?q=_exists_:pharmgkb.xrefs.pubchem.si...
22,['/v1/query' 'gsrs.approvedby' '' '_exists_' ''],/v1/query?q=_exists_:gsrs.approvedby&fields=gs...,/v1/query?q=_exists_:gsrs.approvedby&fields=gs...


In [50]:
def fetch_url(url):
    try:
        response = httpx.get(url)
        return response.status_code, response.text
    except httpx.HTTPStatusError as e:
        return e.response.status_code, str(e)
    except Exception as e:
        return None, str(e)

In [51]:
HOST_NAME = "http://mychem.info"
results = []
for url in tqdm(df["log.path"], desc="Fetching URLs"):
    status_code, content = fetch_url(HOST_NAME + url)
    results.append((url, status_code, content))

Fetching URLs: 100%|██████████████████████████████████████████| 1825/1825 [05:13<00:00,  5.82it/s]


In [52]:
res_df = pd.DataFrame(results, columns=["query", "status", "response"])

In [53]:
results[1]

('/v1/query?q=_exists_:drugcentral.drug_use.off_label_use.snomed_concept_id&fields=drugcentral.drug_use.off_label_use.snomed_concept_id',
 200,
 '{"took":16,"total":729,"max_score":1.0,"hits":[{"_id":"CESYKOGBSMNBPD-UHFFFAOYSA-N","_score":1.0,"drugcentral":{"_license":"http://bit.ly/2SeEhUy","drug_use":{"off_label_use":[{"snomed_concept_id":427649000},{"snomed_concept_id":15771004}]}}},{"_id":"AKUJBENLRBOFTD-QZIXMDIESA-N","_score":1.0,"drugcentral":{"_license":"http://bit.ly/2SeEhUy","drug_use":{"off_label_use":[{"snomed_concept_id":88854002},{"snomed_concept_id":46775006}]}}},{"_id":"BIVUTZYWJNTGDG-GTMVCPGISA-N","_score":1.0,"drugcentral":{"_license":"http://bit.ly/2SeEhUy","drug_use":{"off_label_use":[{"snomed_concept_id":18081009},{"snomed_concept_id":7520000},{"snomed_concept_id":2707005},{"snomed_concept_id":17732003},{"snomed_concept_id":80960004},{"snomed_concept_id":58750007},{"snomed_concept_id":36753006},{"snomed_concept_id":111835002}]}}},{"_id":"AQNDDEOPVVGCPG-UHFFFAOYSA-N"

In [54]:
def create_query_labels(df: pd.DataFrame):
    """Return filtered df of queries with 200 status code and >0 results only"""
    filtered_res = []

    for row in tqdm(df.itertuples()):
        if row.status == 200:
            try:
                resp = json.loads(row.response)
                if resp.get("total", 0) > 0:
                    filtered_res.append({"query": row.query, "hits": resp.get("total", 0)})
            except (json.JSONDecodeError, AttributeError) as e:
                print("Err", row)
                continue

    return pd.DataFrame(filtered_res)

In [55]:
pos_df = create_query_labels(res_df)   

363it [00:00, 497.32it/s]

Err Pandas(Index=238, query='/v1/api', status=200.0, response='<!DOCTYPE html>\n<html class="bg-dark">\n  <head>\n    <title id="title"></title>\n    <meta charset="utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <meta name="HandheldFriendly" content="True" />\n    <meta property="og:locale" content="en_US" />\n    <!-- General Metadata -->\n    <meta property="og:image:type" content="image/jpg">\n    <meta property="og:image:width" content="1200">\n    <meta property="og:image:height" content="630">\n    <meta property="og:type" content="website">\n    <meta property="fb:app_id" content="" />\n    <meta name="twitter:card" content="summary" />\n    <meta name="msapplication-TileColor" content="#000000">\n    <meta name="theme-color" content="#000000">\n    <!-- Unique for each site -->\n    <meta id="featured" property="og:image" content="">\n    <meta id="ogurl" property="og:url" content="">\n    <meta id="url" property="og:site_name" conten

1451it [00:02, 879.09it/s]

Err Pandas(Index=1229, query='/v1/chem/DB00001?fields=drugbank', status=200.0, response='[{"_id":"FIBJDTSHOUXTKV-BRHMIFOHSA-N","_version":1,"drugbank":{"_license":"https://bit.ly/3Hikpvm","accession_number":["BTD00024","BIOD00024"],"cas":"138068-37-8","id":"DB00001","inchi_key":"","name":"Lepirudin","synonyms":["[Leu1, Thr2]-63-desulfohirudin","Desulfatohirudin","Hirudin variant-1","Lepirudin","Lepirudin recombinant","R-hirudin"],"unii":"Y43GF64R34"}},{"_id":"XYWBJDRHGNULKG-OUMQNGNKSA-N","_version":1,"drugbank":{"_license":"https://bit.ly/3Hikpvm","accession_number":[""],"cas":"120993-53-5","id":"DB11095","inchi_key":"","name":"Desirudin","synonyms":["63-Desulfohirudin","63-Desulfohirudin (Hirudo Medicinalis Isoform HV1)","63-Desulfohirudin (recombinant)","Desirudin","Desirudin recombinant","Desirudina","Hirudin desirudin"],"unii":"U0JZ726775"}},{"_id":"CHEMBL1201662","_version":1},{"_id":"CHEMBL1201666","_version":1},{"_id":"OTQCKZUSUGYWBD-BRHMIFOHSA-N","_version":1}]')
Err Pandas(Ind

1825it [00:03, 497.66it/s]


In [56]:
pos_df.to_csv("data/logs/mychem_hits.csv", index=False)

In [None]:
fetch_url(HOST_NAME + "/v3/query?" + "q=ensemblgene:ENSG00000105376 AND HGNC:5348")

## Part 2: Generating instruction-query pairs from a bigger LLM

In [1]:
import os

from llama_cpp import Llama, LLAMA_SPLIT_MODE_NONE, LlamaGrammar
import pandas as pd
from tqdm import tqdm
import json

In [None]:
llm = Llama(
    model_path=os.path.expanduser("/home/atubati/vendor/weights_llama3.1/Llama-3.1-Nemotron-70B-Instruct-HF-Q6_K/Llama-3.1-Nemotron-70B-Instruct-HF-Q6_K-00001-of-00002.gguf"),
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
    # seed=1337, # Uncomment to set a specific seed
    n_ctx=16384, # Uncomment to increase the context window
    # split_mode=LLAMA_SPLIT_MODE_NONE  # Uncomment to use single-GPU
)

llama_model_loader: additional 1 GGUFs metadata loaded.
llama_model_loader: loaded meta data with 44 key-value pairs and 724 tensors from /home/atubati/vendor/weights_llama3.1/Llama-3.1-Nemotron-70B-Instruct-HF-Q6_K/Llama-3.1-Nemotron-70B-Instruct-HF-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.1 70B Instruct
llama_model_loader: - kv   3:                       general.organization str              = Meta Llama
llama_model_loader: - kv   4:                           general.finetune str              = Instruct
llama_model_loader: - kv   5:                           general.basename str        

In [3]:
# data = pd.read_csv("data/one_year_unique_sig.csv")["log.path"].values
data = pd.read_csv("data/logs/mygene_hits.csv")["query"].values
data.shape

(4208,)

In [5]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model_name = 'Snowflake/snowflake-arctic-embed-l'
model_kwargs = {"device": "cuda:0"}
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs=model_kwargs)

vectorstore = FAISS.load_local(folder_path="data/rag", index_name="faiss_index", embeddings=embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 160})  # 160 to accommodate 10 pairs

def process_retrieved_docs(doc_batches):
    return ["\n\n".join([doc.page_content for doc in doc_batch]) for doc_batch in doc_batches]

In [6]:
with open("data/prompts/datagen_prompt.md") as fd:
    base_prompt = fd.read()

with open("gene_query_docs.txt") as doc_fd:
    docs = doc_fd.read()

# with open("data/original/compact_desc_with_context.csv") as desc_fd:
#     description = desc_fd.read()

In [7]:
output_schema = {
                    "type": "object",
                    "properties": {
                        "instructions": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    # "query": {"type": "string"},
                                    "description": {"type": "string"},
                                    "instruction": {"type": "string"}
                                },
                                "required": ["description", "instruction"]
                            }
                        }
                    },
                    "required": ["instructions"]
                }

In [None]:
file_name = "data/ft/mygene_inst_query_pairs.csv"

BATCH_SIZE = 10  # NOTE: change in prompt also if changed
for idx in tqdm(range(0, len(data), BATCH_SIZE)):
    prompt = f"{base_prompt}\n" + "\n".join(data[idx:(idx + BATCH_SIZE)])
    context = process_retrieved_docs(retriever.batch([prompt]))[0]

    if idx == 0:
        print(prompt, context)

    output = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": f"\nUse the documentation and schema to complete the user-given task. Docs: {docs}\n\nSchema: {context}"},
            {"role": "user", "content": prompt},
        ],
        # grammar=query_grammar
        response_format={
            "type": "json_object",
            "schema": output_schema
        },
    )

    resp = json.loads(output["choices"][0]["message"]["content"])
    pd.DataFrame(resp["instructions"]).to_csv(file_name, mode="a", index=False, header=not os.path.exists(file_name))

  0%|                                                                     | 0/421 [00:00<?, ?it/s]

You need to generate instructions for API queries, which will be paired with each query to fine-tune a smaller LLM assistant. For each of the 10 provided queries, describe what the user is aiming to accomplish with the query. Then, create a concise instruction that would lead the smaller LLM to generate that exact query.

Please follow these requirements:

1. Output strictly in JSON format with two fields: "description" and "instruction".
2. Ensure all instructions are in clear, professional English.
3. Use varied and precise verbs in the descriptions and instructions to enhance the model's generalization abilities. This is crucial!
4. Instructions must be suitable for a small language model, so avoid any actions or requests it cannot fulfill, such as setting alarms or producing non-text outputs.
5. Limit each instruction to a single sentence.

Here are the 10 API queries, one per line:
/v3/query?q=ensembl.gene:ENSG00000001497&fields=transcripts
/v3/query?q=symbol:DAND5
/v3/query?q=sym

CUDA error: out of memory
  current device: 1, in function alloc at /tmp/pip-install-5jh73lxy/llama-cpp-python_154b89b9064e4f7295e5b6693d8361d9/vendor/llama.cpp/ggml/src/ggml-cuda.cu:380
  cuMemCreate(&handle, reserve_size, &prop, 0)
/tmp/pip-install-5jh73lxy/llama-cpp-python_154b89b9064e4f7295e5b6693d8361d9/vendor/llama.cpp/ggml/src/ggml-cuda.cu:106: CUDA error
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
resp = json.loads(output["choices"][0]["message"]["content"])

In [None]:
len(resp["instructions"])

In [None]:
resp["instructions"]

In [None]:
file_name = "data/ft/inst_query_pairs.csv"
pd.DataFrame(resp["instructions"]).to_csv(file_name, mode="a", index=False, header=not os.path.exists(file_name))