## Part 1: Filtering data

In [None]:
from collections import OrderedDict
import re

import polars as pl
from urllib import parse

In [None]:
ALLOWED_PARAMS = ("q", "fields", "species")  # need only these to test LLM initially
OPS = ["AND", "OR", "NOT", "(", ")", "?", "*", "\d+-\d+", '"', "~", "^", "TO", "+"]  # most of ES ops

def parse_encoded_uri(encoded_uri):
    uri = parse.unquote(encoded_uri)
    uri_components = parse.urlparse(uri)

    path = uri_components.path.removesuffix("/")
    if not path.startswith("/v3"):
        path = None

    return path, uri_components

def filter_uri_params(encoded_uri):
    path, uri_components = parse_encoded_uri(encoded_uri)
    params = parse.parse_qsl(uri_components.query)

    params = filter(lambda x: x[0] in ALLOWED_PARAMS, params)

    # hacky use of unquote and quote, can be more efficient by iterating over params
    return f"{path}?{parse.unquote(parse.urlencode(list(params), quote_via=parse.quote))}"

def get_uri_signature(encoded_uri):
    path, uri_components = parse_encoded_uri(encoded_uri)
    params = parse.parse_qs(uri_components.query) # NOTE: qs vs qsl

    # NOTE: we assume len(params["param"]) == 1
    # these are just rough filters
    q_str = params["q"][0] if "q" in params else ""
    ops_pattern = re.compile(r'(' + '|'.join(map(re.escape, OPS)) + r')')
    field_pattern = re.compile(r'([a-zA-Z\._]+)\d*:')

    q_fields = field_pattern.findall(q_str)
    q_ops = ops_pattern.findall(q_str)

    fields = params["fields"][0] if "fields" in params else ""
    species = params["species"][0] if "species" in params else ""
    # take only /v3/query or /v3/gene from the path, then sort and join all other fields
    sig = ("/".join(path.split("/")[:3]), fields, ''.join(sorted(q_ops)), ''.join(sorted(q_fields)), ''.join(sorted(species)))
    return tuple(map(lambda s: s.lower(), sig))

print(get_uri_signature("/v3/gene/1023/?fields=symbol,entrezgene&q=(entrezgene:dipeptidyl%20peptidase%20%28DPP%29-4*)"))
print(get_uri_signature("/v3/query?species=human&q=chr3%3A108324182-108324285%3A%20"))
print(get_uri_signature("/v3/query/?species=human&fields=symbol&q=Feasibility+of+leadless+left+ventricular+septal+pacing+with+the+WiSE-CRT+system+to+target+the+left+bundle+branch+area:+A+porcine+model+and+multicenter+patient+experience"))
print(filter_uri_params("/v3/query/?species=human&fields=symbol&q=Feasibility+of+leadless+left+ventricular+septal+pacing+with+the+WiSE-CRT+system+to+target+the+left+bundle+branch+area:+A+porcine+model+and+multicenter+patient+experience"))

In [None]:
encoded_uri = "/v3/query/?fields=symbol,entrezgene&q=dipeptidyl%20peptidase%20%28DPP%29-4"
uri = parse.unquote(encoded_uri)
uri_components = parse.urlparse(uri)
parse.urlencode(parse.parse_qsl(uri_components.query), quote_via=parse.quote)

In [None]:
parse.urlencode({'fields': 'symbol,entrezgene', 'q': 'dipeptidyl peptidase (DPP)-4'}, quote_via=parse.quote)

In [None]:
path, uri_components = parse_encoded_uri("/v3/query?fields=symbol%2Centrezgene&q=1")
parse.parse_qs(uri_components.query)

In [None]:
import pandas as pd
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=24)

df = pd.read_csv("data/mygene_report_last_365_days.csv")

In [None]:
df.head()

In [None]:
# use new.path to construct query_sig
# unquote log.path for easier parsing for the downstream LLM
df["new.path"] = df["log.path"].parallel_apply(filter_uri_params)

In [None]:
df["query_sig"] = df["new.path"].parallel_apply(get_uri_signature)

In [None]:
df["log.path"] = df["log.path"].parallel_apply(parse.unquote)

In [None]:
pl_df = pl.from_pandas(df)
grp = pl_df.group_by("query_sig")
filtered = grp.first()  # optionally get longest queries

In [None]:
# make np arrays render on single line in CSV
import numpy as np
np.set_printoptions(threshold=np.inf)
np.set_printoptions(linewidth=np.inf)

filtered.to_pandas().to_csv("one_year_unique_sig.csv", index=False)

## Part 2: Generating instruction-query pairs from a bigger LLM

In [1]:
import os

from llama_cpp import Llama, LLAMA_SPLIT_MODE_NONE, LlamaGrammar
import pandas as pd

In [2]:
llm = Llama(
    model_path=os.path.expanduser("~/vendor/weights_llama3.1/Meta-Llama-3.1-70B-Instruct-Q4_K_L.gguf"),
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
    # seed=1337, # Uncomment to set a specific seed
    n_ctx=16384, # Uncomment to increase the context window
    # split_mode=LLAMA_SPLIT_MODE_NONE  # Uncomment to use single-GPU
)

llama_model_loader: loaded meta data with 33 key-value pairs and 724 tensors from /home/atubati/vendor/weights_llama3.1/Meta-Llama-3.1-70B-Instruct-Q4_K_L.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 70B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str              = 70B
llama_model_loader: - kv   6:                            general.license str              = lla

In [3]:
# data = pd.read_csv("data/one_year_unique_sig.csv")["log.path"].values
data = pd.read_csv("data/sample_logs.csv")["log.path"].values
data.shape

(10,)

In [4]:
with open("data/prompts/datagen_prompt.md") as fd:
    base_prompt = fd.read()

with open("gene_query_docs.txt") as doc_fd:
    docs = doc_fd.read()

with open("data/original/compact_desc.csv") as desc_fd:
    description = desc_fd.read()

In [5]:
output_schema = {
                    "type": "object",
                    "properties": {
                        "instructions": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "query": {"type": "string"},
                                    "imperative": {"type": "string"},
                                    "question": {"type": "string"}
                                },
                                "required": ["query", "imperative", "question"]
                            }
                        }
                    },
                    "required": ["instructions"]
                }

In [6]:
BATCH_SIZE = 10  # NOTE: change in prompt also if changed
for idx in range(0, len(data), BATCH_SIZE):
    prompt = f"{base_prompt}\n" + "\n".join(data[idx:(idx + BATCH_SIZE)])
    print(prompt)
    output = llm.create_chat_completion(
        messages=[
            {"role": "user", "content": prompt},
            {"role": "system", "content": f"Use this documentation for help: {docs}\n\nand this Schema: {description}"},
        ],
        # grammar=query_grammar
        response_format={
            "type": "json_object",
            "schema": output_schema
        },
    )
    break

You need to generate instructions for API queries and the instruction-query pairs will be used to fine-tune a smaller LLM assistant. Generate two instructions for each of the 10 given queries, one imperative and one question-style.

Here are the requirements:
1. Strictly output a JSON.
2. Use the documentation and summary of fields in the database as reference.
3. The instructions must be in English.
4. Use diverse verbs in your responses to generalize better. This is very important!
5. A small language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action.
6. Each instruction must be 3 sentences long at the most.

Here are the 10 API queries line-wise:
/v3/query?q=ensemblgene:7087	ICAM5	19	19p13.2	protein-coding	TLCN	HGNC:5348	ENSG00000105376
/v3/query?q=symbol:CDK1+AND+taxid:9606&fields=go.BP


llama_print_timings:        load time =    2602.08 ms
llama_print_timings:      sample time =   15006.67 ms /   939 runs   (   15.98 ms per token,    62.57 tokens per second)
llama_print_timings: prompt eval time =   31112.94 ms /  8933 tokens (    3.48 ms per token,   287.12 tokens per second)
llama_print_timings:        eval time =   84759.76 ms /   938 runs   (   90.36 ms per token,    11.07 tokens per second)
llama_print_timings:       total time =  135948.72 ms /  9871 tokens


In [7]:
import json
resp = json.loads(output["choices"][0]["message"]["content"])

In [8]:
len(resp["instructions"])

10

In [9]:
resp["instructions"]

[{'query': '/v3/query?q=ensemblgene:7087\tICAM5\t19\t19p13.2\tprotein-coding\tTLCN\tHGNC:5348\tENSG00000105376',
  'imperative': 'Find the gene with Ensembl gene ID 7087 and return its symbol, name, chromosome, genomic position, type of gene, alias, HGNC ID, and Ensembl gene ID.',
  'question': 'What are the symbol, name, chromosome, genomic position, type of gene, alias, HGNC ID, and Ensembl gene ID of the gene with Ensembl gene ID 7087?'},
 {'query': '/v3/query?q=symbol:CDK1+AND+taxid:9606&fields=go.BP&species=human&size=1',
  'imperative': 'Find the human gene with symbol CDK1 and return its Gene Ontology Biological Process information.',
  'question': 'What is the Gene Ontology Biological Process information of the human gene with symbol CDK1?'},
 {'query': '/v3/query?q=p53&species=human&fields=symbol,name,entrezgene,pantherdb&size=1',
  'imperative': 'Find the human gene with symbol p53 and return its symbol, name, Entrez gene ID, and PANTHER database information.',
  'question': 

In [None]:
pd.DataFrame(resp["instructions"]).to_csv("data/ft/sample_pairs.csv", index=False)