## Part 1: Filtering data

In [3]:
from collections import OrderedDict
import re

import polars as pl
from urllib import parse

In [14]:
ALLOWED_PARAMS = ("q", "fields", "species")  # need only these to test LLM initially
OPS = ["AND", "OR", "NOT", "(", ")", "?", "*", "\d+-\d+", '"', "~", "^", "TO", "+"]  # most of ES ops

def parse_encoded_uri(encoded_uri):
    uri = parse.unquote(encoded_uri)
    uri_components = parse.urlparse(uri)

    path = uri_components.path.removesuffix("/")
    if not path.startswith("/v3"):
        path = None

    return path, uri_components

def filter_uri_params(encoded_uri):
    path, uri_components = parse_encoded_uri(encoded_uri)
    params = parse.parse_qsl(uri_components.query)

    params = filter(lambda x: x[0] in ALLOWED_PARAMS, params)

    # hacky use of unquote and quote, can be more efficient by iterating over params
    return f"{path}?{parse.unquote(parse.urlencode(list(params), quote_via=parse.quote))}"

def get_uri_signature(encoded_uri):
    path, uri_components = parse_encoded_uri(encoded_uri)
    params = parse.parse_qs(uri_components.query) # NOTE: qs vs qsl

    # NOTE: we assume len(params["param"]) == 1
    # these are just rough filters
    q_str = params["q"][0] if "q" in params else ""
    ops_pattern = re.compile(r'(' + '|'.join(map(re.escape, OPS)) + r')')
    field_pattern = re.compile(r'([a-zA-Z\._]+)\d*:')

    q_fields = field_pattern.findall(q_str)
    q_ops = ops_pattern.findall(q_str)

    fields = params["fields"][0] if "fields" in params else ""
    species = params["species"][0] if "species" in params else ""
    # take only /v3/query or /v3/gene from the path, then sort and join all other fields
    sig = ("/".join(path.split("/")[:3]), fields, ''.join(sorted(q_ops)), ''.join(sorted(q_fields)), ''.join(sorted(species)))
    return tuple(map(lambda s: s.lower(), sig))

print(get_uri_signature("/v3/gene/1023/?fields=symbol,entrezgene&q=(entrezgene:dipeptidyl%20peptidase%20%28DPP%29-4*)"))
print(get_uri_signature("/v3/query?species=human&q=chr3%3A108324182-108324285%3A%20"))
print(get_uri_signature("/v3/query/?species=human&fields=symbol&q=Feasibility+of+leadless+left+ventricular+septal+pacing+with+the+WiSE-CRT+system+to+target+the+left+bundle+branch+area:+A+porcine+model+and+multicenter+patient+experience"))
print(filter_uri_params("/v3/query/?species=human&fields=symbol&q=Feasibility+of+leadless+left+ventricular+septal+pacing+with+the+WiSE-CRT+system+to+target+the+left+bundle+branch+area:+A+porcine+model+and+multicenter+patient+experience"))

ParseResult(scheme='', netloc='', path='/v3/query', params='', query='q=ensemblgene:7087ICAM51919p13.2protein-codingTLCNHGNC:5348ENSG00000105376', fragment='')
/v3/query%3Fq%3Densemblgene%3A7087%09ICAM5%0919%0919p13.2%09protein-coding%09TLCN%09HGNC%3A5348%09ENSG00000105376?


In [None]:
encoded_uri = "/v3/query/?fields=symbol,entrezgene&q=dipeptidyl%20peptidase%20%28DPP%29-4"
uri = parse.unquote(encoded_uri)
uri_components = parse.urlparse(uri)
parse.urlencode(parse.parse_qsl(uri_components.query), quote_via=parse.quote)

In [None]:
parse.urlencode({'fields': 'symbol,entrezgene', 'q': 'dipeptidyl peptidase (DPP)-4'}, quote_via=parse.quote)

In [None]:
path, uri_components = parse_encoded_uri("/v3/query?fields=symbol%2Centrezgene&q=1")
parse.parse_qs(uri_components.query)

In [7]:
import pandas as pd
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=24)

df = pd.read_csv("data/mygene_report_last_365_days.csv")

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [8]:
df.head()

Unnamed: 0,log.path
0,"/v3/gene/558?fields=summary,symbol"
1,"/v3/gene/2047?fields=summary,symbol"
2,"/v3/gene/8438?fields=summary,symbol"
3,"/v3/gene/2064?fields=summary,symbol"
4,"/v3/gene/4609?fields=summary,symbol"


In [9]:
# use new.path to construct query_sig
# unquote log.path for easier parsing for the downstream LLM
df["new.path"] = df["log.path"].parallel_apply(filter_uri_params)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1304447), Label(value='0 / 1304447…

In [10]:
df["query_sig"] = df["new.path"].parallel_apply(get_uri_signature)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1304447), Label(value='0 / 1304447…

In [11]:
df["log.path"] = df["log.path"].parallel_apply(parse.unquote)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1304447), Label(value='0 / 1304447…

In [12]:
pl_df = pl.from_pandas(df)
grp = pl_df.group_by("query_sig")
filtered = grp.first()  # optionally get longest queries

In [15]:
# make np arrays render on single line in CSV
import numpy as np
np.set_printoptions(threshold=np.inf)
np.set_printoptions(linewidth=np.inf)

filtered.to_pandas().to_csv("one_year_unique_sig.csv", index=False)

## Part 2: Generating instruction-query pairs from a bigger LLM

In [1]:
import os

from llama_cpp import Llama, LLAMA_SPLIT_MODE_NONE, LlamaGrammar
import pandas as pd

In [None]:
llm = Llama(
    model_path=os.path.expanduser("~/vendor/weights_llama3.1/Meta-Llama-3.1-70B-Instruct-Q4_K_L.gguf"),
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
    # seed=1337, # Uncomment to set a specific seed
    n_ctx=2048, # Uncomment to increase the context window
    # split_mode=LLAMA_SPLIT_MODE_NONE  # Uncomment to use single-GPU
)

In [2]:
data = pd.read_csv("data/one_year_unique_sig.csv")[""]
df.head()

Unnamed: 0,query_sig,log.path,new.path
0,['/v3/query' 'symbol' '' 'antipsychotics' 'ahm...,/v3/query/?species=human&fields=symbol&q=Cardi...,/v3/query?species=human&fields=symbol&q=Cardio...
1,['/v3/query' '' '' 'codingtlcnhgncensemblgene'...,/v3/query?q=ensemblgene:7087\tICAM5\t19\t19p13...,/v3/query?q=ensemblgene:7087ICAM51919p13.2prot...
2,['/v3/query' '' '' 'ensemblgenepseudogeneglulp...,/v3/query?q=ensemblgene:392305\tGLULP4\t9\t9p1...,/v3/query?q=ensemblgene:392305GLULP499p13.3pse...
3,['/v3/query' '' '' 'ensemblgeneotherpcdhgchgnc...,/v3/query?q=ensemblgene:56118\tPCDHGCT\t5\t5q3...,/v3/query?q=ensemblgene:56118PCDHGCT55q31other...
4,['/v3/query' 'symbol' '()' 'e' 'ahmnu'],/v3/query/?species=human&fields=symbol&q=EMERG...,/v3/query?species=human&fields=symbol&q=EMERGI...


In [None]:
output = llm.create_chat_completion(
    messages=[
        # {"role": "system", "content": f"You are an expert in computational biology. Read the following documentation for further use. {docs}"},
        {"role": "system", "content": f"You are an expert in computational biology. Using your experience, summarize the following fields in 2 sentences found in a unified database."},
        {"role": "user", "content": "1. pantherdb.EcoGene\n2. reporter.MarGene-1_0"}
    ],
    # grammar=query_grammar
)