In [7]:
# Import statements
import os
import requests
import pandas as pd
from langchain_community.tools import BaseTool
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from simpletransformers.language_representation import RepresentationModel

import pystow

DATA_DIR = pystow.join("AssayCTX", "data")
LLM_DIR = pystow.join("AssayCTX", "models")

In [2]:
# define handler   

class AssayDescription(BaseTool):
    @staticmethod
    def get_assay_desc(
        df: pd.DataFrame,
        model_type: str = "bert",
        model_name: str = "dmis-lab/biobert-base-cased-v1.2",
        query_col: str = "assay_description",
    ) -> pd.DataFrame:

        #get list of assay descriptions
        assay_descriptions = df[query_col].unique().tolist()

        #initialize model
        model_type = model_type.lower()
        if model_type is None:
            return df
        elif model_type in [
            "bert"
        ]:
            model = RepresentationModel(
                model_type=model_type, model_name=model_name, use_cuda=False
            )
            embeddings = model.encode_sentences(assay_descriptions, combine_strategy="mean")
    
        else:
            raise ValueError(f"Unsupported model type: {model_type}")

        df = pd.concat([df, pd.DataFrame(embeddings).add_prefix(model_type + '_')], axis=1)
        return df

In [3]:
from topic_information import retrieve_topic, topic_information

names = ['slcs']
all_ids = []
for name in names:
    df = pd.read_csv(DATA_DIR / f'filtered_{name}.csv')
    chembl_ids = df.AID.unique().tolist()
    all_ids.extend(chembl_ids)

name = 'slcs'
df_topic = retrieve_topic(all_ids, name)
topic_info = topic_information(df_topic, name)

             chembl_id assay_type  \
776      CHEMBL2346464          B   
2486      CHEMBL806135          B   
2613      CHEMBL755917          B   
3891      CHEMBL680555          B   
4745      CHEMBL747997          B   
...                ...        ...   
1136593  CHEMBL5230261          B   
1136779  CHEMBL3853443          B   
1137234  CHEMBL2317558          B   
1137684  CHEMBL2411005          B   
1139021  CHEMBL1066748          B   

                                               description  olr_cluster_None  
776      Displacement of [3H]-Citalopram from human SER...                 1  
2486     Displacement of [3H]- paroxetine from serotoni...               433  
2613     Inhibition of [3H]norepinephrine uptake at the...               938  
3891     Inhibitory activity against human glycine tran...               875  
4745     Displacement of [3H]-nisoxatine from norepinep...                 1  
...                                                    ...               ...  
11

No sentence-transformers model found with name dmis-lab/biobert-base-cased-v1.2. Creating a new one with mean pooling.


   assay_type  Count in dataset  Topic  Count  \
98          B                 1    909    149   
78          B                 1    632    222   
74          B                 1    585    242   
31          B                 1    168    753   
33          B                 1    186    697   
..        ...               ...    ...    ...   
96          B                93    890    153   
41          B               182    259    547   
59          B               184    433    336   
32          B               258    178    723   
0           B               437      1  14757   

                                       Representation  
98  [locomotor, ulceration, rats, synaptic, locomo...  
78  [sds, page, proteomes, profiling, labeling, ir...  
74  [progesterone, cv, hpr, cotransfection, androg...  
31  [occupancy, ex, autoradiography, vivo, sprague...  
33  [baf3, tel, f3, ba, celltiter, necroptosis, gl...  
..                                                ...  
96  [uptake, gaba, 

In [10]:
from langchain_experimental.chat_models import Llama2Chat

from os.path import expanduser

from langchain_community.llms import LlamaCpp


llm = LlamaCpp(
    model_path= f'{str(LLM_DIR)}/llama-2-7b-chat.Q4_0.gguf',
    streaming=False,
)
model = Llama2Chat(llm=llm)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /zfsdata/data/linde/AssayCTX/models/llama-2-7b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attent

In [21]:
representation = topic_info.Representation[0]
print(representation)

['displacement', '3h', 'from', 'scintillation', 'counting', '125i', 'membranes', 'dopamine', 'radioligand', 'receptor']


In [22]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI

prompt_template = "You are a medicinal chemist. I found these keywords {key_words}. Together they refer to a type of pharmacological experiment. Do you know which experiment?"
prompt = PromptTemplate(
    input_variables=["keywords"], template=prompt_template
)
llm = model
chain = prompt | llm | StrOutputParser()

guessed = False
max_guesses = 5
for wrong_guesses in range(1, max_guesses + 1):
    response = chain.invoke(input=f', '.join(representation))
    if not 'however' in response.lower():
        print(response)
        break
else:
    print("The chosen llm can not make sense of the input keywords.") 

Llama.generate: 146 prefix-match hit, remaining 62 prompt tokens to eval

llama_print_timings:        load time =     379.86 ms
llama_print_timings:      sample time =     155.65 ms /   256 runs   (    0.61 ms per token,  1644.69 tokens per second)
llama_print_timings: prompt eval time =    5688.43 ms /    62 tokens (   91.75 ms per token,    10.90 tokens per second)
llama_print_timings:        eval time =   22082.87 ms /   255 runs   (   86.60 ms per token,    11.55 tokens per second)
llama_print_timings:       total time =   28177.24 ms /   317 tokens


  Thank you for providing me with the keywords related to a pharmacological experiment. Based on these keywords, I can infer that the experiment is likely a displacement experiment using a radioligand such as 125I to measure the binding affinity of a dopamine receptor.
Here's how I arrived at this conclusion:
* "Displacement" refers to the process of competing for a specific binding site on a protein or receptor with another molecule, in this case, 125I.
* "3h" likely indicates the time period over which the experiment is conducted.
* "From" could be used to indicate the starting point of the experiment, but without more context, it's difficult to determine its exact meaning.
* "Scintillation" refers to the measurement of light produced by a radiation detection device, such as a Geiger counter. This suggests that the experiment involves some form of radioactive tracer.
* "Counting" is likely used to indicate the method of measuring the binding affinity of the radioligand to the dopamin