# EX1

# Setup

In [1]:
import os
import numpy as np
import pandas as pd
import subprocess
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
import chromadb
from itertools import combinations

In [2]:
from BRAD import llms
from BRAD import brad
from BRAD import rag

[nltk_data] Downloading package words to /home/jpic/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jpic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jpic/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Load LLM + Database

In [3]:
# llm = llms.load_nvidia(nvidia_model = 'meta/llama3-70b-instruct', temperature=0)
llm = llms.load_llama('/nfs/turbo/umms-indikar/shared/projects/RAG/models/llama-2-70b-orca-200k.Q3_K_S.gguf')

In [5]:
llm

LlamaCpp(verbose=False, callbacks=<langchain_core.callbacks.manager.CallbackManager object at 0x14c84de2c950>, client=<llama_cpp.llama.Llama object at 0x14c9502ad890>, model_path='/nfs/turbo/umms-indikar/shared/projects/RAG/models/llama-2-70b-orca-200k.Q3_K_S.gguf', n_ctx=4096, max_tokens=1000, temperature=0.0)

In [12]:
import torch
seed = 0
if torch.cuda.is_available():
    generator = torch.Generator('cuda').manual_seed(seed)
else:
    generator = torch.Generator().manual_seed(seed)

In [14]:
persist_directory = '/nfs/turbo/umms-indikar/shared/projects/RAG/databases/DigitalLibrary-10-June-2024/'
embeddings_model = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5', model_kwargs = {'device': 'cpu'}) #, model_kwargs={'CUDA_LAUNCH_BLOCKING':1})
db_name = "DigitalLibrary"
_client_settings = chromadb.PersistentClient(path=(persist_directory + db_name))
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings_model, client=_client_settings, collection_name=db_name)

Tue Jun 11 10:08:50 2024 INFO Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5
Tue Jun 11 10:08:52 2024 INFO Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Tue Jun 11 10:08:53 2024 INFO Collection DigitalLibrary is not created.


## Code for Running the Experiment

In [20]:
def runPrompt(prompt,
              llm,                                         # preloaded LLM
              vectordb,                                    # preloaded vector database
              numArticles      = 5,                        # Number of documents used with RAG
              numHidden        = 2,                        # Number of unused documents with RAG
              numNoRagCalls    = 3,                        # Number of trials without the RAG
              ragOutputFile    ='EX-10-JUNE-2024-RAG.csv', # Outputs of the LLM with the RAG
              llmOnlyOutputFile='EX-10-JUNE-2024-LLM.csv'):# Outputs of the LLM without the RAG
    # Database look up
    documentSearch = vectordb.similarity_search_with_relevance_scores(prompt, k=numArticles)
    docs, scores = rag.getDocumentSimilarity(documentSearch)

    # Make the RAG chain
    chain = load_qa_chain(llm, chain_type="stuff", verbose = True)

    # Output information
    outputs = {
        'prompt':[],
        'response':[],
    }
    for i in range(numHidden):
        outputs['hiddenText-' + str(i)] = []
        outputs['hiddenRef-' + str(i)] = []
        outputs['hiddenScore-' + str(i)] = []
    for i in range(numArticles - numHidden):
        outputs['usedText-' + str(i)] = []
        outputs['usedRef-' + str(i)] = []
        outputs['usedScore-' + str(i)] = []

    # Iterate over combinations of documents
    for usedDocsIdxs in list(combinations(np.arange(numArticles), numArticles-numHidden)):
        print(usedDocsIdxs)
        # Organize docs
        usedDocs, usedScores, hiddenDocs, hiddenScores = [], [], [], []
        for idx in range(numArticles):
            if idx in usedDocsIdxs:
                usedDocs.append(docs[idx])
                usedScores.append(scores[idx])
            else:
                hiddenDocs.append(docs[idx])
                hiddenScores.append(scores[idx])

        # Call LLM
        response   = chain({"input_documents": usedDocs, "question": prompt})

        # Organize the output
        outputs['prompt'].append(prompt)
        outputs['response'].append(response['output_text'])
        usedI, hiddenI = 0, 0
        for idx in range(numArticles):
            if idx in usedDocsIdxs:
                outputs['usedText-'  + str(usedI)].append(docs[idx].page_content)
                outputs['usedRef-'   + str(usedI)].append(docs[idx].metadata)
                outputs['usedScore-' + str(usedI)].append(scores[idx])
                usedI += 1
            else:
                print('Hidden=' + str(idx))
                outputs['hiddenText-'  + str(hiddenI)].append(docs[idx].page_content)
                outputs['hiddenRef-'   + str(hiddenI)].append(docs[idx].metadata)
                outputs['hiddenScore-' + str(hiddenI)].append(scores[idx])
                hiddenI += 1
        print(hiddenI)
        print(usedI)

    # Save RAG output to file
    df = pd.DataFrame(outputs)
    if os.path.isfile(ragOutputFile):
        # File exists, append to it
        df.to_csv(ragOutputFile, mode='a', header=False, index=False)
    else:
        # File does not exist, create a new file
        df.to_csv(ragOutputFile, mode='w', header=True, index=False)

    # No RAG
    outputs = {
        'prompt':[prompt],
    }
    for i in range(numNoRagCalls):
        # Call LLM
        response   = chain({"input_documents": [], "question": prompt})
        outputs['response-'+str(i)] = response['output_text']

    # Save RAG output to file
    df = pd.DataFrame(outputs)
    if os.path.isfile(llmOnlyOutputFile):
        # File exists, append to it
        df.to_csv(llmOnlyOutputFile, mode='a', header=False, index=False)
    else:
        # File does not exist, create a new file
        df.to_csv(llmOnlyOutputFile, mode='w', header=True, index=False)



In [16]:
prompt = 'Compare and contrast the utility of dynamical systems and control theory perspectives relative to machine learning models of cellular reprogramming. What are the advantages and disadvantages of each perspective and which data are utilized in each framework?'
documentSearch = vectordb.similarity_search_with_relevance_scores(prompt, k=5)
docs, scores = rag.getDocumentSimilarity(documentSearch)

In [17]:
docs

[Document(page_content='a classical prob-\nlem of control theory (3). The difﬁculty arises in the fact that\nthe dynamics—and even proper representations of the cell state\nand inputs—are not well deﬁned in the context of cellular repro-\ngramming. Nevertheless, it seems natural to treat reprogram-\nming as a problem in control theory, with the ﬁnal state being\nthe desired reprogrammed cell. In this paper, we provide such a\nframework based on empirical data and demonstrate the poten-tial of this framework to provide insights into cellular repro-\ngramming (4).\nOur goal is to mathematically identify TFs that can directly\nreprogram human ﬁbroblasts into a desired target cell type. As\npart of our methodology, we create a model for the natural\ndynamics of proliferating human ﬁbroblasts, using time series\ndata collected throughout the cell cycle. We couple data from\nbioinformatics with methods of mathematical control theory—a\nframework that we dub data-guided control (DGC). We use 

# Run LLM Calls

In [None]:
# load questions
df = pd.read_csv('EX-10-June-2024-Qs.csv')
prompts =  df['Question']
for i, prompt in enumerate(prompts):
    print(prompt)
    runPrompt(prompt,
              llm,                                         # preloaded LLM
              vectordb,                                    # preloaded vector database
              numArticles      = 5,                        # Number of documents used with RAG
              numHidden        = 2,                        # Number of unused documents with RAG
              numNoRagCalls    = 2,                        # Number of trials without the RAG
              ragOutputFile    ='EX-10-JUNE-2024-RAG-llama2-70b.csv', # Outputs of the LLM with the RAG
              llmOnlyOutputFile='EX-10-JUNE-2024-LLM-llama2-70b.csv') # Outputs of the LLM without the RAG
    
    

Gene regulation is often described from the perspective of a dynamical system, where gene expression or other measures are used to construct a state space model. In deterministic dynamical systems, the state variable contains sufficient information to determine the future of the system. What experimental measurements do you believe are sufficient to determine the state of a cell?
(0, 1, 2)


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

2014 ); however, a central goal of the ﬁeld remains to move
beyond such a qualitative, metaphorical conceptualization to-
ward more quantitative, predictive models.
Mathematical modeling, especially in conjunction with dynam-
ical systems theories ( Brauer and Kribs, 2015 ), provides a
powerful tool for gai

In [31]:
df = pd.read_csv('EX-10-JUNE-2024-LLM.csv')
df.columns

Index(['prompt', 'response-0', 'response-1', 'response-2'], dtype='object')

In [32]:
list(df['response-0'])

["I don't know. The question is quite specific and requires expertise in gene regulation and dynamical systems. While I can understand the context, I'm not aware of a definitive answer to this question. Determining the state of a cell is a complex task, and it's likely that multiple experimental measurements would be required to accurately determine the state of a cell. However, I'm not aware of a specific set of measurements that are widely accepted as sufficient to determine the state of a cell."]

In [33]:
list(df['response-1'])

["I don't know. The question is quite specific and requires expertise in gene regulation and dynamical systems. While I can understand the context, I'm not aware of a definitive answer to this question. Determining the state of a cell is a complex task, and it's likely that multiple experimental measurements would be required to accurately capture the state of a cell. However, I'm not aware of a specific set of measurements that are widely accepted as sufficient to determine the state of a cell."]

In [34]:
list(df['response-2'])

["I don't know. The question is quite specific and requires expertise in gene regulation and dynamical systems. While I can understand the context, I'm not aware of a definitive answer to this question. Determining the state of a cell is a complex task, and it's likely that multiple experimental measurements would be required to capture the complexity of cellular behavior. However, I'm not aware of a specific set of measurements that are widely accepted as sufficient to determine the state of a cell."]

In [9]:
for i in list(combinations(np.arange(5), 3))[0]:
    print(i)
0 in list(combinations(np.arange(5), 3))[0]

0
1
2


True

In [10]:
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA

In [11]:
help(ChatNVIDIA)

Help on class ChatNVIDIA in module langchain_nvidia_ai_endpoints.chat_models:

class ChatNVIDIA(langchain_core.language_models.chat_models.BaseChatModel)
 |  ChatNVIDIA(*, name: Optional[str] = None, cache: ForwardRef('Union[BaseCache, bool, None]') = None, verbose: bool = None, callbacks: ForwardRef('Callbacks') = None, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None, custom_get_token_ids: Optional[Callable[[str], List[int]]] = None, callback_manager: Optional[langchain_core.callbacks.base.BaseCallbackManager] = None, base_url: str = 'https://integrate.api.nvidia.com/v1', model: str = 'mistralai/mixtral-8x7b-instruct-v0.1', temperature: Optional[float] = None, max_tokens: Optional[int] = 1024, top_p: Optional[float] = None, seed: Optional[int] = None, stop: Optional[Sequence[str]] = None) -> None
 |  
 |  NVIDIA chat model.
 |  
 |  Example:
 |      .. code-block:: python
 |  
 |          from langchain_nvidia_ai_endpoints import ChatNVIDIA
 |  
 |  
 |    

# Code for LLM Calls

In [None]:
def crossValidationOfDocumentsExperiment(chain, docs, scores, prompt, chatstatus):
    scores = list(scores)
    outputs = {
        'prompt':[],
        'response':[],
        'hidden':[],
        'hiddenRef':[],
        'hiddenScore':[]
    }
    for i in range(len(docs) - 1):
        outputs['known' + str(i)] = []
        outputs['knownRef' + str(i)] = []
        outputs['knownScore' + str(i)] = []
    for i in range(len(docs)):
        # query the model
        usedDocs = docs[:i] + docs[i + 1:]
        usedScores = scores[:i] + scores[i + 1:]
        hiddenDoc = docs[i]
        hiddenScore = scores[i]
        response   = chain({"input_documents": usedDocs, "question": prompt})
        # save the info
        outputs['prompt'].append(prompt)
        outputs['response'].append(response['output_text'])
        outputs['hidden'].append(hiddenDoc.page_content)
        outputs['hiddenRef'].append(hiddenDoc.metadata)
        outputs['hiddenScore'].append(scores[i])
        for j in range(len(docs) - 1):
            outputs['known' + str(j)].append(usedDocs[j].page_content)
            outputs['knownRef' + str(j)].append(usedDocs[j].metadata)
            outputs['knownScore' + str(j)].append(scores[j])

    df = pd.DataFrame(outputs)
    # Check if the file exists
    if os.path.isfile(chatstatus['experiment-output']):
        # File exists, append to it
        df.to_csv(chatstatus['experiment-output'], mode='a', header=False, index=False)
    else:
        # File does not exist, create a new file
        df.to_csv(chatstatus['experiment-output'], mode='w', header=True, index=False)

# Overlap Percentage

In [14]:
Q = pd.read_csv('Question Classification - Sheet1.csv')

In [16]:
Q['Question']

0      Gene regulation is often described from the pe...
1      Compare and contrast the utility of dynamical ...
2      Synthetic lethality occurs when the knockout o...
3      The addition of transcription factors have bee...
4      RNAi is used to silence different genes. How l...
                             ...                        
225                                                  NaN
226                                                  NaN
227                                                  NaN
228                                                  NaN
229                                                  NaN
Name: Question, Length: 230, dtype: object

In [18]:
K = 5
docs = []
for q in Q['Question']:
    print(q)
    qdocs = vectordb.similarity_search_with_relevance_scores(q, k=K)
    docs.append(qdocs)

Gene regulation is often described from the perspective of a dynamical system, where gene expression or other measures are used to construct a state space model. In deterministic dynamical systems, the state variable contains sufficient information to determine the future of the system. What experimental measurements do you believe are sufficient to determine the state of a cell?
Compare and contrast the utility of dynamical systems and control theory perspectives relative to machine learning models of cellular reprogramming. What are the advantages and disadvantages of each perspective and which data are utilized in each framework?
Synthetic lethality occurs when the knockout of two genes is lethal to a cell while the knockout of either gene separately would not kill the cell. The Yeast-2-Hybrid assay is an experimental technique that can determine if two proteins bind with one another by transplanting the proteins to a yeast cell and attaching them to reporter genes. Are these two id

AttributeError: 'float' object has no attribute 'replace'

In [35]:
def check_overlap(str1, str2):
    max_overlap = min(len(str1), len(str2))
    
    # Check overlap from end of str1 to beginning of str2
    for k in range(50, max_overlap + 1):
        if str1[-k:] == str2[:k]:
            return True # (str1, str2, str1[-k:], 'end-to-beginning')
    
    # Check overlap from end of str2 to beginning of str1
    for k in range(50, max_overlap + 1):
        if str2[-k:] == str1[:k]:
            return True #(str2, str1, str2[-k:], 'end-to-beginning')
    
    return False

In [43]:
c = 0
for qdocs in docs:
    c += 1
    sources = []
    for d in qdocs:
        sources.append(d[0].metadata['source'])
    if len(list(set(sources))) != K:
        numOverlaps = 0
        for i in range(K):
            for j in range(i+1,K):
                if qdocs[i][0].metadata['source'] == qdocs[j][0].metadata['source']:
                    if check_overlap(qdocs[i][0].page_content, qdocs[j][0].page_content):
                        numOverlaps += 1
        print(numOverlaps)
        if numOverlaps > 0:
            break
            print(qdocs)
print(c)
# print(c / len(docs))

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
7
22


In [42]:
qdocs

[(Document(page_content='in press as: Paull et al., A modular master regulator landscape controls cancer transcriptional identity, Cell (2021),\nhttps://doi.org/10.1016/j.cell.2020.11.045\nArticle', metadata={'page': 10, 'source': '/nfs/turbo/umms-indikar/shared/projects/RAG/papers/DigitalLibrary-9-June-2024/A modular master regulator landscape controls.pdf'}),
  0.7005485892295837),
 (Document(page_content='cite this article in press as: Paull et al., A modular master regulator landscape controls cancer transcriptional identity, Cell (2021),\nhttps://doi.org/10.1016/j.cell.2020.11.045\nArticle', metadata={'page': 2, 'source': '/nfs/turbo/umms-indikar/shared/projects/RAG/papers/DigitalLibrary-9-June-2024/A modular master regulator landscape controls.pdf'}),
  0.6858721971511841),
 (Document(page_content='as: Paull et al., A modular master regulator landscape controls cancer transcriptional identity, Cell (2021),\nhttps://doi.org/10.1016/j.cell.2020.11.045\nArticle', metadata={'page': 2

In [29]:
K = 5

In [21]:
docs[189]

[(Document(page_content='we proceed with our analysis of hypergraph products, we n eed to\nintroduce some speciﬁc notations:\nLet⊛n\ni=1Hi= (V,E) = (×n\ni=1V(Hi),E(⊛n\ni=1Hi)) be an arbitrary hypergraph\nproduct. The projectionpj:V→V(Hj) is deﬁned by v= (v1,...,v n)↦→vj. We\nwill callvjthej-th coordinate of the vertex v∈V. For a given vertex w∈V(H)\ntheHj-layer through wis the partial hypergraph of H\nHw\nj=⟨{v∈V(H)|pk(v) =pk(w) fork̸=j}⟩.\nIf for a hypergraph product ⊛n\ni=1HiholdsHi∼=Hfor alli= 1,...,nwe will\ndenote this hypergraph simply by H⊛n.\nLetUdenote the unit element, if one exists, of an arbitrary product ⊛, i.e.,\nH=H⊛Ufor all hypergraphs H. Since all hypergraph products considered here\nhave vertex set V1×V2the unit must always be a hypergraph with a single vertex.\nA hypergraphis said to be primeif the identity H=H1⊛H2implies that H1∼=U', metadata={'page': 7, 'source': '/nfs/turbo/umms-indikar/shared/projects/RAG/papers/DigitalLibrary-9-June-2024/A Survey on Hypergraph P