# Searching the Hardwired Genome like the Wikipedia Game

**Problem**: Given Gene A and Gene B, find the most direct path of influence from Gene A to Gene B.

The naive approach to this is Breadth First Search. Using an LLM, we aim to find the path faster (or with fewer searches) and to identify missing connections in our network

In [27]:
import os
import numpy as np
import pandas as pd
import getpass
import difflib
import requests
import random
import string
import wikipediaapi
import logging
from scipy.sparse import csr_matrix


from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA
from langchain.memory import ConversationBufferMemory

from HWG import build

In [10]:
log = logging.getLogger(__name__)
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
log.info('Logging check!')

INFO:__main__:Logging check!


# Set up LLM Bot

In [52]:
def firstOrderInteractions(HWG, gene):
    df = HWG['geneIndexTable']
    index = df.loc[df['symbol'] == gene].index
    # print(index)
    index = index.tolist()[0]
    # print(index)
    firstOrderIndices = HWG['A'][:,index].nonzero()[0].tolist()
    # print(len(firstOrderIndices))
    # print(firstOrderIndices)
    return df['symbol'].loc[firstOrderIndices].tolist()

fom = firstOrderInteractions(HWG, 'PCNA')
len(fom)

326

In [78]:
def getHWGTemplate():
    template = """You must are playing the WikiGame on a network of genes, and you must find a chain of
genes that are related to one another in order to connect a source gene to a target gene. Your current 
gene is {current} and you must select a new gene that will be closer to the target gene {target1}. You can
select any of the following genes:

{{links}}

If the target gene of {target2} is available, you should select the target gene {target3}. Otherwise, select
a next gene that is likely to be closer or fewer connections to the gene {target4}. If no gene appear relevant
you must still choose a gene to try next. You cannot say None. You have already visited the following genes,
which should be avoided in the future:

{visited}

Format your output as:
Next gene=<topic here>
"""
    return template

In [93]:
class WikiGameLLMBot():
    def __init__(self, HWG,
                 start_gene = None,
                 target_gene = None,
                 model_name='meta/llama3-70b-instruct',
                 temperature=0.1,
                 rag=True
                ):

        assert start_gene != target_gene, "Please enter different start and target topics."
        
        ################################################################
        #
        #    Save some things
        #
        ################################################################
        
        self.HWG           = HWG
        self.start_gene    = start_gene
        self.target_gene   = target_gene
        self.current_gene  = self.start_gene
        self.visited       = [self.start_gene]
        self.rag           = rag

        ################################################################
        #
        #    Load NVIDIA model and chatbot history
        #
        ################################################################

        if not os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
            nvidia_api_key = getpass.getpass("Enter your NVIDIA API key: ")
            assert nvidia_api_key.startswith("nvapi-"), f"{nvidia_api_key[:5]}... is not a valid key"
            os.environ["NVIDIA_API_KEY"] = nvidia_api_key
        else:
            nvidia_api_key = os.environ["NVIDIA_API_KEY"]
            
        self.llm = ChatNVIDIA(model       = model_name,
                              api_key     = nvidia_api_key,
                              temperature = temperature,
                             )
        self.memory = ConversationBufferMemory(ai_prefix="System")

    def get_filled_templates(self, alltitles, allsummaries):
        i = 0
        k = 200 # batch size
        templates, titles, summaries  = [], [], []
        while i < len(alltitles):
            titlesTemplate    = alltitles[i:i+k]
            titles.append(titlesTemplate)
            if self.rag:
                summariesTemplate = allsummaries[i:i+k]
                summaries.append(summariesTemplate)
                templates.append(self.get_filled_template(titlesTemplate, summaries=summariesTemplate))
            else:
                templates.append(self.get_filled_template(titlesTemplate))
            i += k
        return templates, titles, summaries
    
    def get_filled_template(self, genes, summaries=None):
        template = getHWGTemplate()
        visitedGenes = ""
        for gene in self.visited:
            visitedGenes += '- ' + str(gene) + '\n'
        template = template.format(target1 = self.target_gene,
                                   target2 = self.target_gene,
                                   target3 = self.target_gene,
                                   target4 = self.target_gene,
                                   current = self.current_gene,
                                   visited = visitedGenes
                                  )
        if self.rag:
            link_table = """Topic\tDescription
--------\t-----------
"""
            for i in range(len(titles)):
                link_table += ("title:" + titles[i] + '\t' + summaries[i] + '\n')
        else:
            link_table = """Gene
----------
"""
            for i in range(len(genes)):
                if genes[i] is not None:
                    link_table += ("gene:" + str(genes[i]) + '\n')

        template = template.format(links = link_table)
        return template
    
    def take_turn(self):
        # Get all first order interactions from current genes
        firstOrderGenes = firstOrderInteractions(self.HWG, self.current_gene)
        
        # For now RAG must be false
        if self.rag:
            print('Got all titles')
            for i, title in enumerate(page_titles):
                print(title + '\t\t' + str(i) + '/' + str(len(page_titles)))
                pages.append(self.wiki_wiki.page(title))
    
            # get the summaries of these pages
            titles, summaries = [], []
            page_title_lookup = {}
            for page in pages:
                titles.append(page.title)
                summaries.append(get_page_summary(page))
                page_title_lookup[titles[-1]] = page
        else:
            summaries = []

        print(f'len(firstOrderGenes)={len(firstOrderGenes)}')
        print(f'len(summaries)={len(summaries)}')
        if self.rag:
            print(f'len(page_title_lookup.keys())={len(page_title_lookup.keys())}')
            print('page_title_lookup.keys()')
            print(list(page_title_lookup.keys()))

        templates, geneSets, summarySets = self.get_filled_templates(firstOrderGenes, summaries)
        bestGenes, bestSummaries = [], []
        for i in range(len(templates)):
            template = templates[i]

#            print('\n\n\n')
            print("Template")
            print(template)
#            print('\n\n\n')            
            print('Gene Set')
            print(geneSets[i])
#            print('\n\n\n')

            response = self.llm.invoke(template)
            print("Response")
            print(response)
            print(' ')
            print("Response.content")
            print(response.content)
#            print('\n\n\n')
            print("Parsed Response")
            proposedPage = response.content.split('=')[1]
            print(proposedPage)
#            print('\n\n\n')

            print("Most similar page")
            log.info(f'proposedPage={proposedPage}')
            log.info(f'geneSets[i]={geneSets[i]}')
            most_similar = difflib.get_close_matches(proposedPage, geneSets[i], n=1)[0]
            print(most_similar)
#            print('\n\n\n')
            print('\n\n\n')

            bestGenes.append(most_similar)
            if self.rag:
                bestSummaries.append(get_page_summary(page_title_lookup[most_similar]))

        # reduce it further
        if len(bestGenes) > 1:
            template = self.get_filled_template(bestGenes, bestSummaries)
            print('Map Reduce Template')
            print(template)
            response = self.llm.invoke(template)

            print("Response")
            print(response)
            print(' ')
            print("Response.content")
            print(response.content)

            print("Parsed Response")
            proposedGene = response.content.split('=')[1]
            print(proposedPage)
            
            print("Most similar page")
            most_similar = difflib.get_close_matches(proposedGene, bestGenes, n=1)[0]
        else:
            most_similar = bestGenes[0]

        # reset the current page
        print('Selected Gene')
        print(most_similar)
        if self.rag:
            self.current_gene = page_title_lookup[most_similar]
        else:
            self.current_gene = most_similar

        self.visited.append(self.current_gene)
        
        if self.target_gene == self.current_gene:
            return True

        return False


# Load Hardwired Genome Network

In [80]:
from HWG import build
HWG = build.loadHWG()

Loading HWG for threshold 600...
HWG loaded successfully.


In [81]:
HWG['geneIndexTable']

Unnamed: 0,Stable ID,symbol,Transcription Factor,Gene start (bp),Gene end (bp),Chromosome/scaffold name,Protein stable ID
0,ENSG00000000003,TSPAN6,False,100627108.0,100639991.0,X,ENSP00000362111
1,ENSG00000000005,TNMD,False,100584936.0,100599885.0,X,ENSP00000362122
2,ENSG00000000419,DPM1,False,50934867.0,50959140.0,20,ENSP00000507119
3,ENSG00000000457,SCYL3,False,,,,
4,ENSG00000000460,FIRRM,False,,,,
...,...,...,...,...,...,...,...
41318,ENSG00000181404,WASHC1,False,,,,
41319,ENSG00000276581,SPATA31A5,False,,,,
41320,ENSG00000278848,TP53TG3F,False,,,,
41321,ENSG00000279782,PPIAL4F,False,,,,


# Play Game

In [94]:
bot = WikiGameLLMBot(HWG,
                 start_gene = 'MYOD1',
                 target_gene = 'PCNA',
                 model_name='meta/llama3-70b-instruct',
                 temperature=0.1,
                 rag=False)

In [95]:
found = False
i = 0
while not found:
    found = bot.take_turn()
    print(bot.current_gene)
    i += 1
    if i > 5:
        break

len(firstOrderGenes)=123
len(summaries)=0
Template
You must are playing the WikiGame on a network of genes, and you must find a chain of
genes that are related to one another in order to connect a source gene to a target gene. Your current 
gene is MYOD1 and you must select a new gene that will be closer to the target gene PCNA. You can
select any of the following genes:

Gene
----------
gene:KDM1A
gene:CREBBP
gene:IFRD1
gene:MYH13
gene:PAX7
gene:IGF1
gene:TBPL1
gene:HDAC4
gene:MEF2A
gene:TCF3
gene:AP1M1
gene:MYH7B
gene:MEF2C
gene:SMARCD3
gene:CHRD
gene:ESR1
gene:MYH7
gene:SIRT1
gene:CABIN1
gene:DDX17
gene:EP300
gene:PRMT5
gene:SNW1
gene:SIX4
gene:SMAD7
gene:SUV39H1
gene:CTCF
gene:CKM
gene:EZH2
gene:MYH1
gene:MYH3
gene:PPARGC1A
gene:MYF6
gene:MYF5
gene:MYL2
gene:GAPDH
gene:MAPK14
gene:MDFI
gene:TBP
gene:SRF
gene:KAT2B
gene:ID2
gene:HDAC1
gene:MEF2D
gene:ID3
gene:CREB1
gene:MYB
gene:TCF21
gene:MYOG
gene:TWIST1
gene:EGR2
gene:BHLHE41
gene:RUNX2
gene:MYH2
gene:ID1
gene:SIX1
gene:SMARCA4
g

INFO:__main__:proposedPage=EP300
INFO:__main__:geneSets[i]=['KDM1A', 'CREBBP', 'IFRD1', 'MYH13', 'PAX7', 'IGF1', 'TBPL1', 'HDAC4', 'MEF2A', 'TCF3', 'AP1M1', 'MYH7B', 'MEF2C', 'SMARCD3', 'CHRD', 'ESR1', 'MYH7', 'SIRT1', 'CABIN1', 'DDX17', 'EP300', 'PRMT5', 'SNW1', 'SIX4', 'SMAD7', 'SUV39H1', 'CTCF', 'CKM', 'EZH2', 'MYH1', 'MYH3', 'PPARGC1A', 'MYF6', 'MYF5', 'MYL2', 'GAPDH', 'MAPK14', 'MDFI', 'TBP', 'SRF', 'KAT2B', 'ID2', 'HDAC1', 'MEF2D', 'ID3', 'CREB1', 'MYB', 'TCF21', 'MYOG', 'TWIST1', 'EGR2', 'BHLHE41', 'RUNX2', 'MYH2', 'ID1', 'SIX1', 'SMARCA4', 'CSRP3', 'CDKN1C', 'CDH15', 'TNNI2', 'NFATC1', 'MYH8', 'MYH10', 'FST', 'CDK4', 'CHRND', 'PAX3', 'GATA4', 'CDK9', 'GTF2B', 'MSTN', 'FGF2', 'RB1', 'TCF12', 'NCOR1', 'MYOCD', 'TP53', 'SMAD4', 'AKT1', 'LMO4', 'HES6', 'SETD7', 'FOXO1', 'UTRN', 'GRIP1', 'FBXO32', 'KLHL40', 'TRIM63', 'NEUROD1', 'IGFN1', 'HAND2', 'TBXT', 'HEY1', 'TAF3', 'MAPK7', 'SMAD3', 'IGF2', 'CTNNB1', 'MYL1', 'STAT3', 'KAT5', 'SMAD2', 'ASCL3', 'JUN', 'MSC', 'EXOC3L1', 'BHLHA15', 

Response
content='Next gene=EP300' response_metadata={'role': 'assistant', 'content': 'Next gene=EP300', 'token_usage': {'prompt_tokens': 903, 'total_tokens': 909, 'completion_tokens': 6}, 'model_name': 'meta/llama3-70b-instruct'} id='run-d1341987-da3c-4865-8854-a496210ea52d-0' role='assistant'
 
Response.content
Next gene=EP300
Parsed Response
EP300
Most similar page
EP300




Selected Gene
EP300
EP300
len(firstOrderGenes)=675
len(summaries)=0
Template
You must are playing the WikiGame on a network of genes, and you must find a chain of
genes that are related to one another in order to connect a source gene to a target gene. Your current 
gene is EP300 and you must select a new gene that will be closer to the target gene PCNA. You can
select any of the following genes:

Gene
----------
gene:NFYA
gene:PLXND1
gene:KDM1A
gene:POLR2J
gene:CREBBP
gene:KMT2E
gene:ITGA2B
gene:ETV1
gene:PAX6
gene:TFAP2B
gene:TFAP2D
gene:MED24
gene:MLXIPL
gene:BAZ1B
gene:BRCA1
gene:ERCC1
gene:NR1H4
gene:WWTR1


INFO:__main__:proposedPage=CREBBP
INFO:__main__:geneSets[i]=['NFYA', 'PLXND1', 'KDM1A', 'POLR2J', 'CREBBP', 'KMT2E', 'ITGA2B', 'ETV1', 'PAX6', 'TFAP2B', 'TFAP2D', 'MED24', 'MLXIPL', 'BAZ1B', 'BRCA1', 'ERCC1', 'NR1H4', 'WWTR1', 'MNAT1', 'RUNX3', 'AQR', 'NR1H3', 'PIAS1', 'MED17', 'POLR2B', 'HDAC9', 'ERCC8', 'ARID1B', 'FOXP3', 'THRAP3', 'CUL1', 'KMT2C', 'CAMK2B', 'CREB3L3', 'HDAC7', 'HIPK2', 'SPEN', 'TLE2', 'ME1', 'NFYC', 'SPI1', 'POLR1H', 'PKM', 'CBFB', 'HDAC4', 'MEF2A', 'POLR1A', 'SIRT2', 'RORA', 'CAMK2A', 'ING3', 'TCF3', 'MBD3', 'PRKACA', 'SREBF1', 'UBE2D1', 'TP63', 'PTGS2', 'GLI2', 'NOTCH3', 'EED', 'ACTB', 'XAB2', 'RARB', 'GTF3C1', 'SIRT6', 'EDN1', 'TNRC6C', 'TP73', 'SMARCA2', 'SRCAP', 'HSP90AA1', 'RBL1', 'TCF7', 'MEF2C', 'PGR', 'KAT6A', 'PPIE', 'REST', 'NCOA1', 'ABCB1', 'BAX', 'TFAP2C', 'AURKA', 'ANAPC5', 'SIRT4', 'TNRC6A', 'ESR1', 'CEBPE', 'SUPT16H', 'AGO1', 'HDAC6', 'SUCO', 'BRPF3', 'SIRT1', 'SETD1A', 'POLR2E', 'MED15', 'SMARCB1', 'CABIN1', 'MAPK1', 'POLR2F', 'DDX17', 'TNRC6B', 'RB

Response
content='Next gene=CREBBP' response_metadata={'role': 'assistant', 'content': 'Next gene=CREBBP', 'token_usage': {'prompt_tokens': 1373, 'total_tokens': 1380, 'completion_tokens': 7}, 'model_name': 'meta/llama3-70b-instruct'} id='run-6430309c-b4fc-4ccd-b936-78088a21c17b-0' role='assistant'
 
Response.content
Next gene=CREBBP
Parsed Response
CREBBP
Most similar page
CREBBP




Template
You must are playing the WikiGame on a network of genes, and you must find a chain of
genes that are related to one another in order to connect a source gene to a target gene. Your current 
gene is EP300 and you must select a new gene that will be closer to the target gene PCNA. You can
select any of the following genes:

Gene
----------
gene:EPAS1
gene:SUMO1
gene:HDAC1
gene:ASH1L
gene:MEF2D
gene:GADD45A
gene:PRDM2
gene:ZBTB17
gene:TFAP2E
gene:RBBP5
gene:ID3
gene:CDC20
gene:PROX1
gene:ARID1A
gene:RPA2
gene:CTSD
gene:KMT2A
gene:APOA1
gene:CREB1
gene:MYB
gene:MED28
gene:FOXO3
gene:CCND2
gene:SET
ge

INFO:__main__:proposedPage=RPA2
INFO:__main__:geneSets[i]=['EPAS1', 'SUMO1', 'HDAC1', 'ASH1L', 'MEF2D', 'GADD45A', 'PRDM2', 'ZBTB17', 'TFAP2E', 'RBBP5', 'ID3', 'CDC20', 'PROX1', 'ARID1A', 'RPA2', 'CTSD', 'KMT2A', 'APOA1', 'CREB1', 'MYB', 'MED28', 'FOXO3', 'CCND2', 'SET', 'HOXB1', 'IFNA6', 'IFNA8', 'EGR1', 'NFYB', 'COPS5', 'GTF3A', 'HNRNPA2B1', 'TWIST1', 'NEUROG3', 'EGR2', 'MED13L', 'ATF1', 'NR4A1', 'CDK2', 'NMI', 'AGO2', 'NCOA3', 'SNAI1', 'RBPJL', 'PCK1', 'HIF3A', 'TRERF1', 'SIRT5', nan, 'H2BC11', 'MED20', 'CDKN1A', 'SOX4', 'DEK', 'RUNX2', 'IRF1', 'SOX9', 'IL1B', 'PAX8', 'POLR1B', 'GTF2F1', 'MED1', 'CITED1', 'MAX', 'ID1', 'AGO3', 'THRA', 'NR1D1', 'IRF3', 'PRMT1', 'STAT5A', 'HSPA2', 'NUP214', 'BCL11B', 'KLF2', 'SMARCA4', 'ATF4', 'VGF', 'IRF5', 'MYOD1', 'CCNT1', 'FOXA1', 'ASH2L', 'EPO', 'JUND', 'HELZ2', 'HBZ', 'MED18', 'DNMT1', 'NFATC1', 'NR1H2', 'RARA', 'PIAS3', 'DNAJB1', 'PPARG', 'MYBBP1A', 'RPA1', 'H3-3B', 'GPS2', 'PCNA', 'CDK8', 'MED10', 'KRAS', 'E2F5', 'BMAL1', 'HSD17B4', 'MED6', 'C

Response
content='Next gene=RPA2' response_metadata={'role': 'assistant', 'content': 'Next gene=RPA2', 'token_usage': {'prompt_tokens': 1367, 'total_tokens': 1373, 'completion_tokens': 6}, 'model_name': 'meta/llama3-70b-instruct'} id='run-d312637c-ddbc-4dc1-ad87-27600a2c8f28-0' role='assistant'
 
Response.content
Next gene=RPA2
Parsed Response
RPA2
Most similar page


TypeError: object of type 'float' has no len()