# Wikipedia Game with BRAD

In [1]:
import os
import numpy as np
import pandas as pd
import getpass
import difflib
import requests
import random
import string
import wikipediaapi
import logging

from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA
from langchain.memory import ConversationBufferMemory

from BRAD import brad

  from tqdm.autonotebook import tqdm, trange


In [2]:
log = logging.getLogger(__name__)
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
log.info('What is up?!')

INFO:__main__:What is up?!


In [13]:
class WikiGameLLMBot():
    def __init__(self, wiki_wiki,
                 start_topic = None,
                 target_topic = None,
                 model=None,
                 model_name='meta/llama3-70b-instruct',
                 temperature=0.1,
                 rag=True
                ):

        assert start_topic != target_topic, "Please enter different start and target topics."
        
        ################################################################
        #
        #    Save some things
        #
        ################################################################
        
        self.wiki_wiki     = wiki_wiki
        self.start_topic   = start_topic
        self.target_topic  = target_topic
        self.target_page   = self.wiki_wiki.page(self.target_topic)
        self.start_page    = self.wiki_wiki.page(self.start_topic)
        self.current_topic = self.start_page
        self.current_page  = self.start_page
        self.visited       = [self.start_page]
        self.rag           = rag

        ################################################################
        #
        #    Load NVIDIA model and chatbot history
        #
        ################################################################

        if model is not None:
            self.llm = model
        else:
            if not os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
                nvidia_api_key = getpass.getpass("Enter your NVIDIA API key: ")
                assert nvidia_api_key.startswith("nvapi-"), f"{nvidia_api_key[:5]}... is not a valid key"
                os.environ["NVIDIA_API_KEY"] = nvidia_api_key
            else:
                nvidia_api_key = os.environ["NVIDIA_API_KEY"]
                
            self.llm = ChatNVIDIA(model       = model_name,
                                  api_key     = nvidia_api_key,
                                  temperature = temperature,
                                 )
            self.memory = ConversationBufferMemory(ai_prefix="System")

    def get_filled_templates(self, alltitles, allsummaries):
        i = 0
        k = 100 # batch size
        templates, titles, summaries  = [], [], []
        while i < len(alltitles):
            titlesTemplate    = alltitles[i:i+k]
            titles.append(titlesTemplate)
            if self.rag:
                summariesTemplate = allsummaries[i:i+k]
                summaries.append(summariesTemplate)
                templates.append(self.get_filled_template(titlesTemplate, summaries=summariesTemplate))
            else:
                templates.append(self.get_filled_template(titlesTemplate))
            i += k
        return templates, titles, summaries
    
    def get_filled_template(self, titles, summaries=None):
        template = self.getWikiTemplate()
        visitedPages = ""
        for pg in self.visited:
            visitedPages += '- ' + str(pg.title) + '\n'
        template = template.format(target1 = self.target_topic,
                                   target2 = self.target_topic,
                                   target3 = self.target_topic,
                                   target4 = self.target_topic,
                                   current = self.current_topic,
                                   visited = visitedPages
                                  )
        if self.rag:
            link_table = """Topic\tDescription
--------\t-----------
"""
            for i in range(len(titles)):
                link_table += ("title:" + titles[i] + '\t' + summaries[i] + '\n')
        else:
            link_table = """Topic
----------
"""
            for i in range(len(titles)):
                link_table += ("title:" + titles[i] + '\n')

        template = template.format(links = link_table)
        return template

    def get_page_summary(self, wiki_page):
        """
        Retrieves a brief summary of a given Wikipedia page.
    
        This function takes a Wikipedia page object and returns the summary of the page. However, rather than 
        returning the entire summary, it returns only the first few lines. This is particularly useful for 
        getting a quick overview or introduction to the page's content without needing to process the entire 
        summary text.
    
        Parameters
        ----------
        wiki_page : WikipediaPage object
            A Wikipedia page object from which the summary is to be extracted. The object should have a 'summary' 
            attribute containing the text of the page's summary.
    
        Returns
        -------
        str
            A string containing the first few lines of the Wikipedia page's summary. The exact number of lines 
            returned is set to 5 in this implementation.
        """
        # return just the first few lines if there are multiple
        return wiki_page.summary[:100] + "..."
        # return ". ".join(wiki_page.summary.split("\n")[:1]) + "..."
    
    def getWikiTemplate(self):
        template = """/force RAG You must are playing the Wikipedia Game where you must find a chain of
Wikipedia pages that connect a source topic to a target topic. Your current topic is {current} and
you must select a new wikipedia page closer to the target topic {target1}. You can select a topic from
the following list:
    
{{links}}
    
If the target topic of {target2} is available, you should select the target topic {target3}. Otherwise, select
a next topic that is likely to be closer or fewer connections to the topic {target4}. If no topics appear relevant
you must still choose a topic to do next. You cannot say None. You have already visited the following pages,
which should be avoided in the future:

{visited}

Format your output as:
Next topic=<topic here>
    """
        return template
    
    def take_turn(self):
        # Get all linked pages
        print(type(self.current_page))
        page_titles = list(self.current_page.links)

        badpages = ['Category', 'Help', 'Wikipedia', 'Portal', 'List', 'Talk', 'Template']
        badpages_lower = [bad.lower() for bad in badpages]
        page_titles = [title for title in page_titles if not any(title.lower().startswith(bad) for bad in badpages_lower)]
        
        pages = []
        if self.rag:
            print('Got all titles')
            for i, title in enumerate(page_titles):
                print(title + '\t\t' + str(i) + '/' + str(len(page_titles)))
                pages.append(self.wiki_wiki.page(title))
    
            # get the summaries of these pages
            titles, summaries = [], []
            page_title_lookup = {}
            for page in pages:
                titles.append(page.title)
                summaries.append(self.get_page_summary(page))
                page_title_lookup[titles[-1]] = page
        else:
            titles = page_titles
            summaries = []
            page_title_lookup = []

        print(f'len(titles)={len(titles)}')
        print(f'len(summaries)={len(summaries)}')
        if self.rag:
            print(f'len(page_title_lookup.keys())={len(page_title_lookup.keys())}')

        if self.rag:
            print('page_title_lookup.keys()')
            print(list(page_title_lookup.keys()))

        templates, titleSets, summarySets = self.get_filled_templates(titles, summaries)
        bestTitles, bestSummaries = [], []
        for i in range(len(templates)):
            template = templates[i]

            print("Template")
            print(template)
            print('Title Set')
            print(titleSets)

            response = self.llm.invoke(template)
            print("Response")
            print(response)
            print("Parsed Response")
            proposedPage = response.content.split('=')[1]
            print(proposedPage)

            print("Most similar page")
            most_similar = difflib.get_close_matches(proposedPage, titleSets[i], n=1)[0]
            print(most_similar)
            print('\n\n\n')

            bestTitles.append(most_similar)
            if self.rag:
                bestSummaries.append(self.get_page_summary(page_title_lookup[most_similar]))

        # reduce it further
        if len(bestTitles) > 1:
            template = self.get_filled_template(bestTitles, bestSummaries)
            print('Map Reduce Template')
            print(template)
            response = self.llm.invoke(template)

            print("Response")
            print(response)

            print("Parsed Response")
            proposedPage = response.content.split('=')[1]
            print(proposedPage)
            
            print("Most similar page")
            most_similar = difflib.get_close_matches(proposedPage, bestTitles, n=1)[0]
        else:
            most_similar = bestTitles[0]
            
        # reset the current page
        print('Selected Page')
        print(most_similar)
        if self.rag:
            self.current_page = page_title_lookup[most_similar]
        else:
            self.current_page = self.wiki_wiki.page(most_similar)

        self.visited.append(self.current_page)
        
        if self.target_page.fullurl == self.current_page.fullurl:
            return True

        return False


In [4]:
global wiki_wiki
random_string = 'XXX' # generate_random_string(10)
wiki_wiki = wikipediaapi.Wikipedia(
    f'WikiBot-{random_string} (https://www.linkedin.com/in/kmaurinjones/)',
    'en',
    timeout = 30
    )

INFO:wikipediaapi:Wikipedia: language=en, user_agent: WikiBot-XXX (https://www.linkedin.com/in/kmaurinjones/) (Wikipedia-API/0.6.0; https://github.com/martin-majlis/Wikipedia-API/), extract_format=1


In [10]:
from BRAD import llms
from BRAD import brad
llm = llms.load_llama()
bot = brad.chatbot(llm=llm)


Would you like to use a database with BRAD [Y/N]?


 N


[32m2024-07-10 19:42:55 INFO semantic_router.utils.logger local[0m


Welcome to RAG! The chat log from this conversation will be saved to /home/jpic/BRAD/2024-07-10_19-41-35/log.json. How can I help?


In [14]:
llm = bot.to_langchain()

In [15]:
bot = WikiGameLLMBot(wiki_wiki,
                 start_topic = 'Hot air balloon',
                 target_topic = 'CRISPR',
                 model = llm,
                 model_name=None, # 'meta/llama3-70b-instruct',
                 temperature=0.1,
                 rag=False)

In [16]:
found = False
i = 0
while not found:
    found = bot.take_turn()
    print(bot.current_page.title)
    i += 1
    if i > 25:
        break

INFO:wikipediaapi:Request URL: https://en.wikipedia.org/w/api.php?action=query&prop=links&titles=Hot air balloon&pllimit=500


<class 'wikipediaapi.WikipediaPage'>


INFO:root:RAG


len(titles)=206
len(summaries)=0
Template
/force RAG You must are playing the Wikipedia Game where you must find a chain of
    Wikipedia pages that connect a source topic to a target topic. Your current topic is Hot air balloon (id: 38173, ns: 0) and
    you must select a new wikipedia page closer to the target topic CRISPR. You can select a topic from
    the following list:
    
    Topic
----------
title:1900 Olympics
title:1989 Alice Springs hot air balloon crash
title:2011 Somerset hot air balloon crash
title:2012 Carterton hot air balloon crash
title:2012 Ljubljana Marshes hot air balloon crash
title:2013 Luxor hot air balloon crash
title:2016 Lockhart hot air balloon crash
title:2018 Luxor hot air balloon crash
title:2021 Albuquerque hot air balloon crash
title:Abbey of Saint Gall
title:Aerodynamic drag
title:Air balloon (disambiguation)
title:Aircraft
title:Aircraft registration
title:Airworthiness
title:Albuquerque, New Mexico
title:Alice Springs, Northern Territory
title:Alt

INFO:root:


route



INFO:root:RAG
INFO:root:

PLANNER:

{'module': 'RAG', 'steps': [{'llm': "\x1b[1mLlamaCpp\x1b[0m\nParams: {'model_path': '/nfs/turbo/umms-indikar/shared/projects/RAG/models/llama-2-7b-chat.Q8_0.gguf', 'suffix': None, 'max_tokens': 1000, 'temperature': 0.0, 'top_p': 0.95, 'logprobs': None, 'echo': False, 'stop_sequences': [], 'repeat_penalty': 1.1, 'top_k': 40}", 'memory': 'None', 'prompt': "input_variables=['history', 'input'] template='Current conversation: {history}\\n\\n\\nNew Input: \\n{input}'", 'input': "Context: You are BRAD (Bioinformatic Retrieval Augmented Data), a chatbot specializing in biology,\nbioinformatics, genetics, and data science. You can be connected to a text database to augment your answers\nbased on the literature with Retrieval Augmented Generation, or you can use several additional modules including\nsearching the web for new articles, searching Gene Ontology or Enrichr bioinformatics databases, running snakemake\nand matlab pipelines, or


[1m> Finished chain.[0m


Please select a new wikipedia page closer to the target topic CRISPR from the list above.
Response
True
Parsed Response


AttributeError: 'str' object has no attribute 'content'