In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import autogen
from autogen import register_function, AssistantAgent, UserProxyAgent
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
from autogen.agentchat.contrib.qdrant_retrieve_user_proxy_agent import QdrantRetrieveUserProxyAgent
from autogen.agentchat.contrib.web_surfer import WebSurferAgent
from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient
from ARGO import ArgoWrapper
from CustomLLMAutogen2 import ARGO_LLM
from typing import Dict, List, Any
import pandas as pd
from dotenv import load_dotenv
import glob


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()

True

In [4]:
# All available models
config_list = [
        {
            'model': 'gpt-3.5-turbo-16k',
            'tags': ['gpt3.5']
        },
        {
            'model': 'Argo',
            'api_type': 'argo',
            'argo_client': ARGO_LLM(argo=ArgoWrapper,model_type='gpt4', temperature = 0.3),
            'tags': ['argo']
        },
        {
            'model': 'NA',
            'api_key': 'NA',
            'base_url': 'http://140.221.70.43:5005/llm/v1',
            'tags': ['local']
        }
]
# Filters the models based on the tags. Filters models
filter_dict = {'tags': ['gpt3.5']}
config_list = autogen.filter_config(config_list, filter_dict)

In [5]:
# Sets up configuration for agents
llm_config = {
    "config_list": config_list, 
    "cache_seed": None, # Ensures differing responses
    "timeout": 600,
    "seed": 44,
    "temperature": 0.2, # Temperature max is 2
}
llm_config_gen = {
    "config_list": 
    [
        {
            'model': 'gpt-4-turbo',
        }
    ], 
    "cache_seed": None, # Ensures differing responses
    "timeout": 600,
    "seed": 44,
    "temperature": 1.0, # Temperature max is 2
    "response_format": { "type": "json_object" },
}
llm_config

{'config_list': [{'model': 'gpt-3.5-turbo-16k', 'tags': ['gpt3.5']}],
 'cache_seed': None,
 'timeout': 600,
 'seed': 44,
 'temperature': 0.2}

In [6]:
googleai_embedding_function= embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key = os.environ["GOOGLE_API_KEY"])

In [7]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\r", "\t"])

In [8]:
def termination_msg(x):
    return isinstance(x, dict) and "TERMINATE" == str(x.get("content", ""))[-9:].upper()

In [9]:
problem = f'''For each paper, generate 3 unique and extremely difficult to answer multiple choice questions with 5 choices each.
The answerer does not have access to the paper, you cannot require context for the question.
These should be general knowledge questions with supporting evidence from the paper.
All context required to answer the question must be provided within the question statement.
The question statement cannot include 'in this study', 'in this paper', 'according to the paper', etc.
There should be exactly one correct answer.
The incorrect answers must be difficult to distinguish from the correct answer, however they cannot be correct.
The incorrect answers are 'distractors' that are designed to be confuse the large language model that is answering the question.
'''

# problem = f'''For every paper in docs, generate 3 unique and extremely confusing to answer multiple choice questions with 5 choices each.
# Each question and its choices are designed to trick the answerer.
# The answerer does not have access to the paper, you cannot require context for the question.
# These should be general knowledge questions with supporting evidence from the paper.
# All context required to answer the question must be provided within the question statement.
# There should be exactly one correct answer.
# The incorrect answers must be difficult to distinguish from the correct answer, however they cannot be correct.
# The incorrect answers are 'distractors' that are designed to be confuse the large language model that is answering the question.
# '''

In [10]:
SCHEMA = {
  "question": "The generated question",
  "correct_answer": "The correct answer",
  "distractors": [
    "Incorrect answer #1", "Incorrect answer #2", "Incorrect Answer #3", "Incorrect Answer #4"
  ],
  "skills": f'''Choose the necessary skills for answering the question using at least one of the following options 
  {["Generalization", "Basic comprehension", "Summarization", "Interpolation/extrapolation", "Cross-domain application", "Reasoning", "General knowledge", "Fundamental domain science concepts", "Understanding identifiers/notation", "Understanding evolution of ideas"]}
  Must be in the form of a {list}''',
  "domains": f'''Choose the most applicable domains for the question using at least one of the following options
  {["physics", "material science", "biology", "chemistry", "computer science", "mathematics", "climate"]}
  Must be in the form of a {list}''',
  "difficulty": "Choose the difficulty of the question using one of the following options ['easy', 'medium', 'hard'] ",
  "doi": "Identify the digital object identifier (DOI) of the paper and provide it here. It will be the link of the form doi.org",
  "author": {
      "name": "Jose A. Tandoc",
      "affiliation": "Jose A. Tandoc",
      "position": "Student",
      "orcid": "NA"
  },
  "support": "",
  "comments": "generated question"
}

In [11]:
def initiate_RAG_agent(docs: List[str], collection: str):
    retrieval_assistant = QdrantRetrieveUserProxyAgent(
        name="assistant",
        human_input_mode="NEVER",
        default_auto_reply="Reply `TERMINATE` if the task is done.",
        # max_consecutive_auto_reply=10,
        retrieve_config={
            "task": "qa",
            "docs_path": docs,
            "custom_text_split_function": text_splitter.split_text,
            "embedding_funcion": googleai_embedding_function,
            "client": QdrantClient(url="https://7f9bbc68-cbea-48e0-9841-2dd23f878d28.us-east4-0.gcp.cloud.qdrant.io", api_key= os.environ["QDRANT_API_KEY"]),
            "collection_name": collection,
            "get_or_create": True,
        },
        code_execution_config=False,
        description="Assistant who has extra content retrieval power for solving difficult problems.",
    )
    return retrieval_assistant

expert = AssistantAgent(
    "Expert",
    system_message=f'''You are an expert on {problem} Assist in answering the problem. Then, put the information in a list using the following schema: {SCHEMA}. Follow the instructions of the SCHEMA.
    You should return a python {Dict}: {{'question1': SCHEMA, 'question2': SCHEMA, 'question3': SCHEMA}}. Do not change the author, question, affiliation, or comments fields. This response must be a JSON object.
    Reply 'TERMINATE' in the end when everything is done.''',
    llm_config=llm_config_gen,
    human_input_mode="NEVER",  # Never ask for human input.
    description="Expert in question generation.",
)

format_verifier = autogen.ConversableAgent(
    "format_verifier",
    system_message=f'''You are a format verifier that ensures that the expert's response is a python dictionary of dictionaries: {Dict[Dict, Any]}. 
    Reply 'VALID' if the format is a singular JSON object.
    Otherwise, reply 'WRONG' and prompt expert for a new format.''',
    llm_config=llm_config,
    human_input_mode="NEVER",  # Never ask for human input.
    description="Verifier who can verify the correctness of expert's questions.",
)

generality_verifier = autogen.ConversableAgent(
    "generality_verifier",
    system_message=f'''You are a generality verifier that ensures that a given question is not paper-specific. That is, it can be answered without reading the paper.
    If the question is too specific, you prompt the expert for another question.
    Reply 'VALID' if the response is general enough to be answered without reading the paper and pass the JSON object to the question_generation_automator. Otherwise, reply 'WRONG' and provide feedback on how to improve the question.''',
    llm_config=llm_config,
    human_input_mode="NEVER",
    description="Generality Verifier who ensures that each question is general enough to be answered without reading the given paper.",
)

automator = AssistantAgent(
    'question_generation_automator',
    system_message=f'''DO NOT UPDATE CONTEXT. You are an automator that takes the JSON object produced by the expert and makes a function call to the add_questions function.
    Reply 'TERMINATE' in the end when everything is done.''',
    llm_config=llm_config,
    human_input_mode="NEVER",
    description="Automator utilizes functions to automate the process of generating questions.",
)

user_proxy = autogen.ConversableAgent(
    name="User",
    system_message=f'''You are a user that can execute the add_questions function to add the generated questions to the CSV file.''',
    is_termination_msg=termination_msg,
    human_input_mode="NEVER",
    code_execution_config=False,
    description="The proxy that adds the question to the CSV file using the add_questions function"
)

# web_surfer = WebSurferAgent(
#     "web_surfer",
#     system_message=f'''You are a web surfer that can find the DOI of a paper using the paper's title.
#     Reply 'TERMINATE' in the end when everything is done''',
#     llm_config=llm_config,
#     summarizer_llm_config=llm_config,
#     browser_config={"viewport_size": 4096, "bing_api_key": os.environ["BING_API_KEY"]},
#     description="Web Surfer who can search the web for information.",
# )

In [12]:
def add_question(question: str, correct_answer: str, distractors: List[str], skills: str, domains: str, difficulty: str, doi: str, author: str, comments: str, affiliation: str, position: str) -> str:
    cur = pd.DataFrame([{
        "question": question,
        "correct_answer": correct_answer,
        "distractors": distractors,
        "skills": skills,
        "domains": domains,
        "difficulty": difficulty,
        "doi": doi,
        "author": author,
        "comments": comments,
        "affiliation": affiliation,
        "position": position,
    }])
    if os.path.exists(r"generatedQuestions/generated_questions.csv"):
        df = pd.read_csv(r"generatedQuestions/generated_questions.csv")
        df = pd.concat([df, cur])
        df.to_csv(r"generatedQuestions/generated_questions.csv", index=False)
    else:
        cur.to_csv(r"generatedQuestions/generated_questions.csv", index=False)
    return "Question added to the CSV file."

In [13]:
def add_questions(question1: Dict, question2: Dict, question3: Dict) -> str:
    q1 = pd.DataFrame([question1])
    q2 = pd.DataFrame([question2])
    q3 = pd.DataFrame([question3])
    cur = pd.concat([q1, q2, q3], ignore_index=True)
    if os.path.exists(r"generatedQuestions/generated_questions.csv"):
        cur.to_csv(r"generatedQuestions/generated_questions.csv", mode='a', header=False, index=False)
    else:
        cur.to_csv(r"generatedQuestions/generated_questions.csv", mode='w', index=False)
    return 'Finished'

In [14]:
register_function(
    add_questions,
    caller=automator,
    executor=user_proxy,
    name="add_questions",
    description="Adds all generated questions to the CSV file. Arguments required: question1=Dict, question2=Dict, question3=Dict",
)

In [15]:
docs = glob.glob('../papers/*.pdf')
doc_count = len(docs)

In [16]:
def _reset_agents(retrieval_assistant):
    retrieval_assistant.reset()
    expert.reset()
    # format_verifier.reset()
    # generality_verifier.reset()
    automator.reset()
    user_proxy.reset()
    #web_surfer.reset()

In [17]:
for idx, doc in enumerate(docs[:1]):
    retrieval_assistant = initiate_RAG_agent([doc], f'doc{idx}')
    def state_transition(last_speaker, groupchat):
        messages = groupchat.messages
        if last_speaker is retrieval_assistant:
            return expert
        elif last_speaker is expert:
            if "UPDATE CONTEXT" in messages[-1]["content"]:
                return retrieval_assistant
            return automator
        # elif last_speaker is format_verifier:
        #     if "VALID" in messages[-1]["content"]:
        #         return generality_verifier
        #     else:
        #         return expert
        # elif last_speaker is generality_verifier:
        #     if "VALID" in messages[-1]["content"]:
        #         return automator 
        #     else:
        #         return expert
        elif last_speaker is automator:
            return user_proxy
        elif last_speaker is user_proxy:
            return None
    def rag_chat(retrieval_assistant):
        _reset_agents(retrieval_assistant)
        groupchat = autogen.GroupChat(
            agents=[retrieval_assistant, expert, automator, user_proxy], messages=[], max_round=20,
            speaker_selection_method=state_transition,
            send_introductions=True, # Provides information on each agent in the group chat to the manager.
        )
        
        manager = autogen.GroupChatManager(groupchat=groupchat, llm_config={
            "config_list": config_list, 
            "cache_seed": None, # Ensures differing responses
            "timeout": 600,
            "seed": 42,
            }
        )

        # Start chatting with retrieval_assistant as this is the user proxy agent.
        retrieval_assistant.initiate_chat(
            manager,
            message=retrieval_assistant.message_generator,
            problem=problem,
            n_results=3,
        )
    rag_chat(retrieval_assistant)


Trying to create collection.


2024-07-11 10:03:38,438 - autogen.agentchat.contrib.qdrant_retrieve_user_proxy_agent - INFO - Found 21 chunks.[0m


[32mAdding content of doc 20 to context.[0m
[32mAdding content of doc 19 to context.[0m
[32mAdding content of doc 12 to context.[0m
[33massistant[0m (to chat_manager):

You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the
context provided by the user.
If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.
You must give as short an answer as possible.

User's question is: For each paper, generate 3 unique and extremely difficult to answer multiple choice questions with 5 choices each.
The answerer does not have access to the paper, you cannot require context for the question.
These should be general knowledge questions with supporting evidence from the paper.
All context required to answer the question must be provided within the question statement.
The question statement cannot include 'in this study', 'in this paper', 'according to the paper', etc.
There should be exactly 

In [48]:
from json import loads, dumps
import requests
import pandas as pd
df = pd.read_csv(r"generatedQuestions/generated_questions.csv", index_col=False)
length = df.shape[0]
headers = {
            "Content-Type": "application/json"
        }
TEST_URL = 'https://web.cels.anl.gov/projects/auroragptquestions/api/test_question'
SUBMIT_URL = 'https://web.cels.anl.gov/projects/auroragptquestions/api/question'
for idx in range(1):
    data = dumps(df.iloc[idx].to_dict())
    response = requests.post(TEST_URL, headers=headers, data=data)
    print(response)
df.iloc[idx].to_dict()
# for idx in range(length):
#     data = df.iloc[idx].to_json()
#     response = requests.post(SUBMIT_URL, headers=headers, data=data)
#     print(response)
    

<Response [500]>


{'question': 'Which antibody is often used as a control in cell line experiments due to its ubiquitous expression in eukaryotic cells?',
 'correct_answer': 'GAPDH',
 'distractors': "['Tubulin', 'Actin', 'Myc', 'Hemoglobin']",
 'skills': "['General knowledge']",
 'domains': "['biology']",
 'difficulty': 'hard',
 'doi': 'doi.org/10.1038/nature12345',
 'author': "{'name': 'Jose A. Tandoc', 'affiliation': 'Argonne', 'position': 'Student', 'orcid': 'NA'}",
 'support': nan,
 'comments': 'generated question'}

In [47]:
import ast
data = df.iloc[1]
data['doi'] = '1234TEST'
data['distractors'] = ast.literal_eval(data["distractors"])
data['skills'] = ast.literal_eval(data["skills"])
data['domains'] = ast.literal_eval(data["domains"])
data['author'] = ast.literal_eval(data["author"])
data['support'] = ""
data.to_json()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['doi'] = '1234TEST'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['distractors'] = ast.literal_eval(data["distractors"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['skills'] = ast.literal_eval(data["skills"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['domains'] = ast.literal_eva

'{"question":"In a Bayesian statistical framework, what is the term used to describe the initial distributions input into the model prior to incorporating observed data?","correct_answer":"Priors","distractors":["Posterior","Likelihood","Conjugate","Monte Carlo"],"skills":["General knowledge","Understanding identifiers\\/notation"],"domains":["mathematics"],"difficulty":"hard","doi":"1234TEST","author":{"name":"Jose A. Tandoc","affiliation":"Argonne","position":"Student","orcid":"NA"},"support":"","comments":"generated question"}'

In [45]:
DATA = data.to_json()
response = requests.post(TEST_URL, headers=headers, data=DATA)
DATA.replace("'", "\"")

'{"question":"In a Bayesian statistical framework, what is the term used to describe the initial distributions input into the model prior to incorporating observed data?","correct_answer":"Priors","distractors":["Posterior","Likelihood","Conjugate","Monte Carlo"],"skills":["General knowledge","Understanding identifiers\\/notation"],"domains":["mathematics"],"difficulty":"hard","doi":"1234TEST","author":"{"name": "Jose A. Tandoc", "affiliation": "Argonne", "position": "Student", "orcid": "NA"}","support":"","comments":"generated question"}'

<Response [200]>


In [None]:
# user_proxy = autogen.ConversableAgent(
#     name="User",
#     llm_config=False,
#     is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg["content"],
#     human_input_mode="NEVER",
# )

# assistant = autogen.ConversableAgent(
#     name="Assistant",
#     system_message="You are a helpful AI assistant. "
#     "You help with JSON Formatting"
#     "Return 'TERMINATE' when the task is done.",
#     llm_config=llm_config,
# )
# test = []
# def json_return(first_name: str, last_name: str, email: str) -> Dict:
#     form = {
#         'first_name': first_name,
#         'last_name': last_name,
#         'email': email,
#     }
#     test.append(form)
#     return form

# register_function(
#     json_return,
#     caller=assistant,  # The assistant agent can suggest calls to the calculator.
#     executor=user_proxy,  # The user proxy agent can execute the calculator calls.
#     name="json_return",  # By default, the function name is used as the tool name.
#     description="A json object returner",  # A description of the tool.
# )
# form = {
#     'first_name': 'first name',
#     'last_name': 'last name',
#     'email': 'email',
# }
# chat_result = user_proxy.initiate_chat(assistant, message= f'''Produce a json object using the following format: {form} given this information: Alec Tandoc, metandoc@gmail.com''', max_turns=3)