In [1]:
from autogpt.commands.web_selenium import browse_website, scrape_text_with_selenium_no_agent
import json
from util import sanitize_filename
import os
from prompts import get_predicted_usefulness_of_text_prompt
from collections import defaultdict
from llm import chat_openai
from autogpt.commands.web_search import web_search_ddg
from datetime import datetime

In [2]:
search_query = "How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?"

In [3]:
search_query_file_safe = sanitize_filename(search_query)

In [4]:
# Load in decomposition
with open(f'autoscious_logs/{search_query_file_safe}/decompositions/improved_decomposition.json', 'r') as f:
    decomposition = json.load(f)

In [5]:
# Get key questions
key_questions = decomposition['key_drivers']['1']['hypotheses']['1']['key_questions']
print(key_questions)

{'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?', '2': 'How does ECR enzyme activity compare to other bacteria?', '3': 'What factors influence ECR enzyme activity in Kitsatospor setae bacteria?', '4': 'What are the potential applications or implications of high ECR enzyme activity in Kitsatospor setae bacteria?'}


In [6]:
# Create sources folder
if not os.path.exists(f'autoscious_logs/{search_query_file_safe}/sources'):
    os.makedirs(f'autoscious_logs/{search_query_file_safe}/sources') 
if not os.path.exists(f'autoscious_logs/{search_query_file_safe}/sources/full_text'):
    os.makedirs(f'autoscious_logs/{search_query_file_safe}/source/full_text') 

for key in key_questions.keys():
    if not os.path.exists(f'autoscious_logs/{search_query_file_safe}/sources/kq{int(key)-1}'):
        os.makedirs(f'autoscious_logs/{search_query_file_safe}/source/kq{int(key)-1}') 

In [7]:
search_engine = "General"
# search_engine = "General"

### Getting key questions and the decomposition and context

In [8]:
# Create a decomposition for each key question only
context = "Enoyl-CoA carboxylase/reductase enzymes (ECRs)"
key_question_decomposition_list = []
for driver_key, driver_value in decomposition['key_drivers'].items():
    for hypothesis_key, hypothesis_value in driver_value['hypotheses'].items():
        for question_key, question_value in hypothesis_value['key_questions'].items():
            new_decomposition = decomposition.copy()
            new_decomposition['key_drivers'] = {
                driver_key: {
                    'driver': driver_value['driver'],
                    'hypotheses': {
                        hypothesis_key: {
                            'hypothesis': hypothesis_value['hypothesis'],
                            'key_questions': {
                                question_key: question_value
                            }
                        }
                    }
                }
            }
            key_question_decomposition_list.append(new_decomposition)
print("Key questions decomposition list: ", key_question_decomposition_list)

Key questions decomposition list:  [{'project_question': 'How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?', 'project_objective': 'To determine the efficiency of ECR enzymes in Kitsatospor setae bacteria', 'key_drivers': {'1': {'driver': 'ECR enzyme activity', 'hypotheses': {'1': {'hypothesis': 'ECR enzyme activity is high in Kitsatospor setae bacteria', 'key_questions': {'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?'}}}}}}, {'project_question': 'How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?', 'project_objective': 'To determine the efficiency of ECR enzymes in Kitsatospor setae bacteria', 'key_drivers': {'1': {'driver': 'ECR enzyme activity', 'hypotheses': {'1': {'hypothesis': 'ECR enzyme activity is high in Kitsatospor setae bacteria', 'key_questions': {'2': 'How does ECR enzyme activity compare to other bacteria?'}}}}}}, {'project_question': 'How efficiently do the ECR enzymes work,

### Coming up with many good search queries

In [33]:
def get_initial_search_queries_prompt(context, key_question_decomposition, search_engine):
  if search_engine == "General":
    return f'''
Context:
{context}

Research question decomposition
{key_question_decomposition}

Task:
For the key question, write a clear and comprehensive but short (around 3 queries) list of search keyword queries optimized for best search engine results, so that you can confidently and quickly surface the most relevant information to determine the best answer to the question. 

The output should be in JSON format: 
```json
{{
  1: "<insert query>",
  etc.
}}

Respond only with the output, with no explanation or conversation.
'''
  elif search_engine == "OpenAlex":
    return f'''
Context:
{context}

Research question decomposition
{key_question_decomposition}

Task:
For the key question, write a clear and comprehensive but short (around 3 queries) list of search keyword queries optimized for best results from a research paper database that searches using an index of word sequences called n-grams, so that you can confidently and quickly surface the most relevant information to determine the best answer to the question. 

The output should be in JSON format: 
```json
{{
  1: "<insert query>",
  etc.
}}

Respond only with the output, with no explanation or conversation.
'''

In [34]:
print(get_initial_search_queries_prompt(context, key_question_decomposition_list[0], search_engine))


Context:
Enoyl-CoA carboxylase/reductase enzymes (ECRs)

Research question decomposition
{'project_question': 'How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?', 'project_objective': 'To determine the efficiency of ECR enzymes in Kitsatospor setae bacteria', 'key_drivers': {'1': {'driver': 'ECR enzyme activity', 'hypotheses': {'1': {'hypothesis': 'ECR enzyme activity is high in Kitsatospor setae bacteria', 'key_questions': {'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?'}}}}}}

Task:
For the key question, write a clear and comprehensive but short (around 3 queries) list of search keyword queries optimized for best search engine results, so that you can confidently and quickly surface the most relevant information to determine the best answer to the question. 

The output should be in JSON format: 
```json
{
  1: "<insert query>",
  etc.
}

Respond only with the output, with no explanation or conversation.



In [35]:
# TODO: Iterate through all key questions, or just ask to generate an initial set of search queries considering all key questions. Then when we iterate, we can see what facts have been added to what papers, and what unanswered questions we should focus our queries on.

context = "Enoyl-CoA carboxylase/reductase enzymes (ECRs)"

for decomposition_idx, key_question_decomposition in enumerate(key_question_decomposition_list):
    key_question_initial_search_queries = json.loads(chat_openai(get_initial_search_queries_prompt(context, key_question_decomposition, search_engine), model="gpt-3.5-turbo")[0])

    with open(f'autoscious_logs/{search_query_file_safe}/sources/kq{decomposition_idx}/initial_search_queries.json', 'w') as f:
        json.dump(key_question_initial_search_queries, f, indent=2)

Prompt:  
Context:
Enoyl-CoA carboxylase/reductase enzymes (ECRs)

Research question decomposition
{'project_question': 'How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?', 'project_objective': 'To determine the efficiency of ECR enzymes in Kitsatospor setae bacteria', 'key_drivers': {'1': {'driver': 'ECR enzyme activity', 'hypotheses': {'1': {'hypothesis': 'ECR enzyme activity is high in Kitsatospor setae bacteria', 'key_questions': {'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?'}}}}}}

Task:
For the key question, write a clear and comprehensive but short (around 3 queries) list of search keyword queries optimized for best search engine results, so that you can confidently and quickly surface the most relevant information to determine the best answer to the question. 

The output should be in JSON format: 
```json
{
  1: "<insert query>",
  etc.
}

Respond only with the output, with no explanation or conversation.



Completion info:  {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "{\n  \"1\": \"ECR enzyme activity Kitsatospor setae bacteria\",\n  \"2\": \"Factors affecting ECR enzyme activity\",\n  \"3\": \"Regulation of ECR enzyme activity in bacteria\"\n}",
        "role": "assistant"
      }
    }
  ],
  "created": 1691161687,
  "id": "chatcmpl-7jqhLghbedyUReTc9IosfSPoPyySk",
  "model": "gpt-3.5-turbo-0613",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 45,
    "prompt_tokens": 276,
    "total_tokens": 321
  }
}
Prompt:  
Context:
Enoyl-CoA carboxylase/reductase enzymes (ECRs)

Research question decomposition
{'project_question': 'How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?', 'project_objective': 'To determine the efficiency of ECR enzymes in Kitsatospor setae bacteria', 'key_drivers': {'1': {'driver': 'ECR enzyme activity', 'hypotheses': {'1': {'hypothesis': 'ECR enzyme 

### Web search given search keywords

In [36]:
import os
from dotenv import load_dotenv
from googleapiclient.discovery import build
load_dotenv()

True

In [37]:
import time
def google_search_raw(search_term, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=os.getenv('DEV_KEY'))
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()

    search_results = res.get("items", [])
    time.sleep(1)

    # Create a list of only the URLs from the search results
    search_results_links = [item["link"] for item in search_results]
    return search_results

In [38]:
def search_google(search_query):
    num_google_searches = 8
    results = google_search_raw(search_query, os.getenv('MY_CSE_ID'), num=num_google_searches, lr="lang_en", cr="countryUS")
    return results

In [39]:
MAX_RETRIES = 3

for decomposition_idx, key_question_decomposition in enumerate(key_question_decomposition_list):
    with open(f'autoscious_logs/{search_query_file_safe}/sources/kq{decomposition_idx}/initial_search_queries.json', 'r') as f:
        key_question_initial_search_queries = json.load(f)

    for idx, query in key_question_initial_search_queries.items():
        print("query: ", query)

        # DDG
        web_search_res = json.loads(web_search_ddg(query))
        if len(web_search_res) == 0:
            print("trying google search!")
            # Google
            web_search_res_raw = search_google(query) # google uses 'link' instead of 'href'
            web_search_res = [{
                'title': web_search_res_raw[i]['title'], 
                'href': web_search_res_raw[i]['link'], 
                'body': web_search_res_raw[i]['snippet']
                } for i in range(len(web_search_res_raw))
            ]
        
        # save web search results
        with open(f'autoscious_logs/{search_query_file_safe}/sources/kq{decomposition_idx}/initial_search_results_query_{idx}.json', 'w') as f:
            json.dump(web_search_res, f, indent=2)

query:  ECR enzyme activity Kitsatospor setae bacteria
query:  Factors affecting ECR enzyme activity
trying google search!
query:  Regulation of ECR enzyme activity in bacteria
query:  ECR enzyme activity in Kitsatospor setae bacteria
query:  Comparison of ECR enzyme activity in Kitsatospor setae bacteria with other bacteria
trying google search!
query:  Factors affecting ECR enzyme activity in Kitsatospor setae bacteria
query:  Factors influencing ECR enzyme activity in Kitsatospor setae bacteria
query:  ECR enzyme regulation in Kitsatospor setae bacteria
trying google search!
query:  ECR enzyme kinetics in Kitsatospor setae bacteria
query:  ECR enzyme activity Kitsatospor setae bacteria
query:  Efficiency of ECR enzymes in Kitsatospor setae bacteria
trying google search!
query:  Applications of high ECR enzyme activity in Kitsatospor setae bacteria


### Reading type 1: filtering unlikely relevant sources based on title and body

In [40]:
def get_filtering_web_results_ratings(context, decomposition, web_search_res):
    return f'''
Context:
{context}

Research question decomposition:
{decomposition}

Search results:
{web_search_res}

Task:
Based on the key questions and each search result's title and body content, identify which key question indexes the search result content is likely most useful for or -1 if it isn't useful for any key question, and for each key question index identified, assign a relevance score of the search result content using a 5-point Likert scale, with 1 being very irrelevant to 5 being very relevant to the key question.

The output should be in JSON format: 
```json
{{
  <'insert href'>: [['insert key question index', 'relevance score'], etc.],
  etc.
}}
```

Respond only with the output, with no explanation or conversation.
'''

In [41]:
from collections import defaultdict

for decomposition_idx, key_question_decomposition in enumerate(key_question_decomposition_list):
    with open(f'autoscious_logs/{search_query_file_safe}/sources/kq{decomposition_idx}/initial_search_queries.json', 'r') as f:
        key_question_initial_search_queries = json.load(f)

    for query_idx, query in key_question_initial_search_queries.items():
        # load web search results
        with open(f'autoscious_logs/{search_query_file_safe}/sources/kq{decomposition_idx}/initial_search_results_query_{query_idx}.json', 'r') as f:
            web_search_res = json.loads(f.read())
        
        filtered_web_results = {}
        if web_search_res != []:
            # filter web results based on title and body
            filtered_web_results = json.loads(chat_openai(get_filtering_web_results_ratings(context, decomposition, web_search_res), model="gpt-3.5-turbo")[0])

        ratings_url_dict = defaultdict(list)
        for url, ratings in filtered_web_results.items():
            if '-1' not in str(ratings):  # only process if the ratings is not -1
                for rating in ratings:
                    ratings_url_dict[str(rating[1])].append(url)  # append the URL to the correct category
            else:
                ratings_url_dict['-1'].append(url) # Misc & unrelated

        # save filtered search results
        with open(f'autoscious_logs/{search_query_file_safe}/sources/kq{decomposition_idx}/rated_web_results_query_{int(query_idx)}.json', 'w') as f:
            json.dump(ratings_url_dict, f, indent=2)

Prompt:  
Context:
Enoyl-CoA carboxylase/reductase enzymes (ECRs)

Research question decomposition:
{'project_question': 'How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?', 'project_objective': 'To determine the efficiency of ECR enzymes in Kitsatospor setae bacteria', 'key_drivers': {'1': {'driver': 'ECR enzyme activity', 'hypotheses': {'1': {'hypothesis': 'ECR enzyme activity is high in Kitsatospor setae bacteria', 'key_questions': {'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?', '2': 'How does ECR enzyme activity compare to other bacteria?', '3': 'What factors influence ECR enzyme activity in Kitsatospor setae bacteria?', '4': 'What are the potential applications or implications of high ECR enzyme activity in Kitsatospor setae bacteria?'}}}}}}

Search results:
[{'title': 'Awakening the Sleeping Carboxylase Function of Enzymes: Engineering the ...', 'href': 'https://pubs.acs.org/doi/10.1021/jacs.9b03431', 'body': 'Howe

Completion info:  {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "{\n  \"https://pubs.acs.org/doi/10.1021/jacs.9b03431\": [[\"1\", 5]],\n  \"https://www6.slac.stanford.edu/news/2022-04-29-how-soil-microbe-could-rev-artificial-photosynthesis\": [[\"1\", 4]],\n  \"https://pubmed.ncbi.nlm.nih.gov/31243147/\": [[\"1\", 5]],\n  \"https://scitechdaily.com/bacterial-enzyme-converts-co2-into-carbon-compounds-20x-faster-than-photosynthesis/\": [[\"1\", 5]],\n  \"https://www.researchgate.net/figure/Reaction-scheme-and-structural-organization-of-the-K-setae-ECR-complex-a_fig1_360182999\": [[\"1\", 5]],\n  \"https://phys.org/news/2022-04-soil-microbe-rev-artificial-photosynthesis.html\": [[\"1\", 4]],\n  \"https://pubs.acs.org/doi/10.1021/acsomega.2c01559\": -1,\n  \"https://annalsmicrobiology.biomedcentral.com/articles/10.1007/BF03175056\": -1\n}",
        "role": "assistant"
      }
    }
  ],
  "created": 1691161740,
  "id": "chatcm

### Reading type 2 & 3: filtering based on skimming and sampling from each source, and only saving most relevant sources for fact extraction and quotes.

In [9]:
# COMPLETE code for predicting usefulness of very relevant (5) and relevant (4) results.
from autogpt.commands.web_selenium import scrape_text_with_selenium_no_agent

CHUNK_SIZE = 1000
SAMPLING_FACTOR = 0.1 # Also cap it so it falls under the max token limit
MAX_TOKENS = 2500 * 4 # 1 token = 4 chars, 2500 + 500 (prompt) tokens is high for GPT3.5
MAX_CHUNKS = int(MAX_TOKENS / CHUNK_SIZE)
context = "Enoyl-CoA carboxylase/reductase enzymes (ECRs)"

In [10]:
def get_sample_chunks(text, CHUNK_SIZE, num_chunk_samples):
    step_size = len(text) // num_chunk_samples

    chunks = []
    for i in range(0, len(text), step_size):
        chunk = text[i:i+CHUNK_SIZE]
        chunks.append(chunk)

        # Break after getting the required number of chunks
        if len(chunks) >= num_chunk_samples:
            break

    return chunks

In [11]:
# Need to determine how useful the text is likely to be for answering the key questions
def get_predicted_usefulness_of_text_prompt(context, decomposition, sample_text_chunks):
    return f'''
Task: 
Based on the research question decomposition key questions and the sample text chunks of the source text, the goal is to identify how useful reading the full source text would be to extract direct quoted facts and determine the best answer to any of the key questions. 

Context:
{context}

Research question decomposition:
{decomposition}

Sample text chunks from the source text:
{sample_text_chunks}

Deliverables:
For each key question, assign a predicted usefulness score of the full source text using a 5-point Likert scale, with 1 being very unlikely to be usefulness to 5 being very likely useful and containing facts that answer the key question.

The output should be of the following JSON format
{{
    <insert key question index>: <insert predicted usefulness>,
   etc.
}}


Respond only with the output, with no explanation or conversation.
'''

In [12]:
def get_sample_chunks(text, CHUNK_SIZE, num_chunk_samples):
    step_size = len(text) // num_chunk_samples

    chunks = []
    for i in range(0, len(text), step_size):
        chunk = text[i:i+CHUNK_SIZE]
        chunks.append(chunk)

        # Break after getting the required number of chunks
        if len(chunks) >= num_chunk_samples:
            break

    return chunks

def find_title(url, web_search_inf):
    for item in web_search_inf:
        if item["href"] == url:
            return item["title"]
    return None

In [19]:
# Skimming through each highly relevant paper from skimming
for decomposition_idx, key_question_decomposition in enumerate(key_question_decomposition_list):
    with open(f'autoscious_logs/{search_query_file_safe}/sources/kq{decomposition_idx}/initial_search_queries.json', 'r') as f:
        key_question_initial_search_queries = json.load(f)

    for query_idx, query in key_question_initial_search_queries.items():
        # open filtered search results
        with open(f'autoscious_logs/{search_query_file_safe}/sources/kq{decomposition_idx}/rated_web_results_query_{int(query_idx)}.json', 'r') as f:
            ratings_url_dict = json.loads(f.read())

        # open web search info to extract metadata
        with open(f'autoscious_logs/{search_query_file_safe}/sources/kq{decomposition_idx}/initial_search_results_query_{int(query_idx)}.json', 'r') as f:
            web_search_info = json.load(f)
        
        for rating, urls in ratings_url_dict.items():
            if rating == '5': # TODO: figure out a good way to iterate and go to 4 after
                # Start with iterating through 4s and 5s of ratings_url_dict
                folder_path = f'autoscious_logs/{search_query_file_safe}/sources/kq{decomposition_idx}/predicted_usefulness_{rating}'
                if not os.path.exists(folder_path):
                    os.makedirs(folder_path)

                for rating_source_idx, url in enumerate(urls):
                    print("key question", decomposition_idx, "query ", query_idx, "rating_source_idx", rating_source_idx, "Skimming url:", url)

                    # Ensure the url hasn't already been visited
                    title = find_title(url, web_search_info)
                    if not os.path.exists(f'autoscious_logs/{sanitize_filename(search_query)}/sources/full_text/{sanitize_filename(title)}.txt') and not os.path.exists(f'{folder_path}/query_{query_idx}_url_index_{rating_source_idx}.json'):
                        text = scrape_text_with_selenium_no_agent(url, None, search_engine='chrome')

                        # Only evaluate websites you're able to scrape
                        if text != "No information found":
                            total_chunks = len(text) / CHUNK_SIZE
                            num_chunk_samples = min(int(total_chunks * SAMPLING_FACTOR), MAX_CHUNKS)
                            sample_chunks = get_sample_chunks(text, CHUNK_SIZE, num_chunk_samples)
                            print("len(sample_chunks)", len(sample_chunks))

                            # Get predicted usefulness based on sample chunks
                            predicted_usefulness_results = json.loads(chat_openai(get_predicted_usefulness_of_text_prompt(context, decomposition, sample_chunks), model="gpt-3.5-turbo")[0])

                            # save filtered search results
                            with open(f'{folder_path}/query_{query_idx}_url_index_{rating_source_idx}.json', 'w') as f:
                                json.dump(predicted_usefulness_results, f, indent=2)
                            
                            # Check if any scores were (4 or) 5, because then we should save the full text
                            pred_usefulness = predicted_usefulness_results.values()
                            if 5 in pred_usefulness or '5' in pred_usefulness:
                                with open(f'autoscious_logs/{sanitize_filename(search_query)}/sources/full_text/{sanitize_filename(title)}.txt', 'w', encoding='utf-8') as f:
                                    f.write(title + '\n')
                                    f.write(url + '\n')
                                    f.write(text)
                    else:
                        print("URL or text already visited!")

key question 0 query  1 rating_source_idx 0 Skimming url: https://pubs.acs.org/doi/10.1021/jacs.9b03431
URL or text already visited!
key question 0 query  1 rating_source_idx 1 Skimming url: https://pubmed.ncbi.nlm.nih.gov/31243147/
URL or text already visited!
key question 0 query  1 rating_source_idx 2 Skimming url: https://www.researchgate.net/figure/Reaction-scheme-and-structural-organization-of-the-K-setae-ECR-complex-a_fig1_360182999
URL or text already visited!
key question 1 query  1 rating_source_idx 0 Skimming url: https://pubs.acs.org/doi/10.1021/jacs.9b03431
URL or text already visited!
key question 1 query  1 rating_source_idx 1 Skimming url: https://pubmed.ncbi.nlm.nih.gov/31243147/
URL or text already visited!
key question 1 query  1 rating_source_idx 2 Skimming url: https://www.researchgate.net/figure/Reaction-scheme-and-structural-organization-of-the-K-setae-ECR-complex-a_fig1_360182999
URL or text already visited!
key question 1 query  3 rating_source_idx 0 Skimming u