In [1]:
import json
from util import sanitize_filename
import os
from collections import defaultdict
from llm import chat_openai
from autogpt.commands.web_search import web_search_ddg
from datetime import datetime

In [2]:
import os

chunk_size = 4000 # How many characters to read per chunk in website text
overlap = 25 # How much overlap between chunks
MAX_TOKENS = 10000 # Roughly 3K tokens, $0.10 per MAX_TOKENs, max tokens to read per key question
search_query = "How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?"

In [3]:
# Can be moved to util.py
# Get the list of key questions
def get_key_questions_list_string(search_query_file_safe):
    file_path = f'autoscious_logs/{search_query_file_safe}/decompositions/improved_decomposition.json'

    with open(file_path, 'r') as f:
        question_decomposition = json.load(f)
    key_questions = question_decomposition['key_drivers']['1']['hypotheses']['1']['key_questions']
    print("\nkey_questions\n", key_questions)

    numbered_key_questions_string = ""
    for key, value in key_questions.items():
        first_two_words = ' '.join(value.split())
        numbered_key_questions_string += f'{int(key) - 1}. {first_two_words}\n'

    print("\nnumbered_key_questions_string\n", numbered_key_questions_string)
    return key_questions, numbered_key_questions_string

In [4]:
# 1) Get key questions
search_query_file_safe = sanitize_filename(search_query)
key_questions_dict, key_questions_list_string = get_key_questions_list_string(search_query_file_safe)
print("key_questions_dict", key_questions_dict)

# 2) create facts folder and subfolders
facts_folder_path = f'autoscious_logs/{sanitize_filename(search_query)}/facts'
if not os.path.exists(facts_folder_path):
    os.makedirs(facts_folder_path)
for kq_idx in key_questions_dict:
    kq_facts_folder_path = f'{facts_folder_path}/kq{int(kq_idx)-1}'
    if not os.path.exists(kq_facts_folder_path):
        os.makedirs(kq_facts_folder_path)
    with open(f'{kq_facts_folder_path}/facts.txt', 'a') as f:
        pass


key_questions
 {'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?', '2': 'How does ECR enzyme activity compare to other bacteria?', '3': 'What factors influence ECR enzyme activity in Kitsatospor setae bacteria?', '4': 'What are the potential applications or implications of high ECR enzyme activity in Kitsatospor setae bacteria?'}

numbered_key_questions_string
 0. What is the level of ECR enzyme activity in Kitsatospor setae bacteria?
1. How does ECR enzyme activity compare to other bacteria?
2. What factors influence ECR enzyme activity in Kitsatospor setae bacteria?
3. What are the potential applications or implications of high ECR enzyme activity in Kitsatospor setae bacteria?

key_questions_dict {'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?', '2': 'How does ECR enzyme activity compare to other bacteria?', '3': 'What factors influence ECR enzyme activity in Kitsatospor setae bacteria?', '4': 'What are the potential appli

In [5]:
# Can be broken to prompts.py and util.py
import os

def extract_facts_from_website_text(search_query_file_safe, key_questions_list_string, website_title, website_text, website_url):
    seed_initial_question_decomposition_prompt = f'''
Key questions (index : question): 
{key_questions_list_string}

Task: 
Extract and output as many accurate direct quotes from the text that are relevant to answering the key questions and its most relevant key question index. Format as a JSON.
```json
{{
  "1": {{
    "quote": "",
    "index": 0
  }},
  etc.
}}
```

Text: {website_text}

Respond only with the output, with no explanation or conversation.
'''
    # Ask GPT the prompt
    print("seed_initial_question_decomposition_prompt", seed_initial_question_decomposition_prompt)
    res = chat_openai(seed_initial_question_decomposition_prompt, model="gpt-3.5-turbo")
    print("Extracted quotes: ", res[0])

    # Save the quote to the corresponding key question index file
    res_json = json.loads(res[0])
    for key, value in res_json.items():
        index = value['index']
        quote = value['quote']

        # Only log if there is a quote
        if quote:
          file_name = f'autoscious_logs/{search_query_file_safe}/facts/kq{index}/facts.txt'

          with open(file_name, 'a', encoding='utf-8') as f:
              f.write(quote.replace('/"', '"') + f"[{website_url}]" + os.linesep)

    return

In [6]:
# Can be moved to util
def chunk_text(text: str, chunk_size: int, overlap: int) -> list[str]:
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i : i + chunk_size])
        print("len(chunk): ", text[i : i + chunk_size])
    return chunks

In [7]:
# Go through each full text rated highly to extract facts from
full_text_folder_path = f'autoscious_logs/{sanitize_filename(search_query)}/sources/full_text'

# Loop through every file in the directory
for filename in os.listdir(full_text_folder_path):
    curr_tokens = 0
    if filename.endswith('.txt'):
        file_path = os.path.join(full_text_folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            title = file.readline().strip()
            url = file.readline().strip()
            text = file.read()
        
        # Break the text into chunk to extract information from
        chunks = chunk_text(text, chunk_size, overlap)
        for i, chunk in enumerate(chunks):
            print(f"Chunk: {i} / {len(chunks)}")
            extract_facts_from_website_text(search_query_file_safe, key_questions_list_string, title, chunk, url)

            curr_tokens += len(chunk)
            if curr_tokens > MAX_TOKENS:
                print("Max tokens reached in chunks!")
                break

len(chunk):  Back to Top
Skip to main content
The .gov means it’s official.
Federal government websites often end in .gov or .mil. Before
sharing sensitive information, make sure you’re on a federal
government site.
The site is secure.
The https:// ensures that you are connecting to the
official website and that any information you provide is encrypted
and transmitted securely.
Access keys
NCBI Homepage
MyNCBI Homepage
Main Content
Main Navigation
Journal List
Virulence
v.8(6); 2017
PMC5626244
As a library, NLM provides access to scientific literature. Inclusion in an NLM database does not imply endorsement of, or agreement with,
the contents by NLM or the National Institutes of Health.
Learn more about our disclaimer.
Virulence. 2017; 8(6): 938–958. Published online 2016 Oct 20.
doi: 10.1080/21505594.2016.1250995PMCID: PMC5626244PMID: 27763824Blue light treatment of Pseudomonas aeruginosa: Strong bactericidal activity, synergism with antibiotics and inactivation of virulence factorsGr

Completion info:  {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "{\n  \"0\": {\n    \"quote\": \"Pseudomonas aeruginosa is among the most common pathogens responsible for both acute and chronic infections of high incidence and severity.\",\n    \"index\": 1\n  },\n  \"1\": {\n    \"quote\": \"P. aeruginosa resistance to conventional antimicrobials has increased rapidly over the past decade.\",\n    \"index\": 1\n  }\n}",
        "role": "assistant"
      }
    }
  ],
  "created": 1691117585,
  "id": "chatcmpl-7jfE1tVFpbTJjtWiTMo2o4dudtD0g",
  "model": "gpt-3.5-turbo-0613",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 86,
    "prompt_tokens": 1181,
    "total_tokens": 1267
  }
}
Extracted quotes:  {
  "0": {
    "quote": "Pseudomonas aeruginosa is among the most common pathogens responsible for both acute and chronic infections of high incidence and severity.",
    "index": 1
  },
  "1": {
    "quote