In [3]:
import json
from util import sanitize_filename
import os
from collections import defaultdict
from llm import chat_openai
from datetime import datetime

In [4]:
import os

chunk_size = 4000 # How many characters to read per chunk in website text
overlap = 25 # How much overlap between chunks
MAX_TOKENS = 10000 # Roughly 3K tokens, $0.10 per MAX_TOKENs, max tokens to read per key question
search_query = "How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?"

In [5]:
# Can be moved to util.py
# Get the list of key questions
def get_key_questions_list_string(search_query_file_safe):
    file_path = f'autoscious_logs/{search_query_file_safe}/decompositions/improved_decomposition.json'

    with open(file_path, 'r') as f:
        question_decomposition = json.load(f)
    key_questions = question_decomposition['key_drivers']['1']['hypotheses']['1']['key_questions']
    print("\nkey_questions\n", key_questions)

    numbered_key_questions_string = ""
    for key, value in key_questions.items():
        first_two_words = ' '.join(value.split())
        numbered_key_questions_string += f'{int(key) - 1}. {first_two_words}\n'

    print("\nnumbered_key_questions_string\n", numbered_key_questions_string)
    return key_questions, numbered_key_questions_string

In [6]:
# 1) Get key questions
search_query_file_safe = sanitize_filename(search_query)
key_questions_dict, key_questions_list_string = get_key_questions_list_string(search_query_file_safe)
print("key_questions_dict", key_questions_dict)

# 2) create facts folder and subfolders
facts_folder_path = f'autoscious_logs/{sanitize_filename(search_query)}/facts'
if not os.path.exists(facts_folder_path):
    os.makedirs(facts_folder_path)
for kq_idx in key_questions_dict:
    kq_facts_folder_path = f'{facts_folder_path}/kq{int(kq_idx)-1}'
    if not os.path.exists(kq_facts_folder_path):
        os.makedirs(kq_facts_folder_path)
    with open(f'{kq_facts_folder_path}/facts.txt', 'w') as f:
        pass


key_questions
 {'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?', '2': 'How does ECR enzyme activity compare to other bacteria?', '3': 'What factors influence ECR enzyme activity in Kitsatospor setae bacteria?', '4': 'What are the potential applications or implications of high ECR enzyme activity in Kitsatospor setae bacteria?'}

numbered_key_questions_string
 0. What is the level of ECR enzyme activity in Kitsatospor setae bacteria?
1. How does ECR enzyme activity compare to other bacteria?
2. What factors influence ECR enzyme activity in Kitsatospor setae bacteria?
3. What are the potential applications or implications of high ECR enzyme activity in Kitsatospor setae bacteria?

key_questions_dict {'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?', '2': 'How does ECR enzyme activity compare to other bacteria?', '3': 'What factors influence ECR enzyme activity in Kitsatospor setae bacteria?', '4': 'What are the potential appli

In [7]:
# Can be broken to prompts.py and util.py
import os

def extract_facts_from_website_text(search_query_file_safe, key_questions_list_string, website_title, website_text, website_url):
    seed_initial_question_decomposition_prompt = f'''
Key questions (index : question): 
{key_questions_list_string}

Task: 
Extract accurate and useful direct quotes from the source text that are relevant to answering the key questions and also output the quote's most relevant key question index. Format as a JSON.
```json
{{
  "<insert unique quote index>": {{
    "reasoning": <one sentence explaining how this direct quote helps answer the key question>
    "quote": "<insert direct quote>",
    "index": <insert most relevant key question index>
  }},
  etc.
}}
```

Source text: {website_text}

Respond only with the output, with no explanation or conversation.
'''
    # Ask GPT the prompt
    print("seed_initial_question_decomposition_prompt", seed_initial_question_decomposition_prompt)
    res = chat_openai(seed_initial_question_decomposition_prompt, model="gpt-3.5-turbo")
    print("Extracted quotes: ", res[0])

    # Save the quote to the corresponding key question index file
    res_json = json.loads(res[0])
    for key, value in res_json.items():
        index = value['index']
        quote = value['quote']

        # Only log if there is a quote
        if quote:
          file_name = f'autoscious_logs/{search_query_file_safe}/facts/kq{index}/facts.txt'

          with open(file_name, 'a', encoding='utf-8') as f:
              f.write(quote.replace('/"', '"') + f"[{website_url}]" + os.linesep)

    return

In [8]:
# Can be moved to util
def chunk_text(text: str, chunk_size: int, overlap: int) -> list[str]:
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i : i + chunk_size])
        print("len(chunk): ", text[i : i + chunk_size])
    return chunks

In [9]:
# Go through each full text rated highly to extract facts from
full_text_folder_path = f'autoscious_logs/{sanitize_filename(search_query)}/sources/full_text'

# Loop through every file in the directory
for filename in os.listdir(full_text_folder_path):
    curr_tokens = 0
    if filename.endswith('.txt'):
        file_path = os.path.join(full_text_folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            title = file.readline().strip()
            url = file.readline().strip()
            text = file.read()
        
        # Break the text into chunk to extract information from
        chunks = chunk_text(text, chunk_size, overlap)
        for i, chunk in enumerate(chunks):
            print(f"Chunk: {i} / {len(chunks)}")
            extract_facts_from_website_text(search_query_file_safe, key_questions_list_string, title, chunk, url)

            curr_tokens += len(chunk)
            if curr_tokens > MAX_TOKENS:
                print("Max tokens reached in chunks!")
                break

len(chunk):  ContentsSignificanceAbstractResultsDiscussionMaterials and MethodsData AvailabilityAcknowledgmentsSupporting InformationReferencesInformation & AuthorsMetrics & CitationsView OptionsReferencesMediaShareSignificanceCarboxylases capture and convert CO2, which makes them key enzymes in photosynthesis and the global carbon cycle. However, the question how enzymes bind atmospheric CO2 is still unsolved. We studied enoyl-CoA carboxylases/reductases (Ecrs), the fastest CO2-fixing enzymes in nature, using structural biology, biochemistry, and advanced computational methods. Ecrs create a highly specific CO2-binding pocket with 4 amino acids at the active site. The pocket controls the fate of the gaseous molecule during catalysis and shields the catalytic center from oxygen and water. This exquisite control makes Ecrs highly efficient carboxylases outcompeting RuBisCO, the key enzyme of photosynthesis, by an order of magnitude. Our findings define the atomic framework for the futur

Completion info:  {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "{\n  \"0\": {\n    \"reasoning\": \"This quote provides information about the level of ECR enzyme activity in Kitsatospor setae bacteria.\",\n    \"quote\": \"Ecrs create a highly specific CO2-binding pocket with 4 amino acids at the active site.\",\n    \"index\": 0\n  },\n  \"1\": {\n    \"reasoning\": \"This quote compares the catalytic efficiency and fidelity of ECR enzymes to RuBisCO, the key enzyme of photosynthesis.\",\n    \"quote\": \"which outcompete the plant enzyme RuBisCO in catalytic efficiency and fidelity by more than an order of magnitude.\",\n    \"index\": 1\n  },\n  \"2\": {\n    \"reasoning\": \"This quote explains the factors required to create a highly efficient CO2-fixing enzyme in Kitsatospor setae bacteria.\",\n    \"quote\": \"4 amino acids, N81, F170, E171, and H365, are required to create a highly efficient CO2-fixing enzyme.\",\n