In [42]:
from llm import chat_openai
import json
from datetime import datetime

In [32]:
import re

def sanitize_filename(filename):
    return re.sub(r'[^a-zA-Z0-9]', '_', filename)

In [9]:
search_query = "How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?"

In [33]:
search_query_file_safe = sanitize_filename(search_query)

## MECE question decomposition

In [10]:
seed_initial_question_decomposition_prompt = f'''
Research project question: {search_query}

Task: Decompose this research question into 1) project objective: what should be the outcome of the project, 2) key drivers: what are the key problem drivers to achieve the project objective, 3) hypotheses: what can already be hypothesized about the key drivers, 4) key questions: what analyses need to be conducted to verify, falsify, or change the hypotheses?

Rules: Be as mutually exclusive, completely exhaustive (MECE) as possible. The output should be in nested JSON format of 
```json
{{
  "project_objective": "",
  "key_drivers": {{
    "1": {{
      "driver": "",
      "hypotheses": {{
          "1": {{
            "hypothesis": "",
            "key_questions": {{
                "1": "", 
                "2": "",
                etc.
            }},
          }},
          etc.
      }},
    "2" : {{}},
    etc.
    }}"",
  }},
}}
```
Respond only with the output, with no explanation or conversation.
'''

In [11]:
print("seed_initial_question_decomposition_prompt", seed_initial_question_decomposition_prompt)

seed_initial_question_decomposition_prompt 
Research project question: How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?

Task: Decompose this research question into 1) project objective: what should be the outcome of the project, 2) key drivers: what are the key problem drivers to achieve the project objective, 3) hypotheses: what can already be hypothesized about the key drivers, 4) key questions: what analyses need to be conducted to verify, falsify, or change the hypotheses?

Rules: Be as mutually exclusive, completely exhaustive (MECE) as possible. The output should be in nested JSON format of 
```json
{
  "project_objective": "",
  "key_drivers": {
    "1": {
      "driver": "",
      "hypotheses": {
          "1": {
            "hypothesis": "",
            "key_questions": {
                "1": "", 
                "2": "",
                etc.
            },
          },
          etc.
      },
    "2" : {},
    etc.
    }"",
  },
}
```
Respon

In [12]:
res = chat_openai(seed_initial_question_decomposition_prompt, model="gpt-3.5-turbo")
print(res[0])

Prompt:  
Research project question: How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?

Task: Decompose this research question into 1) project objective: what should be the outcome of the project, 2) key drivers: what are the key problem drivers to achieve the project objective, 3) hypotheses: what can already be hypothesized about the key drivers, 4) key questions: what analyses need to be conducted to verify, falsify, or change the hypotheses?

Rules: Be as mutually exclusive, completely exhaustive (MECE) as possible. The output should be in nested JSON format of 
```json
{
  "project_objective": "",
  "key_drivers": {
    "1": {
      "driver": "",
      "hypotheses": {
          "1": {
            "hypothesis": "",
            "key_questions": {
                "1": "", 
                "2": "",
                etc.
            },
          },
          etc.
      },
    "2" : {},
    etc.
    }"",
  },
}
```
Respond only with the output, with no ex

Completion info:  {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "{\n  \"project_objective\": \"To determine the efficiency of ECR enzymes in Kitsatospor setae bacteria\",\n  \"key_drivers\": {\n    \"1\": {\n      \"driver\": \"ECR enzyme activity\",\n      \"hypotheses\": {\n        \"1\": {\n          \"hypothesis\": \"ECR enzyme activity is high in Kitsatospor setae bacteria\",\n          \"key_questions\": {\n            \"1\": \"What is the level of ECR enzyme activity in Kitsatospor setae bacteria?\",\n            \"2\": \"How does ECR enzyme activity vary under different conditions?\",\n            \"3\": \"Are there any factors that enhance or inhibit ECR enzyme activity?\"\n          }\n        }\n      }\n    }\n  }\n}",
        "role": "assistant"
      }
    }
  ],
  "created": 1690744745,
  "id": "chatcmpl-7i6ETD80Zlsi2RIIvZyM7UuZzfl1q",
  "model": "gpt-3.5-turbo-0613",
  "object": "chat.completion",
  "usage"

In [13]:
question_decomposition = json.loads(res[0])

In [28]:
question_decomposition['search_query'] = search_query

In [29]:
print(question_decomposition)

{'project_objective': 'To determine the efficiency of ECR enzymes in Kitsatospor setae bacteria', 'key_drivers': {'1': {'driver': 'ECR enzyme activity', 'hypotheses': {'1': {'hypothesis': 'ECR enzyme activity is high in Kitsatospor setae bacteria', 'key_questions': {'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?', '2': 'How does ECR enzyme activity vary under different conditions?', '3': 'Are there any factors that enhance or inhibit ECR enzyme activity?'}}}}}, 'search_query': 'How efficiently do the ECR enzymes work, especially in Kitsatospor setae bacteria?'}


In [15]:
key_questions = question_decomposition['key_drivers']['1']['hypotheses']['1']['key_questions']
print(key_questions)

{'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?', '2': 'How does ECR enzyme activity vary under different conditions?', '3': 'Are there any factors that enhance or inhibit ECR enzyme activity?'}


In [47]:
with open(f'autoscious_logs/decompositions/{search_query_file_safe}.json', 'w') as f:
    json.dump(question_decomposition, f, indent=2)

FileNotFoundError: [Errno 2] No such file or directory: 'autoscious_logs/decompositions/How_efficiently_do_the_ECR_enzymes_work__especially_in_Kitsatospor_setae_bacteria_.json'

## Search

In [84]:
with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\question_answering\autoscious_logs\decompositions\How_efficiently_do_the_ECR_enzymes_work__especially_in_Kitsatospor_setae_bacteria_.json', 'r') as f:
        question_decomposition = json.load(f)
key_questions = question_decomposition['key_drivers']['1']['hypotheses']['1']['key_questions']
print(key_questions)

{'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?', '2': 'How does ECR enzyme activity vary under different conditions?', '3': 'Are there any factors that enhance or inhibit ECR enzyme activity?'}


In [37]:
key_questions['1']

'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?'

In [17]:
import json
import time
from itertools import islice
from duckduckgo_search import DDGS
DUCKDUCKGO_MAX_ATTEMPTS = 3

In [20]:
# Taken from AutoGPT

def safe_google_results(results: str | list) -> str:
    """
        Return the results of a Google search in a safe format.

    Args:
        results (str | list): The search results.

    Returns:
        str: The results of the search.
    """
    if isinstance(results, list):
        safe_message = json.dumps(
            [result.encode("utf-8", "ignore").decode("utf-8") for result in results]
        )
    else:
        safe_message = results.encode("utf-8", "ignore").decode("utf-8")
    return safe_message

def web_search_ddg(query: str, num_results: int = 8) -> str:
    print("searching from duck duck go!")
    """Return the results of a Google search

    Args:
        query (str): The search query.
        num_results (int): The number of results to return.

    Returns:
        str: The results of the search.
    """
    search_results = []
    attempts = 0

    while attempts < DUCKDUCKGO_MAX_ATTEMPTS:
        if not query:
            return json.dumps(search_results)

        results = DDGS().text(query)
        search_results = list(islice(results, num_results))

        if search_results:
            break

        time.sleep(1)
        attempts += 1

    results = json.dumps(search_results, ensure_ascii=False, indent=4)
    return safe_google_results(results)

In [21]:
web_search_res = web_search_ddg(key_questions['1'])

searching from duck duck go!


In [22]:
web_search_res

'[\n    {\n        "title": "Intersubunit Coupling Enables Fast CO2-Fixation by Reductive ...",\n        "href": "https://pubs.acs.org/doi/10.1021/acscentsci.2c00057",\n        "body": "Here we used a combination of ambient-temperature X-ray free electron laser (XFEL) and cryogenic synchrotron experiments to study the structural organization of the ECR from Kitasatospora setae. The K. setae ECR is a homotetramer that differentiates into a pair of dimers of open- and closed-form subunits in the catalytically active state. Using ..."\n    },\n    {\n        "title": "In the active centre of carbon dioxide conversion",\n        "href": "https://www.mpg.de/13672605/0705-terr-138345-nature-s-fantastic-four",\n        "body": "They analyzed the ECR from the bacterium Kitasatospora setae, currently the fastest known carboxylase. Using a combined approach of structural biology, biochemistry and computer simulations, they were able to understand for the first time how the enzyme binds and conve

In [51]:
# save to main
file_name_safe = sanitize_filename(key_questions['1'])

with open(f'autoscious_logs/web_search/{file_name_safe}.json', 'w') as f:
    json.dump(web_search_res, f, indent=2)

## Browse websites

In [46]:
from autogpt.commands.web_selenium import browse_website

NoConsoleScreenBufferError: Found xterm-color, while expecting a Windows console. Maybe try to run this program using "winpty" or run it in cmd.exe instead. Or otherwise, in case of Cygwin, use the Python executable that is compiled for Cygwin.

## Extract relevant facts and evidence for key question

In [None]:
# Testing and reading website code is in browse_websites.py

In [111]:
with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\question_answering\autoscious_logs\decompositions\How_efficiently_do_the_ECR_enzymes_work__especially_in_Kitsatospor_setae_bacteria_.json', 'r') as f:
        question_decomposition = json.load(f)
key_questions = question_decomposition['key_drivers']['1']['hypotheses']['1']['key_questions']
print(key_questions)

{'1': 'What is the level of ECR enzyme activity in Kitsatospor setae bacteria?', '2': 'How does ECR enzyme activity vary under different conditions?', '3': 'Are there any factors that enhance or inhibit ECR enzyme activity?'}


In [113]:
numbered_key_questions_string = ""
# Iterate over each item in the dictionary
for key, value in key_questions.items():
    # Extract the first two words from the question
    first_two_words = ' '.join(value.split())
    # Add the new line to the string, using the key as the number and the first two words as the question
    numbered_key_questions_string += f'{key}. {first_two_words}\n'

print(numbered_key_questions_string)

1. What is the level of ECR enzyme activity in Kitsatospor setae bacteria?
2. How does ECR enzyme activity vary under different conditions?
3. Are there any factors that enhance or inhibit ECR enzyme activity?



In [53]:
key_question = "What is the level of ECR enzyme activity in Kitsatospor setae bacteria?"

In [92]:
# Open the file in read mode ('r')
with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\question_answering\autoscious_logs\web_search\What_is_the_level_of_ECR_enzyme_activity_in_Kitsatospor_setae_bacteria_\In_the_active_centre_of_carbon_dioxide_conversion_complete_text.txt', 'r', encoding='utf-8') as file:
    # Read the entire contents of the file
    website_text = file.read()

# Now, 'text' contains the contents of the file
print(website_text)

In order to overcome the climate crisis, two measures are required: The reduction of carbon dioxide (CO2) emissions, and removal of CO2 from the earth atmosphere. The latter is the goal of Tobias Erb's research group at the Max Planck Institute for Terrestrial Microbiology in Marburg. Their approaches not only aim to benefit climate protection, but also to secure sustainability in the long term: to filter CO2 from the air and make it usable for technology.
The Carboxylase active site: Four amino acids are important for CO2 binding in highly efficient CO2 fixing enzymes.
© Max-Planck-Institute for Terrestrial Microbiology/Erb
The Carboxylase active site: Four amino acids are important for CO2 binding in highly efficient CO2 fixing enzymes.
© Max-Planck-Institute for Terrestrial Microbiology/Erb
Filtering CO2 efficiently from the air - nature can do this through photosynthesis, converting CO2 into biomass. Unlike industrial technologies which can only use the gas in a highly concentrated

In [128]:
seed_initial_question_decomposition_prompt = f'''
Key questions (index : question): 
{numbered_key_questions_string}

Task: 
Extract and output any accurate direct quotes from the text that are relevant to the key questions and its most relevant key question index. Rank from most relevant to least. Format as a JSON.
```json
{{
  "1": {{
    "quote": "",
    "index": 0
  }},
  etc.
}}
```

Text: {website_text}

Respond only with the output, with no explanation or conversation.
'''

In [129]:
res = chat_openai(seed_initial_question_decomposition_prompt, model="gpt-3.5-turbo")
print(res[0])

Prompt:  
Key questions (index : question): 
1. What is the level of ECR enzyme activity in Kitsatospor setae bacteria?
2. How does ECR enzyme activity vary under different conditions?
3. Are there any factors that enhance or inhibit ECR enzyme activity?


Task: 
Extract and output any accurate direct quotes from the text that are relevant to the key questions and its most relevant key question index. Rank from most relevant to least. Format as a JSON.
```json
{
  "1": {
    "quote": "",
    "index": 0
  },
  etc.
}
```

Text: In order to overcome the climate crisis, two measures are required: The reduction of carbon dioxide (CO2) emissions, and removal of CO2 from the earth atmosphere. The latter is the goal of Tobias Erb's research group at the Max Planck Institute for Terrestrial Microbiology in Marburg. Their approaches not only aim to benefit climate protection, but also to secure sustainability in the long term: to filter CO2 from the air and make it usable for technology.
The Ca

Completion info:  {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "{\n  \"1\": {\n    \"quote\": \"We were surprised to learn that only four amino acids are sufficient to provide control over the CO2 molecule\u201d, explains Gabriele Stoffel, postdoctoral fellow in the Erb department and first author of the study. \u201cThree amino acids \u2013 one asparagine, glutamate and a histidine \u2013 hold the CO2 in place from two sides. Another amino acid, a phenylalanine, shields the bound CO2 from water, which would inhibit the reaction\u201d, says Stoffel.\",\n    \"index\": 2\n  },\n  \"2\": {\n    \"quote\": \"Iria Bernhardsgr\u00fctter succeeded by exchanging amino acids to correct the \\\"miscasts\\\" in PCS. Immediately, the efficiency of CO2 increased to around 20 percent. Now the second aspect was targeted, namely shielding the binding site from water. Iria Bernhardsgr\u00fctter was also able to solve this problem: anothe

In [130]:
print(res[0], type(res[0]))

{
  "1": {
    "quote": "We were surprised to learn that only four amino acids are sufficient to provide control over the CO2 molecule”, explains Gabriele Stoffel, postdoctoral fellow in the Erb department and first author of the study. “Three amino acids – one asparagine, glutamate and a histidine – hold the CO2 in place from two sides. Another amino acid, a phenylalanine, shields the bound CO2 from water, which would inhibit the reaction”, says Stoffel.",
    "index": 2
  },
  "2": {
    "quote": "Iria Bernhardsgrütter succeeded by exchanging amino acids to correct the \"miscasts\" in PCS. Immediately, the efficiency of CO2 increased to around 20 percent. Now the second aspect was targeted, namely shielding the binding site from water. Iria Bernhardsgrütter was also able to solve this problem: another amino acid replacement blocked the water's access to the binding site. The combination of both changes led to a carboxylation rate of almost 95%. Similar experiments with AER increased 

In [132]:
res_json = json.loads(res[0])

In [123]:
print(type(res_json['3']['quote']))

<class 'str'>


In [133]:
res_json


{'1': {'quote': 'We were surprised to learn that only four amino acids are sufficient to provide control over the CO2 molecule”, explains Gabriele Stoffel, postdoctoral fellow in the Erb department and first author of the study. “Three amino acids – one asparagine, glutamate and a histidine – hold the CO2 in place from two sides. Another amino acid, a phenylalanine, shields the bound CO2 from water, which would inhibit the reaction”, says Stoffel.',
  'index': 2},
 '2': {'quote': 'Iria Bernhardsgrütter succeeded by exchanging amino acids to correct the "miscasts" in PCS. Immediately, the efficiency of CO2 increased to around 20 percent. Now the second aspect was targeted, namely shielding the binding site from water. Iria Bernhardsgrütter was also able to solve this problem: another amino acid replacement blocked the water\'s access to the binding site. The combination of both changes led to a carboxylation rate of almost 95%. Similar experiments with AER increased CO2-conversion effic

In [137]:
import os
# iterate over the dictionary
for key, value in res_json.items():
    # get the index and quote
    index = value['index']
    quote = value['quote']

    # create a file name with the format 'f_{index}.txt'
    file_name = f'autoscious_logs/kd_1/h_1/f_{index}.txt'

    # open the file in append mode ('a') to add the quote
    # if you want to overwrite the existing file (if any), use 'w' mode
    with open(file_name, 'a', encoding='utf-8') as f:
        f.write(quote.replace('/"', '"') + os.linesep)  # write the quote to the file and add a new line