### Import necessary libraries and functions

In [1]:
import json
import re
import os
from collections import defaultdict
from datetime import datetime
from llm import chat_openai

In [2]:
def sanitize_filename(filename, max_length=100):
    sanitized = re.sub(r'[^a-zA-Z0-9]', '_', filename)
    return sanitized[:max_length]

In [3]:
# create logs folder
logs_folder_path = f'autoscious_logs/'
if not os.path.exists(logs_folder_path):
    os.makedirs(logs_folder_path)

In [98]:
# Hard coded for now, can ask ChatGPT to just extract this from the first 10000 chars of a website because the abstract / approach will be there if there is one
approach_description = '''Carbon capture is one of the foremost methods for curtailing greenhouse gas emissions. Incumbent technologies are inherently inefficient due to thermal energy losses, large footprint, or degradation of sorbent material. We report a solid-state faradaic electro-swing reactive adsorption system comprising an electrochemical cell that exploits the reductive addition of CO2 to quinones for carbon capture. The reported device is compact and flexible, obviates the need for ancillary equipment, and eliminates the parasitic energy losses by using electrochemically activated redox carriers. An electrochemical cell with a polyanthraquinone–carbon nanotube composite negative electrode captures CO2 upon charging via the carboxylation of reduced quinones, and releases CO2 upon discharge. The cell architecture maximizes the surface area exposed to gas, allowing for ease of stacking of the cells in a parallel passage contactor bed. We demonstrate the capture of CO2 both in a sealed chamber and in an adsorption bed from inlet streams of CO2 concentrations as low as 0.6% (6000 ppm) and up to 10%, at a constant CO2 capacity with a faradaic efficiency of >90%, and a work of 40–90 kJ per mole of CO2 captured, with great durability of electrochemical cells showing <30% loss of capacity after 7000 cylces.'''

### [PoC Complete, hard-coded approach description] Extract key problems from the paper

In [None]:
# Assumption: you already have an approach description that you want to improve in a txt file
# Goal: based on that txt file description, try to extract out key problems via searching through the entire text or the web
# DRI: Kevin
# Progress: incomplete code

In [4]:
# Hard coded from now, taken from web search section below

search_query = "Solid-state faradaic electro-swing reactive adsorption limitations"
search_query_file_safe = sanitize_filename(search_query)

In [5]:
with open(f'autoscious_logs/Solid_state_faradaic_electro_swing_reactive_adsorption_limitations/sources/full_text/Faradaic_electro_swing_reactive_adsorption_for_CO_2_capture.txt', 'r') as f:
    initial_approach_paper_text = f.read()

In [18]:
import os

chunk_size = 40000 # How many characters to read per chunk in website text, 4 chars per 1 token. 10K tokens = 40K chars
overlap = 25 # How much overlap between chunks
MAX_TOKENS = 40000 # 10K -- Roughly 3K tokens, $0.10 per MAX_TOKENs, max tokens to read per key question -- 40K tokens is $0.40

In [8]:
# 2) create facts folder and subfolders
problems_folder_path = f'autoscious_logs/{search_query_file_safe}/problems'
if not os.path.exists(problems_folder_path):
    os.makedirs(problems_folder_path)

In [51]:
# Can be broken to prompts.py and util.py
import os



def extract_problems_from_text(approach_description, excerpt):
    extract_problems_from_excerpt_prompt = f'''
Ultimate goal: 
Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured.

Task:
Based only on the "related information", list up to 10 potential limitations and problems with the approach. Do not include any general limitations and problems with the approach that is only found in the "approach description" section, but not in the "related information" section. Be clear, comprehensive, and concise. For each potential limitation and problem, cite quotes from related information that helped you extract it.

Approach description:
{approach_description}

Related information:
{excerpt}
'''
    # Ask GPT the prompt
    res = chat_openai(extract_problems_from_excerpt_prompt, model="gpt-3.5-turbo")
    print("\n\nExcerpt to extract from: \n", excerpt, "\n\nExtracted problems: \n", res[0])

    return res[0]

In [46]:
from rank_bm25 import BM25Okapi
import re

def chunk_text(text: str, key_question: str, chunk_size: int, overlap: int = 0) -> list[str]:
    """
    Splits a text into overlapping chunks and ranks them using BM25 based on relevance to a key question.
    
    Args:
    - text (str): The source text.
    - key_question (str): The key question to rank the chunks by.
    - chunk_size (int): The size of each chunk.
    - num_chunk_samples (int): The number of top-ranked chunks to return.
    - overlap (int): The size of the overlap between chunks. Default is 0.
    
    Returns:
    - list[str]: The top-ranked chunks based on BM25 scores.
    """

    # 1. Split text into overlapping chunks
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]

    # # 2. Tokenize the chunks
    # tokenized_chunks = [re.findall(r"\w+", chunk) for chunk in chunks]

    # # 3. Initialize BM25
    # bm25 = BM25Okapi(tokenized_chunks)

    # # 4. Query BM25 with the key question
    # tokenized_question = re.findall(r"\w+", key_question)
    # scores = bm25.get_scores(tokenized_question)

    # # 5. Sort chunks by BM25 scores
    # sorted_chunks = [chunk for _, chunk in sorted(zip(scores, chunks), key=lambda pair: pair[0], reverse=True)]

    # # 6. Return top num_chunk_samples chunks

    # # Just read the first chunks and hope that the abstract is there
    return chunks

In [60]:
# update key problems list
import os

def update_problems_list(approach_description, excerpt_problems, problems_list):
    update_problems_list_prompt = f'''
Ultimate goal: 
Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured.

Task:
Prioritize the entire list of problems based on biggest bottleneck to increasing the approach's carbon capture efficiency. Be comprehensive, specific, and concise.

Approach description:
{approach_description}

Below include all the problems to prioritize:
{problems_list}

{excerpt_problems}
'''
    # Ask GPT the prompt
    res = chat_openai(update_problems_list_prompt, model="gpt-3.5-turbo")
    print("\n\nPrompt: \n", update_problems_list_prompt, "\n\nUpdated list of problems: \n", res[0])

    return res[0]

In [11]:
# with open(f'autoscious_logs/{search_query_file_safe}/sources/keywords_query.txt', 'r') as f:
#     keywords_query = f.read().strip('"')
# keywords_query = "Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured."
# search_query = "carbon capture"

# Dummy keywords_query, Probably not the best keyword, but really just trying to extract the most relevant information

In [61]:
# This method can partially extract the answer, but not the exact table passage which is critical NOR does it prioritize the list of facts

# # Loop through every file in the directory, just goes in order
# approach_findings_dict = {}
# for filename in os.listdir(full_text_folder_path):
#     curr_tokens = 0
#     if filename.endswith('.txt'):
#         file_path = os.path.join(full_text_folder_path, filename)
#         with open(file_path, 'r', encoding='utf-8') as file:
#             title = file.readline().strip()
#             url = file.readline().strip()
#             text = file.read()
        
#         # Break the text into chunk to extract information from
#         # chunks = chunk_text(text, keywords_query, chunk_size, overlap)

#         print("Looking at: ", title)


# Chunk the text into parts / DEBUGGING FOR NOW
chunks = chunk_text(initial_approach_paper_text, "", chunk_size, overlap)

curr_tokens = 0
extracted_problems_list = ""
for i, chunk in enumerate(chunks):
    print(f"Chunk: {i} / {len(chunks)}")
    extracted_problems = extract_problems_from_text(approach_description, chunk)

    if '1. ' in extracted_problems:
        extracted_problems = extracted_problems[extracted_problems.index('1. '):]

    print("\n\nexcept:\n", chunk, "\n\nextracted_problems\n", extracted_problems)

    # Save the approach description and main findings

    # TODO: Run another ChatCompletion to update the list of problems, to stay organized
    # Later: should you add references to the problems? Not for Proof of Concept
    extracted_problems_list = update_problems_list(approach_description, extracted_problems, extracted_problems_list)

    curr_tokens += len(chunk)

    # TODO: change limit, but currently unsure what limit is good based on time and typicaly paper size
    # if curr_tokens > MAX_TOKENS:
    #     print("Max tokens reached in chunks!")
    #     break

Chunk: 0 / 2


Excerpt to extract from: 
 Faradaic electro-swing reactive adsorption for CO 2 capture
https://pubs.rsc.org/en/content/articlehtml/2019/ee/c9ee02412c
ViewÂ PDFÂ VersionPreviousÂ ArticleNextÂ Article Open Access Article
This Open Access Article is licensed under a Creative Commons Attribution-Non Commercial 3.0 Unported Licence DOI:Â 10.1039/C9EE02412C
(Paper)
Energy Environ. Sci., 2019, 12, 3530-3547Faradaic electro-swing reactive adsorption for CO2 captureâ€ 
Sahag
Voskian
and
T. Alan
Hatton
*
Department of Chemical Engineering, Massachusetts Institute of Technology, Cambridge, MA 02139, USA. E-mail: tahatton@mit.edu
Received
28th July 2019
, Accepted 30th September 2019First published on 1st October 2019AbstractCarbon capture is one of the foremost methods for curtailing greenhouse gas emissions. Incumbent technologies are inherently inefficient due to thermal energy losses, large footprint, or degradation of sorbent material. We report a solid-state faradaic electro-s

In [62]:
print(extracted_problems_list)

Prioritized list of problems based on biggest bottleneck to increasing the approach's carbon capture efficiency:

1. Mass transfer limitations: The transport of CO2 into the porous electrode is the rate-limiting step in the carbon capture process. This can result in slower capture rates and broader breakthrough profiles.

2. Variation in channel width: Imperfect packing of the electrochemical cells in the flow device can lead to variation in channel width, causing uneven distribution of flow and overlapping breakthrough profiles.

3. Sluggish electron transfer: The CVs of the polyanthraquinone-carbon nanotube (PAQ-CNT) composite electrodes show sluggish electron transfer, which can affect the overall efficiency of the carbon capture system.

4. Electrode degradation: Over time, the electrochemical cells may experience degradation, resulting in a loss of capacity. This can be due to electromigration of shorter polymer chains from the electrode and into the electrolyte.

5. Energy requir

In [70]:
import re
# Use regex to match lines starting with numbers followed by a period
points = re.findall(r'(\d+\..+?)(?=\n\n\d+\.|\n\n$)', extracted_problems_list, re.DOTALL)

# Create a list to store each point
final_list = [point.strip() for point in points if point.strip()]

print(final_list)

['1. Mass transfer limitations: The transport of CO2 into the porous electrode is the rate-limiting step in the carbon capture process. This can result in slower capture rates and broader breakthrough profiles.', '2. Variation in channel width: Imperfect packing of the electrochemical cells in the flow device can lead to variation in channel width, causing uneven distribution of flow and overlapping breakthrough profiles.', '3. Sluggish electron transfer: The CVs of the polyanthraquinone-carbon nanotube (PAQ-CNT) composite electrodes show sluggish electron transfer, which can affect the overall efficiency of the carbon capture system.', '4. Electrode degradation: Over time, the electrochemical cells may experience degradation, resulting in a loss of capacity. This can be due to electromigration of shorter polymer chains from the electrode and into the electrolyte.', '5. Energy requirements: The energy required for the capture and release of CO2 can vary depending on the flux of CO2 and

In [72]:
# save approach main findings dict
with open(f'autoscious_logs/{sanitize_filename(search_query)}_extracted_problems_from_paper_by_excerpt_problems.json', 'w') as f:
    json.dump(final_list, f, indent=4)

# TODO: need to fix because the last problem gets cut off, but it should be fine for now

In [None]:
# Assumption: you have context that contains the objective, existing approach and problems
# Goal: based on the context, you want to know what the most important question to answer is
# DRI: Kevin
# Progress: this is an alternative strategy that is tangential to the current approach, but it does help us come up with a search query

In [3]:
# GPT template
prompt = f'''
Context:
{context}

Existing useful facts:
[]

Given the context and existing facts, what is the most sensitive and important question to answer and research facts for in order to overcome the first and most important limitation and achieve the objective?

The output should be in JSON format: 
```json
{{
  "key_question": "<insert key question>",
}}

Respond only with the output, with no explanation or conversation. I expect a first-rate answer.
'''
search_query = json.loads(chat_openai(prompt, model="gpt-3.5-turbo")[0])["key_question"]
print(search_query)

What are the most cost-effective and energy-efficient carbon capture technologies currently available?


In [69]:
search_query_file_safe = sanitize_filename(search_query[:50])
search_engine = "academic"

In [70]:
folder_path = f'autoscious_logs/{search_query_file_safe}'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [71]:
folder_path = f'autoscious_logs/{search_query_file_safe}/sources'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [72]:
def get_initial_search_queries_prompt(key_question, search_engine):
  return f'''
Key question:
{key_question}

Task:
For the key question, write a clear and comprehensive but short (1 query) list of search queries optimized for best search engine results, so that you can confidently and quickly surface the most relevant information to determine the best answer to the question. Extract a string of search keywords query from the key question.

The output should be in JSON format: 
```json
{{
  "1": "<insert query>",
  "keywords_query": "<insert keywords>"
}}

Respond only with the output, with no explanation or conversation.
'''

In [73]:
# for decomposition_idx, key_question_decomposition in enumerate(key_question_decomposition_list):
key_question_initial_search_queries = json.loads(chat_openai(get_initial_search_queries_prompt(search_query, search_engine), model="gpt-3.5-turbo")[0])

keywords_query = key_question_initial_search_queries.pop('keywords_query')

with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'w') as f:
    json.dump(key_question_initial_search_queries, f, indent=2)

with open(f'autoscious_logs/{search_query_file_safe}/sources/keywords_query.txt', 'w') as f:
    json.dump(keywords_query, f)

print("keywords_query: ", keywords_query)

In [9]:
with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.loads(f.read())

with open(f'autoscious_logs/{search_query_file_safe}/sources/keywords_query.txt', 'r') as f:
    keywords_query = json.loads(f.read())

### [PoC, P2] Web search for additional key problems with approach, searching for and using search keywords

In [None]:
# Assumption: you have a search query that you want GPT to search for papers for online
# Goal: Based on the search query, you want to search for papers online
# DRI: Kevin
# Progress: This should work, although you need to be careful with the SERPAPI because there's only 100 calls per month. Ideally, you make one call and just play with those search results after the txt information is saved to files

In [None]:
# Hard code the search_query because gpt-4 >> gpt-3.5
# Hard code result from prompt:
'''
Ultimate goal: 
Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured.

Approach description:
The improved approach would still employ a solid-state faradaic electro-swing reactive adsorption system, but it would incorporate a quinone electrode designed to exploit hydrogen-bonding and/or protonation as described in the research paper. By selecting quinones and conditions that are optimized for these interactions, it's possible to control reduction potentials and mechanisms. This could lead to lowered activation energies, and thus a reduction in the overall energy cost per ton of CO2 captured, aiming to beat the state-of-the-art efficiency metrics.

You want to search for research papers that may highlight the key problems and limitations with this approach. What search queries would be best?
'''

# OR to find problems above and below

'''
Ultimate goal: 
Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured.

Task:
You want to search for research papers that may highlight the key problems and limitations with this approach. What search queries would be best? Rank from most likely to lead to search results that contain information about key problems and limitations.

Approach description:
Carbon capture is one of the foremost methods for curtailing greenhouse gas emissions. Incumbent technologies are inherently inefficient due to thermal energy losses, large footprint, or degradation of sorbent material. We report a solid-state faradaic electro-swing reactive adsorption system comprising an electrochemical cell that exploits the reductive addition of CO2 to quinones for carbon capture. The reported device is compact and flexible, obviates the need for ancillary equipment, and eliminates the parasitic energy losses by using electrochemically activated redox carriers. An electrochemical cell with a polyanthraquinone–carbon nanotube composite negative electrode captures CO2 upon charging via the carboxylation of reduced quinones, and releases CO2 upon discharge. The cell architecture maximizes the surface area exposed to gas, allowing for ease of stacking of the cells in a parallel passage contactor bed. We demonstrate the capture of CO2 both in a sealed chamber and in an adsorption bed from inlet streams of CO2 concentrations as low as 0.6% (6000 ppm) and up to 10%, at a constant CO2 capacity with a faradaic efficiency of >90%, and a work of 40–90 kJ per mole of CO2 captured, with great durability of electrochemical cells showing <30% loss of capacity after 7000 cylces.
'''

search_query = "Solid-state faradaic electro-swing reactive adsorption limitations"
search_query_file_safe = sanitize_filename(search_query)

In [None]:
# # SERP Google Scholar Code below
# # Only 100 credits per month, save or load in from json
# import requests

# api_key = '81bc4f7012f748724058db4c0d481c21646f2d36d191cd91e4a9fed77699242c'
# num_results = 20  # Set to the maximum allowed, which is 20

# url = f"https://serpapi.com/search?engine=google_scholar&q={search_query}&api_key={api_key}&num={num_results}"

# response = requests.get(url)
# g_scholar_serp_results = response.json()['organic_results']

In [None]:
# # save web search results
# with open(f'autoscious_logs/{sanitize_filename(search_query)}.json', 'w') as f:
#     json.dump(g_scholar_serp_results, f, indent=2)

# load web search results
with open(f'autoscious_logs/{search_query_file_safe}.json', 'r') as f:
    g_scholar_serp_results = json.loads(f.read())

In [None]:
# import os
# from dotenv import load_dotenv
# from googleapiclient.discovery import build
# import json
# from scholarly import scholarly
# from scholarly import ProxyGenerator

# load_dotenv()

# # Set up a ProxyGenerator object to use free proxies
# # This needs to be done only once per session
# pg = ProxyGenerator()
# pg.FreeProxies()
# scholarly.use_proxy(pg)

In [None]:
import requests
from PyPDF2 import PdfReader 
from io import BytesIO

def try_getting_pdf(url):
    try:
        response = requests.get(url, verify=True)
        f = BytesIO(response.content)
        pdf = PdfReader(f)
        return True
    except:
        print("Could not get pdf")
        return False

# Get the PDF content
def try_getting_pdf_content(url):
    try:
        print("trying url:", url)
        response = requests.get(url, verify=True)
        f = BytesIO(response.content)
        pdf = PdfReader(f)
        content = ""

        for i in range(len(pdf.pages)):
            page = pdf.pages[i]
            text = page.extract_text()
            content += text

        print("SUCCESSSS")
        return content
    except:
        print("Error getting PDF content")
        return ""

In [None]:
# Debug just for checking what g_scholar_serp_results variable looks like
# for res in g_scholar_serp_results:
#     if "resources" in res.keys() and res['resources'][0]['file_format'] == "PDF":
#         print("\n", res['position'], try_getting_pdf_content(res['resources'][0]['link'])[:100], res['title'])
#     else:
#         print("\n", res['position'], "no pdf", res['title'])

In [None]:
import time
def google_search_raw(search_term, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=os.getenv('DEV_KEY'))
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()

    search_results = res.get("items", [])
    time.sleep(1)

    # Create a list of only the URLs from the search results
    search_results_links = [item["link"] for item in search_results]
    return search_results

def search_google(search_query):
    num_google_searches = 8
    results = google_search_raw(search_query, os.getenv('MY_CSE_ID'), num=num_google_searches, lr="lang_en", cr="countryUS")
    return results

In [None]:
# Just creating folder
os.makedirs(f'autoscious_logs/{search_query_file_safe}')
os.makedirs(f'autoscious_logs/{search_query_file_safe}/sources')
os.makedirs(f'autoscious_logs/{search_query_file_safe}/sources/full_text')

In [None]:
# SERP
from util import scrape_text_with_selenium_no_agent
MAX_RETRIES = 3

for res in g_scholar_serp_results:
    title = res["title"]
    url = res["link"]
    print("title: ", title, "url: ", url)

    if title and not os.path.exists(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt'):
        if "resources" in res.keys() and res['resources'][0]['file_format'] == "PDF":
            print("TRYING PDF!")
            pdf_text = try_getting_pdf_content(res['resources'][0]['link'])
            if pdf_text:
                print("PDF WORKED!")
                # Record pdf text
                with open(str(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt'), 'w', encoding='utf-8') as f:
                    f.write(title + '\n')
                    f.write(res['resources'][0]['link'] + '\n')
                    f.write(pdf_text)
                print("PDF WORKED! 2")
            else:
                print("PDF DIDN'T WORK, TRYING SELENIUM")
                text = scrape_text_with_selenium_no_agent(url, None, search_engine='chrome')

                # Record pdf text
                with open(str(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt'), 'w', encoding='utf-8') as f:
                    f.write(title + '\n')
                    f.write(url + '\n')
                    f.write(text)
        else:
            print("TRYING SELENIUM")
            text = scrape_text_with_selenium_no_agent(url, None, search_engine='chrome')

            # Record pdf text
            with open(str(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt'), 'w', encoding='utf-8') as f:
                f.write(title + '\n')
                f.write(url + '\n')
                f.write(text)

title:  Faradaic electro-swing reactive adsorption for CO 2 capture url:  https://pubs.rsc.org/en/content/articlehtml/2019/ee/c9ee02412c
title:  Carbon Capture Beyond Amines: CO2 Sorption at Nucleophilic Oxygen Sites in Materials url:  https://onlinelibrary.wiley.com/doi/abs/10.1002/cnma.202200436
title:  Mechanochemical Impregnation of a Redox-Active Guest into a Metal–Organic Framework for Electrochemical Capture of CO2 url:  https://pubs.acs.org/doi/abs/10.1021/acssuschemeng.3c00133
TRYING PDF!
trying url: https://www.researchgate.net/profile/Samuel-Wenger/publication/371262323_Mechanochemical_Impregnation_of_a_Redox-Active_Guest_into_a_Metal-Organic_Framework_for_Electrochemical_Capture_of_CO_2/links/647d43ddb3dfd73b77626ebe/Mechanochemical-Impregnation-of-a-Redox-Active-Guest-into-a-Metal-Organic-Framework-for-Electrochemical-Capture-of-CO-2.pdf
Error getting PDF content
PDF DIDN'T WORK, TRYING SELENIUM
Going through url:  https://pubs.acs.org/doi/abs/10.1021/acssuschemeng.3c00133

### [PoC, complete for single problem] Given approach's problems, come up with search keywords to potentially address each problem

In [None]:
# Focused on problem 7 for now because it's a specific example to latch to and perfect

In [96]:
# save approach main findings dict
with open(f'autoscious_logs/{sanitize_filename(search_query)}_extracted_problems_from_paper_by_excerpt_problems.json', 'r') as f:
    problem_list = json.loads(f.read())
print(problem_list)

['1. Mass transfer limitations: The transport of CO2 into the porous electrode is the rate-limiting step in the carbon capture process. This can result in slower capture rates and broader breakthrough profiles.', '2. Variation in channel width: Imperfect packing of the electrochemical cells in the flow device can lead to variation in channel width, causing uneven distribution of flow and overlapping breakthrough profiles.', '3. Sluggish electron transfer: The CVs of the polyanthraquinone-carbon nanotube (PAQ-CNT) composite electrodes show sluggish electron transfer, which can affect the overall efficiency of the carbon capture system.', '4. Electrode degradation: Over time, the electrochemical cells may experience degradation, resulting in a loss of capacity. This can be due to electromigration of shorter polymer chains from the electrode and into the electrolyte.', '5. Energy requirements: The energy required for the capture and release of CO2 can vary depending on the flux of CO2 and

In [97]:
problem_list[6]

'7. Sensitivity to oxygen: The presence of oxygen in the feed stream can cause oxidation of the reduced quinones, reducing the overall efficiency of the carbon capture system. Careful tuning of the electron density of the quinone polymer is required to avoid reactions with oxygen.'

In [126]:
# Later: itreate through each problem to get a search query for each
get_search_query_prompt = f'''Task:
You want to search for research papers that may address the given problem with this approach. What search query would be best?

Given problem: 
{problem_list[6]}

Respond only with the output, with no explanation or conversation.'''
print("prompt: ", get_search_query_prompt)
search_query_res = chat_openai(get_search_query_prompt)[0].strip('"')

prompt:  Task:
You want to search for research papers that may address the given problem with this approach. What search query would be best?

Given problem: 
7. Sensitivity to oxygen: The presence of oxygen in the feed stream can cause oxidation of the reduced quinones, reducing the overall efficiency of the carbon capture system. Careful tuning of the electron density of the quinone polymer is required to avoid reactions with oxygen.

Respond only with the output, with no explanation or conversation.


In [127]:
search_query_res

'carbon capture system oxygen sensitivity quinone polymer'

### [PoC complete] Web search given search keywords

In [None]:
# Assumption: you have a search query that you want GPT to search for papers for online
# Goal: Based on the search query, you want to search for papers online
# DRI: Kevin
# Progress: This should work, although you need to be careful with the SERPAPI because there's only 100 calls per month. Ideally, you make one call and just play with those search results after the txt information is saved to files

In [136]:
# Search query from section above
search_query = search_query_res
search_query_file_safe = sanitize_filename(search_query)

In [137]:
# SERP Google Scholar Code below
# Only 100 credits per month, save or load in from json
import requests
import os
g_scholar_file_path = f'autoscious_logs/{search_query_file_safe}.json'

if os.path.exists(g_scholar_file_path):
    # load web search results
    with open(g_scholar_file_path, 'r') as f:
        g_scholar_serp_results = json.loads(f.read())
else:
    api_key = '81bc4f7012f748724058db4c0d481c21646f2d36d191cd91e4a9fed77699242c'
    num_results = 20  # Set to the maximum allowed, which is 20

    url = f"https://serpapi.com/search?engine=google_scholar&q={search_query}&api_key={api_key}&num={num_results}"

    response = requests.get(url)
    g_scholar_serp_results = response.json()['organic_results']

    # save web search results
    with open(g_scholar_file_path, 'w') as f:
        json.dump(g_scholar_serp_results, f, indent=2)

In [12]:
# import os
# from dotenv import load_dotenv
# from googleapiclient.discovery import build
# import json
# from scholarly import scholarly
# from scholarly import ProxyGenerator

# load_dotenv()

# # Set up a ProxyGenerator object to use free proxies
# # This needs to be done only once per session
# pg = ProxyGenerator()
# pg.FreeProxies()
# scholarly.use_proxy(pg)

In [133]:
import requests
from PyPDF2 import PdfReader 
from io import BytesIO

def try_getting_pdf(url):
    try:
        response = requests.get(url, verify=True)
        f = BytesIO(response.content)
        pdf = PdfReader(f)
        return True
    except:
        print("Could not get pdf")
        return False

# Get the PDF content
def try_getting_pdf_content(url):
    try:
        print("trying url:", url)
        response = requests.get(url, verify=True)
        f = BytesIO(response.content)
        pdf = PdfReader(f)
        content = ""

        for i in range(len(pdf.pages)):
            page = pdf.pages[i]
            text = page.extract_text()
            content += text

        print("SUCCESSSS")
        return content
    except:
        print("Error getting PDF content")
        return ""

In [None]:
# Debug just for checking what g_scholar_serp_results variable looks like
# for res in g_scholar_serp_results:
#     if "resources" in res.keys() and res['resources'][0]['file_format'] == "PDF":
#         print("\n", res['position'], try_getting_pdf_content(res['resources'][0]['link'])[:100], res['title'])
#     else:
#         print("\n", res['position'], "no pdf", res['title'])

In [10]:
# import time
# def google_search_raw(search_term, cse_id, **kwargs):
#     service = build("customsearch", "v1", developerKey=os.getenv('DEV_KEY'))
#     res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()

#     search_results = res.get("items", [])
#     time.sleep(1)

#     # Create a list of only the URLs from the search results
#     search_results_links = [item["link"] for item in search_results]
#     return search_results

# def search_google(search_query):
#     num_google_searches = 8
#     results = google_search_raw(search_query, os.getenv('MY_CSE_ID'), num=num_google_searches, lr="lang_en", cr="countryUS")
#     return results

In [138]:
# Just creating folder
os.makedirs(f'autoscious_logs/{search_query_file_safe}')
os.makedirs(f'autoscious_logs/{search_query_file_safe}/sources')
os.makedirs(f'autoscious_logs/{search_query_file_safe}/sources/full_text')

In [139]:
# SERP
from util import scrape_text_with_selenium_no_agent
MAX_RETRIES = 3

for res in g_scholar_serp_results:
    title = res["title"]
    url = res["link"]
    print("title: ", title, "url: ", url)

    if title and not os.path.exists(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt'):
        if "resources" in res.keys() and res['resources'][0]['file_format'] == "PDF":
            print("TRYING PDF!")
            pdf_text = try_getting_pdf_content(res['resources'][0]['link'])
            if pdf_text:
                print("PDF WORKED!")
                # Record pdf text
                with open(str(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt'), 'w', encoding='utf-8') as f:
                    f.write(title + '\n')
                    f.write(res['resources'][0]['link'] + '\n')
                    f.write(pdf_text)
                print("PDF WORKED! 2")
            else:
                print("PDF DIDN'T WORK, TRYING SELENIUM")
                text = scrape_text_with_selenium_no_agent(url, None, search_engine='chrome')

                # Record pdf text
                with open(str(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt'), 'w', encoding='utf-8') as f:
                    f.write(title + '\n')
                    f.write(url + '\n')
                    f.write(text)
        else:
            print("TRYING SELENIUM")
            text = scrape_text_with_selenium_no_agent(url, None, search_engine='chrome')

            # Record pdf text
            with open(str(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt'), 'w', encoding='utf-8') as f:
                f.write(title + '\n')
                f.write(url + '\n')
                f.write(text)

title:  Electrochemically Mediated Direct CO2 Capture by a Stackable Bipolar Cell url:  https://chemistry-europe.onlinelibrary.wiley.com/doi/abs/10.1002/cssc.202102533
TRYING PDF!
trying url: https://chemistry-europe.onlinelibrary.wiley.com/doi/pdf/10.1002/cssc.202102533
Error getting PDF content
PDF DIDN'T WORK, TRYING SELENIUM
Going through url:  https://chemistry-europe.onlinelibrary.wiley.com/doi/abs/10.1002/cssc.202102533
select firefox options!
hard coding chrome
setting up chrome driver
Driver is getting url
set timeout!
Page loaded within 15 seconds
Driver got url
Driver has found page source
Handing off to Beautiful Soup!
done extractin
Text:  Opens in a new window Opens an external website Opens an external website in a new window
This website utilizes technologies such as cookies to enable essential site functionality, as well as for analytics, personalization, and targeted advertising purposes. To learn more, view the following link:
Privacy Policy
Skip to Article Content
S

In [None]:
# Assumption: you have search results from a search engine
# Goal: based on initial search result titles and snipppets, you want to filter out the most relevant search results
# DRI: Kevin
# Progress: this is a good approach for optimizing the amount of search results we scrape and read through with the LLM, but optimization happens after we have a feasible and workable complete approach

In [14]:
def get_filtering_web_results_ratings(key_question, web_search_res):
    return f'''
Key question:
{key_question}

Task:
Based on the key question and each search result's title and body content, reason and assign a predicted usefulness score of the search result's content and potential useful references to answering the key question using a 5-point Likert scale, with 1 being very not useful, 2 being not useful, 3 being somewhat useful, 4 being useful, 5 being very useful.

Search results:
{web_search_res}

The output should be in JSON format: 
```json
{{
  'href': 'relevance score',
  etc.
}}
```

Respond only with the output, with no explanation or conversation.
'''

In [15]:
from collections import defaultdict

with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.load(f)

for query_idx, query in key_question_initial_search_queries.items():
    # load web search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_results_query_{query_idx}.json', 'r') as f:
        web_search_res = json.loads(f.read())
    
    filtered_web_results = {}
    if web_search_res != []:
        # filter web results based on title and body
        filtered_web_results = json.loads(chat_openai(get_filtering_web_results_ratings(search_query, web_search_res), model="gpt-3.5-turbo")[0])

    ratings_url_dict = defaultdict(list)
    for url, rating in filtered_web_results.items():
        ratings_url_dict[str(rating)].append(url)

    # save filtered search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/rated_web_results_query_{int(query_idx)}.json', 'w') as f:
        json.dump(ratings_url_dict, f, indent=2)

In [None]:
# Assumption: you have scraped text from the search results
# Goal: based on initial search result full text, you want to filter out the most relevant search results by taking the most relevant chunks of the full text and checking for how relevant they are to the search query
# DRI: Kevin
# Progress: this is a good approach for optimizing the amount of search results we scrape and read through with the LLM, but optimization happens after we have a feasible and workable complete approach

In [12]:
# COMPLETE code for predicting usefulness of very relevant (5) and relevant (4) results.
from util import scrape_text_with_selenium_no_agent

CHUNK_SIZE = 1000
SAMPLING_FACTOR = 0.1 # Also cap it so it falls under the max token limit
MAX_TOKENS = 2500 * 4 # 1 token = 4 chars, 2500 + 500 (prompt) tokens is high for GPT3.5
MAX_CHUNKS = int(MAX_TOKENS / CHUNK_SIZE)
# context = "Enoyl-CoA carboxylase/reductase enzymes (ECRs)"

In [17]:
def get_sample_chunks(text, CHUNK_SIZE, num_chunk_samples):
    step_size = len(text) // num_chunk_samples

    chunks = []
    for i in range(0, len(text), step_size):
        chunk = text[i:i+CHUNK_SIZE]
        chunks.append(chunk)

        # Break after getting the required number of chunks
        if len(chunks) >= num_chunk_samples:
            break

    return chunks

In [18]:
# Need to determine how useful the text is likely to be for answering the key questions
def get_predicted_usefulness_of_text_prompt(key_question, sample_text_chunks):
    return f'''
Key question:
{key_question}

Task: 
Based on the key question and the sample text chunks of the source text, the goal is to identify how useful reading the full source text would be to extract direct quoted facts or references to determine the best answer to the key question. 

Deliverable:
Assign a predicted usefulness score of the full source text using a 5-point Likert scale, with 1 being very unlikely to be usefulness, 2 being unlikely to be useful, 3 being somewhat likely to be useful, 4 being likely to be useful, and 5 being very likely useful and containing facts or references that answer the key question.

Sample text chunks from the source text:
{sample_text_chunks}

The output should be of the following JSON format
{{
    ""predicted_usefulness: <insert predicted usefulness rating>,
   etc.
}}


Respond only with the output, with no explanation or conversation.
'''

In [19]:
from rank_bm25 import BM25Okapi
import re

def get_most_relevant_chunks_with_bm25(key_question, text, CHUNK_SIZE, num_chunk_samples):
    # 1. Split text into chunks
    chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]

    # 2. Tokenize the chunks
    tokenized_chunks = [re.findall(r"\w+", chunk) for chunk in chunks]

    # 3. Initialize BM25
    bm25 = BM25Okapi(tokenized_chunks)

    # 4. Query BM25 with the key question
    tokenized_question = re.findall(r"\w+", key_question)
    scores = bm25.get_scores(tokenized_question)

    # 5. Sort chunks by BM25 scores
    sorted_chunks = [chunk for _, chunk in sorted(zip(scores, chunks), key=lambda pair: pair[0], reverse=True)]

    # 6. Return top num_chunk_samples chunks
    return sorted_chunks[:num_chunk_samples]

In [20]:
def find_title(url, web_search_info):
    for item in web_search_info:
        if item["href"] == url:
            return item["title"]
    return None

def check_pdf(url, web_search_info):
    for item in web_search_info:
        if "pdf" in item.keys() and item["pdf"]:
            return True
    print("Not pdf")
    return False

In [29]:
folder_path = f'autoscious_logs/{search_query_file_safe}/sources/full_text'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Skimming through each highly relevant paper from skimming
with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.load(f)

for query_idx, query in key_question_initial_search_queries.items():
    # open filtered search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/rated_web_results_query_{int(query_idx)}.json', 'r') as f:
        ratings_url_dict = json.loads(f.read())

    # open web search info to extract metadata
    with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_results_query_{int(query_idx)}.json', 'r') as f:
        web_search_info = json.load(f)
    
    for rating, urls in ratings_url_dict.items():
        if rating == '5' or rating == '4' or rating == '3': # Scraping all useful websites to skim through
            # Start with iterating through 4s and 5s of ratings_url_dict
            folder_path = f'autoscious_logs/{search_query_file_safe}/sources/predicted_usefulness_{rating}'
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)

            for rating_source_idx, url in enumerate(urls):
                print("query ", query_idx, "rating_source_idx", rating_source_idx, "Skimming url:", url)

                # Ensure the url hasn't already been visited
                title = find_title(url, web_search_info)
                if title and not os.path.exists(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt') and not os.path.exists(f'{folder_path}/query_{query_idx}_url_index_{rating_source_idx}.json'):

                    # Check if it's a pdf or not
                    if try_getting_pdf(url):
                        print("PDF found!")
                        text = try_getting_pdf_content(url)
                    else:
                        text = scrape_text_with_selenium_no_agent(url, None, search_engine='firefox')

                    # Only evaluate websites you're able to scrape
                    if text and text != "No information found":
                        # Use full text if it's short
                        if len(text) > 10000:
                            total_chunks = len(text) / CHUNK_SIZE
                            num_chunk_samples = min(int(total_chunks * SAMPLING_FACTOR), MAX_CHUNKS)
                            # sample_chunks = get_sample_chunks(text, CHUNK_SIZE, num_chunk_samples)
                            sample_chunks = get_most_relevant_chunks_with_bm25(keywords_query, text, CHUNK_SIZE, num_chunk_samples) # Using BM25 to search for keywords instead of general query
                            print("len(sample_chunks)", len(sample_chunks))
                        else:
                            sample_chunks = [text]
                        print("len(text)", len(text), "sample chunks: ", sample_chunks)

                        # Get predicted usefulness based on sample chunks
                        predicted_usefulness_results = json.loads(chat_openai(get_predicted_usefulness_of_text_prompt(search_query, sample_chunks), model="gpt-3.5-turbo")[0])

                        # save filtered search results
                        with open(f'{folder_path}/query_{query_idx}_url_index_{rating_source_idx}.json', 'w') as f:
                            predicted_usefulness_results['title'] = title
                            predicted_usefulness_results['url'] = url
                            json.dump(predicted_usefulness_results, f, indent=2)
                        
                        # Check if any scores were (4 or) 5, because then we should save the full text
                        pred_usefulness = predicted_usefulness_results.values()

                        # TODO: perhaps make this more dynamic
                        if 5 in pred_usefulness or '5' in pred_usefulness or 4 in pred_usefulness or '4' in pred_usefulness:
                        # DEBUG: Just looking at the scraping results
                            with open(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt', 'w', encoding='utf-8') as f:
                                f.write(title + '\n')
                                f.write(url + '\n')
                                f.write(text)
                else:
                    print("URL or text already visited!")

query  1 rating_source_idx 0 Skimming url: https://climate.mit.edu/ask-mit/how-efficient-carbon-capture-and-storage
Could not get pdf
Going through url:  https://climate.mit.edu/ask-mit/how-efficient-carbon-capture-and-storage
select firefox options!
Driver is getting url
set timeout!
Page loaded within 15 seconds
Driver got url
Driver has found page source
Handing off to Beautiful Soup!
done extractin
Text:  Skip to main content
Close
Main navigation
Climate 101
What We Know
What Can Be Done
Climate Primer
Explore
Explainers
Ask MIT Climate
Podcast
For Educators
MIT Action
News
Events
Resources
Search
Have a question?
Ask us!
Ask MIT ClimateHow efficient is carbon capture and storage?
Most carbon capture technologies aim to stop at least 90% of the CO2 in smokestacks from reaching the atmosphere. But as the technology approaches 100% efficiency, it gets more expensive and takes more energy to captur
query  1 rating_source_idx 1 Skimming url: https://sequestration.mit.edu/pdf/David_and

In [None]:
# Assumption: you have search results with each text scraped from a search engine
# Goal: based on scraped search results, you want to extract important facts from the text
# DRI: Kevin
# Progress: this only supports scraping from the first 10000 chars to get the abstract, but this might not be sufficient to get information like limitations. This is an open problem of how to smartly extract text without reading the whole thing. Alternatively, we can just read the whole thing and just cap at extremely high.

In [73]:
import os

chunk_size = 40000 # How many characters to read per chunk in website text
overlap = 25 # How much overlap between chunks
MAX_TOKENS = 80000 # 10K -- Roughly 3K tokens, $0.10 per MAX_TOKENs, max tokens to read per key question -- 40K tokens is $0.40

In [74]:
# 2) create facts folder and subfolders
facts_folder_path = f'autoscious_logs/{search_query_file_safe}/facts'
if not os.path.exists(facts_folder_path):
    os.makedirs(facts_folder_path)

In [12]:
# IMPORTANT: Need to go through BM25 of each paper and test if they contain key problems like this: https://chat.openai.com/share/83ae523e-6c10-424f-92c7-935b3f8e3b8a
import os

def extract_facts_from_website_text(search_query_file_safe, key_question, website_title, website_text, website_url):
    # 1. Extract designs
#     extract_designs_prompt = f'''
# Ultimate goal: 
# Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured.

# Task:
# Extract all carbon capture system designs and their efficiency metrics from the paper text.

# The output should be in JSON format: 
# ```json
# {{
#   "<design idx>": "<insert system design and efficiency metrics description>",
#   etc.
# }}
# ```

# Paper text: 
# {website_text}

# Respond only with the output, with no explanation or conversation.
# '''
#     # Ask GPT the prompt
#     print("extract_designs_prompt", extract_designs_prompt)
#     res = chat_openai(extract_designs_prompt, model="gpt-3.5-turbo")
#     print("Extracted designs: ", res[0])
#     res_json = json.loads(res[0])

#     key_problems = []
#     for approach_description in res_json.values():
    approach_description = website_text
    identify_key_problem_five_whys_prompt = f'''
Ultimate goal: 
Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured.

Task:
Based only on the "related information", list top potential limitations and problems with the approach. Do not include any general limitations and problems with the approach that is only found in the "approach description" section, but not in the "related information" section. Be clear and concise.

Approach description:
{approach_description}

Related information:
{website_text}
'''
    # Ask GPT the prompt
    print("identify_key_problem_five_whys_prompt", identify_key_problem_five_whys_prompt)
    res = chat_openai(identify_key_problem_five_whys_prompt, model="gpt-3.5-turbo")
    print("Extracted problems: ", res[0])

    # Save the quote to the corresponding key question index file
    extrated_problems = res[0]

    return (approach_description, extrated_problems)

In [7]:
from rank_bm25 import BM25Okapi
import re

def chunk_text(text: str, key_question: str, chunk_size: int, overlap: int = 0) -> list[str]:
    """
    Splits a text into overlapping chunks and ranks them using BM25 based on relevance to a key question.
    
    Args:
    - text (str): The source text.
    - key_question (str): The key question to rank the chunks by.
    - chunk_size (int): The size of each chunk.
    - num_chunk_samples (int): The number of top-ranked chunks to return.
    - overlap (int): The size of the overlap between chunks. Default is 0.
    
    Returns:
    - list[str]: The top-ranked chunks based on BM25 scores.
    """

    # 1. Split text into overlapping chunks
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]

    # 2. Tokenize the chunks
    tokenized_chunks = [re.findall(r"\w+", chunk) for chunk in chunks]

    # 3. Initialize BM25
    bm25 = BM25Okapi(tokenized_chunks)

    # 4. Query BM25 with the key question
    tokenized_question = re.findall(r"\w+", key_question)
    scores = bm25.get_scores(tokenized_question)

    # 5. Sort chunks by BM25 scores
    sorted_chunks = [chunk for _, chunk in sorted(zip(scores, chunks), key=lambda pair: pair[0], reverse=True)]

    # 6. Return top num_chunk_samples chunks

    # Just read the first chunks and hope that the abstract is there
    return chunks

In [57]:
# with open(f'autoscious_logs/{search_query_file_safe}/sources/keywords_query.txt', 'r') as f:
#     keywords_query = f.read().strip('"')
# keywords_query = "Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured."
# search_query = "carbon capture"

# Dummy keywords_query, Probably not the best keyword, but really just trying to extract the most relevant information

In [18]:
# Debug: testing what passages are extracted from BM25
file_path = r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\research_machine_v4_designer_evaluator\autoscious_logs\Solid_state_faradaic_electro_swing_reactive_adsorption_limitations\sources\full_text\Faradaic_electro_swing_reactive_adsorption_for_CO_2_capture.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    title = file.readline().strip()
    url = file.readline().strip()
    text = file.read()
filename = os.path.basename(file_path)

approach_findings_dict = {}
chunks = chunk_text(text, search_query, chunk_size, overlap)
for i, chunk in enumerate(chunks):
    print(f"Chunk: {i} / {len(chunks)}")
    print(chunk)

    approach_description, main_findings = extract_facts_from_website_text(search_query_file_safe, search_query, title, chunk, url)
    print("\n\nextracted_problems\n", main_findings)

    # Save the approach description and main findings
    approach_findings_dict[i] = main_findings
    
    print()
    print()

Chunk: 0 / 20
View PDF VersionPrevious ArticleNext Article Open Access Article
This Open Access Article is licensed under a Creative Commons Attribution-Non Commercial 3.0 Unported Licence DOI: 10.1039/C9EE02412C
(Paper)
Energy Environ. Sci., 2019, 12, 3530-3547Faradaic electro-swing reactive adsorption for CO2 capture†
Sahag
Voskian
and
T. Alan
Hatton
*
Department of Chemical Engineering, Massachusetts Institute of Technology, Cambridge, MA 02139, USA. E-mail: tahatton@mit.edu
Received
28th July 2019
, Accepted 30th September 2019First published on 1st October 2019AbstractCarbon capture is one of the foremost methods for curtailing greenhouse gas emissions. Incumbent technologies are inherently inefficient due to thermal energy losses, large footprint, or degradation of sorbent material. We report a solid-state faradaic electro-swing reactive adsorption system comprising an electrochemical cell that exploits the reductive addition of CO2 to quinones for carbon capture. The reported de

In [19]:
# save approach extracted problems dict
with open(f'autoscious_logs/{sanitize_filename(search_query)}_extracted_problems_by_file_name.json', 'w') as f:
    json.dump(approach_findings_dict, f, indent=2)

In [None]:
# This method can partially extract the answer, but not the exact table passage which is critical NOR does it prioritize the list of facts
# Go through each full text rated highly to extract facts from
full_text_folder_path = f'autoscious_logs/{search_query_file_safe}/sources/full_text'

# Loop through every file in the directory, just goes in order
approach_findings_dict = {}
for filename in os.listdir(full_text_folder_path):
    curr_tokens = 0
    if filename.endswith('.txt'):
        file_path = os.path.join(full_text_folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            title = file.readline().strip()
            url = file.readline().strip()
            text = file.read()
        
        # Break the text into chunk to extract information from
        # chunks = chunk_text(text, keywords_query, chunk_size, overlap)

        print("Looking at: ", title)

        # Max out GPT3.5 and hopefully the abstract is in there
        chunks = [text[:10000]]

        for i, chunk in enumerate(chunks):
            print(f"Chunk: {i} / {len(chunks)}")
            approach_description, main_findings = extract_facts_from_website_text(search_query_file_safe, search_query, title, chunk, url)

            print("\n\napproach_description\n", approach_description, "\n\nmain_findings\n", main_findings)

            # Save the approach description and main findings
            approach_findings_dict[filename] = (approach_description, main_findings)

            curr_tokens += len(chunk)
            if curr_tokens > MAX_TOKENS:
                print("Max tokens reached in chunks!")
                break

In [None]:
print(approach_findings_dict)

In [16]:
# save approach main findings dict
with open(f'autoscious_logs/{sanitize_filename(search_query)}_approaches_findings_by_file_name.json', 'w') as f:
    json.dump(approach_findings_dict, f, indent=2)

### [PoC complete] Extract main findings from each paper

In [140]:
import os

chunk_size = 40000 # How many characters to read per chunk in website text
overlap = 25 # How much overlap between chunks
MAX_TOKENS = 80000 # 10K -- Roughly 3K tokens, $0.10 per MAX_TOKENs, max tokens to read per key question -- 40K tokens is $0.40

In [141]:
# 2) create facts folder and subfolders
facts_folder_path = f'autoscious_logs/{search_query_file_safe}/facts'
if not os.path.exists(facts_folder_path):
    os.makedirs(facts_folder_path)

In [142]:
# Can be broken to prompts.py and util.py
import os

def extract_facts_from_website_text(search_query_file_safe, key_question, website_title, website_text, website_url):
    # 1. Extract designs
#     extract_designs_prompt = f'''
# Ultimate goal: 
# Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured.

# Task:
# Extract all carbon capture system designs and their efficiency metrics from the paper text.

# The output should be in JSON format: 
# ```json
# {{
#   "<design idx>": "<insert system design and efficiency metrics description>",
#   etc.
# }}
# ```

# Paper text: 
# {website_text}

# Respond only with the output, with no explanation or conversation.
# '''
#     # Ask GPT the prompt
#     print("extract_designs_prompt", extract_designs_prompt)
#     res = chat_openai(extract_designs_prompt, model="gpt-3.5-turbo")
#     print("Extracted designs: ", res[0])
#     res_json = json.loads(res[0])

#     key_problems = []
#     for approach_description in res_json.values():
    approach_description = website_text
    identify_key_problem_five_whys_prompt = f'''
Ultimate goal: 
Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured.

Approach description:
{approach_description}

Use the 5 whys until the answers are tautological to understand the approach's main findings, challenges, and anything of particular note.
'''
    # Ask GPT the prompt
    print("identify_key_problem_five_whys_prompt", identify_key_problem_five_whys_prompt)
    res = chat_openai(identify_key_problem_five_whys_prompt, model="gpt-3.5-turbo")

    # Save the quote to the corresponding key question index file
    five_why_res = res[0]
    print("\nFIVE WHYS TO UNDERSTAND AN APPROACH'S MAIN FINDINGS: \n", five_why_res)

    # Extract key insights regardless
    main_findings = chat_openai("Given this, what are the main findings, what were the challenges, anything of particular note? Only output the main findings, challenges, and noteworthy things, do not provide suggestions.", model="gpt-3.5-turbo", chat_history=[
        {
            "role": "assistant",
            "content": f"{five_why_res}"
        }
    ])[0]
        
    print("\nMAIN FINDINGS FROM PAPER: \n", main_findings)

    return (approach_description, main_findings)

In [153]:
# This method extracts out main findings to be seen if it solves key problems of a different paper later on
MAX_PAPERS_TO_EXTRACT_FROM = 3 # for debugging

# Go through each full text rated highly to extract facts from
full_text_folder_path = f'autoscious_logs/{search_query_file_safe}/sources/full_text'

# Loop through every file in the directory, just goes in order
approach_findings_dict = {}
count = 0
for filename in os.listdir(full_text_folder_path):
    curr_tokens = 0
    if filename.endswith('.txt'):
        file_path = os.path.join(full_text_folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            title = file.readline().strip()
            url = file.readline().strip()
            text = file.read()
        
        # Break the text into chunk to extract information from
        # chunks = chunk_text(text, keywords_query, chunk_size, overlap)

        print("Looking at: ", title)

        # Max out GPT3.5 and hopefully the abstract is in there
        chunks = [text[:10000]]

        for i, chunk in enumerate(chunks):
            print(f"Chunk: {i} / {len(chunks)}")
            approach_description, main_findings = extract_facts_from_website_text(search_query_file_safe, search_query, title, chunk, url)

            print("\n\napproach_description\n", approach_description, "\n\nmain_findings\n", main_findings)

            # Save the approach description and main findings
            approach_findings_dict[filename] = (approach_description, main_findings)

            curr_tokens += len(chunk)
            if curr_tokens > MAX_TOKENS:
                print("Max tokens reached in chunks!")
                break

    count += 1
    if count > MAX_PAPERS_TO_EXTRACT_FROM:
        break

Looking at:  Analysis of technologies for carbon dioxide capture from the air
Chunk: 0 / 1
identify_key_problem_five_whys_prompt 
Ultimate goal: 
Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured.

Approach description:
Citation: Leonzio, G.; Fennell, P .S.;
Shah, N. Analysis of Technologies for
Carbon Dioxide Capture from the Air.
Appl. Sci. 2022 ,12, 8321. https://
doi.org/10.3390/app12168321
Academic Editor: Leonarda
Francesca Liotta
Received: 19 July 2022
Accepted: 16 August 2022
Published: 19 August 2022
Publisher’s Note: MDPI stays neutral
with regard to jurisdictional claims in
published maps and institutional afﬁl-
iations.
Copyright: © 2022 by the authors.
Licensee MDPI, Basel, Switzerland.
This article is an open access article
distributed under the terms and
conditions of the Creative Commons
Attribution (CC BY) license (https://
creativecommons.org/licenses/by/
4.0/).
applied  
sciences 
Review
Analysis of T echnolo

In [154]:
print(approach_findings_dict)

{'Analysis_of_technologies_for_carbon_dioxide_capture_from_the_air.txt': ('Citation: Leonzio, G.; Fennell, P .S.;\nShah, N. Analysis of Technologies for\nCarbon Dioxide Capture from the Air.\nAppl. Sci. 2022 ,12, 8321. https://\ndoi.org/10.3390/app12168321\nAcademic Editor: Leonarda\nFrancesca Liotta\nReceived: 19 July 2022\nAccepted: 16 August 2022\nPublished: 19 August 2022\nPublisher’s Note: MDPI stays neutral\nwith regard to jurisdictional claims in\npublished maps and institutional afﬁl-\niations.\nCopyright: © 2022 by the authors.\nLicensee MDPI, Basel, Switzerland.\nThis article is an open access article\ndistributed under the terms and\nconditions of the Creative Commons\nAttribution (CC BY) license (https://\ncreativecommons.org/licenses/by/\n4.0/).\napplied  \nsciences \nReview\nAnalysis of T echnologies for Carbon Dioxide Capture from the Air\nGrazia Leonzio *\n , Paul S. Fennell and Nilay Shah\nDepartment of Chemical Engineering, Imperial College London, South Kensington, L

In [155]:
# save approach main findings dict
with open(f'autoscious_logs/{sanitize_filename(search_query)}_approaches_findings_by_file_name.json', 'w') as f:
    json.dump(approach_findings_dict, f, indent=2)

### [PoC complete, only for hard coded problems instead of per problem, need GPT4 for accurate matching though] Check for if key problem has been solved by existing ideas

In [None]:
# Assumption: you have scraped text and main findings from the search results
# Goal: based on scraped text and main findings, you want to see if a research paper's approach solves the key problem of the initial approach
# DRI: Kevin
# Progress: this system works, but the prompts may be altered to be better.

In [156]:
with open(f'autoscious_logs/{sanitize_filename(search_query)}_approaches_findings_by_file_name.json', 'r') as f:
    approach_findings_dict = json.loads(f.read())

In [157]:
print(approach_findings_dict['AOxygen_Stable_Electrochemical_CO2_Capture_and_Concentration_with_Quinones_Using_Alcohol_Additives.txt'][1])

Main Findings:
- The researchers developed a molecular redox carrier-based electrochemically driven CO2 capture and concentration (eCCC) system.
- The addition of ethanol as an alcohol additive improved the stability and performance of the system.
- The proposed eCCC system using TCQ and ethanol showed a minimum energy requirement of 21 kJ/mol to concentrate CO2 from 10 to 100%.
- The efficiency of the eCCC system was found to be twice as efficient as state-of-the-art thermal amine capture systems and other reported redox carrier-based systems.

Challenges:
- Most reported electrochemically driven CCC systems are not practical for use due to their sensitivity to O2, which limits stability and effectiveness.

Noteworthy:
- The use of alcohol additives to tailor the redox properties of quinone/alcohol combinations is a general approach that can be applied to other CO2-capture applications.
- The proposed eCCC system offers the potential for higher theoretical efficiencies compared to cur

In [158]:
# Assume that the approach has already been analyzed for their key problem, and that we only need to input the main findings into prompt_3
# 8/24/23 Approach 6 section has this documented

approach_paper_ratings = {}
count = 0
for key, value in approach_findings_dict.items():
    # if count >= 10:
    #     break
    main_findings = value[1]

    print(count, "\nEvaluating paper: ", key, "\nmain findings: ", main_findings)

    prompt_1 = '''
Ultimate goal: 
Propose a carbon capture system that is more efficient than SOTA in terms of GJ energy / ton of CO2 captured.

Approach description:
We report a solid-state faradaic electro-swing reactive adsorption system comprising an electrochemical cell that exploits the reductive addition of CO2 to quinones for carbon capture. The reported device is compact and flexible, obviates the need for ancillary equipment, and eliminates the parasitic energy losses by using electrochemically activated redox carriers. An electrochemical cell with a polyanthraquinone–carbon nanotube composite negative electrode captures CO2 upon charging via the carboxylation of reduced quinones, and releases CO2 upon discharge. The cell architecture maximizes the surface area exposed to gas, allowing for ease of stacking of the cells in a parallel passage contactor bed. We demonstrate the capture of CO2 both in a sealed chamber and in an adsorption bed from inlet streams of CO2 concentrations as low as 0.6% (6000 ppm) and up to 10%, at a constant CO2 capacity with a faradaic efficiency of >90%, and a work of 40–90 kJ per mole of CO2 captured, with great durability of electrochemical cells showing <30 percent loss of capacity after 7000 cycles.

Use the 5 whys until the answers are tautological to understand why the approach is not more efficient. Think step by step.
    '''

    response_1 = '''
Your proposed carbon capture system using a solid-state faradaic electro-swing reactive adsorption system is intriguing. You've cited numerous advantages, including high Faradaic efficiency, low energy consumption, and operational durability. To investigate why the system might not be more efficient, let's apply the "5 Whys" methodology.

Why is the energy required 40-90 kJ per mole of CO2 captured?

The energy cost likely comes from the electrochemical reactions needed to transform quinones for capturing and releasing CO2.

Why do electrochemical reactions require this much energy?

Reductive addition of CO2 to quinones involves changes in the electronic states of the molecules, which inherently require energy to facilitate these transformations.

Why do changes in electronic states require energy?

The energy is needed to overcome activation barriers for the electrochemical reactions, including the work needed to align the molecules in a way that allows the reaction to proceed.

Why are there activation barriers to overcome?

Activation barriers are inherent in chemical reactions due to the need to disrupt existing bonds and form new ones, which requires an initial energy input.

Why is energy required to disrupt existing bonds and form new ones?

This is tautological: Bonds have a certain energy profile, and to alter this profile, energy must be added or released. Changing bonds alters the potential energy landscape of the molecules involved.

If you're looking for ways to improve efficiency:

Reducing Activation Energy: Investigate catalysts or more reactive forms of quinones that could lower the activation energy.

Optimizing Cell Architecture: Ensure that the cell architecture allows for maximum surface area exposure while minimizing resistive losses.

Efficiency in Electrical Circuit: Ensure that the electrical connections and current collectors are designed to minimize resistive losses, which would add to the energy costs.

Operational Parameters: Temperature, pressure, and flow rate could be optimized to facilitate reactions and minimize energy use.

Material Longevity: While you've stated less than 30 percent loss of capacity after 7000 cycles, further material research could improve this, effectively spreading the energy cost over more cycles.

By addressing these points, you might be able to create an even more efficient system.
    '''

    prompt_2 = '''
Given this, what is the key problem needed to increase this approach's efficiency and beat SOTA? Only output the key problem, do not provide suggestions.
    '''

    response_2 = '''
The key problem to increase this approach's efficiency is the high energy requirement of 40-90 kJ per mole of CO2 captured, primarily due to the activation energy barriers in the electrochemical reactions involving quinones.
    '''

    prompt_3 = f'''
A research paper's main findings:
{main_findings}

Given this, reiterate the approach, key problem, analyze the extent to which the approach's key problem is solved by the research paper's main findings, and then rate on a 5-point Likert scale with 1 = approach's key problem is definitely not, 2 = approach's key problem is likely not, 3 = approach's key problem might be, 4 = approach's key problem is probably, and 5 = approach's key problem is definitely solved by the research paper's main findings. Think step by step.
    '''

    # see if the key problem has been solved
    response_3 = chat_openai(prompt_3, model="gpt-4", chat_history=[
        {
            "role": "user",
            "content": prompt_1
        },
        {
            "role": "assistant",
            "content": response_1
        },
        {
            "role": "user",
            "content": prompt_2
        },
        {
            "role": "assistant",
            "content": response_2
        }
    ])[0]
    print("RESPONSE: ", response_3)

    approach_paper_ratings[key] = response_3

    count += 1

0 
Evaluating paper:  Analysis_of_technologies_for_carbon_dioxide_capture_from_the_air.txt 
main findings:  Main Findings:
1. Direct air capture (DAC) technologies, particularly absorption and adsorption, have been extensively studied and show promise for CO2 capture from the atmosphere.
2. DAC offers advantages such as independence from CO2 source location, versatility for long-term storage or short-term utilization, and potential to address emissions from small sources.
3. Thermodynamic challenges, including the low concentration of CO2 in the atmosphere, result in high energy requirements and costs for DAC systems.
4. Reducing costs and improving efficiency are crucial for large-scale DAC deployment to make the technology economically viable and scalable.

Challenges:
1. High energy requirements and associated costs due to the dilute nature of CO2 in the atmosphere.
2. Technological advancements needed to enhance the efficiency and cost-effectiveness of DAC systems.
3. Scaling up DA

RESPONSE:  The approach mentioned in the research paper is Direct Air Capture (DAC) technologies, particularly absorption and adsorption, for CO2 capture from the atmosphere. The key problem identified in the paper is the high energy requirements and associated costs due to the dilute nature of CO2 in the atmosphere.

Analyzing the extent to which the research paper's main findings address the key problem:

1. The main findings acknowledge the high energy requirements and costs associated with DAC systems due to the low concentration of CO2 in the atmosphere. (Partial acknowledgement)

2. The paper highlights the need for technological advancements to enhance the efficiency and cost-effectiveness of DAC systems. (Acknowledgement)

3. The research paper emphasizes the importance of reducing costs and improving efficiency for large-scale DAC deployment to make the technology economically viable and scalable. (Acknowledgement)

Based on this analysis, I would rate the extent to which the 

In [159]:
# save approach main findings dict
with open(f'autoscious_logs/{sanitize_filename(search_query)}_approaches_paper_ratings_by_file_name.json', 'w') as f:
    json.dump(approach_paper_ratings, f, indent=2)

#### [Skip] Rerank facts.txt file based on relevancy

In [None]:
# Assumption: you have a txt file of fact
# Goal: based on the facts, you want to prioritize the facts from most relevant to least
# DRI: Kevin
# Progress: this is an old strategy, but could be relevant for prioritizing problems and approaches by using a propmt

In [37]:
file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts.txt'

with open(file_name, 'r', encoding='utf-8') as f:
    facts_list = f.read()

In [47]:
def get_rerank_facts_list_prompt(facts_list):
  return f'''
Context:
{context}

Key question: 
{search_query}

Task:
Rerank the following facts based on how well they answer the key question. The more useful, specific, and correct, the better. The best answer should be at the top, and the worst answer should be at the bottom of the list. Use direct quotes and do not change the wording of the facts. Leave out facts that are not relevant to the key question.

Current facts list:
{facts_list}

The output should be a JSON list of facts:
```json
['<insert fact>', etc.]
```

Respond only with the output, with no explanation or conversation. I expect a first-rate answer.
'''
reranked_facts_list = json.loads(chat_openai(get_rerank_facts_list_prompt(facts_list), model="gpt-3.5-turbo")[0])
print(reranked_facts_list)

['The most cost-effective and energy-efficient carbon capture technologies currently available include direct air capture (DAC) and carbon capture and storage (CCS) technologies.', 'The most cost-effective and energy-efficient carbon capture technologies currently available include amine-based absorption, membrane separation, and cryogenic separation.', 'The most cost-effective and energy-efficient carbon capture technologies currently available are post-combustion capture and pre-combustion capture.', 'The most cost-effective and energy-efficient carbon capture technologies currently available are MEA scrubbing of flue gas in Pulverized Coal (PC) power plants and Natural Gas Combined Cycle (NGCC) power plants.', 'The cost of carbon capture varies depending on the source of the carbon captured, the distance to the storage site, and the nature of the storage site itself. However, the Carbon Capture & Storage Association (CCSA) estimated that earlier CCS projects in the power sector woul

In [48]:
file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts_reranked.txt'

with open(file_name, 'w', encoding='utf-8') as f:
    for answer in reranked_facts_list:
        f.write(answer + os.linesep)

In [49]:
with open( f'autoscious_logs/{search_query_file_safe}/facts/facts_reranked.txt', 'r', encoding='utf-8') as f:
    reranked_facts_list = f.read()

In [50]:
print(reranked_facts_list)

The most cost-effective and energy-efficient carbon capture technologies currently available include direct air capture (DAC) and carbon capture and storage (CCS) technologies.

The most cost-effective and energy-efficient carbon capture technologies currently available include amine-based absorption, membrane separation, and cryogenic separation.

The most cost-effective and energy-efficient carbon capture technologies currently available are post-combustion capture and pre-combustion capture.

The most cost-effective and energy-efficient carbon capture technologies currently available are MEA scrubbing of flue gas in Pulverized Coal (PC) power plants and Natural Gas Combined Cycle (NGCC) power plants.

The cost of carbon capture varies depending on the source of the carbon captured, the distance to the storage site, and the nature of the storage site itself. However, the Carbon Capture & Storage Association (CCSA) estimated that earlier CCS projects in the power sector would cost bet

In [None]:
# Assumption: you have prioritized facts
# Goal: based on the prioritized facts, use a LLM to evaluate what the best answer to the research question is
# DRI: Kevin
# Progress: this is old when I was trying to extract answers to a research question, but can be adapted to checking how much a key problem is solved by an approach

In [52]:
file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts_reranked.txt'

with open(file_name, 'r', encoding='utf-8') as f:
    facts_list_reranked = f.read()

In [53]:
# split the string into a list of sentences by '\n\n'
sentences = facts_list_reranked.split('\n\n')

# create a dictionary where the key is the index + 1 (to start at 1) and the value is the sentence
facts_dict = {i+1: sentence for i, sentence in enumerate(sentences) if sentence.strip()}

print(facts_dict)

{1: 'The most cost-effective and energy-efficient carbon capture technologies currently available include direct air capture (DAC) and carbon capture and storage (CCS) technologies.', 2: 'The most cost-effective and energy-efficient carbon capture technologies currently available include amine-based absorption, membrane separation, and cryogenic separation.', 3: 'The most cost-effective and energy-efficient carbon capture technologies currently available are post-combustion capture and pre-combustion capture.', 4: 'The most cost-effective and energy-efficient carbon capture technologies currently available are MEA scrubbing of flue gas in Pulverized Coal (PC) power plants and Natural Gas Combined Cycle (NGCC) power plants.', 5: 'The cost of carbon capture varies depending on the source of the carbon captured, the distance to the storage site, and the nature of the storage site itself. However, the Carbon Capture & Storage Association (CCSA) estimated that earlier CCS projects in the po

In [62]:
# Improvement: it should cite direct quote facts, so that the direct quote and url can be retrieved in the answer.

prompt = f'''
Context:
{context}

Key question: 
{search_query}

Task:
You are an evaluator. Given the key question, context, and facts, what is the best answer to the key question? Outline both the proposition of why it is a first-rate answer, and the opposition of what the answer's biggest weaknesses might be to make it not a first-rate answer. Make sure you cite facts in your answers, and keep in mind the higher level objective in the context. Then, give a recommendation of the next most sensitive and important question to answer and research facts for in order to overcome the first and most important limitation and better achieve the objective?

The output should be in JSON format: 
```json
{{
  "best_answer": "<insert best answer based on facts to key question with fact citations>",
  "opposition":  "<insert reasons why the best answer might not be a first-rate answer to the key question>",
  "proposition": "<insert reasons why the best answer might be a first-rate answer to the key question>",
  "conclusion": "<insert yes or no if the best answer is a first-rate answer to the key question, and then your reasoning>",
  "next_question": "<insert next question recommendation or leave empty if we have already overcome with limitation>"
}}
```
Use a double square bracket around the index of the fact to cite it. For example, [[1]] means fact 1.

Search queries tried:
[{search_query}]

Most relevant facts (ordered from most to least relevant):
{facts_dict}

Respond only with the output, with no explanation or conversation.
'''
verifier_feedback = json.loads(chat_openai(prompt, model="gpt-3.5-turbo")[0])
print(verifier_feedback)

{'best_answer': 'The most cost-effective and energy-efficient carbon capture technologies currently available include direct air capture (DAC) and carbon capture and storage (CCS) technologies [[1]]. These technologies have the potential to capture large amounts of CO2 and can be deployed in various locations, making them scalable solutions. Additionally, amine-based absorption, membrane separation, and cryogenic separation are considered to be cost-effective and energy-efficient carbon capture technologies [[2]].', 'opposition': 'The biggest weakness of DAC and CCS technologies is their high cost per metric ton of CO2 captured [[5]]. The energy consumption of these technologies is also significant, requiring a substantial amount of electricity [[2]].', 'proposition': 'The most cost-effective and energy-efficient carbon capture technologies currently available are direct air capture (DAC) and carbon capture and storage (CCS) technologies. These technologies have the potential to captur

In [63]:
with open(f'autoscious_logs/{search_query_file_safe}/facts/facts.json', 'r', encoding='utf-8') as f:
    facts_json = json.load(f)

In [64]:
import re
import string

def replace_references(feedback, facts_dict, facts_json):
    def replacer(match):
        index = int(match.group(1))
        fact = facts_dict[index]
        
        # Find the corresponding URL for the fact in facts_json
        fact_stripped = fact.strip(string.punctuation)
        quote_url = None
        for key in facts_json:
            if fact_stripped in key:
                quote_url = facts_json[key]
                break
            
        if quote_url:
            return f'[{quote_url}]'
        else:
            return f'[]'

    feedback['best_answer'] = re.sub(r'\[\[(\d+)\]\]', replacer, feedback['best_answer'])
    feedback['proposition'] = re.sub(r'\[\[(\d+)\]\]', replacer, feedback['proposition'])
    feedback['opposition'] = re.sub(r'\[\[(\d+)\]\]', replacer, feedback['opposition'])
    feedback['conclusion'] = re.sub(r'\[\[(\d+)\]\]', replacer, feedback['conclusion'])

    return feedback

verifier_feedback_updated = replace_references(verifier_feedback, facts_dict, facts_json)
print(verifier_feedback_updated)


{'best_answer': 'The most cost-effective and energy-efficient carbon capture technologies currently available include direct air capture (DAC) and carbon capture and storage (CCS) technologies [On Monday, carbon capture firm Climeworks added another plant to this list, opening its third direct air capture (DAC) plant in Troia, Italy.[https://www.power-technology.com/features/carbon-capture-cost/]]. These technologies have the potential to capture large amounts of CO2 and can be deployed in various locations, making them scalable solutions. Additionally, amine-based absorption, membrane separation, and cryogenic separation are considered to be cost-effective and energy-efficient carbon capture technologies [Amine-based absorption, membrane separation, and cryogenic separation are among the most cost-effective and energy-efficient carbon capture technologies currently available.[https://www.power-technology.com/features/carbon-capture-cost/]].', 'opposition': 'The biggest weakness of DAC

In [65]:
file_name = f'autoscious_logs/{search_query_file_safe}/facts/evaluation.json'

with open(file_name, 'w', encoding='utf-8') as f:
    json.dump(verifier_feedback_updated, f, indent=4)

In [67]:
verifier_feedback_updated['next_question']

'What are the ongoing research and development efforts to improve the cost-effectiveness and energy efficiency of carbon capture technologies?'

In [68]:
search_query = verifier_feedback_updated['next_question']