In [10]:
# key question, potentially add metrics?
context = '''Objective: 
Carbon capture 1 Gt of CO2 per year ASAP

Existing approach and problems:
{'approach': 'Direct Air Capture (DAC) with Carbon Storage', 'proposition': 'DAC with carbon storage is an ideal approach as it directly captures CO2 from the atmosphere and stores it in a secure manner. It has the potential to capture large amounts of CO2 and can be deployed in various locations, making it a scalable solution.', 'opposition': 'The biggest weakness of DAC with carbon storage is its high cost per metric ton of CO2 captured. The energy consumption is also significant, requiring a substantial amount of electricity. Additionally, the storage capacity of certain methods may be limited, requiring frequent transportation and storage infrastructure.', 'limitations': {'1': {'limitation': 'High cost per metric ton of CO2 captured', 'metric': 'Cost per metric ton of CO2 captured'}, '2': {'limitation': 'Significant energy consumption', 'metric': 'Energy consumption'}, '3': {'limitation': 'Limited storage capacity', 'metric': 'Storage capacity'}}}

First principles:
Efficiency: The ideal approach should focus on improving the efficiency of the carbon capture process to reduce the cost per metric ton of CO2 captured. This can be achieved by optimizing the design and operation of the capture technology, minimizing energy requirements, and maximizing the utilization of captured CO2.
'''

In [11]:
import json
from util import sanitize_filename, get_predicted_usefulness_of_text_prompt, web_search_ddg
import os
from collections import defaultdict
from llm import chat_openai
from datetime import datetime

### Decompose objective, existing approach, and first principle needed to be addressed into key questions

In [3]:
# GPT template
prompt = f'''
Context:
{context}

Existing useful facts:
[]

Given the context and existing facts, what is the most sensitive and important question to answer and research facts for in order to overcome the first and most important limitation and achieve the objective?

The output should be in JSON format: 
```json
{{
  "key_question": "<insert key question>",
}}

Respond only with the output, with no explanation or conversation. I expect a first-rate answer.
'''
search_query = json.loads(chat_openai(prompt, model="gpt-3.5-turbo")[0])["key_question"]
print(search_query)

What are the most cost-effective and energy-efficient carbon capture technologies currently available?


In [69]:
search_query_file_safe = sanitize_filename(search_query[:50])
search_engine = "academic"

In [70]:
folder_path = f'autoscious_logs/{search_query_file_safe}'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [71]:
folder_path = f'autoscious_logs/{search_query_file_safe}/sources'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [72]:
def get_initial_search_queries_prompt(key_question, search_engine):
  return f'''
Key question:
{key_question}

Task:
For the key question, write a clear and comprehensive but short (1 query) list of search queries optimized for best search engine results, so that you can confidently and quickly surface the most relevant information to determine the best answer to the question. Extract a string of search keywords query from the key question.

The output should be in JSON format: 
```json
{{
  "1": "<insert query>",
  "keywords_query": "<insert keywords>"
}}

Respond only with the output, with no explanation or conversation.
'''

In [73]:
# for decomposition_idx, key_question_decomposition in enumerate(key_question_decomposition_list):
key_question_initial_search_queries = json.loads(chat_openai(get_initial_search_queries_prompt(search_query, search_engine), model="gpt-3.5-turbo")[0])

keywords_query = key_question_initial_search_queries.pop('keywords_query')

with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'w') as f:
    json.dump(key_question_initial_search_queries, f, indent=2)

with open(f'autoscious_logs/{search_query_file_safe}/sources/keywords_query.txt', 'w') as f:
    json.dump(keywords_query, f)

print("keywords_query: ", keywords_query)

In [9]:
with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.loads(f.read())

with open(f'autoscious_logs/{search_query_file_safe}/sources/keywords_query.txt', 'r') as f:
    keywords_query = json.loads(f.read())

### Web search given search keywords

In [None]:
# SERP Google Scholar Code below
# # Only 100 credits per month, save or load in from json
# import requests

# api_key = '81bc4f7012f748724058db4c0d481c21646f2d36d191cd91e4a9fed77699242c'
# query = 'carbon capture'
# num_results = 20  # Set to the maximum allowed, which is 20

# url = f"https://serpapi.com/search?engine=google_scholar&q={query}&api_key={api_key}&num={num_results}"

# response = requests.get(url)
# g_scholar_serp_results = response.json()['organic_results']

In [None]:
# # save web search results
# with open(f'autoscious_logs/{sanitize_filename(query)}.json', 'w') as f:
#     json.dump(g_scholar_serp_results, f, indent=2)

In [52]:
search_query_file_safe = sanitize_filename('carbon capture')

In [12]:
import os
from dotenv import load_dotenv
from googleapiclient.discovery import build
import json
from scholarly import scholarly
from scholarly import ProxyGenerator

load_dotenv()

# Set up a ProxyGenerator object to use free proxies
# This needs to be done only once per session
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)

In [50]:
import requests
from PyPDF2 import PdfReader 
from io import BytesIO

def try_getting_pdf(url):
    try:
        response = requests.get(url, verify=True)
        f = BytesIO(response.content)
        pdf = PdfReader(f)
        return True
    except:
        print("Could not get pdf")
        return False

# Get the PDF content
def try_getting_pdf_content(url):
    try:
        response = requests.get(url, verify=True)
        f = BytesIO(response.content)
        pdf = PdfReader(f)
        content = ""

        for i in range(len(pdf.pages)):
            page = pdf.pages[i]
            text = page.extract_text()
            content += text

        print("SUCCCESSS")
        return content[:100]
    except:
        print("Error getting PDF content")
        return ""

In [51]:
for res in response.json()['organic_results']:
    if "resources" in res.keys() and res['resources'][0]['file_format'] == "PDF":
        print("\n", res['position'], try_getting_pdf_content(res['resources'][0]['link']), res['title'])
    else:
        print("\n", res['position'], "no pdf", res['title'])


 0 no pdf Progress in carbon capture technologies
Error getting PDF content

 1  The outlook for improved carbon capture technology

 2 no pdf Carbon capture and utilization update

 3 no pdf Carbon capture
SUCCCESSS

 4 1 
 Outlook of Carbon Capture Technology and Challenges  1 
Tabbi Wilberforce1, A. Baroutaji2*, Bass Outlook of carbon capture technology and challenges
SUCCCESSS

 5 Contents  Page
Making new-build power plants  
CO2 capture ready 2
A new report says power plants ca Carbon capture and storage
Error getting PDF content

 6  Carbon capture and storage update
SUCCCESSS

 7 HIGHLIGHTS
The newsletter is compiled by the 
National Energy Technology Lab -
oratory to provide in Carbon capture

 8 no pdf Carbon capture and sequestration
Error getting PDF content

 9  Carbon capture and storage: how green can black be?

 10 no pdf Carbon capture

 11 no pdf Carbon capture and storage
SUCCCESSS

 12 Feature 
April  1, 2001 / Volume 35 , Issue 7 / pp. 148 A – 153 A
Copyright © 20

In [14]:
import time
def google_search_raw(search_term, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=os.getenv('DEV_KEY'))
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()

    search_results = res.get("items", [])
    time.sleep(1)

    # Create a list of only the URLs from the search results
    search_results_links = [item["link"] for item in search_results]
    return search_results

def search_google(search_query):
    num_google_searches = 8
    results = google_search_raw(search_query, os.getenv('MY_CSE_ID'), num=num_google_searches, lr="lang_en", cr="countryUS")
    return results

In [74]:
MAX_RETRIES = 3

# for decomposition_idx, key_question_decomposition in enumerate(key_question_decomposition_list):
with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.load(f)

for idx, query in key_question_initial_search_queries.items():
    print("query: ", query)
    # query = "ECR enzyme efficiency in k setae" # Hard coded to get the results I want

    web_search_res = []
    if search_engine == "academic":
        print("trying academic search")
        try:
            scholar_res_gen = scholarly.search_pubs(query)

            for res in scholar_res_gen:
                item = {}
                item['title'] = res['bib']['title']
                if try_getting_pdf(res['eprint_url']):
                    item['href'] = res['eprint_url']
                    item['pdf'] = True
                else:
                    item['href'] = res['pub_url']
                    item['pdf'] = False
                item['body'] = res['bib']['abstract']
                web_search_res += [item]
                print("Adding academic paper: ", item)
        except: 
            print("Exception, trying normal search")
    if web_search_res == []:
        # DDG
        print("trying normal search")
        web_search_res = json.loads(web_search_ddg(query))
        if len(web_search_res) == 0:
            print("trying google search!")
            # Google
            web_search_res_raw = search_google(query) # google uses 'link' instead of 'href'
            web_search_res = [{
                'title': web_search_res_raw[i]['title'], 
                'href': web_search_res_raw[i]['link'], 
                'body': web_search_res_raw[i]['snippet'],
                'pdf': False
                } for i in range(len(web_search_res_raw))
            ]

    # save web search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_results_query_{idx}.json', 'w') as f:
        json.dump(web_search_res, f, indent=2)

query:  ongoing research and development efforts to improve cost-effectiveness and energy efficiency of carbon capture technologies
trying academic search


In [None]:
# SERP
MAX_RETRIES = 3

# for decomposition_idx, key_question_decomposition in enumerate(key_question_decomposition_list):
with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.load(f)

for idx, query in key_question_initial_search_queries.items():
    print("query: ", query)
    # query = "ECR enzyme efficiency in k setae" # Hard coded to get the results I want

    web_search_res = []
    if search_engine == "academic":
        print("trying academic search")
        try:
            scholar_res_gen = scholarly.search_pubs(query)

            for res in scholar_res_gen:
                item = {}
                item['title'] = res['bib']['title']
                if try_getting_pdf(res['eprint_url']):
                    item['href'] = res['eprint_url']
                    item['pdf'] = True
                else:
                    item['href'] = res['pub_url']
                    item['pdf'] = False
                item['body'] = res['bib']['abstract']
                web_search_res += [item]
                print("Adding academic paper: ", item)
        except: 
            print("Exception, trying normal search")
    if web_search_res == []:
        # DDG
        print("trying normal search")
        web_search_res = json.loads(web_search_ddg(query))
        if len(web_search_res) == 0:
            print("trying google search!")
            # Google
            web_search_res_raw = search_google(query) # google uses 'link' instead of 'href'
            web_search_res = [{
                'title': web_search_res_raw[i]['title'], 
                'href': web_search_res_raw[i]['link'], 
                'body': web_search_res_raw[i]['snippet'],
                'pdf': False
                } for i in range(len(web_search_res_raw))
            ]

    # save web search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_results_query_{idx}.json', 'w') as f:
        json.dump(web_search_res, f, indent=2)

### Reading type 1: filtering unlikely relevant sources based on title and body

In [14]:
def get_filtering_web_results_ratings(key_question, web_search_res):
    return f'''
Key question:
{key_question}

Task:
Based on the key question and each search result's title and body content, reason and assign a predicted usefulness score of the search result's content and potential useful references to answering the key question using a 5-point Likert scale, with 1 being very not useful, 2 being not useful, 3 being somewhat useful, 4 being useful, 5 being very useful.

Search results:
{web_search_res}

The output should be in JSON format: 
```json
{{
  'href': 'relevance score',
  etc.
}}
```

Respond only with the output, with no explanation or conversation.
'''

In [15]:
from collections import defaultdict

with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.load(f)

for query_idx, query in key_question_initial_search_queries.items():
    # load web search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_results_query_{query_idx}.json', 'r') as f:
        web_search_res = json.loads(f.read())
    
    filtered_web_results = {}
    if web_search_res != []:
        # filter web results based on title and body
        filtered_web_results = json.loads(chat_openai(get_filtering_web_results_ratings(search_query, web_search_res), model="gpt-3.5-turbo")[0])

    ratings_url_dict = defaultdict(list)
    for url, rating in filtered_web_results.items():
        ratings_url_dict[str(rating)].append(url)

    # save filtered search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/rated_web_results_query_{int(query_idx)}.json', 'w') as f:
        json.dump(ratings_url_dict, f, indent=2)

### Reading type 2 & 3

In [16]:
# COMPLETE code for predicting usefulness of very relevant (5) and relevant (4) results.
from util import scrape_text_with_selenium_no_agent

CHUNK_SIZE = 1000
SAMPLING_FACTOR = 0.1 # Also cap it so it falls under the max token limit
MAX_TOKENS = 2500 * 4 # 1 token = 4 chars, 2500 + 500 (prompt) tokens is high for GPT3.5
MAX_CHUNKS = int(MAX_TOKENS / CHUNK_SIZE)
# context = "Enoyl-CoA carboxylase/reductase enzymes (ECRs)"

In [17]:
def get_sample_chunks(text, CHUNK_SIZE, num_chunk_samples):
    step_size = len(text) // num_chunk_samples

    chunks = []
    for i in range(0, len(text), step_size):
        chunk = text[i:i+CHUNK_SIZE]
        chunks.append(chunk)

        # Break after getting the required number of chunks
        if len(chunks) >= num_chunk_samples:
            break

    return chunks

In [18]:
# Need to determine how useful the text is likely to be for answering the key questions
def get_predicted_usefulness_of_text_prompt(key_question, sample_text_chunks):
    return f'''
Key question:
{key_question}

Task: 
Based on the key question and the sample text chunks of the source text, the goal is to identify how useful reading the full source text would be to extract direct quoted facts or references to determine the best answer to the key question. 

Deliverable:
Assign a predicted usefulness score of the full source text using a 5-point Likert scale, with 1 being very unlikely to be usefulness, 2 being unlikely to be useful, 3 being somewhat likely to be useful, 4 being likely to be useful, and 5 being very likely useful and containing facts or references that answer the key question.

Sample text chunks from the source text:
{sample_text_chunks}

The output should be of the following JSON format
{{
    ""predicted_usefulness: <insert predicted usefulness rating>,
   etc.
}}


Respond only with the output, with no explanation or conversation.
'''

In [19]:
from rank_bm25 import BM25Okapi
import re

def get_most_relevant_chunks_with_bm25(key_question, text, CHUNK_SIZE, num_chunk_samples):
    # 1. Split text into chunks
    chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]

    # 2. Tokenize the chunks
    tokenized_chunks = [re.findall(r"\w+", chunk) for chunk in chunks]

    # 3. Initialize BM25
    bm25 = BM25Okapi(tokenized_chunks)

    # 4. Query BM25 with the key question
    tokenized_question = re.findall(r"\w+", key_question)
    scores = bm25.get_scores(tokenized_question)

    # 5. Sort chunks by BM25 scores
    sorted_chunks = [chunk for _, chunk in sorted(zip(scores, chunks), key=lambda pair: pair[0], reverse=True)]

    # 6. Return top num_chunk_samples chunks
    return sorted_chunks[:num_chunk_samples]

In [20]:
def find_title(url, web_search_info):
    for item in web_search_info:
        if item["href"] == url:
            return item["title"]
    return None

def check_pdf(url, web_search_info):
    for item in web_search_info:
        if "pdf" in item.keys() and item["pdf"]:
            return True
    print("Not pdf")
    return False

In [29]:
folder_path = f'autoscious_logs/{search_query_file_safe}/sources/full_text'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Skimming through each highly relevant paper from skimming
with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.load(f)

for query_idx, query in key_question_initial_search_queries.items():
    # open filtered search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/rated_web_results_query_{int(query_idx)}.json', 'r') as f:
        ratings_url_dict = json.loads(f.read())

    # open web search info to extract metadata
    with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_results_query_{int(query_idx)}.json', 'r') as f:
        web_search_info = json.load(f)
    
    for rating, urls in ratings_url_dict.items():
        if rating == '5' or rating == '4' or rating == '3': # Scraping all useful websites to skim through
            # Start with iterating through 4s and 5s of ratings_url_dict
            folder_path = f'autoscious_logs/{search_query_file_safe}/sources/predicted_usefulness_{rating}'
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)

            for rating_source_idx, url in enumerate(urls):
                print("query ", query_idx, "rating_source_idx", rating_source_idx, "Skimming url:", url)

                # Ensure the url hasn't already been visited
                title = find_title(url, web_search_info)
                if title and not os.path.exists(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt') and not os.path.exists(f'{folder_path}/query_{query_idx}_url_index_{rating_source_idx}.json'):

                    # Check if it's a pdf or not
                    if try_getting_pdf(url):
                        print("PDF found!")
                        text = try_getting_pdf_content(url)
                    else:
                        text = scrape_text_with_selenium_no_agent(url, None, search_engine='firefox')

                    # Only evaluate websites you're able to scrape
                    if text and text != "No information found":
                        # Use full text if it's short
                        if len(text) > 10000:
                            total_chunks = len(text) / CHUNK_SIZE
                            num_chunk_samples = min(int(total_chunks * SAMPLING_FACTOR), MAX_CHUNKS)
                            # sample_chunks = get_sample_chunks(text, CHUNK_SIZE, num_chunk_samples)
                            sample_chunks = get_most_relevant_chunks_with_bm25(keywords_query, text, CHUNK_SIZE, num_chunk_samples) # Using BM25 to search for keywords instead of general query
                            print("len(sample_chunks)", len(sample_chunks))
                        else:
                            sample_chunks = [text]
                        print("len(text)", len(text), "sample chunks: ", sample_chunks)

                        # Get predicted usefulness based on sample chunks
                        predicted_usefulness_results = json.loads(chat_openai(get_predicted_usefulness_of_text_prompt(search_query, sample_chunks), model="gpt-3.5-turbo")[0])

                        # save filtered search results
                        with open(f'{folder_path}/query_{query_idx}_url_index_{rating_source_idx}.json', 'w') as f:
                            predicted_usefulness_results['title'] = title
                            predicted_usefulness_results['url'] = url
                            json.dump(predicted_usefulness_results, f, indent=2)
                        
                        # Check if any scores were (4 or) 5, because then we should save the full text
                        pred_usefulness = predicted_usefulness_results.values()

                        # TODO: perhaps make this more dynamic
                        if 5 in pred_usefulness or '5' in pred_usefulness or 4 in pred_usefulness or '4' in pred_usefulness:
                        # DEBUG: Just looking at the scraping results
                            with open(f'autoscious_logs/{search_query_file_safe}/sources/full_text/{sanitize_filename(title)}.txt', 'w', encoding='utf-8') as f:
                                f.write(title + '\n')
                                f.write(url + '\n')
                                f.write(text)
                else:
                    print("URL or text already visited!")

query  1 rating_source_idx 0 Skimming url: https://climate.mit.edu/ask-mit/how-efficient-carbon-capture-and-storage
Could not get pdf
Going through url:  https://climate.mit.edu/ask-mit/how-efficient-carbon-capture-and-storage
select firefox options!
Driver is getting url
set timeout!
Page loaded within 15 seconds
Driver got url
Driver has found page source
Handing off to Beautiful Soup!
done extractin
Text:  Skip to main content
Close
Main navigation
Climate 101
What We Know
What Can Be Done
Climate Primer
Explore
Explainers
Ask MIT Climate
Podcast
For Educators
MIT Action
News
Events
Resources
Search
Have a question?
Ask us!
Ask MIT ClimateHow efficient is carbon capture and storage?
Most carbon capture technologies aim to stop at least 90% of the CO2 in smokestacks from reaching the atmosphere. But as the technology approaches 100% efficiency, it gets more expensive and takes more energy to captur
query  1 rating_source_idx 1 Skimming url: https://sequestration.mit.edu/pdf/David_and

### Extract facts

In [30]:
import os

chunk_size = 4000 # How many characters to read per chunk in website text
overlap = 25 # How much overlap between chunks
MAX_TOKENS = 40000 # 10K -- Roughly 3K tokens, $0.10 per MAX_TOKENs, max tokens to read per key question -- 40K tokens is $0.40

In [31]:
# 2) create facts folder and subfolders
facts_folder_path = f'autoscious_logs/{search_query_file_safe}/facts'
if not os.path.exists(facts_folder_path):
    os.makedirs(facts_folder_path)

In [32]:
# Can be broken to prompts.py and util.py
import os

def extract_facts_from_website_text(search_query_file_safe, key_question, website_title, website_text, website_url):
    seed_initial_question_decomposition_prompt = f'''
Key question: 
{key_question}

Task:
What's the best answer based on the source text? Give me as specific and correct of an answer as possible. Then, quote the section of the source text that supports your answer. 

The output should be in JSON format: 
```json
{{
  "best_answer": "<insert best answer>",
  "quote": "<insert quote>",
}}
```

Source text: 
{website_text}

Respond only with the output, with no explanation or conversation.
'''
    # Ask GPT the prompt
    print("seed_initial_question_decomposition_prompt", seed_initial_question_decomposition_prompt)
    res = chat_openai(seed_initial_question_decomposition_prompt, model="gpt-3.5-turbo")
    print("Extracted quotes: ", res[0])

    # Save the quote to the corresponding key question index file
    res_json = json.loads(res[0])
    # for key, value in res_json.items():
    answer = res_json.get("best_answer", "")
    quote = res_json.get("quote", "")

    # Only log if there is a quote
    if quote and type(quote) == str:
        file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts.txt'

        with open(file_name, 'a', encoding='utf-8') as f:
            f.write(answer + os.linesep)

        # Save the best answer and quote into a JSON for reference retrieval later
        json_file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts.json'
        if os.path.exists(json_file_name):
            with open(json_file_name, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
        else:
            data = {}

        # Update the dictionary and save it back
        data[answer] = quote.replace('/"', '"') + f"[{website_url}]"
        
        with open(json_file_name, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=4)

    return

In [33]:
from rank_bm25 import BM25Okapi
import re

def chunk_text(text: str, key_question: str, chunk_size: int, overlap: int = 0) -> list[str]:
    """
    Splits a text into overlapping chunks and ranks them using BM25 based on relevance to a key question.
    
    Args:
    - text (str): The source text.
    - key_question (str): The key question to rank the chunks by.
    - chunk_size (int): The size of each chunk.
    - num_chunk_samples (int): The number of top-ranked chunks to return.
    - overlap (int): The size of the overlap between chunks. Default is 0.
    
    Returns:
    - list[str]: The top-ranked chunks based on BM25 scores.
    """

    # 1. Split text into overlapping chunks
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]

    # 2. Tokenize the chunks
    tokenized_chunks = [re.findall(r"\w+", chunk) for chunk in chunks]

    # 3. Initialize BM25
    bm25 = BM25Okapi(tokenized_chunks)

    # 4. Query BM25 with the key question
    tokenized_question = re.findall(r"\w+", key_question)
    scores = bm25.get_scores(tokenized_question)

    # 5. Sort chunks by BM25 scores
    sorted_chunks = [chunk for _, chunk in sorted(zip(scores, chunks), key=lambda pair: pair[0], reverse=True)]

    # 6. Return top num_chunk_samples chunks
    return sorted_chunks


In [34]:
with open(f'autoscious_logs/{search_query_file_safe}/sources/keywords_query.txt', 'r') as f:
    keywords_query = f.read().strip('"')

In [35]:
# This method can partially extract the answer, but not the exact table passage which is critical NOR does it prioritize the list of facts
# Go through each full text rated highly to extract facts from
full_text_folder_path = f'autoscious_logs/{search_query_file_safe}/sources/full_text'

# Loop through every file in the directory, just goes in order
for filename in os.listdir(full_text_folder_path):
    curr_tokens = 0
    if filename.endswith('.txt'):
        file_path = os.path.join(full_text_folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            title = file.readline().strip()
            url = file.readline().strip()
            text = file.read()
        
        # Break the text into chunk to extract information from
        chunks = chunk_text(text, keywords_query, chunk_size, overlap)
        for i, chunk in enumerate(chunks):
            print(f"Chunk: {i} / {len(chunks)}")
            extract_facts_from_website_text(search_query_file_safe, search_query, title, chunk, url)

            curr_tokens += len(chunk)
            if curr_tokens > MAX_TOKENS:
                print("Max tokens reached in chunks!")
                break

Chunk: 0 / 6
seed_initial_question_decomposition_prompt 
Key question: 
What are the most cost-effective and energy-efficient carbon capture technologies currently available?

Task:
What's the best answer based on the source text? Give me as specific and correct of an answer as possible. Then, quote the section of the source text that supports your answer. 

The output should be in JSON format: 
```json
{
  "best_answer": "<insert best answer>",
  "quote": "<insert quote>",
}
```

Source text: 

By downloading this case study, you acknowledge that GlobalData may share your information with GlobalData and that your personal data will be used as described in their Privacy Policy Submit
Visit our Privacy Policy for more information about our services, how GlobalData may use, process and share your personal data, including information on your rights in respect of your personal data and how you can unsubscribe from future marketing communications.
Our services are intended for corporate sub

#### Rerank facts.txt file based on relevancy

In [37]:
file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts.txt'

with open(file_name, 'r', encoding='utf-8') as f:
    facts_list = f.read()

In [47]:
def get_rerank_facts_list_prompt(facts_list):
  return f'''
Context:
{context}

Key question: 
{search_query}

Task:
Rerank the following facts based on how well they answer the key question. The more useful, specific, and correct, the better. The best answer should be at the top, and the worst answer should be at the bottom of the list. Use direct quotes and do not change the wording of the facts. Leave out facts that are not relevant to the key question.

Current facts list:
{facts_list}

The output should be a JSON list of facts:
```json
['<insert fact>', etc.]
```

Respond only with the output, with no explanation or conversation. I expect a first-rate answer.
'''
reranked_facts_list = json.loads(chat_openai(get_rerank_facts_list_prompt(facts_list), model="gpt-3.5-turbo")[0])
print(reranked_facts_list)

['The most cost-effective and energy-efficient carbon capture technologies currently available include direct air capture (DAC) and carbon capture and storage (CCS) technologies.', 'The most cost-effective and energy-efficient carbon capture technologies currently available include amine-based absorption, membrane separation, and cryogenic separation.', 'The most cost-effective and energy-efficient carbon capture technologies currently available are post-combustion capture and pre-combustion capture.', 'The most cost-effective and energy-efficient carbon capture technologies currently available are MEA scrubbing of flue gas in Pulverized Coal (PC) power plants and Natural Gas Combined Cycle (NGCC) power plants.', 'The cost of carbon capture varies depending on the source of the carbon captured, the distance to the storage site, and the nature of the storage site itself. However, the Carbon Capture & Storage Association (CCSA) estimated that earlier CCS projects in the power sector woul

In [48]:
file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts_reranked.txt'

with open(file_name, 'w', encoding='utf-8') as f:
    for answer in reranked_facts_list:
        f.write(answer + os.linesep)

In [49]:
with open( f'autoscious_logs/{search_query_file_safe}/facts/facts_reranked.txt', 'r', encoding='utf-8') as f:
    reranked_facts_list = f.read()

In [50]:
print(reranked_facts_list)

The most cost-effective and energy-efficient carbon capture technologies currently available include direct air capture (DAC) and carbon capture and storage (CCS) technologies.

The most cost-effective and energy-efficient carbon capture technologies currently available include amine-based absorption, membrane separation, and cryogenic separation.

The most cost-effective and energy-efficient carbon capture technologies currently available are post-combustion capture and pre-combustion capture.

The most cost-effective and energy-efficient carbon capture technologies currently available are MEA scrubbing of flue gas in Pulverized Coal (PC) power plants and Natural Gas Combined Cycle (NGCC) power plants.

The cost of carbon capture varies depending on the source of the carbon captured, the distance to the storage site, and the nature of the storage site itself. However, the Carbon Capture & Storage Association (CCSA) estimated that earlier CCS projects in the power sector would cost bet

### Evaluator / verifier

In [52]:
file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts_reranked.txt'

with open(file_name, 'r', encoding='utf-8') as f:
    facts_list_reranked = f.read()

In [53]:
# split the string into a list of sentences by '\n\n'
sentences = facts_list_reranked.split('\n\n')

# create a dictionary where the key is the index + 1 (to start at 1) and the value is the sentence
facts_dict = {i+1: sentence for i, sentence in enumerate(sentences) if sentence.strip()}

print(facts_dict)

{1: 'The most cost-effective and energy-efficient carbon capture technologies currently available include direct air capture (DAC) and carbon capture and storage (CCS) technologies.', 2: 'The most cost-effective and energy-efficient carbon capture technologies currently available include amine-based absorption, membrane separation, and cryogenic separation.', 3: 'The most cost-effective and energy-efficient carbon capture technologies currently available are post-combustion capture and pre-combustion capture.', 4: 'The most cost-effective and energy-efficient carbon capture technologies currently available are MEA scrubbing of flue gas in Pulverized Coal (PC) power plants and Natural Gas Combined Cycle (NGCC) power plants.', 5: 'The cost of carbon capture varies depending on the source of the carbon captured, the distance to the storage site, and the nature of the storage site itself. However, the Carbon Capture & Storage Association (CCSA) estimated that earlier CCS projects in the po

In [62]:
# Improvement: it should cite direct quote facts, so that the direct quote and url can be retrieved in the answer.

prompt = f'''
Context:
{context}

Key question: 
{search_query}

Task:
You are an evaluator. Given the key question, context, and facts, what is the best answer to the key question? Outline both the proposition of why it is a first-rate answer, and the opposition of what the answer's biggest weaknesses might be to make it not a first-rate answer. Make sure you cite facts in your answers, and keep in mind the higher level objective in the context. Then, give a recommendation of the next most sensitive and important question to answer and research facts for in order to overcome the first and most important limitation and better achieve the objective?

The output should be in JSON format: 
```json
{{
  "best_answer": "<insert best answer based on facts to key question with fact citations>",
  "opposition":  "<insert reasons why the best answer might not be a first-rate answer to the key question>",
  "proposition": "<insert reasons why the best answer might be a first-rate answer to the key question>",
  "conclusion": "<insert yes or no if the best answer is a first-rate answer to the key question, and then your reasoning>",
  "next_question": "<insert next question recommendation or leave empty if we have already overcome with limitation>"
}}
```
Use a double square bracket around the index of the fact to cite it. For example, [[1]] means fact 1.

Search queries tried:
[{search_query}]

Most relevant facts (ordered from most to least relevant):
{facts_dict}

Respond only with the output, with no explanation or conversation.
'''
verifier_feedback = json.loads(chat_openai(prompt, model="gpt-3.5-turbo")[0])
print(verifier_feedback)

{'best_answer': 'The most cost-effective and energy-efficient carbon capture technologies currently available include direct air capture (DAC) and carbon capture and storage (CCS) technologies [[1]]. These technologies have the potential to capture large amounts of CO2 and can be deployed in various locations, making them scalable solutions. Additionally, amine-based absorption, membrane separation, and cryogenic separation are considered to be cost-effective and energy-efficient carbon capture technologies [[2]].', 'opposition': 'The biggest weakness of DAC and CCS technologies is their high cost per metric ton of CO2 captured [[5]]. The energy consumption of these technologies is also significant, requiring a substantial amount of electricity [[2]].', 'proposition': 'The most cost-effective and energy-efficient carbon capture technologies currently available are direct air capture (DAC) and carbon capture and storage (CCS) technologies. These technologies have the potential to captur

In [63]:
with open(f'autoscious_logs/{search_query_file_safe}/facts/facts.json', 'r', encoding='utf-8') as f:
    facts_json = json.load(f)

In [64]:
import re
import string

def replace_references(feedback, facts_dict, facts_json):
    def replacer(match):
        index = int(match.group(1))
        fact = facts_dict[index]
        
        # Find the corresponding URL for the fact in facts_json
        fact_stripped = fact.strip(string.punctuation)
        quote_url = None
        for key in facts_json:
            if fact_stripped in key:
                quote_url = facts_json[key]
                break
            
        if quote_url:
            return f'[{quote_url}]'
        else:
            return f'[]'

    feedback['best_answer'] = re.sub(r'\[\[(\d+)\]\]', replacer, feedback['best_answer'])
    feedback['proposition'] = re.sub(r'\[\[(\d+)\]\]', replacer, feedback['proposition'])
    feedback['opposition'] = re.sub(r'\[\[(\d+)\]\]', replacer, feedback['opposition'])
    feedback['conclusion'] = re.sub(r'\[\[(\d+)\]\]', replacer, feedback['conclusion'])

    return feedback

verifier_feedback_updated = replace_references(verifier_feedback, facts_dict, facts_json)
print(verifier_feedback_updated)


{'best_answer': 'The most cost-effective and energy-efficient carbon capture technologies currently available include direct air capture (DAC) and carbon capture and storage (CCS) technologies [On Monday, carbon capture firm Climeworks added another plant to this list, opening its third direct air capture (DAC) plant in Troia, Italy.[https://www.power-technology.com/features/carbon-capture-cost/]]. These technologies have the potential to capture large amounts of CO2 and can be deployed in various locations, making them scalable solutions. Additionally, amine-based absorption, membrane separation, and cryogenic separation are considered to be cost-effective and energy-efficient carbon capture technologies [Amine-based absorption, membrane separation, and cryogenic separation are among the most cost-effective and energy-efficient carbon capture technologies currently available.[https://www.power-technology.com/features/carbon-capture-cost/]].', 'opposition': 'The biggest weakness of DAC

In [65]:
file_name = f'autoscious_logs/{search_query_file_safe}/facts/evaluation.json'

with open(file_name, 'w', encoding='utf-8') as f:
    json.dump(verifier_feedback_updated, f, indent=4)

In [67]:
verifier_feedback_updated['next_question']

'What are the ongoing research and development efforts to improve the cost-effectiveness and energy efficiency of carbon capture technologies?'

In [68]:
search_query = verifier_feedback_updated['next_question']