In [1]:
# key question
search_query = '''Objective: 
Carbon capture 1 Gt of CO2 per year ASAP

Existing approach and problems:
{'approach': 'Direct Air Capture (DAC) with Carbon Storage', 'proposition': 'DAC with carbon storage is an ideal approach as it directly captures CO2 from the atmosphere and stores it in a secure manner. It has the potential to capture large amounts of CO2 and can be deployed in various locations, making it a scalable solution.', 'opposition': 'The biggest weakness of DAC with carbon storage is its high cost per metric ton of CO2 captured. The energy consumption is also significant, requiring a substantial amount of electricity. Additionally, the storage capacity of certain methods may be limited, requiring frequent transportation and storage infrastructure.', 'limitations': {'1': {'limitation': 'High cost per metric ton of CO2 captured', 'metric': 'Cost per metric ton of CO2 captured'}, '2': {'limitation': 'Significant energy consumption', 'metric': 'Energy consumption'}, '3': {'limitation': 'Limited storage capacity', 'metric': 'Storage capacity'}}}

First principles:
Efficiency: The ideal approach should focus on improving the efficiency of the carbon capture process to reduce the cost per metric ton of CO2 captured. This can be achieved by optimizing the design and operation of the capture technology, minimizing energy requirements, and maximizing the utilization of captured CO2.
'''

In [2]:
import json
from util import sanitize_filename, get_predicted_usefulness_of_text_prompt, web_search_ddg
import os
from collections import defaultdict
from llm import chat_openai
from datetime import datetime

In [3]:
search_query_file_safe = sanitize_filename(search_query[:50])
search_engine = "academic"

In [4]:
folder_path = f'autoscious_logs/{search_query_file_safe}'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [5]:
folder_path = f'autoscious_logs/{search_query_file_safe}/sources'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [6]:
def get_initial_search_queries_prompt(key_question, search_engine):
  return f'''
Key question:
{key_question}

Task:
For the key question, write a clear and comprehensive but short (1 query) list of search queries optimized for best search engine results, so that you can confidently and quickly surface the most relevant information to determine the best answer to the question. Extract a string of search keywords query from the key question.

The output should be in JSON format: 
```json
{{
  "1": "<insert query>",
  "keywords_query": "<insert keywords>"
}}

Respond only with the output, with no explanation or conversation.
'''

In [14]:
# for decomposition_idx, key_question_decomposition in enumerate(key_question_decomposition_list):
key_question_initial_search_queries = json.loads(chat_openai(get_initial_search_queries_prompt(search_query, search_engine), model="gpt-3.5-turbo")[0])

keywords_query = key_question_initial_search_queries.pop('keywords_query')

with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'w') as f:
    json.dump(key_question_initial_search_queries, f, indent=2)

with open(f'autoscious_logs/{search_query_file_safe}/sources/keywords_query.txt', 'w') as f:
    json.dump(keywords_query, f)

In [14]:
with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.loads(f.read())

with open(f'autoscious_logs/{search_query_file_safe}/sources/keywords_query.txt', 'r') as f:
    keywords_query = json.loads(f.read())

### Web search given search keywords

In [9]:
import os
from dotenv import load_dotenv
from googleapiclient.discovery import build
import json
from scholarly import scholarly
from scholarly import ProxyGenerator

load_dotenv()

# Set up a ProxyGenerator object to use free proxies
# This needs to be done only once per session
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)

In [4]:
import requests
from PyPDF2 import PdfReader 
from io import BytesIO

def try_getting_pdf(url):
    response = requests.get(url)
    f = BytesIO(response.content)
    try:
        pdf = PdfReader(f)
        return True
    except:
        print("Could not get pdf")
        return False

# Get the PDF content
def try_getting_pdf_content(url):
    response = requests.get(url)
    f = BytesIO(response.content)
    try:
        pdf = PdfReader(f)
        content = ""

        for i in range(len(pdf.pages)):
            page = pdf.pages[i]
            text = page.extract_text()
            content += text
        return content
    except:
        print("Error getting PDF content")
        return ""

In [11]:
import time
def google_search_raw(search_term, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=os.getenv('DEV_KEY'))
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()

    search_results = res.get("items", [])
    time.sleep(1)

    # Create a list of only the URLs from the search results
    search_results_links = [item["link"] for item in search_results]
    return search_results

def search_google(search_query):
    num_google_searches = 8
    results = google_search_raw(search_query, os.getenv('MY_CSE_ID'), num=num_google_searches, lr="lang_en", cr="countryUS")
    return results

In [12]:
MAX_RETRIES = 3

# for decomposition_idx, key_question_decomposition in enumerate(key_question_decomposition_list):
with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.load(f)

for idx, query in key_question_initial_search_queries.items():
    print("query: ", query)
    # query = "ECR enzyme efficiency in k setae" # Hard coded to get the results I want

    web_search_res = []
    if search_engine == "academic":
        print("trying academic search")
        try:
            scholar_res_gen = scholarly.search_pubs(query)

            for res in scholar_res_gen:
                item = {}
                item['title'] = res['bib']['title']
                if try_getting_pdf(res['eprint_url']):
                    item['href'] = res['eprint_url']
                    item['pdf'] = True
                else:
                    item['href'] = res['pub_url']
                    item['pdf'] = False
                item['body'] = res['bib']['abstract']
                web_search_res += [item]
        except: 
            print("Exception, trying normal search")
    if web_search_res == []:
        # DDG
        print("trying normal search")
        web_search_res = json.loads(web_search_ddg(query))
        if len(web_search_res) == 0:
            print("trying google search!")
            # Google
            web_search_res_raw = search_google(query) # google uses 'link' instead of 'href'
            web_search_res = [{
                'title': web_search_res_raw[i]['title'], 
                'href': web_search_res_raw[i]['link'], 
                'body': web_search_res_raw[i]['snippet'],
                'pdf': False
                } for i in range(len(web_search_res_raw))
            ]

    # save web search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_results_query_{idx}.json', 'w') as f:
        json.dump(web_search_res, f, indent=2)

query:  Carbon capture technologies
trying academic search
Exception, trying normal search
trying normal search


### Reading type 1: filtering unlikely relevant sources based on title and body

In [13]:
def get_filtering_web_results_ratings(key_question, web_search_res):
    return f'''
Key question:
{key_question}

Task:
Based on the key question and each search result's title and body content, reason and assign a predicted usefulness score of the search result's content and potential useful references to answering the key question using a 5-point Likert scale, with 1 being very not useful, 2 being not useful, 3 being somewhat useful, 4 being useful, 5 being very useful.

Search results:
{web_search_res}

The output should be in JSON format: 
```json
{{
  'href': 'relevance score',
  etc.
}}
```

Respond only with the output, with no explanation or conversation.
'''

In [14]:
from collections import defaultdict

with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.load(f)

for query_idx, query in key_question_initial_search_queries.items():
    # load web search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_results_query_{query_idx}.json', 'r') as f:
        web_search_res = json.loads(f.read())
    
    filtered_web_results = {}
    if web_search_res != []:
        # filter web results based on title and body
        filtered_web_results = json.loads(chat_openai(get_filtering_web_results_ratings(search_query, web_search_res), model="gpt-3.5-turbo")[0])

    ratings_url_dict = defaultdict(list)
    for url, rating in filtered_web_results.items():
        ratings_url_dict[str(rating)].append(url)

    # save filtered search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/rated_web_results_query_{int(query_idx)}.json', 'w') as f:
        json.dump(ratings_url_dict, f, indent=2)

### Reading type 2 & 3

In [5]:
# COMPLETE code for predicting usefulness of very relevant (5) and relevant (4) results.
from util import scrape_text_with_selenium_no_agent

CHUNK_SIZE = 1000
SAMPLING_FACTOR = 0.1 # Also cap it so it falls under the max token limit
MAX_TOKENS = 2500 * 4 # 1 token = 4 chars, 2500 + 500 (prompt) tokens is high for GPT3.5
MAX_CHUNKS = int(MAX_TOKENS / CHUNK_SIZE)
# context = "Enoyl-CoA carboxylase/reductase enzymes (ECRs)"

In [6]:
def get_sample_chunks(text, CHUNK_SIZE, num_chunk_samples):
    step_size = len(text) // num_chunk_samples

    chunks = []
    for i in range(0, len(text), step_size):
        chunk = text[i:i+CHUNK_SIZE]
        chunks.append(chunk)

        # Break after getting the required number of chunks
        if len(chunks) >= num_chunk_samples:
            break

    return chunks

In [7]:
# Need to determine how useful the text is likely to be for answering the key questions
def get_predicted_usefulness_of_text_prompt(key_question, sample_text_chunks):
    return f'''
Key question:
{key_question}

Task: 
Based on the key question and the sample text chunks of the source text, the goal is to identify how useful reading the full source text would be to extract direct quoted facts or references to determine the best answer to the key question. 

Deliverable:
Assign a predicted usefulness score of the full source text using a 5-point Likert scale, with 1 being very unlikely to be usefulness, 2 being unlikely to be useful, 3 being somewhat likely to be useful, 4 being likely to be useful, and 5 being very likely useful and containing facts or references that answer the key question.

Sample text chunks from the source text:
{sample_text_chunks}

The output should be of the following JSON format
{{
    ""predicted_usefulness: <insert predicted usefulness rating>,
   etc.
}}


Respond only with the output, with no explanation or conversation.
'''

In [11]:
from rank_bm25 import BM25Okapi
import re

def get_most_relevant_chunks_with_bm25(key_question, text, CHUNK_SIZE, num_chunk_samples):
    # 1. Split text into chunks
    chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]

    # 2. Tokenize the chunks
    tokenized_chunks = [re.findall(r"\w+", chunk) for chunk in chunks]

    # 3. Initialize BM25
    bm25 = BM25Okapi(tokenized_chunks)

    # 4. Query BM25 with the key question
    tokenized_question = re.findall(r"\w+", key_question)
    scores = bm25.get_scores(tokenized_question)

    # 5. Sort chunks by BM25 scores
    sorted_chunks = [chunk for _, chunk in sorted(zip(scores, chunks), key=lambda pair: pair[0], reverse=True)]

    # 6. Return top num_chunk_samples chunks
    return sorted_chunks[:num_chunk_samples]

In [8]:
def find_title(url, web_search_info):
    for item in web_search_info:
        if item["href"] == url:
            return item["title"]
    return None

def check_pdf(url, web_search_info):
    for item in web_search_info:
        if "pdf" in item.keys() and item["pdf"]:
            return True
    print("Not pdf")
    return False

In [16]:
folder_path = f'autoscious_logs/{search_query_file_safe}/sources/full_text'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Skimming through each highly relevant paper from skimming
with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_queries.json', 'r') as f:
    key_question_initial_search_queries = json.load(f)

for query_idx, query in key_question_initial_search_queries.items():
    # open filtered search results
    with open(f'autoscious_logs/{search_query_file_safe}/sources/rated_web_results_query_{int(query_idx)}.json', 'r') as f:
        ratings_url_dict = json.loads(f.read())

    # open web search info to extract metadata
    with open(f'autoscious_logs/{search_query_file_safe}/sources/initial_search_results_query_{int(query_idx)}.json', 'r') as f:
        web_search_info = json.load(f)
    
    for rating, urls in ratings_url_dict.items():
        if rating == '5' or rating == '4' or rating == '3': # Scraping all useful websites to skim through
            # Start with iterating through 4s and 5s of ratings_url_dict
            folder_path = f'autoscious_logs/{search_query_file_safe}/sources/predicted_usefulness_{rating}'
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)

            for rating_source_idx, url in enumerate(urls):
                print("query ", query_idx, "rating_source_idx", rating_source_idx, "Skimming url:", url)

                # Ensure the url hasn't already been visited
                title = find_title(url, web_search_info)
                if title and not os.path.exists(f'autoscious_logs/{sanitize_filename(search_query)}/sources/full_text/{sanitize_filename(title)}.txt') and not os.path.exists(f'{folder_path}/query_{query_idx}_url_index_{rating_source_idx}.json'):

                    # Check if it's a pdf or not
                    if try_getting_pdf(url):
                        print("PDF found!")
                        text = try_getting_pdf_content(url)
                    else:
                        text = scrape_text_with_selenium_no_agent(url, None, search_engine='firefox')

                    # Only evaluate websites you're able to scrape
                    if text and text != "No information found":
                        total_chunks = len(text) / CHUNK_SIZE
                        num_chunk_samples = min(int(total_chunks * SAMPLING_FACTOR), MAX_CHUNKS)
                        # sample_chunks = get_sample_chunks(text, CHUNK_SIZE, num_chunk_samples)
                        sample_chunks = get_most_relevant_chunks_with_bm25(keywords_query, text, CHUNK_SIZE, num_chunk_samples) # Using BM25 to search for keywords instead of general query
                        print("len(sample_chunks)", len(sample_chunks))

                        # Get predicted usefulness based on sample chunks
                        predicted_usefulness_results = json.loads(chat_openai(get_predicted_usefulness_of_text_prompt(search_query, sample_chunks), model="gpt-3.5-turbo")[0])

                        # save filtered search results
                        with open(f'{folder_path}/query_{query_idx}_url_index_{rating_source_idx}.json', 'w') as f:
                            predicted_usefulness_results['title'] = title
                            predicted_usefulness_results['url'] = url
                            json.dump(predicted_usefulness_results, f, indent=2)
                        
                        # Check if any scores were (4 or) 5, because then we should save the full text
                        pred_usefulness = predicted_usefulness_results.values()

                        # TODO: perhaps make this more dynamic
                        if 5 in pred_usefulness or '5' in pred_usefulness or 4 in pred_usefulness or '4' in pred_usefulness:
                        # DEBUG: Just looking at the scraping results
                            with open(f'autoscious_logs/{sanitize_filename(search_query)}/sources/full_text/{sanitize_filename(title)}.txt', 'w', encoding='utf-8') as f:
                                f.write(title + '\n')
                                f.write(url + '\n')
                                f.write(text)
                else:
                    print("URL or text already visited!")

query  1 rating_source_idx 0 Skimming url: https://climate.mit.edu/explainers/carbon-capture
URL or text already visited!
query  1 rating_source_idx 1 Skimming url: https://www.npr.org/2021/08/08/1025845745/what-is-carbon-capture-technology-it-has-a-complicated-place-in-the-infrastructu
URL or text already visited!
query  1 rating_source_idx 2 Skimming url: https://www.honeywell.com/us/en/news/2022/09/how-carbon-capture-works
URL or text already visited!
query  1 rating_source_idx 3 Skimming url: https://www.nationalgrid.com/stories/energy-explained/carbon-capture-technology-and-how-it-works
URL or text already visited!
query  1 rating_source_idx 0 Skimming url: https://www.nytimes.com/interactive/2023/03/19/us/carbon-capture.html
Could not get pdf
Going through url:  https://www.nytimes.com/interactive/2023/03/19/us/carbon-capture.html
select firefox options!
Driver is getting url
set timeout!
Page loaded within 15 seconds
Driver got url
Driver has found page source
Handing off to Bea

### Extract facts

In [None]:
# 2) create facts folder and subfolders
facts_folder_path = f'autoscious_logs/{sanitize_filename(search_query)}/facts'
if not os.path.exists(facts_folder_path):
    os.makedirs(facts_folder_path)

In [None]:
# Can be broken to prompts.py and util.py
import os

def extract_facts_from_website_text(search_query_file_safe, key_question, website_title, website_text, website_url):
    seed_initial_question_decomposition_prompt = f'''
Key question: 
{key_question}

Context:
Ideally, I'm looking for a numerical answer.

Task:
What's the best answer based on the source text? Give me as specific and correct of an answer as possible. Then, quote the section of the source text that supports your answer. 

The output should be in JSON format: 
```json
{{
  "<insert best answer>": "<insert quote>"
}}
```

Source text: {website_text}

Respond only with the output, with no explanation or conversation.
'''
    # Ask GPT the prompt
    print("seed_initial_question_decomposition_prompt", seed_initial_question_decomposition_prompt)
    res = chat_openai(seed_initial_question_decomposition_prompt, model="gpt-3.5-turbo")
    print("Extracted quotes: ", res[0])

    # Save the quote to the corresponding key question index file
    res_json = json.loads(res[0])
    for key, value in res_json.items():
        answer = key
        quote = value

        # Only log if there is a quote
        if quote:
          file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts.txt'

          with open(file_name, 'a', encoding='utf-8') as f:
              f.write(answer + os.linesep)

          # Save the best answer and quote into a JSON for reference retrieval later
          json_file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts.json'
          if os.path.exists(json_file_name):
            with open(json_file_name, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
          else:
              data = {}

          # Update the dictionary and save it back
          data[answer] = quote.replace('/"', '"') + f"[{website_url}]"
          
          with open(json_file_name, 'w', encoding='utf-8') as json_file:
              json.dump(data, json_file, ensure_ascii=False, indent=4)

    return

In [None]:
from rank_bm25 import BM25Okapi
import re

def chunk_text(text: str, key_question: str, chunk_size: int, overlap: int = 0) -> list[str]:
    """
    Splits a text into overlapping chunks and ranks them using BM25 based on relevance to a key question.
    
    Args:
    - text (str): The source text.
    - key_question (str): The key question to rank the chunks by.
    - chunk_size (int): The size of each chunk.
    - num_chunk_samples (int): The number of top-ranked chunks to return.
    - overlap (int): The size of the overlap between chunks. Default is 0.
    
    Returns:
    - list[str]: The top-ranked chunks based on BM25 scores.
    """

    # 1. Split text into overlapping chunks
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]

    # 2. Tokenize the chunks
    tokenized_chunks = [re.findall(r"\w+", chunk) for chunk in chunks]

    # 3. Initialize BM25
    bm25 = BM25Okapi(tokenized_chunks)

    # 4. Query BM25 with the key question
    tokenized_question = re.findall(r"\w+", key_question)
    scores = bm25.get_scores(tokenized_question)

    # 5. Sort chunks by BM25 scores
    sorted_chunks = [chunk for _, chunk in sorted(zip(scores, chunks), key=lambda pair: pair[0], reverse=True)]

    # 6. Return top num_chunk_samples chunks
    return sorted_chunks


In [None]:
with open(f'autoscious_logs/{search_query_file_safe}/sources/keywords_query.txt', 'r') as f:
    keywords_query = f.read().strip('"')

In [None]:
# This method can partially extract the answer, but not the exact table passage which is critical NOR does it prioritize the list of facts
# Go through each full text rated highly to extract facts from
full_text_folder_path = f'autoscious_logs/{sanitize_filename(search_query)}/sources/full_text'

# Loop through every file in the directory, just goes in order
for filename in os.listdir(full_text_folder_path):
    curr_tokens = 0
    if filename.endswith('.txt'):
        file_path = os.path.join(full_text_folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            title = file.readline().strip()
            url = file.readline().strip()
            text = file.read()
        
        # Break the text into chunk to extract information from
        chunks = chunk_text(text, keywords_query, chunk_size, overlap)
        for i, chunk in enumerate(chunks):
            print(f"Chunk: {i} / {len(chunks)}")
            extract_facts_from_website_text(search_query_file_safe, search_query, title, chunk, url)

            curr_tokens += len(chunk)
            if curr_tokens > MAX_TOKENS:
                print("Max tokens reached in chunks!")
                break

#### Rerank facts.txt file based on relevancy

In [None]:
file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts.txt'

with open(file_name, 'r', encoding='utf-8') as f:
    facts_list = f.read()

In [None]:
def get_rerank_facts_list_prompt(facts_list):
  return f'''
Key question: 
How efficiently do the ECR enzymes work in Kitsatospor setae bacteria?

Context:
Ideally, I'm looking for a numerical answer.

Task:
Rerank the following facts based on how well they answer the key question. The more specific and correct, the better. The best answer should be at the top, and the worst answer should be at the bottom of the list. Use direct quotes and do not change the wording of the facts. Leave out facts that are not relevant to the key question.

Current facts list:
{facts_list}

The output should be a JSON list of facts:
```json
['<insert fact>', etc.]
```

Respond only with the output, with no explanation or conversation.
'''
reranked_facts_list = json.loads(chat_openai(get_rerank_facts_list_prompt(facts_list), model="gpt-3.5-turbo")[0])
print(reranked_facts_list)

In [None]:
file_name = f'autoscious_logs/{search_query_file_safe}/facts/facts_reranked.txt'

with open(file_name, 'w', encoding='utf-8') as f:
    for answer in reranked_facts_list:
        f.write(answer + os.linesep)