In [1]:
from tavily import TavilyClient

import os
import pandas as pd
import nest_asyncio


In [None]:
# This is a workaround for a bug in the nano_graphrag library
nest_asyncio.apply()

# Set the API key
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
os.environ["TAVILY_API_KEY"] = "YOUR_TAVILY_API_KEY"

tavily_client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])


## Prepare data

In [5]:
import pandas as pd

train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

filtered_data_train = []
filtered_data_test = []

for query in train_df["question"]:
    filtered_data_train.append({"Question": query})

for query in test_df["question"]:
    filtered_data_test.append({"Question": query})

filtered_data_train = filtered_data_train[:2]

In [6]:
from script.prompt_manager import prepare_input_prompts
from script.initialize_tokenizer import initialize_tokenizer

dataset_name = "gpqa" # use the prompt for the GPQA dataset because it's a dataset for multiple choice questions too.

tokenizer = initialize_tokenizer(model_name="gpt-4o")

# Prepare input prompts
input_list_train, active_sequences_train = prepare_input_prompts(
    filtered_data=filtered_data_train,
    model_path=None,
    dataset_name=dataset_name,
    tokenizer=tokenizer,
)

input_list_test, active_sequences_test = prepare_input_prompts(
    filtered_data=filtered_data_test,
    model_path=None,
    dataset_name=dataset_name,
    tokenizer=tokenizer,
)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


## Multi-agent Model

In [7]:
from config import  END_SEARCH_QUERY, END_CODE_QUERY


model_args = {
    "stop_tokens": [END_SEARCH_QUERY, END_CODE_QUERY, tokenizer.eos_token],
    "model": "gpt-4o",
    "temperature": 0.3,
    "top_p": 0.95,
    "top_k": 50,
    "frequency_penalty": 0.3,
}



## Training data inference

In [8]:
from script.run_reasoning import run_reasoning 


accuracy_train = 0
final_results_train_complete = []
repetition = 5
for i in range(repetition):
    # Reset the finished flag for each sequence
    for j in range(len(active_sequences_train)):
        active_sequences_train[j]['finished'] = False 
    # Run reasoning
    sequences_train = run_reasoning(filtered_data_train, active_sequences_train, forcing_research=True, code_mode_name="gpt-4o", general_model_args=model_args, tokenizer=tokenizer, tavily_client=tavily_client)
    final_results_train = [active_sequences_train[i]['final_result'] for i in range(len(active_sequences_train))]
    print(f"Final results: {final_results_train}")
    
    accuracy_train += sum([1 for i in range(len(final_results_train)) if final_results_train[i] == train_df['answer'][i]])/len(final_results_train)
    final_results_train_complete.append(final_results_train)
    
accuracy_train /= repetition
print(f"Accuracy on training set: {accuracy_train}")
    


INFO:nano-graphrag:Creating working directory output/knowledge_graph-0
INFO:nano-graphrag:Load KV full_docs with 0 data
INFO:nano-graphrag:Load KV text_chunks with 0 data
INFO:nano-graphrag:Load KV llm_response_cache with 0 data
INFO:nano-graphrag:Load KV community_reports with 0 data
INFO:nano-vectordb:Init {'embedding_dim': 1536, 'metric': 'cosine', 'storage_file': 'output/knowledge_graph-0/vdb_entities.json'} 0 data


Initial Prompt: [Question]: 2. In general, reliability testing is performed for which of the following reasons?I. To detect unanticipated failure modes.II. To compare estimated failure rates to actual failure rates.III. To monitor reliability growth over time.IV. To meet or exceed customer expectations.

[Choices]: [a] I and III only | [b] II and IV only | [c] I, II and III only  | [d] I, II, III and IV
[ResearchAgent] Gathering information for query: [Question]: 2. In general, reliability testing is performed for which of the following reasons?I. To detect unanticipated failure modes.II. To compare estimated failure rates to actual failure rates.III. To monitor reliability growth over time.IV. To meet or exceed customer expectations.

[Choices]: [a] I and III only | [b] II and IV only | [c] I, II and III only  | [d] I, II, III and IV


INFO:nano-graphrag:[New Docs] inserting 3 docs


[ResearchAgent] Analyzing data:
['March 26, 2025 / 8:30-11:30am Pacific Time $300. This webinar covers a number of reliability demonstration techniques including success based testing, test-to failure, degradation analysis, and other tools utilized by reliability engineers in the industry to demonstrate the reliability during a product test and validation process.', '2025 Dependability Awards and Ratings | J.D. Power Cars for Sale Cars for Sale Best Cars and Trucks by Ratings Search Articles Car Ratings Car Ratings The 2025 Dependability ratings and awards measure reliability based on the responses from more than 80,000 verified owners of 3-year-old vehicles annually. Best Electric Cars Best Cars and Trucks by Ratings Best New Car Deals New Car Preview 2019 Vehicle Dependability: Most Dependable Luxury Cars Under $45,000 New Car Preview Popular Vehicles Popular Cars and Trucks by Ratings Best Electric Cars Best Hybrid Cars Best Sports Cars Popular Expert Reviews 2019 Mazda CX-9 Review 

INFO:nano-graphrag:[New Chunks] inserting 3 chunks
INFO:nano-graphrag:[Entity Extraction]...
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /chat/completions in 0.422302 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /chat/completions in 0.493152 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /chat/completions in 0.430396 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /chat/completions in 0.986000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to 

Research Agent failed


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /chat/completions in 0.427820 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /chat/completions in 0.790294 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.



RateLimitError: litellm.RateLimitError: RateLimitError: OpenAIException - Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"


## Test data inference

In [None]:
from script.run_reasoning import run_reasoning 


final_results_test_complete = []
repetition = 5
for i in range(repetition):
    for j in range(len(active_sequences_test)):
        active_sequences_test[j]['finished'] = False 
    sequences_test = run_reasoning(filtered_data_test, active_sequences_test, forcing_research=True, code_mode_name="gpt-4o", general_model_args=model_args, tokenizer=tokenizer, tavily_client=tavily_client)
    final_results_test = [active_sequences_test[i]['final_result'] for i in range(len(active_sequences_test))]
    print(f"Final results: {final_results_test}")
    final_results_test_complete.append(final_results_test)
    
    
print("final_results_test_complete", final_results_test_complete)

In [None]:
import numpy as np
import csv

final_results_test_complete = np.array(final_results_test_complete).T.tolist()


# Save to CSV function
def save_predictions_to_csv(predictions_list, filename):
    """Save formatted predictions to CSV."""
    header = ["question_id"] + [f"prediction_{i+1}" for i in range(5)]
    
    with open(filename, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(header)  # Write header
        for i, predictions in enumerate(predictions_list, start=1):
            writer.writerow([i] + predictions)

# Save the formatted predictions
save_predictions_to_csv(final_results_test_complete, filename="./output/predictions_test.csv")
