# ChatBot to Search Papers on Arxiv and storing them

In [70]:
import arxiv
import json
import os
from typing import List
from dotenv import load_dotenv
import anthropic
from groq import Groq

In [71]:
PAPER_DIR = "papers"

In [72]:
def search_papers(topic: str, max_results: int = 5) -> List[str]:
    """
    Search for papers on arXiv based on a topic and store their information.
    
    Args:
        topic: The topic to search for
        max_results: Maximum number of results to retrieve (default: 5)
        
    Returns:
        List of paper IDs found in the search
    """
    
    # Use arxiv to find the papers 
    client = arxiv.Client()

    # Search for the most relevant articles matching the queried topic
    search = arxiv.Search(
        query = topic,
        max_results = max_results,
        sort_by = arxiv.SortCriterion.Relevance
    )

    papers = client.results(search)
    
    # Create directory for this topic
    path = os.path.join(PAPER_DIR, topic.lower().replace(" ", "_"))
    os.makedirs(path, exist_ok=True)
    
    file_path = os.path.join(path, "papers_info.json")

    # Try to load existing papers info
    try:
        with open(file_path, "r") as json_file:
            papers_info = json.load(json_file)
    except (FileNotFoundError, json.JSONDecodeError):
        papers_info = {}

    # Process each paper and add to papers_info  
    paper_ids = []
    for paper in papers:
        paper_ids.append(paper.get_short_id())
        paper_info = {
            'title': paper.title,
            'authors': [author.name for author in paper.authors],
            'summary': paper.summary,
            'pdf_url': paper.pdf_url,
            'published': str(paper.published.date())
        }
        papers_info[paper.get_short_id()] = paper_info
    
    # Save updated papers_info to json file
    with open(file_path, "w") as json_file:
        json.dump(papers_info, json_file, indent=2)
    
    print(f"Results are saved in: {file_path}")
    
    return paper_ids

In [73]:
search_papers("computers")

Results are saved in: papers/computers/papers_info.json


['1310.7911v2',
 'math/9711204v1',
 '2208.00733v1',
 '2504.07020v1',
 '2403.03925v1']

In [74]:
def extract_info(paper_id: str) -> str:
    """
    Search for information about a specific paper across all topic directories.
    
    Args:
        paper_id: The ID of the paper to look for
        
    Returns:
        JSON string with paper information if found, error message if not found
    """
 
    for item in os.listdir(PAPER_DIR):
        item_path = os.path.join(PAPER_DIR, item)
        if os.path.isdir(item_path):
            file_path = os.path.join(item_path, "papers_info.json")
            if os.path.isfile(file_path):
                try:
                    with open(file_path, "r") as json_file:
                        papers_info = json.load(json_file)
                        if paper_id in papers_info:
                            return json.dumps(papers_info[paper_id], indent=2)
                except (FileNotFoundError, json.JSONDecodeError) as e:
                    print(f"Error reading {file_path}: {str(e)}")
                    continue
    
    return f"There's no saved information related to paper {paper_id}."

In [75]:
extract_info('1310.7911v2')

'{\n  "title": "Compact manifolds with computable boundaries",\n  "authors": [\n    "Zvonko Iljazovic"\n  ],\n  "summary": "We investigate conditions under which a co-computably enumerable closed set\\nin a computable metric space is computable and prove that in each locally\\ncomputable computable metric space each co-computably enumerable compact\\nmanifold with computable boundary is computable. In fact, we examine the notion\\nof a semi-computable compact set and we prove a more general result: in any\\ncomputable metric space each semi-computable compact manifold with computable\\nboundary is computable. In particular, each semi-computable compact\\n(boundaryless) manifold is computable.",\n  "pdf_url": "http://arxiv.org/pdf/1310.7911v2",\n  "published": "2013-10-29"\n}'

## tools schema

In [76]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "search_papers",
            "description": "Search for papers on arXiv based on a topic and store their information.",
            "parameters": {
                "type": "object",
                "properties": {
                    "topic": {
                        "type": "string",
                        "description": "The topic to search for"
                    }, 
                    "max_results": {
                        "type": "integer",
                        "description": "Maximum number of results to retrieve",
                        "default": 5
                    }
                },
                "required": ["topic"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "extract_info",
            "description": "Search for information about a specific paper across all topic directories.",
            "parameters": {
                "type": "object",
                "properties": {
                    "paper_id": {
                        "type": "string",
                        "description": "The ID of the paper to look for"
                    }
                },
                "required": ["paper_id"]
            }
        }
    }
]

## tool mapping

In [77]:
mapping_tool_function = {
    "search_papers": search_papers,
    "extract_info": extract_info
}

def execute_tool(tool_name, tool_args):
    
    result = mapping_tool_function[tool_name](**tool_args)

    if result is None:
        result = "The operation completed but didn't return any results."
        
    elif isinstance(result, list):
        result = ', '.join(result)
        
    elif isinstance(result, dict):
        # Convert dictionaries to formatted JSON strings
        result = json.dumps(result, indent=2)
    
    else:
        # For any other type, convert using str()
        result = str(result)
    return result

In [78]:
load_dotenv()
api_key = os.getenv('Groq_Cloud')


client = Groq(
api_key = api_key
)

print(client)

<groq.Groq object at 0x715313d01940>


In [88]:
import json

def process_query(query):
    
    messages = [{'role': 'user', 'content': query}]
    
    response = client.chat.completions.create(max_tokens=1024,
                                             model='llama3-70b-8192', 
                                             tools=tools,
                                             messages=messages)
    
    process_query = True
    while process_query:
        assistant_content = ""

        # Access the message from the first choice
        message = response.choices[0].message

        # Handle text response
        if message.content:
            print(message.content)
            assistant_content = message.content
            messages.append({'role': 'assistant', 'content': assistant_content})
            
            if not message.tool_calls:
                process_query = False
        
        # Handle tool calls
        if message.tool_calls:
            for tool_call in message.tool_calls:
                tool_id = tool_call.id
                tool_name = tool_call.function.name
                tool_args = json.loads(tool_call.function.arguments)
                print(f"Calling tool {tool_name} with args {tool_args}")
                
                result = execute_tool(tool_name, tool_args)
                
                # Append assistant message with tool call acknowledgment (as a string)
                assistant_content = f"Tool {tool_name} called with arguments {tool_args}"
                messages.append({'role': 'assistant', 'content': assistant_content})
                
                # Append tool result as a text content object
                messages.append({
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"Tool result for {tool_id}: {result}"
                        }
                    ]
                })
                
                response = client.chat.completions.create(max_tokens=1024,
                                                         model='llama3-70b-8192', 
                                                         tools=tools,
                                                         messages=messages)
                
                # Check if the new response is a final text response
                if response.choices[0].message.content and not response.choices[0].message.tool_calls:
                    print(response.choices[0].message.content)
                    process_query = False

In [89]:
def chat_loop():
    print("Type your queries or 'quit' to exit.")
    while True:
        try:
            query = input("\nQuery: ").strip()
            if query.lower() == 'quit':
                break
    
            process_query(query)
            print("\n")
        except Exception as e:
            print(f"\nError: {str(e)}")

In [90]:
chat_loop()

Type your queries or 'quit' to exit.
Calling tool search_papers with args {'topic': 'Agentic AI', 'max_results': 5}
Results are saved in: papers/agentic_ai/papers_info.json
Calling tool extract_info with args {'paper_id': '2502.18359v1'}
Based on the search results and the extracted information from the paper "Responsible AI Agents", it seems that Agentic AI is an area of research focused on responsible AI agents. These agents are capable of executing tasks, such as booking trips or posting content on social media. However, there are concerns about the potential misuses of AI agents, including rogue commerce, manipulation, defamation, and intellectual property harms. The paper argues that core aspects of software interactions can discipline AI agents and prevent undesired actions. It also proposes a computer-science approach to align AI agents with user norms and values, which can mitigate perceived risks. Finally, the paper asserts that AI agents should not be given legal personhood s