In [None]:
import sqlite3


# Setup the database
def setup_database(db_path: str = 'literature.db'):
    """
    Function to create the SQLite database, set up the connection,
    and create tables if they do not already exist.
    """
    connector = sqlite3.connect(db_path)
    cursor = connector.cursor()

    # Main table for papers
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS papers (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        doi TEXT UNIQUE NOT NULL,
        title TEXT,
        publication_year INTEGER,
        authors TEXT,
        venue TEXT,
        volume TEXT,
        publication_type TEXT,
        publication_source TEXT,
        processed BOOLEAN DEFAULT 0,
        file_path TEXT DEFAULT NULL
    )
    """)

    # Table for paper assessments
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS paper_assessments (
        paper_id INTEGER PRIMARY KEY,
        is_neurosymbolic BOOLEAN,
        is_development BOOLEAN,
        paper_type TEXT,
        summary TEXT,
        takeaways TEXT,
        assessment_date TIMESTAMP,
        FOREIGN KEY (paper_id) REFERENCES papers (id)
    )
    """)

    # Table for keywords
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS keywords (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        keyword TEXT UNIQUE
    )
    """)

    # Relationship table for keywords and papers
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS rel_keywords_papers (
        paper_id INTEGER,
        keyword_id INTEGER,
        FOREIGN KEY (paper_id) REFERENCES papers (id),
        FOREIGN KEY (keyword_id) REFERENCES keywords (id)
    )
    """)

    connector.commit()
    connector.close()

In [None]:
from PyPDF2 import PdfReader


def extract_text_from_pdf(file_path: str) -> str:
    """
    Extract the full text from a PDF using PyPDF2.
    """
    try:
        reader = PdfReader(file_path)
        all_text = []
        for page in reader.pages:
            page_text = page.extract_text() or ""
            all_text.append(page_text)
        return "\n".join(all_text)
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return ""


def get_first_page_text(file_path: str) -> str:
    """
    Extracts text from the first page of a PDF.
    Returns an empty string if no pages exist or an error occurs.
    """
    try:
        reader = PdfReader(file_path)
        if len(reader.pages) > 0:
            first_page = reader.pages[0]
            return first_page.extract_text() or ""
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
    return ""


# This prompt instructs the agent to extract both abstract and keywords.
# If the first page doesn't contain them, it's allowed to call the PaperRetriever tool
# to query the rest of the paper.
# The agent must return the data in JSON form.
ABSTRACT_KEYWORDS_PROMPT = """
You are extracting information from a research paper.

Below is the text of the first page:
{first_page_text}

Your goal:
1. Extract the paper's abstract.
2. Extract the paper's keywords (as a list of words or phrases).

If the first page does not contain the abstract, the full abstract, or the keywords, 
you have access to a 'PaperRetriever' tool that can retrieve more text from the paper.

Return valid JSON with the structure:
{{
  "abstract": "...",
  "keywords": ["...", "..."]
}}

- If no abstract is found, use an empty string: "abstract": ""
- If no keywords are found, use an empty list: "keywords": []
"""

In [None]:
from langchain.vectorstores import Chroma
from langchain.agents import Tool


def create_paper_retriever_tool(vectorstore: Chroma) -> Tool:
    """
    Create a tool that can be called by the agent to do a similarity search
    over the paper's text.
    """
    def retrieval_tool(query: str) -> str:
        docs = vectorstore.similarity_search(query, k=2)
        contents = "\n\n".join([d.page_content for d in docs])
        return contents

    return Tool(
        name="PaperRetriever",
        func=retrieval_tool,
        description=(
            "Retrieves relevant text from the stored papers for the query. "
            "Use it if you need to find the abstract or the keywords that are not on the first page."
        )
    )

In [None]:
import os
import sqlite3
import json
from dotenv import load_dotenv, find_dotenv
from datetime import datetime
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.agents import initialize_agent, AgentType, Tool


load_dotenv(find_dotenv())

db_path: str = 'literature.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# 1) LLM
llm = ChatOpenAI(
    model="gpt-4o-mini", # gpt-4-turbo gpt-4o-mini
    temperature=0.0
)

# 2) Embeddings
embeddings = OpenAIEmbeddings()

# 3) Chroma vector store
persist_directory = "./chroma_store"
vectorstore = Chroma(
    collection_name="papers_collection",
    embedding_function=embeddings,
    persist_directory=persist_directory
)

# 4) Tools
def create_paper_retriever_tool(vectorstore: Chroma) -> Tool:
    def retrieval_tool(query: str) -> str:
        docs = vectorstore.similarity_search(query, k=2)
        contents = "\n\n".join([d.page_content for d in docs])
        return contents
    return Tool(
        name="PaperRetriever",
        func=retrieval_tool,
        description="Retrieves relevant text from the stored papers for the query."
    )

retriever_tool = create_paper_retriever_tool(vectorstore)

# 5) Agent
agent = initialize_agent(
    tools=[retriever_tool],
    llm=llm,
    agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

# 6) Get a set of papers from the DB
cursor.execute("""
    SELECT id, title, file_path
    FROM papers
    WHERE file_path IS NOT NULL
    AND id = 7
""")
paper_rows = cursor.fetchall()

for paper_id, title, file_path in paper_rows:
    print(f"Processing Paper ID={paper_id}: {title}")

    # a) Extract the full text and the first page text
    full_text = extract_text_from_pdf(file_path)
    first_page = get_first_page_text(file_path)

    # b) Insert the entire PDF text into Chroma, 
    #    so the agent can retrieve from it if needed.
    metadata = {"paper_id": paper_id, "title": title}
    doc = Document(page_content=full_text, metadata=metadata)
    vectorstore.add_documents([doc])

    # c) Prepare the prompt
    prompt = ABSTRACT_KEYWORDS_PROMPT.format(first_page_text=first_page)

    # d) Run the agent
    try:
        agent_response = agent.run(prompt)
        print("Agent raw response:\n", agent_response)

        # e) Parse JSON
        try:
            extraction = json.loads(agent_response)
            abstract = extraction.get("abstract", "")
            keywords = extraction.get("keywords", [])
        except json.JSONDecodeError as decode_err:
            print("Could not parse JSON from agent response:", decode_err)
            abstract = ""
            keywords = []

        # f) Do something with the results
        #    For example, store them in the 'papers' table or 'paper_assessments' table.
        #    If 'papers' table doesn't have these columns, you could add them or store them elsewhere.
        #    For demonstration, we'll just print them:
        print(f"Extracted Abstract:\n{abstract}")
        print(f"Extracted Keywords: {keywords}")

        # Example of storing them if 'papers' had columns 'extracted_abstract' & 'extracted_keywords':
        """
        ALTER TABLE papers ADD COLUMN extracted_abstract TEXT;
        ALTER TABLE papers ADD COLUMN extracted_keywords TEXT;
        """
        # cursor.execute("""
        #     UPDATE papers
        #     SET extracted_abstract = ?, 
        #         extracted_keywords = ?
        #     WHERE id = ?
        # """, (abstract, ", ".join(keywords), paper_id))
        # conn.commit()

    except Exception as e:
        print(f"Error extracting data for paper {paper_id}: {e}")

# Close DB
cursor.close()
conn.close()