In [None]:
import pymupdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.schema import Document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain.memory import ConversationBufferMemory
import os
import re

from dotenv import load_dotenv

load_dotenv()

# Research Paper Analysis Agent

This notebook implements an LLM-based research agent that can analyze and answer questions about research papers. The agent extracts content from PDF research papers, creates a searchable knowledge base, and provides contextual answers with conversation memory.

In [None]:
def extract_paper_sections(pdf_path):
    """Extract sections from a research paper PDF"""
    doc = pymupdf.open(pdf_path)
    
    # Common research paper section patterns
    section_patterns = [
        r'^(abstract|introduction|related work|methodology|method|approach|implementation|results|discussion|conclusion|references|acknowledgments)',
        r'^\d+\.?\s+(abstract|introduction|related work|methodology|method|approach|implementation|results|discussion|conclusion|references|acknowledgments)',
        r'^\d+\.\d+\.?\s+.*',  # Subsections like 2.1, 3.2
    ]
    
    sections = {}
    current_section = "content"
    section_content = []
    
    print(f"Extracting content from: {pdf_path}")
    print(f"Total pages: {doc.page_count}")
    
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text = page.get_text()
        lines = text.split('\n')
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            # Check if this line is a section header
            is_section_header = False
            for pattern in section_patterns:
                if re.match(pattern, line.lower()):
                    # Save previous section
                    if section_content:
                        sections[current_section] = '\n'.join(section_content)
                    
                    # Start new section
                    current_section = line.lower()
                    section_content = []
                    is_section_header = True
                    break
            
            if not is_section_header:
                section_content.append(line)
    
    # Save the last section
    if section_content:
        sections[current_section] = '\n'.join(section_content)
    
    doc.close()
    
    print(f"Extracted sections: {list(sections.keys())}")
    return sections

def extract_paper_metadata(pdf_path):
    """Extract basic metadata from research paper"""
    doc = pymupdf.open(pdf_path)
    
    # Get document metadata
    metadata = doc.metadata
    
    # Extract first page text to get title and authors
    first_page = doc[0].get_text()
    lines = first_page.split('\n')
    
    # Simple heuristic to find title (usually first few lines with substantial text)
    title = "Unknown Title"
    authors = "Unknown Authors"
    
    substantial_lines = [line.strip() for line in lines if len(line.strip()) > 10]
    if substantial_lines:
        title = substantial_lines[0]
        if len(substantial_lines) > 1:
            # Look for author pattern (names, emails, affiliations)
            for line in substantial_lines[1:4]:
                if any(indicator in line.lower() for indicator in ['@', 'university', 'institute', 'college']):
                    authors = line
                    break
    
    doc.close()
    
    return {
        'title': title,
        'authors': authors,
        'filename': os.path.basename(pdf_path),
        'total_pages': doc.page_count if 'doc' in locals() else 0
    }

In [None]:
# URL Paper Retrieval Agent using LangChain
import requests
from urllib.parse import urlparse
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain_community.document_loaders import UnstructuredURLLoader
import tempfile
import urllib.request

def download_paper_from_url(url, save_path=None):
    """Download paper from URL and save locally"""
    try:
        # Parse URL to get filename
        parsed_url = urlparse(url)
        filename = parsed_url.path.split('/')[-1]
        
        # If no filename extension, assume PDF
        if not filename.endswith('.pdf'):
            filename = filename + '.pdf' if filename else 'downloaded_paper.pdf'
            
        # Set save path
        if save_path is None:
            save_path = filename
            
        print(f"Downloading paper from: {url}")
        print(f"Saving to: {save_path}")
        
        # Download the file
        urllib.request.urlretrieve(url, save_path)
        
        print(f"Successfully downloaded: {save_path}")
        return save_path
        
    except Exception as e:
        print(f"Error downloading paper: {e}")
        return None

def extract_paper_from_url(url, use_temp_file=True):
    """Extract paper content directly from URL using LangChain loaders"""
    try:
        print(f"Extracting paper content from URL: {url}")
        
        # Check if URL points to a PDF
        if url.lower().endswith('.pdf') or 'pdf' in url.lower():
            if use_temp_file:
                # Download to temporary file first
                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                    urllib.request.urlretrieve(url, tmp_file.name)
                    
                    # Use PyPDFLoader for better PDF handling
                    loader = PyPDFLoader(tmp_file.name)
                    documents = loader.load()
                    
                    # Clean up temp file
                    import os
                    os.unlink(tmp_file.name)
                    
            else:
                # Try direct URL loading (may not work for all PDFs)
                loader = PyPDFLoader(url)
                documents = loader.load()
                
        else:
            # For web pages, use WebBaseLoader
            loader = WebBaseLoader(url)
            documents = loader.load()
        
        print(f"Successfully extracted {len(documents)} pages/sections from URL")
        
        # Combine all document content
        full_text = "\n\n".join([doc.page_content for doc in documents])
        
        # Extract basic metadata
        metadata = {
            'source_url': url,
            'total_pages': len(documents),
            'content_length': len(full_text)
        }
        
        return full_text, metadata, documents
        
    except Exception as e:
        print(f"Error extracting paper from URL: {e}")
        return None, None, None

def process_arxiv_url(arxiv_url):
    """Convert arXiv abstract URL to PDF URL"""
    if 'arxiv.org/abs/' in arxiv_url:
        # Convert abstract URL to PDF URL
        paper_id = arxiv_url.split('/abs/')[-1]
        pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
        return pdf_url
    return arxiv_url

def create_url_paper_agent(url):
    """Create a complete paper analysis setup from URL"""
    global paper_sections, paper_metadata, documents, vector_store
    
    try:
        # Handle arXiv URLs
        if 'arxiv.org' in url:
            url = process_arxiv_url(url)
            print(f"Converted to PDF URL: {url}")
        
        # Extract paper content from URL
        full_text, url_metadata, raw_documents = extract_paper_from_url(url)
        
        if full_text is None:
            print("Failed to extract paper content from URL")
            return False
        
        # Simple section extraction for URL-based papers
        # This is a simplified approach - you might want to enhance this
        sections = extract_sections_from_text(full_text)
        
        # Create paper metadata
        paper_metadata = {
            'title': extract_title_from_text(full_text),
            'authors': extract_authors_from_text(full_text),
            'source_url': url,
            'filename': url.split('/')[-1],
            'total_pages': url_metadata['total_pages'],
            'content_length': url_metadata['content_length']
        }
        
        paper_sections = sections
        
        print("\n=== URL Paper Metadata ===")
        for key, value in paper_metadata.items():
            print(f"{key}: {value}")
            
        print(f"\n=== Sections Found ===")
        for section, content in paper_sections.items():
            print(f"- {section}: {len(content)} characters")
            
        return True
        
    except Exception as e:
        print(f"Error processing URL paper: {e}")
        return False

def extract_sections_from_text(text):
    """Extract sections from full text (simplified approach)"""
    sections = {}
    
    # Split by common section headers
    section_patterns = [
        r'\n\s*(abstract|introduction|related work|methodology|method|approach|implementation|results|discussion|conclusion|references|acknowledgments)\s*\n',
        r'\n\s*\d+\.?\s*(abstract|introduction|related work|methodology|method|approach|implementation|results|discussion|conclusion|references|acknowledgments)\s*\n',
    ]
    
    # For now, return the full text as 'content' section
    # You can enhance this to better parse sections
    sections['full_content'] = text
    
    # Try to find abstract
    import re
    abstract_match = re.search(r'abstract\s*[:\-\n]\s*(.*?)(?=\n\s*(?:introduction|keywords|\d+\.|$))', text, re.IGNORECASE | re.DOTALL)
    if abstract_match:
        sections['abstract'] = abstract_match.group(1).strip()
    
    return sections

def extract_title_from_text(text):
    """Extract title from paper text (simple heuristic)"""
    lines = text.split('\n')
    # Look for the first substantial line as title
    for line in lines[:10]:
        line = line.strip()
        if len(line) > 20 and not line.lower().startswith(('abstract', 'keywords', 'introduction')):
            return line
    return "Unknown Title from URL"

def extract_authors_from_text(text):
    """Extract authors from paper text (simple heuristic)"""
    lines = text.split('\n')
    # Look for author patterns in first few lines
    for line in lines[1:8]:
        line = line.strip()
        if any(indicator in line.lower() for indicator in ['@', 'university', 'institute', 'college', 'department']):
            return line
    return "Unknown Authors from URL"

# Example usage functions
def load_paper_from_arxiv(arxiv_id):
    """Load paper directly from arXiv ID"""
    arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
    return create_url_paper_agent(arxiv_url)

def load_paper_from_pdf_url(pdf_url):
    """Load paper from direct PDF URL"""
    return create_url_paper_agent(pdf_url)

print("URL Paper Retrieval Agent loaded successfully!")
print("Usage:")
print("- create_url_paper_agent('https://arxiv.org/abs/2301.xxxxx')")
print("- load_paper_from_arxiv('2301.xxxxx')")
print("- load_paper_from_pdf_url('https://example.com/paper.pdf')")

## URL Paper Retrieval Agent

This section implements functionality to retrieve and process research papers directly from URLs using LangChain document loaders. 

**Supported Sources:**
- arXiv papers (both abstract and direct PDF URLs)
- Direct PDF URLs from any website
- Web pages containing research content

**Key Features:**
- Automatic arXiv URL conversion (abstract → PDF)
- Temporary file handling for secure downloads
- LangChain PyPDFLoader integration
- Automatic metadata extraction from URL content
- Seamless integration with the existing RAG pipeline

**Usage Examples:**
```python
# From arXiv (using paper ID)
load_paper_from_arxiv("2301.08727")

# From arXiv (using full URL)
create_url_paper_agent("https://arxiv.org/abs/2301.08727")

# From direct PDF URL
create_url_paper_agent("https://example.com/research_paper.pdf")
```

In [None]:
# Configure your research paper source here
PAPER_SOURCE = "doc.pdf"  # Can be local file path or URL

# Check if source is URL or local file
if PAPER_SOURCE.startswith(('http://', 'https://')):
    print("Loading paper from URL...")
    success = create_url_paper_agent(PAPER_SOURCE)
    if not success:
        print("Failed to load paper from URL. Please check the URL and try again.")
        # You might want to set some default values here
        paper_sections = {"error": "Failed to load from URL"}
        paper_metadata = {"title": "Error", "authors": "Unknown", "filename": "error"}
else:
    print("Loading paper from local file...")
    # Extract paper content and metadata from local file
    paper_sections = extract_paper_sections(PAPER_SOURCE)
    paper_metadata = extract_paper_metadata(PAPER_SOURCE)

print("\n=== Paper Metadata ===")
for key, value in paper_metadata.items():
    print(f"{key}: {value}")

print(f"\n=== Sections Found ===")
for section, content in paper_sections.items():
    print(f"- {section}: {len(content)} characters")

In [None]:
# Examples of URL Paper Loading
# Uncomment and modify one of these examples to load papers from URLs:

# Example 1: Load from arXiv using paper ID
# load_paper_from_arxiv("2301.08727")  # Replace with actual arXiv ID

# Example 2: Load from arXiv using full URL
# create_url_paper_agent("https://arxiv.org/abs/2301.08727")  # Replace with actual arXiv URL

# Example 3: Load from direct PDF URL
# create_url_paper_agent("https://example.com/paper.pdf")  # Replace with actual PDF URL

# Example 4: Download and save paper locally first
# pdf_url = "https://arxiv.org/pdf/2301.08727.pdf"
# local_path = download_paper_from_url(pdf_url, "downloaded_paper.pdf")
# if local_path:
#     paper_sections = extract_paper_sections(local_path)
#     paper_metadata = extract_paper_metadata(local_path)

print("URL examples are ready to use!")
print("Uncomment and modify the examples above to load papers from URLs.")
print("Current source:", PAPER_SOURCE)

# Initialize LLM and Embeddings

In [None]:
llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0.1,
    max_tokens=1000
)

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"normalize_embeddings": True}
)

# Create Document Chunks and Vector Store

In [None]:
# Text splitter optimized for research papers
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Larger chunks for research content
    chunk_overlap=200,  # More overlap to preserve context
    length_function=len,
    separators=["\n\n", "\n", ". ", "! ", "? ", ", ", " ", ""]
)

# Create documents from paper sections
documents = []

for section_name, section_content in paper_sections.items():
    if len(section_content.strip()) < 50:  # Skip very short sections
        continue
        
    chunks = text_splitter.split_text(section_content)
    
    for i, chunk in enumerate(chunks):
        doc = Document(
            page_content=chunk,
            metadata={
                "source": paper_metadata['filename'],
                "paper_title": paper_metadata['title'],
                "authors": paper_metadata['authors'],
                "section": section_name,
                "chunk_id": i,
                "chunk_size": len(chunk)
            }
        )
        documents.append(doc)

print(f"Created {len(documents)} document chunks from the research paper")
print(f"Average chunk size: {sum(len(doc.page_content) for doc in documents) // len(documents)} characters")

### Vector Store Creation

In [None]:
# Create vector store from documents
vector_store = FAISS.from_documents(
    documents=documents,
    embedding=embeddings
)

print("Vector store created successfully!")
print(f"Total documents indexed: {len(documents)}")

# Research Agent RAG Pipeline

In [None]:
# Create retriever with research-focused search
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6}  # Retrieve top 6 most relevant chunks for research
)

# Enhanced prompt template for research paper analysis
research_prompt = PromptTemplate(
    template="""You are an expert research assistant specializing in academic paper analysis.

PAPER INFORMATION:
Title: {paper_title}
Authors: {authors}

INSTRUCTIONS:
- Analyze the provided research paper content to answer questions accurately
- Reference specific sections, methodologies, results, and findings when relevant
- Maintain academic rigor and cite evidence from the paper
- Use the conversation history to provide coherent, contextual responses
- If the question requires information not in the provided context, clearly state the limitations
- For technical questions, explain concepts clearly while maintaining accuracy

CONVERSATION HISTORY:
{chat_history}

RELEVANT PAPER CONTENT:
{context}

RESEARCH QUESTION: {question}

DETAILED ANALYSIS:""",
    input_variables=['context', 'question', 'chat_history', 'paper_title', 'authors']
)

In [None]:
def format_research_docs(retrieved_docs):
    """Format retrieved documents with section information"""
    formatted_content = []
    for doc in retrieved_docs:
        section = doc.metadata.get('section', 'Unknown Section')
        content = doc.page_content
        formatted_content.append(f"[Section: {section}]\n{content}")
    
    return "\n\n" + "="*50 + "\n\n".join(formatted_content)

def extract_question(inputs):
    return inputs.get('question', '')

def extract_chat_history(inputs):
    return inputs.get('chat_history', '')

def get_paper_metadata(inputs):
    return {
        'paper_title': paper_metadata['title'],
        'authors': paper_metadata['authors']
    }

# Parallel processing chain for research agent
research_parallel_chain = RunnableParallel({
    'context': RunnableLambda(extract_question) | retriever | RunnableLambda(format_research_docs),
    'question': RunnableLambda(extract_question),
    'chat_history': RunnableLambda(extract_chat_history),
    'paper_title': RunnableLambda(lambda x: paper_metadata['title']),
    'authors': RunnableLambda(lambda x: paper_metadata['authors'])
})

In [None]:
# Complete research agent chain
parser = StrOutputParser()
research_chain = research_parallel_chain | research_prompt | llm | parser

In [None]:
# Research agent with conversation memory
research_memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="question",
    output_key="answer",
    return_messages=False
)

def display_research_history():
    """Display the research conversation history"""
    chat_history = research_memory.load_memory_variables({})["chat_history"]
    if chat_history and chat_history.strip():
        print("=== RESEARCH CONVERSATION HISTORY ===")
        print(chat_history)
        print("=== END HISTORY ===")
    else:
        print("No research conversation history yet.")

def clear_research_history():
    """Clear all research conversation history"""
    research_memory.clear()
    print("Research conversation history cleared.")

def research_qa_with_memory(question):
    """Research QA with persistent memory"""
    # Get chat history from memory
    chat_history = research_memory.load_memory_variables({})["chat_history"]
    
    # Ensure chat_history is a string
    if isinstance(chat_history, list):
        chat_history = "\n".join(str(x) for x in chat_history)
    elif chat_history is None:
        chat_history = ""
    
    # Run the research chain
    result = research_chain.invoke({
        "question": question,
        "chat_history": chat_history
    })
    
    # Extract answer if it's a dict
    if isinstance(result, dict) and "answer" in result:
        answer = result["answer"]
    else:
        answer = result
    
    # Save to memory
    research_memory.save_context({"question": question}, {"answer": answer})
    
    return answer

def interactive_research_chat():
    """Interactive research chat session"""
    print("=== Research Paper Analysis Agent ===")
    print(f"Analyzing: {paper_metadata['title']}")
    print(f"Authors: {paper_metadata['authors']}")
    print("\nCommands:")
    print("- Type your research questions normally")
    print("- 'quit' to exit")
    print("- 'history' to see conversation history")
    print("- 'clear' to clear history")
    print("- 'sections' to see available paper sections")
    print("- 'metadata' to see paper information")
    
    while True:
        user_question = input("\nResearcher: ")
        
        if user_question.lower() == 'quit':
            break
        elif user_question.lower() == 'history':
            display_research_history()
            continue
        elif user_question.lower() == 'clear':
            clear_research_history()
            continue
        elif user_question.lower() == 'sections':
            print("Available paper sections:")
            for section in paper_sections.keys():
                print(f"- {section}")
            continue
        elif user_question.lower() == 'metadata':
            print("Paper Information:")
            for key, value in paper_metadata.items():
                print(f"- {key}: {value}")
            continue
        
        try:
            response = research_qa_with_memory(user_question)
            print(f"\nAgent: {response}\n")
        except Exception as e:
            print(f"Error: {e}")

# Function for single question testing
def ask_research_question(question):
    """Ask a single research question"""
    response = research_qa_with_memory(question)
    print(f"Question: {question}")
    print(f"Answer: {response}")
    return response

# Test the Research Agent

In [None]:
# Test with some sample research questions
print("=== Testing Research Agent ===\n")

# Test questions - modify these based on your research paper
test_questions = [
    "What is the main contribution of this paper?",
    "What methodology does this paper use?",
    "What are the key findings or results?",
    "What are the limitations mentioned in the paper?"
]

for question in test_questions:
    print(f"Q: {question}")
    try:
        answer = ask_research_question(question)
        print(f"A: {answer}\n")
        print("-" * 80)
    except Exception as e:
        print(f"Error: {e}\n")

In [None]:
# Start interactive research session
# Uncomment the line below to start the interactive chat
interactive_research_chat()

In [None]:
# Additional utility functions for research analysis

def get_section_summary(section_name):
    """Get a summary of a specific section"""
    if section_name.lower() in paper_sections:
        content = paper_sections[section_name.lower()]
        question = f"Please provide a concise summary of the {section_name} section"
        
        # Create a temporary retriever for this specific section
        section_docs = [doc for doc in documents if doc.metadata.get('section', '').lower() == section_name.lower()]
        if section_docs:
            # Use the research agent to summarize
            response = research_qa_with_memory(question)
            return response
        else:
            return f"Section '{section_name}' not found or has no content."
    else:
        available_sections = list(paper_sections.keys())
        return f"Section '{section_name}' not found. Available sections: {available_sections}"

def compare_with_related_work():
    """Generate questions about related work and comparisons"""
    related_work_questions = [
        "What related work does this paper cite?",
        "How does this work differ from previous approaches?",
        "What gaps in existing research does this paper address?"
    ]
    
    print("=== Related Work Analysis ===")
    for question in related_work_questions:
        try:
            answer = research_qa_with_memory(question)
            print(f"\nQ: {question}")
            print(f"A: {answer}")
            print("-" * 60)
        except Exception as e:
            print(f"Error analyzing related work: {e}")

def analyze_methodology():
    """Analyze the paper's methodology"""
    method_questions = [
        "What is the experimental setup?",
        "What datasets are used?",
        "What evaluation metrics are employed?",
        "What are the implementation details?"
    ]
    
    print("=== Methodology Analysis ===")
    for question in method_questions:
        try:
            answer = research_qa_with_memory(question)
            print(f"\nQ: {question}")
            print(f"A: {answer}")
            print("-" * 60)
        except Exception as e:
            print(f"Error analyzing methodology: {e}")

# Example usage:
# print(get_section_summary("introduction"))
# compare_with_related_work()
# analyze_methodology()

# 🚀 Streamlit Dashboard

A complete web-based dashboard has been created as `research_dashboard.py` that provides:

## Features:
- **📂 Paper Upload**: Upload PDF files or enter URLs (including arXiv)
- **💾 Automatic Saving**: Papers are saved to a `papers/` folder for future use
- **📚 Paper Library**: Browse and reload previously saved papers
- **💬 Interactive Chat**: Natural language conversation with your research papers
- **📋 Paper Information**: Display metadata and sections
- **🚀 Quick Actions**: Pre-built queries for common research questions
- **📑 Section Explorer**: Browse paper sections interactively
- **🗂️ Paper Management**: Delete unwanted papers to save space

## How to Run:

1. **Install Streamlit** (if not already installed):
   ```bash
   pip install streamlit
   ```

2. **Run the dashboard**:
   ```bash
   streamlit run research_dashboard.py
   ```

3. **Open your browser** and navigate to the displayed URL (usually `http://localhost:8501`)

## Dashboard Components:

### Sidebar:
- Paper upload/URL input
- Paper metadata display
- Saved papers library with reload functionality
- Paper management (view, load, delete)
- Clear paper option

### Main Area:
- Chat interface with conversation history
- Quick action buttons (Summarize, Methodology, Key Findings)
- Paper sections explorer

### File Organization:
- **`papers/` folder**: All downloaded and uploaded PDFs are saved here
- **Persistent storage**: Papers remain available between sessions
- **Smart naming**: arXiv papers are named with their ID for easy identification

### Supported Sources:
- **Local PDF files** - Upload directly (saved to papers folder)
- **arXiv papers** - Enter abstract or PDF URLs (auto-downloaded and saved)
- **Direct PDF URLs** - Any publicly accessible PDF (downloaded and saved)

## New File Management Features:

### Automatic Paper Saving:
- **Uploaded PDFs**: Saved to `papers/` folder with original filename
- **Downloaded papers**: Saved with descriptive names (e.g., `arxiv_1706.03762.pdf`)
- **Persistent storage**: Papers remain available for future sessions

### Paper Library:
- **Browse saved papers**: View all papers in the sidebar
- **Quick reload**: Select and load any previously saved paper
- **File management**: Delete papers you no longer need
- **Storage info**: See file sizes and manage disk space

The dashboard integrates all the functionality from this notebook into a user-friendly web interface with persistent paper storage!