In [10]:
import anthropic

client = anthropic.Anthropic()

client.models.list(limit=20)

SyncPage[ModelInfo](data=[ModelInfo(id='claude-opus-4-1-20250805', created_at=datetime.datetime(2025, 8, 5, 0, 0, tzinfo=datetime.timezone.utc), display_name='Claude Opus 4.1', type='model'), ModelInfo(id='claude-opus-4-20250514', created_at=datetime.datetime(2025, 5, 22, 0, 0, tzinfo=datetime.timezone.utc), display_name='Claude Opus 4', type='model'), ModelInfo(id='claude-sonnet-4-20250514', created_at=datetime.datetime(2025, 5, 22, 0, 0, tzinfo=datetime.timezone.utc), display_name='Claude Sonnet 4', type='model'), ModelInfo(id='claude-3-7-sonnet-20250219', created_at=datetime.datetime(2025, 2, 24, 0, 0, tzinfo=datetime.timezone.utc), display_name='Claude Sonnet 3.7', type='model'), ModelInfo(id='claude-3-5-haiku-20241022', created_at=datetime.datetime(2024, 10, 22, 0, 0, tzinfo=datetime.timezone.utc), display_name='Claude Haiku 3.5', type='model'), ModelInfo(id='claude-3-haiku-20240307', created_at=datetime.datetime(2024, 3, 7, 0, 0, tzinfo=datetime.timezone.utc), display_name='Claud

In [28]:
#!pip install langchain_community
#!pip install langchain_anthropic
#!pip install langchain_huggingface langchain_chroma
#!pip install sentence_transformers
#!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.0.0-py3-none-any.whl (310 kB)
Installing collected packages: pypdf
Successfully installed pypdf-6.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [29]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_anthropic import ChatAnthropic
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import chromadb

In [15]:
load_dotenv()

True

In [33]:
class ResearchPaperRAG:
    def __init__(self,
                 model_name="claude-3-haiku-20240307",
                 embedding_model="sentence-transformers/all-MiniLM-L6-v2",
                 chunk_size=800,
                 chunk_overlap=150,
                 collection_name="research_papers"):

        # Initialize Anthropic Claude
        self.llm = ChatAnthropic(
            model=model_name,
            api_key=os.getenv("ANTHROPIC_API_KEY"),
            temperature=0.1
        )

        # Use HuggingFace embeddings (free, runs locally)
        self.embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model,
            model_kwargs={'device': 'cpu'}
        )

        # Text splitter optimized for academic papers
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""],
            add_start_index=True
        )

        self.collection_name = collection_name
        self.vector_store = None
        self.retriever = None

        # Academic-focused prompt template
        self.prompt_template = ChatPromptTemplate.from_template("""
        You are a research assistant helping analyze documents. Based on the provided research documents, answer the question comprehensively but concisely.

        Guidelines:
        - Cite specific findings when possible
        - Mention methodologies if relevant
        - If uncertain, acknowledge limitations
        - Focus on factual information from the papers

        Research Context: {context}

        Question: {question}

        Analysis:
        """)

    def load_research_directory(self, directory_path):
        """Load all PDFs from a directory"""
        loader = DirectoryLoader(
            directory_path,
            glob="*.pdf",
            loader_cls=PyPDFLoader,
            show_progress=True
        )

        documents = loader.load()
        print(f"Loaded {len(documents)} document pages from {directory_path}")
        return documents

    def load_single_paper(self, pdf_path):
        """Load a single PDF research paper"""
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        print(f"Loaded {len(documents)} pages from {pdf_path}")
        return documents

    def setup_persistent_storage(self, documents, persist_directory="./chroma_db"):
        """Create persistent Chroma vector store"""
        # Split documents
        splits = self.text_splitter.split_documents(documents)
        print(f"Split into {len(splits)} chunks")

        # Create persistent Chroma vector store
        self.vector_store = Chroma.from_documents(
            documents=splits,
            embedding=self.embeddings,
            collection_name=self.collection_name,
            persist_directory=persist_directory
        )

        # Create retriever with more sophisticated search
        self.retriever = self.vector_store.as_retriever(
            search_type="mmr",  # Maximum Marginal Relevance
            search_kwargs={
                "k": 6,
                "fetch_k": 20,
                "lambda_mult": 0.5
            }
        )

        print(f"Vector store created with {len(splits)} chunks")
        return len(splits)

    def load_existing_store(self, persist_directory="./chroma_db"):
        """Load existing persistent vector store"""
        self.vector_store = Chroma(
            collection_name=self.collection_name,
            embedding_function=self.embeddings,
            persist_directory=persist_directory
        )

        self.retriever = self.vector_store.as_retriever(
            search_type="mmr",
            search_kwargs={
                "k": 6,
                "fetch_k": 20,
                "lambda_mult": 0.5
            }
        )

        print("Loaded existing vector store")

    def analyze_research(self, question, return_sources=False):
        """Analyze research papers based on question"""
        if not self.retriever:
            raise ValueError("Vector store not initialized. Load documents first.")

        # Create and run the chain
        chain = (
            {
                "context": self.retriever | self._format_research_context,
                "question": RunnablePassthrough()
            }
            | self.prompt_template
            | self.llm
            | StrOutputParser()
        )

        response = chain.invoke(question)

        if return_sources:
            sources = self.retriever.get_relevant_documents(question)
            return response, sources

        return response

    def _format_research_context(self, docs):
        """Format retrieved documents with metadata"""
        formatted = []
        for i, doc in enumerate(docs, 1):
            source = doc.metadata.get('source', 'Unknown source')
            page = doc.metadata.get('page', 'Unknown page')
            content = doc.page_content.strip()

            formatted.append(f"Source {i} ({source}, Page {page}):\n{content}")

        return "\n\n---\n\n".join(formatted)

    def find_similar_research(self, query, num_results=5):
        """Find similar research passages"""
        if not self.vector_store:
            raise ValueError("Vector store not initialized.")

        results = self.vector_store.similarity_search_with_score(
            query,
            k=num_results
        )

        return results

    def get_research_summary(self, topic):
        """Get a summary of research on a specific topic"""
        summary_prompt = f"""
        Based on the research papers in the database, provide a comprehensive summary about: {topic}

        Include:
        1. Key findings and conclusions
        2. Methodological approaches used
        3. Any conflicting results or debates
        4. Gaps in current research

        Keep the summary structured and evidence-based.
        """

        return self.analyze_research(summary_prompt)

In [34]:
def example_usage():
    # Initialize the research RAG system
    rag = ResearchPaperRAG(
        model_name="claude-3-haiku-20240307",
        chunk_size=800,
        chunk_overlap=150
    )

    try:
        # Option 1: Load papers from a directory
        # documents = rag.load_research_directory("./research_papers/")

        # Option 2: Load a single paper
        documents = rag.load_single_paper("/home/ojas/Downloads/Course Enrolment Guide.pdf")

        # Create persistent vector store
        rag.setup_persistent_storage(documents, "./research_db")

        # Or load existing store
        # rag.load_existing_store("./research_db")

        # Research questions
        questions = [
            "What are the main steps to enroll in a course?",
            "What are the key requirements for a minor?",
            "How does the grading change on a per code basis",
            #"What future research directions are suggested?"
        ]

        for question in questions:
            print(f"\n{'='*50}")
            print(f"Question: {question}")
            print(f"{'='*50}")

            answer, sources = rag.analyze_research(question, return_sources=True)
            print(f"Answer: {answer}")

            print(f"\nSources used ({len(sources)} documents):")
            for i, source in enumerate(sources[:2], 1):
                print(f"{i}. {source.metadata.get('source', 'Unknown')} (Page {source.metadata.get('page', 'N/A')})")

        # Get topic summary
        print(f"\n{'='*50}")
        print("RESEARCH SUMMARY")
        print(f"{'='*50}")
        summary = rag.get_research_summary("ERP usage and course enrollment")
        print(summary)

    except Exception as e:
        print(f"Error: {e}")
        print("Make sure you have the following:")
        print("1. Set ANTHROPIC_API_KEY in your .env file")
        print("2. Installed required packages: pip install langchain-anthropic langchain-huggingface langchain-chroma")
        print("3. Provided valid PDF file paths")

In [35]:
if __name__ == "__main__":
    example_usage()

Loaded 25 pages from /home/ojas/Downloads/Course Enrolment Guide.pdf
Split into 51 chunks
Vector store created with 51 chunks

Question: What are the main steps to enroll in a course?
Answer: Based on the information provided in the research documents, the main steps to enroll in a course are:

1. Search for courses:
   - The Course Catalog provides a detailed listing of all courses offered at the university, including course codes, titles, prerequisites, credit hours, and departmental offerings (Source 2, Source 3).
   - You can use the ERP system to search for and view available courses (Source 1).

2. Add the course to your Wishlist:
   - On the course detail page, you can click "Next" to add the course to your Course Wishlist (Source 4).

3. Proceed with enrollment:
   - Access your Course Wishlist by clicking on the "Enroll" tab (Source 4).
   - Click "Proceed to Step 2 of 3" to review the course information (Source 4).
   - Click "Finish Enrolling" to attempt enrolling in the cou