In [1]:
import os
import re
from typing import List, Dict, Optional, Union
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings


class SmartPDFProcessor:
    """
    Production-grade multi-PDF Processor for Retrieval-Augmented Generation (RAG).
    - Uses hybrid chunking for better course outlines & regulations capture.
    - Enhances metadata (course_code, program, section_type) during ingestion.
    """

    OUTLINE_KEYWORDS = ["outline", "course content", "course outline", "syllabus", "topics covered"]
    COURSE_CODE_PATTERN = r"(COMP\s*\d+|CS\s*\d+|SE\s*\d+)"
    PROGRAM_IDENTIFIERS = {"MS": ["MS", "Master", "Graduate"], "BS": ["BS", "Bachelor", "Undergraduate"]}

    def __init__(
        self,
        model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
        min_char_threshold: int = 50,
        recursive: bool = True,
        min_chunk_size: int = 1200,  # best for outlines
        ):
        self.embedding_model = HuggingFaceEmbeddings(model_name=model_name)
        self.text_splitter = SemanticChunker(
        self.embedding_model,
        breakpoint_threshold_type="percentile",
        min_chunk_size=min_chunk_size
    )
        self.min_char_threshold = min_char_threshold
        self.recursive = recursive


    def process_directory(
        self,
        directory_path: str,
        custom_metadata: Optional[Dict[str, Union[str, int]]] = None
    ) -> List[Document]:
        """Process all PDFs in a directory and return chunks."""
        all_chunks = []
        pdf_files = self._get_pdf_files(directory_path)

        for pdf_path in pdf_files:
            print(f"üìÑ Processing: {pdf_path}")
            chunks = self._process_single_pdf(pdf_path, custom_metadata)
            all_chunks.extend(chunks)

        print(f"‚úÖ Completed processing {len(pdf_files)} PDFs. Total chunks created: {len(all_chunks)}")
        return all_chunks

    def _get_pdf_files(self, directory_path: str) -> List[str]:
        """Return list of PDF file paths from directory."""
        pdf_files = []
        for root, _, files in os.walk(directory_path):
            for file in files:
                if file.lower().endswith(".pdf"):
                    pdf_files.append(os.path.join(root, file))
            if not self.recursive:
                break
        return pdf_files

    def _extract_metadata_from_text(self, text: str) -> Dict[str, Union[str, int]]:
        """Extract course code, program type, outline sections based on patterns."""
        metadata = {}

        # Extract Course Code
        match = re.search(self.COURSE_CODE_PATTERN, text, re.IGNORECASE)
        if match:
            metadata["course_code"] = match.group(0).replace(" ", "").upper()

        # Detect Program Type
        for program, keywords in self.PROGRAM_IDENTIFIERS.items():
            if any(k.lower() in text.lower() for k in keywords):
                metadata["program"] = program

        # Section Detection
        if any(keyword in text.lower() for keyword in self.OUTLINE_KEYWORDS):
            metadata["section_type"] = "course_outline"
        elif any(word in text.lower() for word in ["credit", "hours", "structure"]):
            metadata["section_type"] = "course_structure"
        else:
            metadata["section_type"] = "general"

        return metadata

    def _process_single_pdf(
        self, pdf_path: str, custom_metadata: Optional[Dict[str, Union[str, int]]] = None
    ) -> List[Document]:
        """Private method to process a single PDF with semantic chunking and metadata enrichment."""
        try:
            loader = PyPDFLoader(pdf_path)
            pages = loader.load()
        except Exception as e:
            print(f"‚ùå Error loading PDF {pdf_path}: {e}")
            return []

        processed_chunks = []

        for page_num, page in enumerate(pages):
            cleaned_text = self._clean_text(page.page_content)

            if len(cleaned_text.strip()) < self.min_char_threshold:
                continue

            metadata = {
                "source_file": os.path.basename(pdf_path),
                "source_path": pdf_path,
                "page": page_num + 1,
                "total_pages": len(pages),
                "char_count": len(cleaned_text),
            }

            # Merge extracted metadata
            extracted_meta = self._extract_metadata_from_text(cleaned_text)
            metadata.update(extracted_meta)

            if page.metadata:
                metadata.update(page.metadata)
            if custom_metadata:
                metadata.update(custom_metadata)

            try:
                chunks = self.text_splitter.create_documents(
                    texts=[cleaned_text],
                    metadatas=[metadata]
                )
                processed_chunks.extend(chunks)
            except Exception as e:
                print(f"‚ö†Ô∏è Chunking failed on file {pdf_path}, page {page_num+1}: {e}")

        return processed_chunks

    def _clean_text(self, text: str) -> str:
        """Advanced PDF text cleaning preserving structure for outlines."""
        # Preserve numbered/bulleted lists (convert to newline format)
        text = re.sub(r'(?<!\n)(\d+\.\s)', r'\n\1', text)  # Numbered lists
        text = re.sub(r'(?<!\n)([-‚Ä¢‚óè‚ñ™])', r'\n\1', text)    # Bullet lists

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)

        # Replace ligatures and quotes
        ligatures = {"Ô¨Å": "fi", "Ô¨Ç": "fl", "‚Äô": "'", "‚Äú": '"', "‚Äù": '"'}
        for k, v in ligatures.items():
            text = text.replace(k, v)

        # Clean page headers/footers
        text = re.sub(r'(Page \d+ of \d+|Page \d+|\d+/\d+)$', '', text, flags=re.IGNORECASE)

        # Remove unwanted characters
        text = ''.join(char for char in text if char.isprintable())
        return text.strip()


In [2]:
processor = SmartPDFProcessor()
chunks = processor.process_directory("/home/hammadali08/Downloads/CS/BS Data")

  self.embedding_model = HuggingFaceEmbeddings(model_name=model_name)
  from .autonotebook import tqdm as notebook_tqdm


üìÑ Processing: /home/hammadali08/Downloads/CS/BS Data/BS Mathematics (4 Years) (2024).pdf
üìÑ Processing: /home/hammadali08/Downloads/CS/BS Data/BS Physics (2023).pdf
üìÑ Processing: /home/hammadali08/Downloads/CS/BS Data/BS Zoology (2023).pdf
üìÑ Processing: /home/hammadali08/Downloads/CS/BS Data/BS Computer Science (2023).pdf
‚úÖ Completed processing 4 PDFs. Total chunks created: 772


In [3]:
chunks[120].page_content

'(2012). Calculus, 7E, published by Brooks/Cole Cengage Learning  Swokowski, E. W. (1983) Calculus with Analytic Geometry th  Thomas, (2010) Calculus 12 Edition, Addison -Wesley Annexure -10 -D,'

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [5]:
from langchain_community.vectorstores import Chroma
persist_directory = 'chroma-db'
vectordb = Chroma.from_documents(documents=chunks, 
                                 embedding=embeddings,
                                 persist_directory=persist_directory)
vectordb.persist()

  vectordb.persist()


In [6]:
vectordb.similarity_search('Migration Rules')

[Document(metadata={'page': 107, 'producer': 'Microsoft¬Æ Word 2010', 'source': '/home/hammadali08/Downloads/CS/BS Data/BS Zoology (2023).pdf', 'author': 'abc', 'total_pages': 189, 'creationdate': '2023-10-31T10:10:54+05:00', 'section_type': 'course_outline', 'char_count': 1558, 'source_file': 'BS Zoology (2023).pdf', 'source_path': '/home/hammadali08/Downloads/CS/BS Data/BS Zoology (2023).pdf', 'page_label': '108', 'creator': 'Microsoft¬Æ Word 2010', 'program': 'MS', 'moddate': '2024-08-07T15:24:13+05:00'}, page_content='Field study trips on diversity with emphasis on their adaptation. Annexure -3 -N, Page # 108 of 189'),
 Document(metadata={'page': 146, 'creationdate': '2023-10-31T10:10:54+05:00', 'section_type': 'course_structure', 'page_label': '147', 'producer': 'Microsoft¬Æ Word 2010', 'char_count': 2646, 'program': 'MS', 'source_path': '/home/hammadali08/Downloads/CS/BS Data/BS Zoology (2023).pdf', 'creator': 'Microsoft¬Æ Word 2010', 'author': 'abc', 'moddate': '2024-08-07T15:24

# LLM Initialization

In [7]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['api_key']=os.getenv('api_key')

In [8]:
from langchain_groq import ChatGroq
# LLM Initialization
llm = ChatGroq(model_name="openai/gpt-oss-20b", api_key=os.environ['api_key'])

In [9]:
from langchain.chat_models.base import init_chat_model
llm=init_chat_model("groq:openai/gpt-oss-20b", api_key=os.environ['api_key'])

In [10]:
llm.invoke('What is AI')

AIMessage(content='**Artificial Intelligence (AI)** is a branch of computer science that focuses on creating systems capable of performing tasks that normally require human intelligence. These tasks include reasoning, learning, perception, language understanding, planning, problem‚Äësolving, and even creativity.\n\n---\n\n## 1. Core Concepts\n\n| Concept | What it Means | Example |\n|---------|---------------|---------|\n| **Intelligence** | Ability to acquire knowledge and apply it to achieve goals. | A chess engine evaluating board positions. |\n| **Artificial** | Not naturally occurring; created by humans. | Software written by programmers. |\n| **Learning** | Adapting behavior based on experience. | A spam filter that improves as it processes more emails. |\n| **Reasoning** | Drawing conclusions from data. | A medical diagnosis system that infers disease from symptoms. |\n| **Perception** | Interpreting sensory data. | Image recognition in self‚Äëdriving cars. |\n| **Language Under

# RAG Pipeline

## Re-Ranker Retreiver

In [11]:
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

In [12]:
sparse_retriever = BM25Retriever.from_documents(chunks)
sparse_retriever.k=50

In [13]:
## Convert vector store to RAG chain
retriever = vectordb.as_retriever(
    search_kwargs={"k": 50})

In [14]:
def build_retriever_for_query(query):
    # üîç detect metadata inline
    import re
    filters = {}
    match = re.search(r'(COMP\s*\d+)', query, re.IGNORECASE)
    if match:
        filters["course_code"] = match.group(1).replace(" ", "").upper()

    if "ms" in query.lower():
        filters["program"] = "MS-CS"
        filters["source_file"] = "MS Comuter Science (2023).pdf"
    elif "bs" in query.lower():
        filters["program"] = "BS-CS"
        filters["source_file"] = "Scheme of Studies of BS Computer Science.pdf"

    # ‚úÖ build retriever dynamically
    filtered_retriever = EnsembleRetriever(
        retrievers=[sparse_retriever, retriever],
        weights=[0.5, 0.5],
        search_kwargs={"k": 15, "filter": filters}
    )
    return filtered_retriever


In [15]:
Combined_retriever = EnsembleRetriever(
    retrievers=[retriever, sparse_retriever],
    weights=[0.3, 0.7]
)

In [16]:
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers import ContextualCompressionRetriever

In [17]:
cross_encoder = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-2-v2")

# Wrap it as a reranker
reranker = CrossEncoderReranker(model=cross_encoder, top_n=10)

# Combine with your base retriever
reranking_retriever = ContextualCompressionRetriever(
    base_compressor=reranker,
    base_retriever=Combined_retriever
)

In [18]:
def build_retriever_for_query_(query):
    # üîç detect metadata inline
    import re
    filters = {}
    match = re.search(r'(COMP\s*\d+)', query, re.IGNORECASE)
    if match:
        filters["course_code"] = match.group(1).replace(" ", "").upper()

    if "ms" in query.lower():
        filters["program"] = "MS-CS"
        filters["source_file"] = "MS Comuter Science (2023).pdf"
    elif "bs" in query.lower():
        filters["program"] = "BS-CS"
        filters["source_file"] = "Scheme of Studies of BS Computer Science.pdf"

    # ‚úÖ build retriever dynamically
    filtered_retriever = EnsembleRetriever(
        retrievers=[reranking_retriever],
        weights=[0.5, 0.5],
        search_kwargs={"k": 15, "filter": filters}
    )
    return filtered_retriever


In [19]:
from langchain.prompts.chat import ChatPromptTemplate
# Creating a Prompt template
system_prompt = """
You are an intelligent academic assistant that answers questions strictly using ONLY the information present in the University Scheme of Studies and official Rules & Regulations documents.

Core Principles (DO NOT VIOLATE):
1. You must answer ONLY from the provided context.
2. If the required information is not explicitly present in the context, respond EXACTLY with: "Sorry! I don't know."
3. Never use external knowledge, assumptions, or generalized textbook content.
4. Maintain academic accuracy and integrity.

 When the user asks about a course:
- If they ask for **course outline**, you must:
   ‚Ä¢ Return the **complete list of topics/modules exactly as provided in the context**.
   ‚Ä¢ Preserve the structure and hierarchy (e.g., main topics with subtopics).
   ‚Ä¢ Do NOT summarize, rephrase, or omit any part.

- If they ask for **credit hours, prerequisites, objectives, or learning outcomes**, return ONLY what is explicitly listed in the context.

‚öñ When the user asks about rules or regulations:
- Provide the exact clause, rule number, and description from the official context.
- Do not modify or interpret beyond what is written.

 If the context contains partial or unrelated information:
- Do not fill in missing parts.
- Instead respond with: "Sorry! I don't know."

 Response Format:
- Be clear, structured, and formatted using bullet points or numbered lists ONLY if present in the original context.
- Do not add commentary, opinions, or explanations unless explicitly stated in the context.

Context:
{context}
"""

prompt=ChatPromptTemplate([
    
    ('system',system_prompt),
    ('human','{input}')
    ] )

In [20]:
from langchain.chains.combine_documents import create_stuff_documents_chain
documents_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)

In [21]:
from langchain.chains import create_retrieval_chain
rag_chain = create_retrieval_chain(reranking_retriever,documents_chain)

In [22]:
response=rag_chain.invoke({"input":"Can i migrate to other campus if I have less marks"})
response

{'input': 'Can i migrate to other campus if I have less marks',
 'context': [Document(metadata={'source_file': 'BS Computer Science (2023).pdf', 'source_path': '/home/hammadali08/Downloads/CS/BS Data/BS Computer Science (2023).pdf', 'char_count': 2436, 'total_pages': 87, 'author': 'MNA', 'creator': 'Microsoft¬Æ Word 2010', 'page_label': '16', 'moddate': '2024-08-07T15:21:43+05:00', 'page': 15, 'producer': 'Microsoft¬Æ Word 2010', 'source': '/home/hammadali08/Downloads/CS/BS Data/BS Computer Science (2023).pdf', 'section_type': 'course_structure', 'program': 'BS', 'creationdate': '2023-12-01T08:54:50+05:00'}, page_content='Page 16 1 0. Eligibility Criteria, Duration of the Program and Award of Degree:  Minimum 50% marks in Intermediate/12 years schooling/A - Level (HSSC) or Equivalent with Mathematics are required for admission in BS Computer Science Program. *Equivalency certificate by IBCC will be required in case of education from some other country or system. FSc pre -medical studen

In [23]:
response1=rag_chain.invoke({"input":"Who is Competent Authority"})
print(response1['answer'])

Sorry! I don't know.


In [24]:
response2=rag_chain.invoke({"input":"what does course means?"})
print(response2['answer'])

Sorry! I don't know.


In [31]:
response3=rag_chain.invoke({"input":"What is the outline for Compiler Complition in the BS Computer Science?"})
response3

{'input': 'What is the outline for Compiler Complition in the BS Computer Science?',
 'context': [Document(metadata={'author': 'MNA', 'char_count': 2368, 'creationdate': '2023-12-01T08:54:50+05:00', 'source_path': '/home/hammadali08/Downloads/CS/BS Data/BS Computer Science (2023).pdf', 'program': 'MS', 'page': 75, 'producer': 'Microsoft¬Æ Word 2010', 'moddate': '2024-08-07T15:21:43+05:00', 'total_pages': 87, 'section_type': 'course_outline', 'source_file': 'BS Computer Science (2023).pdf', 'source': '/home/hammadali08/Downloads/CS/BS Data/BS Computer Science (2023).pdf', 'page_label': '76', 'creator': 'Microsoft¬Æ Word 2010'}, page_content='Page 76 Course Name: Professional Practices Course Code: ITEC4112 Credit Hours: 2 (2+0) Pre -requisites: None Course Introduction: A Computing graduate as professional has some responsibilities with respect to the society. This course develops student understanding about historical, social, economic, ethical, and professional issues related to the d

In [26]:
query = "Give a breif outline of Tools for Quantitative Reasoning"  # this comes from the UI or your input cell
retriever = build_retriever_for_query(query)

# Apply multi-query dynamically
from langchain.retrievers.multi_query import MultiQueryRetriever
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=retriever,
    llm=llm,
    include_original=True
)

# Now feed this into your RAG chain
response = rag_chain.invoke({
    "input": query,
    "retriever": multi_query_retriever
})

In [27]:
response['answer']

'**Course Outline: Tools for Quantitative Reasoning**\n\nExploring Graphical Information: Functions, Graphing Tools, Investigating Relationships between Variables, Exploring Tools to find Relationship between Variables, Resources and Population Growth Dealing with Economical, Environmental and Social Issues.  \n\nBuilding Blocks of a Plane: Simultaneous Linear Equations in Two Variables, Graphical and Analytical Approaches to Solve a Problem, Applications of Graphical and Analytical Approaches in Social and Economic Problems.  \n\nExploring Inequalities: Absolute Value and Inequalities, Dealing with Practical Problems Involving Inequalities in Different Disciplines.  \n\nComparing Quantities: Ratio and Proportion and Sequences, Golden Ratio in Sculptures, Comparison of Statements and their use in Social and Economic Problems, Number Patterns and their Applications.  \n\nThinking Logically: Propositions and Truth Values, Applications of Logic.  \n\nUnderstanding Data: Exploring and Summ

In [None]:
print(response['answer'])

An Introduction to Artificial Intelligence and its applications towards Knowledge Based Systems; Introduction to Reasoning and Knowledge Representation, Problem Solving by Searching (Informed searching, Uninformed searching, Heuristics, Local searching, Min‚Äëmax algorithm, Alpha beta pruning, Game‚Äëplaying); Case Studies: General Problem Solver, Eliza, Student, Macsyma; Learning from examples; ANN and Natural Language Processing; Recent trends in AI and applications of AI algorithms.
