#### Import Necessary Libraries

In [1]:
import os
import shutil
from dotenv import load_dotenv

In [None]:
# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema import Document


# Embeddings & Vector Store
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# Parent Document Retriever
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

# LLM
from langchain_groq import ChatGroq

# PDF Parsing
from llama_parse import LlamaParse

In [18]:
# Load environment variables
load_dotenv()
LLAMA_API_KEY = os.getenv("LLAMA_PARSE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")


# Define directories
UPLOAD_DIR = "documents/uploaded_file"
PARSED_DIR = "documents/parsed_md"

os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(PARSED_DIR, exist_ok=True)

In [None]:
# Step 1: Save uploaded PDF (simulate here)
pdf_name = "test.pdf"   # Replace with user-uploaded filename
pdf_path = os.path.join(UPLOAD_DIR, pdf_name)

# Copy test PDF into upload folder
if not os.path.exists(pdf_path):
    shutil.copy("test.pdf", pdf_path)

print("✅ PDF stored at:", pdf_path)

✅ PDF stored at: documents/uploaded_file\pom.pdf


In [5]:
# Step 2: Parse PDF with LlamaParse and save as Markdown
parser = LlamaParse(
    api_key=LLAMA_API_KEY,
    result_type="markdown",
    verbose=True,
    system_prompt="""You are a document parser. Extract the content into clean, structured markdown format.
    - Preserve headings, subheadings, paragraphs clearly.
    - Convert tables into proper markdown table syntax.
    - Represent images with markdown image syntax ![Description](image_placeholder).
    - If image data is missing, describe the image briefly in place.
    - Keep lists, bullet points, and code blocks formatted.
    - Avoid extra line breaks or broken markdown syntax.
            """
)

docs = parser.load_data(pdf_path)

print(f"✅ PDF parsed into {len(docs)} markdown documents.")

Started parsing the file under job_id 3edc41a5-4594-443e-b2a9-8c888f26c520
✅ PDF parsed into 5 markdown documents.


In [6]:
# Save to Markdown
md_file_path = os.path.join(PARSED_DIR, pdf_name.replace(".pdf", ".md"))
with open(md_file_path, "w", encoding="utf-8") as f:
    for doc in docs:
        f.write(doc.text + "\n\n")

print("✅ Markdown stored at:", md_file_path)
print( "Sample Markdown Content:\n", docs[0].text[:500] )  # Print first 500 chars of first doc

✅ Markdown stored at: documents/parsed_md\pom.md
Sample Markdown Content:
 # Perspectives in Management

## Introduction
- Management was developed along with the development of human civilization.
- In early times, people began to organize themselves in family and community units. These early formations laid the foundation for managing resources, time, and efforts.

Management today consists of many well-developed principles and perspectives such as:
- **Classical Theory**: Focuses on organizational structure, efficiency, and formal rules to improve productivity.
- **


In [9]:
#Step 3: Convert parsed docs into LangChain Document objects
lc_docs = [
    Document(page_content=d.text, metadata={"source": pdf_path, "page": i+1})
    for i, d in enumerate(docs)
]

print(f"✅ Converted {len(lc_docs)} docs to LangChain Document format")
print("Sample Document Metadata:", lc_docs[0].metadata)

✅ Converted 5 docs to LangChain Document format
Sample Document Metadata: {'source': 'documents/uploaded_file\\pom.pdf', 'page': 1}


In [10]:
# Step 3: Setup ParentDocumentRetriever
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = Chroma(embedding_function=embeddings, persist_directory="notebook/chroma_db")
doc_store = InMemoryStore()

parent_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=doc_store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)
# lets print a sample of the parent retriever
print("Sample ParentDocumentRetriever:", parent_retriever)


Sample ParentDocumentRetriever: vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000001306D62BEC0> docstore=<langchain_core.stores.InMemoryStore object at 0x000001306DBD5E20> search_kwargs={} child_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x000001304FD6F140> parent_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x000001304FD6DC40>


In [13]:
# Index documents
parent_retriever.add_documents(lc_docs)

print("✅ Documents added to ParentDocumentRetriever")

# Test retrieval
query = "What is the main purpose of this document?"
results = parent_retriever.get_relevant_documents(query)

print(f"🔍 Retrieved {len(results)} parent documents")
print("Sample result snippet:\n", results[0].page_content[:500])

✅ Documents added to ParentDocumentRetriever
🔍 Retrieved 2 parent documents
Sample result snippet:
 # The Evolution of Management Thinking

As the industrial revolution began, factories grew, and the need for better ways to manage people and work increased. This led to the development of scientific approaches to management.

## Early Pioneers in Management

Even in early civilizations, management ideas were in practice. Here are three pioneers who made significant contributions to early management thinking:

1. **Robert Owen (1771-1855)**
- **Who he was:** A British industrialist and social re


In [19]:
# Step 5: Setup Groq LLM
llm = ChatGroq(model="llama-3.1-8b-instant", api_key=os.getenv("GROQ_API_KEY"))

# check model connection 
response = llm.invoke("Hello! Can you confirm you’re active?")
print(response.content)

Hello! Yes, I'm active and ready to assist you with any questions or topics you'd like to discuss. How can I help you today?


In [20]:
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=
"You are a helpful and fact-based QA assistant.\n\n"
        "Use ONLY the information from the context below to answer the question.\n"
        "If the context does not clearly contain the answer, reply exactly with:\n"
        "\"Insufficient information in the provided context.\"\n\n"
        "Be clear, concise, and strictly factual.\n"
        "Do not add assumptions or outside knowledge.\n\n"
        "Context:\n{context}\n\n"
        "Question:\n{question}\n\n"
        "Answer:"
)

# summarize_prompt = PromptTemplate(
#     input_variables=["context"],
#     template= "You are a professional summarizer.\n\n"
#         "Read the document below and summarize it into clear, concise, and complete bullet points.\n"
#         "Use only the information found in the document. Avoid assumptions or external details.\n\n"
#         "Document:\n{context}\n\n"
#         "Summary (in bullet points):"
# )

test_prompt = qa_prompt.format(
    context="The Nile is the longest river in Africa.",
    question="Which continent is the Nile located in?"
)

print(test_prompt)

You are a helpful and fact-based QA assistant.

Use ONLY the information from the context below to answer the question.
If the context does not clearly contain the answer, reply exactly with:
"Insufficient information in the provided context."

Be clear, concise, and strictly factual.
Do not add assumptions or outside knowledge.

Context:
The Nile is the longest river in Africa.

Question:
Which continent is the Nile located in?

Answer:


In [21]:
# Step 7: Preprocess retriever output
from langchain_core.runnables import RunnableLambda
def extract_text(docs):
    return "\n\n".join([doc.page_content for doc in docs])

sample_docs = parent_retriever.get_relevant_documents("test question")
print(extract_text(sample_docs)[:500])


# Principles of Workplace Relationships

## Principle 1: Friendly and Respectful Relationships
- There should be a friendly and respectful relationship between management and workers.
- Both sides must understand that they have common goals and can benefit together.
- Problems should be solved through open discussion, mutual understanding, and cooperation.

## Principle 2: Cooperation, Not Individualism
- Managers and workers should not work separately as different groups; they should work toget


In [22]:
retriever_chain = RunnableLambda(lambda x: parent_retriever.get_relevant_documents(x["question"])) | RunnableLambda(extract_text)

# debugging the retriever chain
test_input = {"question": "What is the main topic of this document?"}
context_text = retriever_chain.invoke(test_input)

print("✅ Retrieved context length:", len(context_text))
print("Sample context snippet:\n", context_text[:400])

✅ Retrieved context length: 5623
Sample context snippet:
 # A. Scientific Management Theory

**Developed by:** Frederick Winslow Taylor (1856–1915)
*(Worked as a foreman at the Midvale Steel Company, USA.)*
Frederick Winslow Taylor is known as the Father of Scientific Management. Scientific management is about finding the best and most efficient way to perform a task. It focuses on productivity, efficiency, and the systematic improvement of worker perfor


In [23]:
# Step 8: Build QA chain
from langchain_core.runnables import RunnableLambda, RunnableParallel, RunnableSequence
qa_chain = RunnableSequence(
    RunnableParallel({
        "context": retriever_chain,
        "question": RunnableLambda(lambda x: x["question"])
    })
    | qa_prompt
    | llm
)



# Test the QA chain
query = "What is the key finding or topic of this document?"
result = qa_chain.invoke({"question": query})

print("🧾 Answer:\n", result.content)




🧾 Answer:
 The key finding or topic of this document is Frederick Winslow Taylor's Scientific Management Theory.


In [25]:
# Step 9: Build Summarization chain
# Reuse ParentDocumentRetriever's child chunks
child_docs = []
for parent in lc_docs:
    child_docs.extend(parent_retriever.child_splitter.split_text(parent.page_content))

# Use a very concise prompt for chunks
chunk_summarize_prompt = PromptTemplate(
    input_variables=["context"],
    template=(
         "You are a professional summarizer.\n\n"
    "Summarize the following text into concise bullet points and space.\n"
    "Each bullet point should start with a '-' and clearly highlight the topic.\n"
    "Ensure that each new topic is separated by a line break for clarity and space.\n"
    "Do NOT include any headers, repeated phrases, or unnecessary wording.\n"
    "Keep all information factual, concise, and clear.\n\n"
    "Text:\n{context}\n\n"
    "Summary (in clean bullet points):"
)
)


chunk_summaries = []
for c in child_docs:
    summarize_chain_chunk = RunnableSequence(
        RunnableParallel({
            "context": RunnableLambda(lambda x: c)
        })
        | chunk_summarize_prompt
        | llm
    )
    result = summarize_chain_chunk.invoke({"topic": "ignored"})
    chunk_summaries.append(result.content.strip())

final_summary = "\n".join([
    line for chunk in chunk_summaries
    for line in chunk.split("\n")
    if line.strip() != ""
])



In [26]:
print("📄 Final Summary:\n", final_summary[:1000])  # print first 1000 chars

📄 Final Summary:
 - Development of Management
- Management originated with the development of human civilization.
- Early Forms of Management
- People organized themselves in family and community units.
- These early formations managed resources, time, and efforts.
- Evolution of Management
- Management continued to evolve with the growth of civilizations.
-
- Classical Theory: Focuses on organizational structure, efficiency, and formal rules to improve productivity.
Management principles and perspectives continue to evolve.
- Human Relations and Behavioral Science: Emphasizes employee motivation, relationships, and the impact of human behavior on work.
- Decision Science: Uses data, models, and analysis to support and improve decision-making.
- Management Science: a field that combines mathematical and analytical techniques to address management issues and enhance operational efficiency.
- It focuses on applying scientific methods to solve complex business problems and optimize organi