In [1]:
# Lama Parser and Retriever Setup
from llama_parse import LlamaParse
from dotenv import load_dotenv
from pathlib import Path
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_groq import ChatGroq
import pandas as pd
from pptx import Presentation
from langchain_community.document_loaders import (
    UnstructuredPDFLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredPowerPointLoader,
    UnstructuredExcelLoader,
    UnstructuredMarkdownLoader,
)
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.retrievers import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnableSequence

import os

load_dotenv()
api_key = os.getenv("LLAMA_CLOUD_API_KEY")

In [2]:
print("Welcome to the MultiDoc Memory Chatbot Notebook!")

Welcome to the MultiDoc Memory Chatbot Notebook!


In [3]:
# Loading documents from a folder

PARSED_FOLDER = Path("Parsed_doc")
PARSED_FOLDER.mkdir(exist_ok=True)

In [4]:
SUPPORTED_EXTS = [".pdf", ".docx", ".pptx", ".md", ".txt", ".xlsx"]

In [5]:
llama_parser = LlamaParse(
    api_key=api_key,
    result_type="markdown",
    verbose=True,
    system_prompt="Extract structured content and preserve formatting as Markdown.",
)

In [6]:




def save_markdown(content: str, filename: str, folder=PARSED_FOLDER):
    save_path = folder / filename
    with open(save_path, "w", encoding="utf-8") as f:
        f.write(content)
    print(f" Saved {save_path}")

def parse_pdf(file_path: Path):
    print(f"Parsing PDF: {file_path.name}")
    try:
        loader = UnstructuredPDFLoader(str(file_path))
        docs = loader.load()
        text = "\n\n".join(doc.page_content for doc in docs)
        if text.strip():
            return text
    except Exception as e:
        print(f"PDF parse error: {e}")
    return None

def parse_docx(file_path: Path):
    print(f"Parsing DOCX with LlamaParse: {file_path.name}")
    try:
        parsed = llama_parser.load_data(str(file_path))
        text = "\n\n".join(
            part.get("text", "") if isinstance(part, dict) else getattr(part, "text", getattr(part, "page_content", ""))
            for part in parsed
        )
        if text.strip():
            return text
    except Exception as e:
        print(f"DOCX parse error: {e}")
    return None

def parse_pptx(file_path: Path):
    print(f"Parsing PPTX: {file_path.name}")
    try:
        prs = Presentation(str(file_path))
        content = ""
        for i, slide in enumerate(prs.slides, 1):
            content += f"## Slide {i}\n"
            for shape in slide.shapes:
                if hasattr(shape, "text") and shape.text.strip():
                    content += shape.text.strip() + "\n\n"
        if content.strip():
            return content
    except Exception as e:
        print(f"PPTX parse error: {e}")
    return None

def parse_xlsx(file_path: Path):
    print(f"Parsing XLSX: {file_path.name}")
    try:
        import tabulate
    except ImportError:
        print("Missing 'tabulate' package. Run: pip install tabulate")
        return None

    try:
        xls = pd.ExcelFile(str(file_path))
        content = ""
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sheet_name)
            if df.empty:
                continue
            content += f"## Sheet: {sheet_name}\n\n"
            content += df.to_markdown(index=False) + "\n\n"
        if content.strip():
            return content
    except Exception as e:
        print(f"XLSX parse error: {e}")
    return None

def parse_md(file_path: Path):
    print(f"Reading Markdown: {file_path.name}")
    try:
        text = file_path.read_text(encoding="utf-8")
        if text.strip():
            return text
    except Exception as e:
        print(f"Markdown read error: {e}")
    return None

def parse_txt(file_path: Path):
    print(f"Reading TXT: {file_path.name}")
    try:
        text = file_path.read_text(encoding="utf-8")
        if text.strip():
            return f"```\n{text}\n```"
    except Exception as e:
        print(f"TXT read error: {e}")
    return None

def fallback_loader(file_path: Path):
    ext = file_path.suffix.lower()
    print(f"Attempting fallback loader for {file_path.name}")
    try:
        if ext == ".docx":
            loader = UnstructuredWordDocumentLoader(str(file_path))
        elif ext == ".pptx":
            loader = UnstructuredPowerPointLoader(str(file_path))
        elif ext == ".xlsx":
            loader = UnstructuredExcelLoader(str(file_path))
        elif ext == ".md":
            loader = UnstructuredMarkdownLoader(str(file_path))
        elif ext == ".txt":
            loader = TextLoader(str(file_path))
        else:
            print(f"No fallback loader available for {file_path.name}")
            return None

        docs = loader.load()
        text = "\n\n".join(doc.page_content for doc in docs)
        if text.strip():
            print(f"Fallback loader succeeded for {file_path.name}")
            return text
    except Exception as e:
        print(f"Fallback loader error: {e}")
    return None

def parse_and_save_file(file_path: Path):
    ext = file_path.suffix.lower()
    content = None

    if ext == ".pdf":
        content = parse_pdf(file_path)
    elif ext == ".docx":
        content = parse_docx(file_path)
    elif ext == ".pptx":
        content = parse_pptx(file_path)
    elif ext == ".xlsx":
        content = parse_xlsx(file_path)
    elif ext == ".md":
        content = parse_md(file_path)
    elif ext == ".txt":
        content = parse_txt(file_path)
    else:
        print(f"Unsupported file type: {file_path.name}")

    # If no content from primary parser, try fallback for all except pdf (which uses unstructured)
    if not content and ext != ".pdf":
        content = fallback_loader(file_path)

    if content:
        save_markdown(content, f"{file_path.stem}.md")
        return {"page_content": content, "source": file_path.name}
    else:
        print(f"Failed to parse {file_path.name}")
        return None



In [7]:
def load_and_parse_folder(folder_path: str):
    folder = Path(folder_path)
    docs = []
    for file_path in folder.glob("*"):
        if file_path.suffix.lower() in SUPPORTED_EXTS:
            doc = parse_and_save_file(file_path)
            if doc:
                docs.append(doc)
        else:
            print(f"Skipping unsupported file: {file_path.name}")
    print(f"\nTotal documents parsed and saved: {len(docs)}")
    return docs


In [8]:
CHUNKS_FOLDER = Path("Chunks_doc")
CHUNKS_FOLDER.mkdir(exist_ok=True)

In [9]:



def save_chunk(content: str, filename: str):
    save_path = CHUNKS_FOLDER / filename
    with open(save_path, "w", encoding="utf-8") as f:
        f.write(content)
    print(f"Saved chunk {save_path}")

def split_and_save_chunks(content: str, base_filename: str, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = splitter.split_text(content)
    print(f"Split into {len(chunks)} chunks")

    for i, chunk in enumerate(chunks, 1):
        chunk_filename = f"{base_filename}_chunk{i}.md"
        save_chunk(chunk, chunk_filename)


In [10]:
# Chunking the documents using RecursiveCharacterTextSplitter
# 1. Parse all documents in folder and save full markdowns
docs = load_and_parse_folder("../data/")  # adjust path as needed
# Change your folder path accordingly

Parsing PPTX: day01_workshop.pptx
 Saved Parsed_doc\day01_workshop.md
Reading Markdown: markdownfile.md
 Saved Parsed_doc\markdownfile.md
Parsing DOCX with LlamaParse: msword_file.docx
Started parsing the file under job_id ccdee811-2f67-44cb-8db2-ffb991a3cdb2
 Saved Parsed_doc\msword_file.md
Parsing PDF: pdffile.pdf
 Saved Parsed_doc\pdffile.md
Reading TXT: random.txt
 Saved Parsed_doc\random.md
Parsing XLSX: student_data.xlsx
 Saved Parsed_doc\student_data.md

Total documents parsed and saved: 6


In [11]:
# 2. Split each parsed document into chunks and save chunks
for doc in docs:
    content = doc["page_content"]
    source_name = doc["source"]
    base_name = source_name.rsplit(".", 1)[0]  # Remove extension
    split_and_save_chunks(content, base_name)


Split into 3 chunks
Saved chunk Chunks_doc\day01_workshop_chunk1.md
Saved chunk Chunks_doc\day01_workshop_chunk2.md
Saved chunk Chunks_doc\day01_workshop_chunk3.md
Split into 3 chunks
Saved chunk Chunks_doc\markdownfile_chunk1.md
Saved chunk Chunks_doc\markdownfile_chunk2.md
Saved chunk Chunks_doc\markdownfile_chunk3.md
Split into 31 chunks
Saved chunk Chunks_doc\msword_file_chunk1.md
Saved chunk Chunks_doc\msword_file_chunk2.md
Saved chunk Chunks_doc\msword_file_chunk3.md
Saved chunk Chunks_doc\msword_file_chunk4.md
Saved chunk Chunks_doc\msword_file_chunk5.md
Saved chunk Chunks_doc\msword_file_chunk6.md
Saved chunk Chunks_doc\msword_file_chunk7.md
Saved chunk Chunks_doc\msword_file_chunk8.md
Saved chunk Chunks_doc\msword_file_chunk9.md
Saved chunk Chunks_doc\msword_file_chunk10.md
Saved chunk Chunks_doc\msword_file_chunk11.md
Saved chunk Chunks_doc\msword_file_chunk12.md
Saved chunk Chunks_doc\msword_file_chunk13.md
Saved chunk Chunks_doc\msword_file_chunk14.md
Saved chunk Chunks_doc

In [12]:

# Folder with chunk files
CHUNKS_FOLDER = Path("Chunks_doc")

# Initialize HuggingFace embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)


  embeddings = HuggingFaceEmbeddings(model_name=model_name)


In [13]:


# Load chunk documents
chunk_docs = []
for chunk_file in CHUNKS_FOLDER.glob("*.md"):
    text = chunk_file.read_text(encoding="utf-8")
    chunk_docs.append(Document(page_content=text, metadata={"source": chunk_file.name}))

print(f"Loaded {len(chunk_docs)} chunks.")

# Initialize Chroma vectorstore (persists to ./chromadb/)
vectorstore = Chroma.from_documents(documents=chunk_docs, embedding=embeddings, collection_name="my_docs",persist_directory="./chromadb/")

print("Chroma vectorstore created and populated.")




Loaded 69 chunks.
Chroma vectorstore created and populated.


In [14]:
# Query example
query = "Explain management principles"

results = vectorstore.similarity_search(query, k=3)

for i, doc in enumerate(results, 1):
    print(f"Result {i} (source: {doc.metadata['source']}):\n{doc.page_content[:500]}...\n---\n")

Result 1 (source: msword_file_chunk32.md):
### How It Makes E-Commerce Better
- **Detailed Product Information**: E-commerce sites offer extensive details (e.g., specifications, reviews, comparisons) that help customers make informed decisions.
- **Price Transparency**: Customers can compare prices across multiple sellers instantly, ensuring they get the best deal.
- **Real-Time Updates**: Businesses can update stock, prices, or promotions instantly, keeping information current.
- **Customer Insights for Businesses**: E-commerce platform...
---

Result 2 (source: msword_file_chunk32.md):
### How It Makes E-Commerce Better
- **Detailed Product Information**: E-commerce sites offer extensive details (e.g., specifications, reviews, comparisons) that help customers make informed decisions.
- **Price Transparency**: Customers can compare prices across multiple sellers instantly, ensuring they get the best deal.
- **Real-Time Updates**: Businesses can update stock, prices, or promotions inst

In [15]:
metadata_field_info = [
    AttributeInfo(name="source", description="Name of the document file", type="string"),
    AttributeInfo(name="file_type", description="Type of file, like pdf, docx, pptx", type="string"),
]

In [16]:
llm = ChatGroq(
    api_key=os.getenv("GROQ_API_KEY"),
    model="llama3-8b-8192"
)


In [17]:
document_content_description = "Content of documents which may include PDFs, Word files, slides, and text containing varied topics."



retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vectorstore,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info,
    verbose=True,
)

In [18]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema.runnable import RunnableSequence, RunnableLambda
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.memory.chat_message_histories import FileChatMessageHistory
import os

In [19]:

# %% Define chat prompt with conversation history placeholder
qa_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        "You are a highly reliable assistant helping users understand documents. "
        "Always answer ONLY using the provided context and conversation history. "
        "If the answer is not present in the context, clearly state that you do not know or that the information is not available. "
        "Do NOT make up or infer facts that are not explicitly in the context."
    ),
    MessagesPlaceholder(variable_name="history"),
    HumanMessagePromptTemplate.from_template(
        """
Context:
{context}

Question:
{question}
"""
    ),
])

In [20]:
output_parser = StrOutputParser()

In [21]:
# %% Function to format retrieved docs into a single string
def format_docs(docs):
    if isinstance(docs, list):
        return "\n\n".join(doc.page_content for doc in docs)
    raise ValueError("Expected a list of Document objects.")

# %% Build the retrieval chain, now passing the 'history' along
retrieval_chain = RunnableSequence(
    {
        "context": RunnableLambda(lambda x: x["question"]) | retriever | RunnableLambda(format_docs),
        "question": RunnableLambda(lambda x: x["question"]),
        "history": RunnableLambda(lambda x: x.get("history", [])),  # Pass chat history
    }
    | qa_prompt
    | llm
    | output_parser
)

In [22]:
os.makedirs("memory", exist_ok=True)

def get_file_history(session_id: str) -> FileChatMessageHistory:
    file_path = f"memory/{session_id}.json"
    return FileChatMessageHistory(file_path=file_path)


# %% Wrap chain with memory support
retrieval_chain_with_memory = RunnableWithMessageHistory(
    runnable=retrieval_chain,
    get_message_history=get_file_history,
    get_session_history=get_file_history,  # add this!
    input_messages_key="question",
    history_messages_key="history",
)


In [23]:
# %% Helper function to run queries easily with session memory
def chat_with_memory(question: str, session_id: str = "default_session"):
    result = retrieval_chain_with_memory.invoke(
        {"question": question},
        config={"configurable": {"session_id": session_id}},
    )
    return result

# %% Run a sample question and print response
response = chat_with_memory("What is MicroController?", session_id="user123")
print(response)


According to the provided context, a MicroController is a highly integrated device that includes, on one chip, all or most of the parts needed to perform an application control function. It has bit manipulation instructions, easy and direct access to I/O, and quick and efficient interrupt processing.


In [24]:
# %% Run a sample question and print response
response = chat_with_memory("Explain it more?", session_id="user123")
print(response)


I apologize, but the provided context does not mention MicroController. It appears to be discussing Photo and Video Sharing Platforms, specifically Instagram, YouTube, and Snapchat, and their role in e-commerce.


In [25]:
# %% Run a sample question and print response
response = chat_with_memory("What is microcontroller explain?", session_id="user123")
print(response)


Based on the provided context, a microcontroller is a highly integrated device that combines multiple components onto a single chip. This includes processing, memory, and input/output (I/O) components. The text highlights the key features of a microcontroller, including:

1. Bit manipulation instructions: Microcontrollers can perform bit-level operations, which enables them to efficiently manipulate digital signals.
2. Easy and direct access to I/O: Microcontrollers have direct access to input/output components, making it easy to interact with external devices and sensors.
3. Quick and efficient interrupt processing: Microcontrollers can quickly respond to interrupts, which enables them to efficiently handle events and changes in the system.

The miniaturization process has made it possible to fit all the necessary components onto a single chip, which is much smaller and more cost-effective than traditional methods.


In [26]:
response= chat_with_memory("My name is Arunb", session_id="user124")
print(response)

I'm afraid I don't see your name, Arunb, in the provided document. It seems that the document only contains information about students with IDs 1001 to 1004, and their names are Alice Johnson, Bob Smith, Clara Davis, and David Wilson. I don't know your name or any other information about you as it's not present in the given context.


In [27]:
response= chat_with_memory("What is my name ", session_id="user124")
print(response)

I apologize, but I don't see your name in the provided context. The only name mentioned is Emma Moore. I don't know your name as it's not present in the given document.
