In [None]:
import os
import langchain
import chromadb
import torch
import pdfplumber
import matplotlib.pyplot as plt
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoModel, AutoTokenizer, pipeline
from PIL import ImageGrab

# Step 1: Load Document Using LangChain
pdf_path = "sample_paper.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Save screenshot
first_1000_chars = documents[0].page_content[:1000]
print(first_1000_chars)  # Display first 1000 characters
img = ImageGrab.grab()
img.save("pdf_loader.png")

# Step 2: Apply Text Splitting Techniques
latex_text = """\\documentclass{article} ... \\end{document}"""  # Full LaTeX text
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
split_docs = splitter.create_documents([latex_text])

# Save screenshot
print(split_docs[0].page_content)  # Print first chunk
img = ImageGrab.grab()
img.save("code_splitter.png")

# Step 3: Embed Documents
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
query = "How are you?"
query_embedding = embedding_model.embed_query(query)

# Save screenshot
print(query_embedding[:5])  # Print first 5 embedding numbers
img = ImageGrab.grab()
img.save("embedding.png")

# Step 4: Create and Configure Vector Databases
chroma_db = Chroma.from_documents(split_docs, embedding_model)
query = "Smoking policy"
similar_results = chroma_db.similarity_search(query, k=5)

# Save screenshot
print(similar_results)  # Display retrieved results
img = ImageGrab.grab()
img.save("vectordb.png")

# Step 5: Develop a Retriever
retriever = chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 2})
query = "Email policy"
retrieved_docs = retriever.get_relevant_documents(query)

# Save screenshot
print(retrieved_docs)  # Print retrieved segments
img = ImageGrab.grab()
img.save("retriever.png")

# Step 6: Construct a QA Bot
qa_chain = RetrievalQA.from_chain_type(
    llm=HuggingFacePipeline.from_model_id(
        model_id="mistralai/mixtral-8x7b-instruct-v01",
        task="text-generation"
    ),
    retriever=retriever
)
query = "What this paper is talking about?"
response = qa_chain.run(query)

# Save screenshot
print(response)  # Print the response
img = ImageGrab.grab()
img.save("QA_bot.png")
