In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# List the top level of MyDrive (quick visual check)
!ls -lh "/content/drive/MyDrive"

total 11M
-rw------- 1 root root  73K Aug 19 09:11  basic-text.pdf
drwx------ 2 root root 4.0K Mar 27  2024 'Colab Notebooks'
-rw------- 1 root root 348K Aug  3  2023  _DSC2237---01.jpg
drwx------ 4 root root 4.0K Jul 12 05:52  ROS_LAB
-rw------- 1 root root 3.2M Mar 16  2024 'Translation 13-Mar-2024 11-42-16.pdf'
-rw------- 1 root root 6.9M Aug 12 10:46  Unit-1-ccnotes.pdf


In [None]:
!pip install PyPDF2



In [None]:
!pip install langchain langchain-community sentence-transformers faiss-cpu pypdf

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFaceHub
from transformers import pipeline

In [None]:
loader = PyPDFLoader("/content/drive/MyDrive/basic-text.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

print(f"Total Chunks Created: {len(chunks)}")
print("\n--- Sample Chunk ---\n")
print(chunks[0].page_content[:300])

Total Chunks Created: 2

--- Sample Chunk ---

Sample Document for PDF Testing
Introduction
This is a simple document created to test basic PDF functionality. It includes various text formatting
options to ensure proper rendering in PDF readers.
Text Formatting Examples
1. Bold text is used for emphasis.
2. Italic text can be used for titles or 


In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
db = FAISS.from_documents(chunks, embedding_model)

In [None]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
local_llm = pipeline("text2text-generation", model="google/flan-t5-small", max_new_tokens=200)

from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=local_llm)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

In [None]:
query = "What is the main topic discussed in this PDF?"
result = qa_chain({"query": query})

print("\nüîπ Answer:")
print(result["result"])

print("\nSources:")
for doc in result["source_documents"]:
    print(f"Page {doc.metadata['page']}:\n{doc.page_content[:200]}\n")


üîπ Answer:
Sample Document for PDF Testing Introduction This is a simple document created to test basic PDF functionality. It includes various text formatting options to ensure proper rendering in PDF readers. Text Formatting Examples 1. Bold text is used for emphasis. 2. Italic text is used for titles or subtle emphasis. 3. Strikethrough is used to show deleted text. Lists Here's an example of an ordered list: 1. First item 2. Second item 3. Third item Quote This is an example of a block quote. It can be used to highlight important information or citations. Table Header 1 Header 2 Header 3 Row 1, Col 1 Row 1, Col 2 Row 1, Col 3 Row 2, Col 1 Row 2, Col 2 Row 2, Col 3 This document demonstrates various formatting options that should translate well to PDF format. This sample PDF file is provided by Sample-Files.com. Visit us for more sample files and resources.

üìÑ Sources:
Page 0:
Sample Document for PDF Testing
Introduction
This is a simple document created to test basic PDF funct