In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install -q langchain sentence-transformers transformers accelerate faiss-cpu pypdf python-docx gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.5/323.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.30-py3-none-any.whl.metadata (3.0 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.3.30-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
import os, glob, json
import torch
from pathlib import Path

from langchain.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader, CSVLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

from transformers import pipeline

print("Torch CUDA available:", torch.cuda.is_available())
DEVICE = 0 if torch.cuda.is_available() else -1

Torch CUDA available: True


In [10]:
pip install "unstructured[pdf]"

Collecting onnx>=1.17.0 (from unstructured[pdf])
  Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Collecting onnxruntime>=1.19.0 (from unstructured[pdf])
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting pdf2image (from unstructured[pdf])
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pdfminer.six (from unstructured[pdf])
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pikepdf (from unstructured[pdf])
  Downloading pikepdf-9.11.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.2 kB)
Collecting pi-heif (from unstructured[pdf])
  Downloading pi_heif-1.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.5 kB)
Collecting google-cloud-vision (from unstructured[pdf])
  Downloading google_cloud_vision-3.10.2-py3-none-any.whl.metadata (9.6 kB)
Collecting effde

In [14]:
DATA_DIR = "/content/drive/MyDrive/AI projects/PDF"
# DirectoryLoader will pick up many file types automatically
loader = DirectoryLoader(DATA_DIR, glob="**/*")
raw_docs = loader.load()
print("Loaded documents:", len(raw_docs))

# If you want to inspect sources:
for d in raw_docs[:6]:
    print("----source:", d.metadata.get("source", "unknown")[:200])
    print("text sample:", d.page_content[:200].replace("\n", " "))


Loaded documents: 1
----source: /content/drive/MyDrive/AI projects/PDF/NLP with Transformer models.pdf
text sample: Revised Edition Natural Language Processing with Transformers  Building Language Applications with Hugging Face  v  o  T  n  u  &  W  n  s  W  e  t  a  r  o  r  l  a  l  l  f  ,  Lewis Tunstall, Leand


In [16]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
docs = splitter.split_documents(raw_docs)
print("Number of Chunks:", len(docs))

for d in docs[:3]:
  print('meta:', d.metadata)
  print('len text:', len(d.page_content))

Number of Chunks: 802
meta: {'source': '/content/drive/MyDrive/AI projects/PDF/NLP with Transformer models.pdf'}
len text: 1101
meta: {'source': '/content/drive/MyDrive/AI projects/PDF/NLP with Transformer models.pdf'}
len text: 1138
meta: {'source': '/content/drive/MyDrive/AI projects/PDF/NLP with Transformer models.pdf'}
len text: 683


In [20]:
from langchain.embeddings import HuggingFaceEmbeddings
# Use a small, free embedding model
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

# Build an in-memory FAISS vector store
vectordb = FAISS.from_documents(docs, embeddings)
print("FAISS index created with", len(docs), "vectors")


  embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index created with 802 vectors


In [21]:
PERSIST_DIR = "/content/drive/MyDrive/rag_index"
os.makedirs(PERSIST_DIR, exist_ok=True)
vectordb.save_local(PERSIST_DIR)
print("Saved index to", PERSIST_DIR)

# To load later (new session):
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
# vectordb = FAISS.load_local(PERSIST_DIR, embeddings)


Saved index to /content/drive/MyDrive/rag_index


In [22]:
LLM_MODEL = "google/flan-t5-base"   # small, free, text2text
pipe = pipeline(
    "text2text-generation",
    model=LLM_MODEL,
    device=DEVICE,
    max_new_tokens=256,
    do_sample=False  # deterministic answers
)
llm = HuggingFacePipeline(pipeline=pipe)

# Build retrieval chain; ask it to return source_documents so we can show citations
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # simple approach: stuff retrieved docs into prompt
    retriever=vectordb.as_retriever(search_kwargs={"k": 4}),
    return_source_documents=True
)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


In [42]:
def ask(query):
    res = qa({"query": query})
    answer = res["result"]
    srcs = res.get("source_documents", [])
    print("→\nANSWER:\n", answer)
    print("\nSOURCES:")
    for s in srcs:
        print("-", s.metadata.get("source", "unknown"))

# Example:
ask("how many freely available models The Hugging Face Hub hosts ")


→
ANSWER:
 over 20,000 freely available models

SOURCES:
- /content/drive/MyDrive/AI projects/PDF/NLP with Transformer models.pdf
- /content/drive/MyDrive/AI projects/PDF/NLP with Transformer models.pdf
- /content/drive/MyDrive/AI projects/PDF/NLP with Transformer models.pdf
- /content/drive/MyDrive/AI projects/PDF/NLP with Transformer models.pdf


In [43]:
import gradio as gr

def respond(query):
    res = qa({"query": query})
    answer = res["result"]
    srcs = [d.metadata.get("source", "unknown") for d in res.get("source_documents", [])]
    return answer, "\n".join(srcs)

demo = gr.Interface(
    fn=respond,
    inputs=gr.Textbox(lines=2, placeholder="Ask about your documents..."),
    outputs=[gr.Textbox(label="Answer"), gr.Textbox(label="Sources")],
    title="CHATBOT for NoteBooks"
)
demo.launch(share=True)  # share=True gives a public temporary link


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://201771f0606de718ad.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


