In [18]:
from azure.storage.blob import BlobServiceClient
from langchain_community.document_loaders import PyPDFLoader
from docx import Document as DocxDocument
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.docstore.document import Document
import faiss
import numpy as np
import tempfile, os, re, pickle
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [19]:
import langchain
print(langchain.__version__)


1.2.0


In [20]:
# Core packages
!pip install streamlit pyPDF2

# LangChain packages
!pip install langchain openai faiss-cpu

# Azure Blob Storage
!pip install azure-storage-blob

# Optional for HuggingFace embeddings (if you want to remove OpenAI dependency later)
!pip install sentence-transformers




In [21]:
# ==========================
# Azure Blob Config
# ==========================
import os
from dotenv import load_dotenv

env_path = Path.cwd() / ".env"
load_dotenv(env_path)

AZURE_CONNECTION_STRING = os.getenv("AZURE_CONNECTION_STRING")
CONTAINER_NAME = os.getenv("CONTAINER_NAME")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_GEMINI_API_KEY")

In [23]:

# ==========================
# Text cleaning
# ==========================
def clean_text(text):
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [24]:
# ==========================
# Load documents (PDF + DOCX + TXT)
# ==========================
documents_text = []

for blob in container_client.list_blobs():
    if not blob.name.lower().endswith((".pdf", ".docx", ".txt")):
        continue

    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(blob.name)[1])
    tmp_file.write(container_client.get_blob_client(blob.name).download_blob().readall())
    tmp_file.close()

    ext = os.path.splitext(blob.name)[1].lower()

    if ext == ".pdf":
        text = "\n".join([p.page_content for p in PyPDFLoader(tmp_file.name).load()])

    elif ext == ".docx":
        doc = DocxDocument(tmp_file.name)
        text = "\n".join([p.text for p in doc.paragraphs])

    elif ext == ".txt":
        with open(tmp_file.name, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

    os.remove(tmp_file.name)
    documents_text.append(clean_text(text))

In [25]:
# ==========================
# Chunking
# ==========================
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_chunks = []
for txt in documents_text:
    all_chunks.extend(splitter.split_text(txt))


In [26]:
print("Total chunks:", len(all_chunks))

Total chunks: 67


In [29]:
# ==========================
# Embeddings + FAISS
# ==========================
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    api_key=GEMINI_API_KEY
)





Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


In [30]:
documents = [Document(page_content=c) for c in all_chunks]
embeddings = embedding_model.embed_documents([d.page_content for d in documents])
embeddings_np = np.array(embeddings, dtype="float32")

In [31]:

faiss_index = faiss.IndexFlatL2(embeddings_np.shape[1])
faiss_index.add(embeddings_np)


In [32]:
faiss.write_index(faiss_index, "faiss_index.idx")
with open("chunks.pkl", "wb") as f:
    pickle.dump(all_chunks, f)

In [33]:
# ==========================
# Helpers
# ==========================
def load_faiss():
    return faiss.read_index("faiss_index.idx"), pickle.load(open("chunks.pkl", "rb"))

In [34]:

def embed_query(q):
    return np.array(
        embedding_model.embed_query(q),
        dtype="float32"
    )

In [35]:
def retrieve(q_emb, index, chunks, k=1):
    _, I = index.search(q_emb.reshape(1, -1), k)
    return [chunks[i] for i in I[0]]


In [36]:
def retrieve(q_emb, index, chunks, k=1):
    _, I = index.search(q_emb.reshape(1, -1), k)
    return [chunks[i] for i in I[0]]

def build_prompt(context, question):
    return f"""
Use ONLY the context to answer.

Context:
{context}

Question: {question}
Answer:
"""

In [37]:
def safe_post(url, headers, json_data):
    session = requests.Session()
    retry = Retry(total=3, backoff_factor=0.5)
    session.mount("https://", HTTPAdapter(max_retries=retry))
    return session.post(url, headers=headers, json=json_data, timeout=30)

In [41]:
def generate_answer(context, question):
    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"
    data = {
        "contents": [{"parts": [{"text": build_prompt(context, question)}]}]
    }
    res = safe_post(f"{url}?key={GEMINI_API_KEY}", {"Content-Type": "application/json"}, data)
    return res.json()["candidates"][0]["content"]["parts"][0]["text"]

In [42]:
# ==========================
# QUESTIONS & ANSWERS
# ==========================
faiss_index, chunks = load_faiss()

questions = [
    "What is React JS?",
    "What is supervised learning?",
    "What is Python?",
    "What is Virtual DOM?",
    "Write a program to check palindrome?",
    "What is DBSCAN?"
]

for q in questions:
    q_emb = embed_query(q)
    top_chunk = retrieve(q_emb, faiss_index, chunks, k=1)
    context = "\n\n".join(top_chunk)
    ans = generate_answer(context, q)

    print("\nQ:", q)
    print("A:", ans)


Q: What is React JS?
A: React JS is an open-source JavaScript library developed by Facebook for building fast and interactive user interfaces, especially single-page applications.

Q: What is supervised learning?
A: Supervised learning is a type of machine learning where the model learns from labelled data. Each training example consists of input pairs and expected outputs (labels). The algorithms aim to map inputs to outputs.

Q: What is Python?
A: Python is a high-level, interpreted, object-oriented programming language known for its simplicity and readability.

Q: What is Virtual DOM?
A: Virtual DOM is a lightweight copy of the real DOM. React updates the Virtual DOM first and then efficiently updates only the changed parts in the real DOM.

Q: Write a program to check palindrome?
A: ```python
s = input("Enter string: ")
if s == s[::-1]: 
    print("Palindrome")
else: 
    print("Not Palindrome")
```

Q: What is DBSCAN?
A: DBSCAN is a density-based clustering algorithm that groups 

In [17]:
from azure.storage.blob import BlobServiceClient
from langchain_community.document_loaders import PyPDFLoader
from docx import Document as DocxDocument
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.docstore.document import Document
import faiss
import numpy as np
import tempfile, os, re, pickle
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ==========================
# Azure Blob Config
# ==========================
from dotenv import load_dotenv
from pathlib import Path
import os
from azure.storage.blob import BlobServiceClient



env_path = Path.cwd() / ".env"
load_dotenv(env_path)

AZURE_CONNECTION_STRING = os.getenv("AZURE_CONNECTION_STRING")
CONTAINER_NAME = os.getenv("CONTAINER_NAME")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_GEMINI_API_KEY")

# ==========================
# Text cleaning
# ==========================
def clean_text(text):
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# ==========================
# Load documents (PDF + DOCX + TXT)
# ==========================
documents_text = []

for blob in container_client.list_blobs():
    if not blob.name.lower().endswith((".pdf", ".docx", ".txt")):
        continue

    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(blob.name)[1])
    tmp_file.write(container_client.get_blob_client(blob.name).download_blob().readall())
    tmp_file.close()

    ext = os.path.splitext(blob.name)[1].lower()

    if ext == ".pdf":
        text = "\n".join([p.page_content for p in PyPDFLoader(tmp_file.name).load()])

    elif ext == ".docx":
        doc = DocxDocument(tmp_file.name)
        text = "\n".join([p.text for p in doc.paragraphs])

    elif ext == ".txt":
        with open(tmp_file.name, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

    os.remove(tmp_file.name)
    documents_text.append(clean_text(text))

# ==========================
# Chunking
# ==========================
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_chunks = []
for txt in documents_text:
    all_chunks.extend(splitter.split_text(txt))

# ==========================
# Embeddings + FAISS
# ==========================
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    api_key=GEMINI_API_KEY
)

documents = [Document(page_content=c) for c in all_chunks]
embeddings = embedding_model.embed_documents([d.page_content for d in documents])
embeddings_np = np.array(embeddings, dtype="float32")

faiss_index = faiss.IndexFlatL2(embeddings_np.shape[1])
faiss_index.add(embeddings_np)

faiss.write_index(faiss_index, "faiss_index.idx")
with open("chunks.pkl", "wb") as f:
    pickle.dump(all_chunks, f)
# ==========================
# Helpers
# ==========================
def load_faiss():
    return faiss.read_index("faiss_index.idx"), pickle.load(open("chunks.pkl", "rb"))

def embed_query(q):
    return np.array(
        embedding_model.embed_query(q),
        dtype="float32"
    )

def retrieve(q_emb, index, chunks, k=1):
    _, I = index.search(q_emb.reshape(1, -1), k)
    return [chunks[i] for i in I[0]]

def build_prompt(context, question):
    return f"""
Use ONLY the context to answer.

Context:
{context}

Question: {question}
Answer:
"""

def safe_post(url, headers, json_data):
    session = requests.Session()
    retry = Retry(total=3, backoff_factor=0.5)
    session.mount("https://", HTTPAdapter(max_retries=retry))
    return session.post(url, headers=headers, json=json_data, timeout=30)

def generate_answer(context, question):
    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"
    data = {
        "contents": [{"parts": [{"text": build_prompt(context, question)}]}]
    }
    res = safe_post(f"{url}?key={GEMINI_API_KEY}", {"Content-Type": "application/json"}, data)
    return res.json()["candidates"][0]["content"]["parts"][0]["text"]

# ==========================
# QUESTIONS & ANSWERS
# ==========================
faiss_index, chunks = load_faiss()

questions = [
    "What is React JS?",
    "What is supervised learning?",
    "What is Python?"
]
for q in questions:
    q_emb = embed_query(q)
    top_chunk = retrieve(q_emb, faiss_index, chunks, k=1)
    context = "\n\n".join(top_chunk)
    ans = generate_answer(context, q)
    print("\nQ:", q)
    print("A:", ans)


Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.



Q: What is React JS?
A: React JS is an open-source JavaScript library developed by Facebook for building fast and interactive user interfaces, especially single-page applications.

Q: What is supervised learning?
A: Supervised learning is a type of machine learning where the model learns from labelled data. Each training example consists of input pairs and expected outputs (labels). The algorithms aim to map inputs to outputs.

Q: What is Python?
A: Python is a high-level, interpreted, object-oriented programming language known for its simplicity and readability.


In [16]:
# ==========================
# IMPORTS
# ==========================
import os
import re
import tempfile
import pickle
import numpy as np
import faiss
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from pathlib import Path
from dotenv import load_dotenv

from azure.storage.blob import BlobServiceClient
from langchain_community.document_loaders import PyPDFLoader
from docx import Document as DocxDocument
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.docstore.document import Document

# ==========================
# LOAD ENV
# ==========================
# Force load .env in the notebook folder
env_path = Path.cwd() / ".env"
load_dotenv(env_path)

AZURE_CONNECTION_STRING = os.getenv("AZURE_CONNECTION_STRING")
CONTAINER_NAME = os.getenv("CONTAINER_NAME")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_GEMINI_API_KEY")


# ==========================
# CONNECT TO AZURE BLOB
# ==========================
blob_service_client = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
container_client = blob_service_client.get_container_client(CONTAINER_NAME)
print("Connected to container:", CONTAINER_NAME)

# ==========================
# TEXT CLEANING FUNCTION
# ==========================
def clean_text(text):
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# ==========================
# LOAD DOCUMENTS
# ==========================
documents_text = []

for blob in container_client.list_blobs():
    if not blob.name.lower().endswith((".pdf", ".docx", ".txt")):
        continue

    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(blob.name)[1])
    tmp_file.write(container_client.get_blob_client(blob.name).download_blob().readall())
    tmp_file.close()

    ext = os.path.splitext(blob.name)[1].lower()

    if ext == ".pdf":
        text = "\n".join([p.page_content for p in PyPDFLoader(tmp_file.name).load()])
    elif ext == ".docx":
        doc = DocxDocument(tmp_file.name)
        text = "\n".join([p.text for p in doc.paragraphs])
    elif ext == ".txt":
        with open(tmp_file.name, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

    os.remove(tmp_file.name)
    documents_text.append(clean_text(text))

# ==========================
# CHUNKING
# ==========================
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_chunks = []
for txt in documents_text:
    all_chunks.extend(splitter.split_text(txt))

# ==========================
# EMBEDDINGS + FAISS
# ==========================
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    api_key=GEMINI_API_KEY
)

documents = [Document(page_content=c) for c in all_chunks]
embeddings = embedding_model.embed_documents([d.page_content for d in documents])
embeddings_np = np.array(embeddings, dtype="float32")

faiss_index = faiss.IndexFlatL2(embeddings_np.shape[1])
faiss_index.add(embeddings_np)

faiss.write_index(faiss_index, "faiss_index.idx")
with open("chunks.pkl", "wb") as f:
    pickle.dump(all_chunks, f)

# ==========================
# HELPER FUNCTIONS
# ==========================
def load_faiss():
    return faiss.read_index("faiss_index.idx"), pickle.load(open("chunks.pkl", "rb"))

def embed_query(q):
    return np.array(embedding_model.embed_query(q), dtype="float32")

def retrieve(q_emb, index, chunks, k=1):
    _, I = index.search(q_emb.reshape(1, -1), k)
    return [chunks[i] for i in I[0]]

def build_prompt(context, question):
    return f"Use ONLY the context to answer.\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"

def safe_post(url, headers, json_data):
    session = requests.Session()
    retry = Retry(total=3, backoff_factor=0.5)
    session.mount("https://", HTTPAdapter(max_retries=retry))
    return session.post(url, headers=headers, json=json_data, timeout=30)

def generate_answer(context, question):
    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"
    data = {"contents": [{"parts": [{"text": build_prompt(context, question)}]}]}
    res = safe_post(f"{url}?key={GEMINI_API_KEY}", {"Content-Type": "application/json"}, data)
    return res.json()["candidates"][0]["content"]["parts"][0]["text"]

# ==========================
# EXAMPLE QUESTIONS
# ==========================
faiss_index, chunks = load_faiss()

questions = [
    "What is React JS?",
    "What is supervised learning?",
    "What is Python?"
]

for q in questions:
    q_emb = embed_query(q)
    top_chunk = retrieve(q_emb, faiss_index, chunks, k=1)
    context = "\n\n".join(top_chunk)
    ans = generate_answer(context, q)
    print("\nQ:", q)
    print("A:", ans)


Connected to container: kalyanpdf


Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.



Q: What is React JS?
A: React JS is an open-source JavaScript library developed by Facebook for building fast and interactive user interfaces, especially single-page applications.

Q: What is supervised learning?
A: Supervised learning is a type of machine learning where the model learns from labelled data. Each training example consists of input pairs and expected outputs (labels). The algorithms aim to map inputs to outputs. Example: Classification (e.g., spam detection), Regression (e.g., house price prediction).

Q: What is Python?
A: Python is a high-level, interpreted, object-oriented programming language known for its simplicity and readability.


In [10]:
from dotenv import load_dotenv
from pathlib import Path
import os

load_dotenv(Path.cwd() / ".env")

print("GEMINI_API_KEY:", os.getenv("GEMINI_API_KEY"))
print("AZURE_CONNECTION_STRING:", os.getenv("AZURE_CONNECTION_STRING") is not None)
print("CONTAINER_NAME:", os.getenv("CONTAINER_NAME"))


GEMINI_API_KEY: AIzaSyAbVjU3k7xMPAw09qr4tQZ84JZAfoRp2XM
AZURE_CONNECTION_STRING: True
CONTAINER_NAME: kalyanpdf
