In [None]:
import pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import os
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel, BitsAndBytesConfig
from langchain.llms import HuggingFacePipeline
from bangla_pdf_ocr import process_pdf
from langchain_community.document_loaders import DirectoryLoader
import torch, re
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate

In [None]:
# print(torch.cuda.is_available())  
# print(torch.cuda.get_device_name(0)) 

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
# # Lets Read the document
# def read_doc(directory):
#     file_loader=PyPDFDirectoryLoader(directory)
#     documents=file_loader.load()
#     return documents

# doc=read_doc('data/')
# len(doc)
# type(doc)
# print(doc[1])

In [None]:
path = "data/HSC26-Bangla1st-Paper.pdf"
output_file = "data/cleaned_data.txt"
process_pdf(path, output_file)

In [None]:
doc = DirectoryLoader("data", glob="**/*.txt")
doc= doc.load()
type(doc)
#print(len(doc))

In [None]:
# Clean each document
for i in range(len(doc)):
    content = doc[i].page_content
    match = re.search(r'Page 6(.*?)Page 20', content, re.DOTALL)
    content = match.group(1)
    # Remove English letters, digits, and hyphens
    cleaned = re.sub(r'[a-zA-Z0-9\-\\n]', '', content)
    # Replace multiple whitespace with a single space or newline
    cleaned = re.sub(r'\s+', ' ', cleaned)
    doc[i].page_content = cleaned.strip()

print(doc)

In [None]:
## Divide the docs into chunks
def chunk_data(docs, chunk_size=1000, chunk_overlap=100):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc



In [None]:
doc=chunk_data(docs=doc)
len(doc)


In [None]:
# import fitz  

# def extract_bangla_text_from_pdf(pdf_path):
#     doc = fitz.open(pdf_path)
#     text = ""
#     for page in doc:
#         text += page.get_text()
#     return text

# text = extract_bangla_text_from_pdf("data/HSC26-Bangla1st-Paper.pdf")

# def clean_bangla_text(text):
#     import re
#     text = re.sub(r'\s+', ' ', text)
#     text = text.strip()
#     return text

# text = clean_bangla_text(text)
# print(text[:100])

In [None]:
## Embedding Technique Of OPENAI
# embeddings=OpenAIEmbeddings(model="text-embedding-3-small", api_key=os.environ['OPENAI_API_KEY'])
# embeddings
# model = SentenceTransformer("distiluse-base-multilingual-cased-v1")  # supports Bangla

# embeddings = model.encode([text])

embeddings = HuggingFaceEmbeddings(
    model_name="l3cube-pune/bengali-sentence-similarity-sbert"
)

In [None]:
# vectors=embeddings.embed_query("How are you?")
# len(vectors)

In [None]:
# ## Vector Search DB In Pinecone
# pc = pinecone.Pinecone(api_key='pcsk_6aVRrV_442vekbbyorbm5i64i2U4Uk5vYC4Bvng2zRXj4HXoKyfajA5XsnT2NC7Z2KgAfL')
# index_name = pc.Index("10mins")
# PineconeVectorStore.from_documents(doc, embeddings, index_name="10mins")

In [None]:
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))  

index_name = "10mins"
# Check if index already exists
if index_name not in pc.list_indexes().names():
    # Create index using ServerlessSpec
    pc.create_index(
        name=index_name,
        vector_type = "dense",
        dimension=768,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud="aws", 
            region="us-east-1"  
        )
    )

    #PineconeVectorStore.from_documents(doc, embeddings, index_name="10mins")

index = pc.Index(index_name)
# Only run this once!
if index.describe_index_stats()["total_vector_count"] == 0:
    PineconeVectorStore.from_documents(doc, embeddings, index_name=index_name)



In [None]:
#llm=ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0)

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    temperature=0,
    google_api_key=os.getenv("GOOGLE_API_KEY")
)

# tokenizer = AutoTokenizer.from_pretrained("BanglaLLM/bangla-llama-13b-base-v0.1")
# llm = AutoModelForCausalLM.from_pretrained("BanglaLLM/bangla-llama-13b-base-v0.1")

#llm = AutoModelForCausalLM.from_pretrained("BanglaLLM/bangla-llama-13b-base-v0.1")

# Wrap existing Pinecone index for LangChain
vectorstore = PineconeVectorStore(
    index=index,             
    embedding=embeddings, 
    text_key="text"
)


# Short-answer prompt for individual chunks (map step)
question_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Answer the following question based on the context below.
Keep your answer very short — a single line only.

Context: {context}
Question: {question}
Answer:
"""
)

# Short-answer prompt for final summarization (reduce step)
combine_prompt = PromptTemplate(
    input_variables=["summaries", "question"],
    template="""
Based on the following partial answers, generate a single short one-line answer.

Answers:
{summaries}

Question: {question}
Answer:
"""
)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 10}),  
    chain_type= "map_reduce",
    chain_type_kwargs={
        "question_prompt": question_prompt,
        "combine_prompt": combine_prompt
    }
)



In [None]:
# ## Cosine Similarity Retreive Results from VectorDB
# def retrieve_query(query, k=1):
#     matching_results=vectorstore.similarity_search(query,k=k)
#     return matching_results

In [None]:
# # Load tokenizer and model (after conversion)
# llm = AutoModel.from_pretrained("asif00/bangla-llama-1B-gguf-16bit", torch_dtype="auto"),

# chain = RetrievalQA.from_chain_type(
#     llm=llm,
#     retriever=index.as_retriever(),  
#     chain_type="map_reduce",

# )

In [None]:
## Search answers from VectorDB
def retrieve_answers(query):
    # doc_search=retrieve_query(query)
    # print(doc_search)
    response=chain.invoke({"query": query})
    return response

In [None]:
our_query = "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?"
answer = retrieve_answers(our_query)
print(answer)

In [None]:
our_query = "অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?"
answer = retrieve_answers(our_query)
print(answer)

In [None]:
our_query = "কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?"
answer = retrieve_answers(our_query)
print(answer)