In [1]:
# # #https://github.com/Future-House/paper-qa.git
# from google.genai import types
# from google import genai

# # # Only run this block for Vertex AI API
# client = genai.Client(api_key="AIzaSyDn82tmjyQRkZr3K79CXWC47ab3Xko29L0")

# !pip install py-spy
from dotenv import load_dotenv
import os
load_dotenv()

# Get a single environment variable
API_KEY_GROQ = os.getenv("API_KEY_GROQ")


In [2]:
# !pip install -qU langchain-community arxiv pymupdf langchain pypdf
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !pip install -U transformers
# !pip install -qU langchain_google_genai
# !pip install -qU langchain-groq

In [3]:
import requests 
from langchain_community.document_loaders import PyPDFLoader


import tempfile
import requests

def get_text_from_pdf(file_path=None, url=None):
    if url:
        response = requests.get(url)
        pdf_bytes = response.content
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
            temp_pdf.write(pdf_bytes)
            file_path = temp_pdf.name
    loader = PyPDFLoader(file_path)
    docs = loader.load_and_split()
    return docs


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [4]:
docs = get_text_from_pdf(url="https://arxiv.org/pdf/2510.18234")

## Chunking techniques

### 1. Sliding Window Chunking

In [5]:
pages = [ doc.page_content for doc in docs ]


In [6]:
import re
def clean_text(text:str) :

    cleaned_text = re.sub(r"(Contents).*?1\. Introduction", r"\1\n", text, flags=re.DOTALL)
    cleaned_text = re.sub(r"(References|REFERENCES).*?$", "", cleaned_text, flags=re.DOTALL)
    return cleaned_text
        

In [7]:
# cleaned_text = clean_text(' '.join(pages))
# display(Markdown(cleaned_text))

In [8]:
from collections import Counter
import math
MAX_TEXT_ENTROPY = 8.0

import spacy
nlp = spacy.load("en_core_web_sm")

def split_sentences(text: str) -> list[str]:
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

def maybe_is_text(s: str, thresh: float = 2.5) -> bool:
    """
    Calculate the entropy of the string to discard files with excessively repeated symbols.

    PDF parsing sometimes represents horizontal distances between words on title pages
    and in tables with spaces, which should therefore not be included in this calculation.
    """
    if not s:
        return False

    s_wo_spaces = s.replace(" ", "")
    if not s_wo_spaces:
        return False

    counts = Counter(s_wo_spaces)
    entropy = 0.0
    length = len(s_wo_spaces)
    for count in counts.values():
        p = count / length
        entropy += -p * math.log2(p)

    # Check if the entropy is within a reasonable range for text
    return MAX_TEXT_ENTROPY > entropy > thresh


sentences = split_sentences(clean_text(' '.join(pages)))
text_sentences = [s for s in sentences if maybe_is_text(s)]

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = embeddings = GoogleGenerativeAIEmbeddings(
            model="models/gemini-embedding-001",
             google_api_key="AIzaSyDEckSvtc3k_d0KgXyPgsvC1nUUjYc7xBk",
            
        )



splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=150,
        )

text = ''.join(text_sentences)
docs = splitter.create_documents([text])
print(len(docs))
class MemoryStore():
    def __init__(self, embeddings):
        self.store = InMemoryVectorStore(embedding=embeddings)

    def add_documents(self, documents):
      
        return  self.store.add_documents(documents)

    def similarity_search(self, query, k=4):
        return self.store.similarity_search(query, k)
    

memory_store = MemoryStore(embeddings)

24


In [10]:
ids = memory_store.add_documents(docs)
print(f"Added {len(ids)} documents to memory store")


Added 24 documents to memory store


In [11]:
def print_state(state, node_name):
    print(f"\n=== {node_name} ===")
    for i, m in enumerate(state["messages"]):
        print(f"[{i}] {m.__class__.__name__}: {m.content}")
        if getattr(m, "tool_calls", None):
            print("   tool_calls:", m.tool_calls)
    print("==============\n")


In [12]:
import asyncio
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.tools import tool
from langchain_groq import ChatGroq

class LLMProvider:
    def __init__(self, api_key: str):
        # self.llm = ChatGoogleGenerativeAI(
        #     model="gemini-2.5-flash",
        #     temperature=0,
        #     api_key=api_key,
        # )
        self.llm = ChatGroq(
            model="qwen/qwen3-32b",
            temperature=0,
            reasoning_format="hidden",
            api_key=api_key
        )
    def base(self):
        return self.llm
    
    def with_tools(self, tools):
        return self.llm.bind_tools(tools, tool_choice="retrieve") 
    

class RetrievalTool:
    def __init__(self, vectorstore, llm_provider:LLMProvider):
        self.vectorstore = vectorstore
        self.llm_provider = llm_provider
        self.tool = self._build_tool()
        self.step_back_tool = self._step_back_build_tool()
        self.MULTI_QUERY_PROMPT =  """You are an AI language model assistant. Your task is to generate three 
                    different versions of the given user question to retrieve relevant documents from a vector 
                    database. By generating multiple perspectives on the user question, your goal is to help
                    the user overcome some of the limitations of the distance-based similarity search. 
                    Provide these alternative questions separated by newlines. Original question: {question}"""


    def get_unique_union(self, results):
        """ Unique union of retrieved docs """
        seen = set()
        unique_docs = []
        for docs in results:
            for doc in docs: 
                if isinstance(doc, tuple):
                   doc = doc[0]
                if doc.page_content not in seen:
                    seen.add(doc.page_content)
                unique_docs.append(doc.page_content)
        return unique_docs
    


    
    def reciprocal_rank_fusion(self, results: list[list], k=60):
        """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
            and an optional parameter k used in the RRF formula """
        # Initialize a dictionary to hold fused scores for each unique document
        fused_scores = {}
        for docs in results:
            print(f"len of docs : {len(docs)}")
            for rank, doc in enumerate(docs):
                doc_str= doc.page_content
                # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
                if doc_str not in fused_scores:
                    fused_scores[doc_str] = 0
                # Retrieve the current score of the document, if any
                previous_score = fused_scores[doc_str]
                # Update the score of the document using the RRF formula: 1 / (rank + k)
                fused_scores[doc_str] += 1 / (rank + k)
        # Sort the documents based on their fused scores in descending order to get the final reranked results
        reranked_results = [
         (  doc,score )for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
        ]
        for doc , score in reranked_results:
            print ( "==================")
            print( f" Score: {score}  === DOC: {doc}")
            print ( "==================")
        # print(len(reranked_results))
        return reranked_results

   
  
    async def retrieve(self,query: str):
    
        docs = await asyncio.to_thread(
                    self.vectorstore.similarity_search, query, 2
                )
        serialized = "\n".join(d.page_content for d in docs)
        return serialized
    
    def _step_back_build_tool(self):
        @tool(description="Retrieve documents using original and step-back query.")
        async def retrieve(state: dict):
            # Extract step-back question from state
            step_back_q = state.get("step_back_question", None)
            if not step_back_q:
                # fallback to original question
                step_back_q = state["messages"][0].content
            
            original_q = state["messages"][0].content

            step_back_docs = await asyncio.to_thread(
                 self.vectorstore.similarity_search, step_back_q, k=3
            )
            original_q_docs = await asyncio.to_thread(
                 self.vectorstore.similarity_search, original_q, k=3
            )

            serialized1 = "\n".join(d.page_content for d in step_back_docs)
            serialized2 = "\n".join(d.page_content for d in original_q_docs)

            return serialized1+"\n"+serialized2

        return retrieve
    

    def _build_tool(self):
        
        @tool(description="Retrieve documents using multi-query expansion.")
        async def retrieve(query: str):

             # 1️⃣ Generate multiple queries

            expansion_response = await self.llm_provider.base().ainvoke(
                    self.MULTI_QUERY_PROMPT.format(question=query)
                )

            print("expansion_response: ",expansion_response)
            queries = [
                q.strip() for q in expansion_response.content.split("\n")
                if q.strip()
            ]
            queries.append(query)

             # 2️⃣ Retrieve docs for each query
            all_docs = []
            for q in queries:
                #similarity_search() is blocking CPU-bound code
                #Runs the function in a thread pool
                #Frees the event loop
                
                docs = await asyncio.to_thread(
                    self.vectorstore.similarity_search, q, 2
                )
                print(f"Retrieved {docs} documents for query: {q}")
                all_docs.append(docs)
 

            fused_docs = self.reciprocal_rank_fusion(all_docs)
            
            serialized = "\n".join([doc[0] for doc in fused_docs[:3]])
            # union_docs = self.get_unique_union(all_docs)
            # serialized = "\n".join(union_docs[:3])
            print("serialized: ",serialized)
            return serialized

        return retrieve

    

In [13]:
from typing import TypedDict, Annotated, List
from langgraph.graph.message import  add_messages
from langchain_core.messages import BaseMessage
from langgraph.graph import StateGraph, START, END
from langchain_core.messages import (
    SystemMessage, AIMessage, ToolMessage, HumanMessage
)
from langgraph.prebuilt import ToolNode


class GraphState(TypedDict):
    # Conversation
    messages: Annotated[List[BaseMessage], add_messages]

    # Decomposition
    sub_questions: List[str]
    current_subq_index: int

    # Memory across sub-questions
    qa_pairs: List[str]

    retrieved_context:str

    #step_back question
    step_back_question:str

In [14]:
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate

ROUTER_PROMPT = """
You are a router for a question-answering system.

Decide whether the user's question requires looking up
information from documents or can be answered directly.

Answer with ONLY one word:
RETRIEVE or DIRECT

Question:


{question}
"""
class RAGNodes:
    def __init__(self, llm_provider : LLMProvider, retrieve_tool:RetrievalTool):
        self.llm_provider = llm_provider
        self.retrieve_tool = retrieve_tool

    async def query_or_respond(self, state: GraphState):
        
        llm = self.llm_provider.with_tools(
            [self.retrieve_tool.tool]
          
        )

        messages = list(state["messages"])
        if not any(isinstance(m, SystemMessage) for m in messages):
            messages.insert(0, SystemMessage(
                "Use the retrieve tool if external information is required."
            ))
        print_state(state, "query_or_respond")
        print( "state in query_or_respond: ",state , messages)
        response =  await llm.ainvoke(messages)
        print("Response llm: ", response)
        return {"messages": [response]}
    
    async def route(self, state: GraphState):
        question = state["messages"][0].content
        print(state['messages'], question)
        response = await self.llm_provider.base().ainvoke(
        ROUTER_PROMPT.format(question=question)
         )
        
        print("route decision: ",response.content)
        decision = response.content.strip().upper()
        if decision not in {"RETRIEVE", "DIRECT"}:
            decision = "RETRIEVE"  # safe fallback
        if decision == "RETRIEVE":
            return "stepBackq"
        return"generate"
    
    async def decompose(self, state: GraphState):
            question = state["messages"][0].content
            # Decomposition
            template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
            The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
            Generate multiple search queries related to: {question} \n
            Output (3 queries):"""

            response =  await self.llm_provider.base().ainvoke(
                template.format(question= question)
            )
            subqs = [q.strip() for q in response.content.split("\n") if q.strip()]


            return  {
            "sub_questions": subqs,
            "current_subq_index": 0,
            "qa_pairs": []
            }


    async def step_back_retrieve(self, state: GraphState):
        original_q = state["messages"][0].content
        step_back_q = state.get("step_back_question", original_q)

        docs1 = await self.retrieve_tool.retrieve(original_q)
        docs2 = await self.retrieve_tool.retrieve(step_back_q)
        print(f"[step_back_retrieve] Original: {docs1}")
        print(f"[step_back_retrieve] Step-back: {docs2}")
        
        context =  docs1 +"\n"+ docs2

        return {
            "retrieved_context": context
        }
    async def step_back__q_generate(self, state: GraphState):
        """
        Generates a step-back version of the question using few-shot examples.
        Stores it in the state for retrieval.
        """
         
         # Step-back few-shot examples
        examples = [
            {
                "input": "Could the members of The Police perform lawful arrests?",
                "output": "what can the members of The Police do?",
            },
            {
                "input": "Jan Sindel’s was born in what country?",
                "output": "what is Jan Sindel’s personal history?",
            },
        ]
        example_prompt = ChatPromptTemplate.from_messages(
            [
                ("human", "{input}"),
                ("ai", "{output}"),
            ]
        )
        few_shot_prompt = FewShotChatMessagePromptTemplate(
            example_prompt=example_prompt,
            examples=examples,
        )

        step_back_prompt = ChatPromptTemplate.from_messages(
            [
                ("system",
                 "You are an expert at world knowledge. Your task is to step back and paraphrase a question "
                 "to a more generic step-back question, which is easier to answer. Here are a few examples:"),
                few_shot_prompt,
                ("user", "{question}"),
            ]
        )

        question = state["messages"][0].content

        llm = self.llm_provider.base()
        response = await llm.ainvoke(step_back_prompt.format_messages(question=question))


        step_back_question = response.content.strip()
        state["step_back_question"] = step_back_question

        print(f"[Step-back Node] Original: {question}")
        print(f"[Step-back Node] Step-back: {step_back_question}")
        
        return {"messages": state["messages"]}


    async def step_back__ans_generate(self, state: GraphState):
        # tool_outputs = [
        #         m.content for m in reversed(state["messages"])
        #         if isinstance(m, ToolMessage)
        #     ][0]
        # combined_context = "\n".join(tool_outputs)
        combined_context = state.get("retrieved_context",None)
        response_prompt_template = """You are an expert of world knowledge. I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

            # {combined_context}
            # Original Question: {question}
            # Answer:"""
        question = state["messages"][0].content
        
        response =  await self.llm_provider.base().ainvoke(
           response_prompt_template.format(combined_context=combined_context,question=question)
        )
        return {"messages": [response]} 



    async def generate(self, state: GraphState):
        tool_messages = []
        print("Generating with retrieved context...")
        # print_state(state, "generate")
        print( "state in generate: ",state )
        for msg in reversed(state["messages"]):
            if isinstance(msg, ToolMessage):
                tool_messages.append(msg)
            else:
                break

        docs_context = "\n".join(m.content for m in reversed(tool_messages))

        system = SystemMessage(f"""
            You are a helpful assistant.
            Use the retrieved context if relevant.
                               
            Context:
            ---------
            {docs_context}
            ---------
            """)

        convo = [
            m for m in state["messages"]
            if isinstance(m, (HumanMessage, AIMessage))
            and not getattr(m, "tool_calls", None)
        ]

        response =  await self.llm_provider.base().ainvoke(
            [system, *convo]
        )

        return {"messages": [response]}
    
    async def retrieve(self, state: GraphState):
        idx = state["current_subq_index"]
        subq = state["sub_questions"][idx]
        
        print(
          f"idx: {idx} => subq: {subq} ==> qa_pairs: {state['qa_pairs'][:idx]}"
       )


        serialized = await self.retrieve_tool.retrieve(query=subq)
        
        return {
           "retrieved_context": serialized
         }
    

    
    
    async def generate_decompose(self, state: GraphState):
        idx = state["current_subq_index"]
        subq = state["sub_questions"][idx]
        q_a_pairs  = state.get("qa_pairs", [])[:idx]


        question = state["messages"][0].content

        # tool_msgs = [m for m in reversed(state["messages"]) if isinstance(m, ToolMessage)][0]
        # context = "\n".join(m.content for m in tool_msgs)
        context = state.get("retrieved_context", "")
        

        template = """Here is the question you need to answer:

            \n --- \n {question} \n --- \n

            Here is any available background question + answer pairs:

            \n --- \n {q_a_pairs} \n --- \n

            Here is additional context relevant to the question: 

            \n --- \n {context} \n --- \n

            Use the above context and any background question + answer pairs to answer the question: \n {question}
            """

        answer = await self.llm_provider.base().ainvoke(template.format(question=subq,q_a_pairs=q_a_pairs, context= context ))
        print(f"AI Answer: {answer}")
        new_pair = f"Question: {question}\nAnswer: {answer}\n\n"
        
        return {
          "qa_pairs": state["qa_pairs"] + [new_pair],
          "current_subq_index": idx + 1,
          "messages":[answer]
         }
    
    

In [15]:
from langgraph.types import Command


class RAGGraphBuilder:
    def __init__(self, nodes: RAGNodes, retrieve_tool: RetrievalTool, checkpointer):
        self.nodes = nodes
        self.retrieve_tool = retrieve_tool
        self.checkpointer = checkpointer
    
    @staticmethod
    def tools_condition(state: GraphState):
        last = state["messages"][-1]
        print_state(state, "tools_condition")
        print( "state in tools_condition: ",state )
        if getattr(last, "tool_calls", None):
            return "tools"
        return END
    @staticmethod
    def should_continue(state: GraphState):
        if state["current_subq_index"] < len(state["sub_questions"]) or state["current_subq_index"] ==len(state["sub_questions"])-1:
            return "retrieve"
        return END


    def build(self):
        graph = StateGraph(GraphState)

        graph.add_node("queryOrResponse", self.nodes.query_or_respond)
        # graph.add_node("decompose", self.nodes.decompose)
        # graph.add_node("tools", ToolNode(tools=[self.retrieve_tool.tool]))
        graph.add_node("step_back_retrieve", self.nodes.step_back_retrieve)
        
        graph.add_node("generate", self.nodes.generate)
        graph.add_node("stepBackgenerate",self.nodes.step_back__ans_generate)
        # graph.add_node("generateDecompose",self.nodes.generate_decompose)
        # graph.add_node("retrieve",self.nodes.retrieve)
        graph.add_node("stepBackq", self.nodes.step_back__q_generate)
      
        graph.add_edge(START, "queryOrResponse")

    
        graph.add_conditional_edges("queryOrResponse",  self.nodes.route, {
            # "decompose": "decompose",
            "stepBackq":"stepBackq",
            "generate": "generate"
        })
        graph.add_edge("stepBackq","step_back_retrieve")
        graph.add_edge("step_back_retrieve", "stepBackgenerate")
        graph.add_edge("stepBackgenerate", END)
        
        
        # graph.add_edge("tools", "generate")
        # graph.add_edge("decompose", "retrieve")
        # graph.add_edge("retrieve", "generateDecompose")
        # graph.add_conditional_edges(
        #     "generateDecompose",
        #     self.should_continue,
        #     {
        #         "retrieve": "retrieve",
        #         END: END
        #     }
        # )
        graph.add_edge("generate", END)

        return graph.compile(checkpointer=self.checkpointer)


In [16]:
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.runnables import RunnableConfig
from IPython.display import display, Markdown


checkpointer = MemorySaver()
class RAGPipeline:
    def __init__(self, api_key, checkpointer,vectorstore):
        self.vectorstore =vectorstore
        self.llm_provider = LLMProvider(api_key)
        self.checkpointer = checkpointer

    async def run(self, query, thread_id, user_id, pdf_id):

        retrieve_tool = RetrievalTool(self.vectorstore, self.llm_provider)
        nodes = RAGNodes(self.llm_provider, retrieve_tool)

        graph = RAGGraphBuilder(
            nodes, retrieve_tool, self.checkpointer
        ).build()
       

        print("Running RAG graph...", graph)
        config: RunnableConfig = {"configurable": {"thread_id": thread_id}}

        result =  await graph.ainvoke(
            {"messages": [HumanMessage(query)]},
            config
        )
            
        print(result)

        return next(
            (m.content for m in reversed(result["messages"])
             if isinstance(m, AIMessage)),
            ""
        )


In [17]:
pipeline = RAGPipeline(
    api_key=API_KEY_GROQ,
    checkpointer=checkpointer,
    vectorstore=memory_store
)
result =  await pipeline.run("describe  table 2?", thread_id="thread1", user_id="user1", pdf_id="pdf1" )
result

Running RAG graph... <langgraph.graph.state.CompiledStateGraph object at 0x71c5cba91d90>

=== query_or_respond ===
[0] HumanMessage: describe  table 2?

state in query_or_respond:  {'messages': [HumanMessage(content='describe  table 2?', additional_kwargs={}, response_metadata={}, id='d03a1429-444b-4c49-a3ff-6f0cdc4ac73d')]} [SystemMessage(content='Use the retrieve tool if external information is required.', additional_kwargs={}, response_metadata={}), HumanMessage(content='describe  table 2?', additional_kwargs={}, response_metadata={}, id='d03a1429-444b-4c49-a3ff-6f0cdc4ac73d')]
Response llm:  content='' additional_kwargs={'tool_calls': [{'id': 'tt6w6t3rv', 'function': {'arguments': '{"query":"describe table 2"}', 'name': 'retrieve'}, 'type': 'function'}]} response_metadata={'token_usage': {'completion_tokens': 316, 'prompt_tokens': 161, 'total_tokens': 477, 'completion_time': 0.676022956, 'completion_tokens_details': {'reasoning_tokens': 291}, 'prompt_time': 0.006896377, 'prompt_tok

'The provided context does not explicitly describe or reference **Table 2**. The text focuses on topics such as OCR technologies (e.g., DeepSeek-OCR, GOT-OCR2.0), deep parsing capabilities (e.g., handling chemical formulas, geometric figures, charts), multilingual recognition, and data sources (e.g., LAION, Wukong). While figures like **Figure 6**, **Figure 8**, **Figure 9**, and **Figure 10** are discussed in detail, there is no mention of **Table 2** in the given content. If Table 2 exists in an associated document or image not included here, additional context would be required to describe it.'

In [18]:
from IPython.display import display, Markdown
display(Markdown(f"**Answer:** {result}"))

**Answer:** The provided context does not explicitly describe or reference **Table 2**. The text focuses on topics such as OCR technologies (e.g., DeepSeek-OCR, GOT-OCR2.0), deep parsing capabilities (e.g., handling chemical formulas, geometric figures, charts), multilingual recognition, and data sources (e.g., LAION, Wukong). While figures like **Figure 6**, **Figure 8**, **Figure 9**, and **Figure 10** are discussed in detail, there is no mention of **Table 2** in the given content. If Table 2 exists in an associated document or image not included here, additional context would be required to describe it.

In [19]:
# llm = ChatGoogleGenerativeAI(
#             model="gemini-2.5-flash",
#             temperature=0,
#             api_key=api_key1,
#         )

# llm.invoke(ROUTER_PROMPT.format(question="What is the contribution of the paper ?"))

In [20]:
# import getpass
# import os

# if "GOOGLE_API_KEY" not in os.environ:
#     os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

In [21]:
# results = memory_store.similarity_search("ocr", k=3)

# print(len(results))


In [22]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("EuroEval/gemma-3-tokenizer")


In [23]:
# import re
# def get_chunks_with_ids(text):
#     cleaned_text = re.sub(r"(Contents).*?1\. Introduction", r"\1\n", text, flags=re.DOTALL)
#     cleaned_text = re.sub(r"(References|REFERENCES).*?$", "", cleaned_text, flags=re.DOTALL)
#     sentences = split_sentences(cleaned_text)
#     text_sentences = [s for s in sentences if maybe_is_text(s)]

#     currentIndex = 0
#     chunks_with_ids = []
#     for i, s in enumerate(text_sentences):
#         start = currentIndex
#         end = start + len(s)

#         chunks_with_ids.append((i, start, end, s))
#         currentIndex = end+1  # +1 for the space/newline between sentences
    

#     return chunks_with_ids
        


# def createChunksPreservingId (chunks_with_ids, tokenizer, maxTokens, buffer):
#     chunks = []
#     currentTokens =0
#     currentChunk = []

#     for chunk in chunks_with_ids:
#         tokenCount = len(tokenizer.encode(chunk[3], add_special_tokens=False))
#         if currentTokens + tokenCount +buffer > maxTokens:
#             if  len(currentChunk) > 0:
#                chunks.append([currentChunk])
#             currentChunk = [chunk]
#             currentTokens = tokenCount

#         else:
#             currentChunk.append(chunk)
#             currentTokens += tokenCount
#     if  len(currentChunk) > 0:
#                chunks.append([currentChunk])
#     return chunks



In [24]:
# sentences_ids = get_chunks_with_ids(cleaned_text)
# chunks = createChunksPreservingId(sentences_ids, tokenizer, maxTokens=500, buffer=10)