In [1]:
from langchain_community.document_loaders import PyPDFLoader
from concurrent.futures import ThreadPoolExecutor

In [14]:
pdf_files = [
    "Abbreviations.pdf",
    "EthnicandCrimeInsurgencyNexusinIndia.pdf",
    "GreyzoneWarfare.pdf",
    "IndiaAdhocArsenal.pdf",
    "IndiaCapablebutconstrained.pdf",
    "IndiaChinaBorder.pdf",
    "Indian Space Research Orgtanizations ISRO.pdf",
    "IndianCapability.pdf",
    "IndianGeographyPhysical.pdf",
    "Indus-water-treaty-an-appraisal(8).pdf",
    "ISRO space programme.pdf",
    "MasteringtheGreyzone.pdf",
    "MilitaryProfessionalGeography.pdf",
    "Pakistan India and the Indus River Basin.pdf",
    "PsiographyofIndia.pdf",
    "RussiaWaragainstUkraine.pdf",
    "SinoIndianBorder.pdf",
    "SinoIndianWar1962.pdf",
    "UkraineObservations.pdf",
    "VoilenceinManipur.pdf",
]

In [15]:
def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    return loader.load()

In [16]:
data = []
with ThreadPoolExecutor() as executor:
     results = executor.map(load_pdf, pdf_files)
     for result in results:
         data.extend(result)

In [17]:
print(f"Total documents loaded: {len(data)}")

Total documents loaded: 2260


In [21]:
data[1]

Document(metadata={'source': 'Abbreviations.pdf', 'page': 1}, page_content='39 Deployment,Deployed Depl \n40 Department  Dept \n41 Development  Dev  \n42 Development, develop Devp  \n43 Different diff \n44 Direction, Directions Dir \n45 Discipline, Disciplines Discp \n46 Distance Dist \n47 Distribution Distr \n48 District Distt \n49 Document Docu \n50 Economic Ec \n51 Effect eff \n52 Electric, Electrical, Electronic Elec \n53 Elements elms \n54 Employ, Employed, Employee  Emp \n55 Environment Env \n56 Environment, Environments,  Environmental Envmt \n57 Equipment Epqt \n58 Estimate, Estimated  est \n59 Exercise  Ex \n60 Executed   Exec \n61 Expressed Expd \n62 Flight Flt \n63 Follow, Followed, Following  Fol \n64 Frequencies   Freq \n65 Geographical Geo \n66 Government, governments Govt \n67 Group gp \n68 Ground gr \n69 Geosynchronous satellite launch vehicle  GSLV \n70 Geosynchronous transfer orbit GTO \n71 Identification Ident \n72 Important  Imp \n73 Important, importance Imp  \n74 

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)


print("Total number of documents: ",len(docs))

Total number of documents:  5996


In [24]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from dotenv import load_dotenv
load_dotenv() 

#Create Embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector = embeddings.embed_query("Developing a Strategy for the Gray Zone")
vector[:5]

[0.03717122599482536,
 -0.062356337904930115,
 -0.017505498602986336,
 -0.017510710284113884,
 0.03319702669978142]

In [25]:
from tqdm import tqdm

# Process in smaller chunks
chunk_size = 100
all_docs = []

for i in range(0, len(docs), chunk_size):
    chunk = docs[i:i + chunk_size]
    try:
        vectorstore = Chroma.from_documents(
            documents=chunk,
            embedding=embeddings,
            persist_directory="./vector_db_store"
        )
    except Exception as e:
        print(f"Error processing chunk {i}-{i+chunk_size}: {str(e)}")

In [114]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
query="why design of a sat structure becomes more complex"
retrieved_docs = retriever.invoke(query)

In [115]:
len(retrieved_docs)

3

In [116]:
print(retrieved_docs)

[Document(metadata={'page': 8, 'source': 'ISRO space programme.pdf'}, page_content='the dev of extensive data-integrity checks, detailed sub-sys \nmodelling and dev of computer-aided design packages for \ninteractive design. Inherent difficulties in handling large-sized \ndynamic problems, often infeasible by direct methods, have \nnecessitated work on dynamic sub structuring which is also useful \nin providing inputs to testing of the units of a large structure, \nwhich cannot be tested as a whole. Emphasis on mass \nreduction needs optimization routines. Efforts on IRS and INSAT \nII reflect much of this higher lvl sophistication in analysis. \nIn a nutshell, the evolution of structural design, analysis, \nand testing methods have led to the realization of optimal \nstructural hardware for sats like IRS and INSAT II. Further, such \nimproved and sophisticated apch have also resulted in a reduced \nnumber of models to be built for design validation through testing \nin view of the hig

In [52]:
print(retrieved_docs)
print(retrieved_docs[1].metadata['page'])
print(retrieved_docs[1].metadata['source'])

[Document(metadata={'page': 0, 'source': 'Abbreviations.pdf'}, page_content='\\Ser Word Abbreviation \n1 Gps aided geo augmented navigation GAGAN \n2 Account Acct \n3 Additional, additionally Addl  \n4 Addition Addn  \n5 Administration admin \n6 armed forces AF \n7 Armed forces special power act AFSPA \n8 Approach, Aproaches Apch \n9 Application  Appl \n10 Attack Atk \n11 Attention  Attn \n12 Advanced technology vehicle ATV \n13 Automatic auto \n14 Available Avail \n15 Between b/w \n16 Border Bdr \n17 Borders Bdrs \n18 Boundary Bfry \n19 Building Bldg \n20 Calculation  calc \n21 Capabilities Cap \n22 Combatants Cbt \n23 Commander Cdr \n24 Characterized Char \n25 Comments Cm \n26 Commands Cmd \n27 Collaboration Colb \n28 Community, communities Com \n29 Command, Commands, Commanding comd \n30 Communication Comm \n31 Control Con \n32 Conclude, conclusion  Concl \n33 Consultation Conslt \n34 Continuous, Continuously cont \n35 Cooperation coop \n36 Decrease, Decreases, Decreasing dec \n37 D

In [131]:
#separate from code

In [117]:
formatted_context = []

for i, chunk in enumerate(retrieved_docs):
    context_entry = (
        f"Text Snippet {i+1}: {chunk.page_content}\n"
        f"Source: {chunk.metadata['source']}, Page: {chunk.metadata['page']}"
    )
    formatted_context.append(context_entry)

# Join all entries with double newlines
final_context = "\n\n".join(formatted_context)

In [130]:
from langchain.prompts import PromptTemplate

In [132]:
document_prompt = PromptTemplate.from_template(
    "Exact text: {page_content}\nSource: {source}\nPage: {page}"
)

In [29]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",temperature=0.3, max_tokens=1000)

In [149]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
   """
  You are a factual assistant. Follow these steps:
1. Provide a direct answer.
2. Extract EXACT text snippets with sources from the provided context.
3. Structure the response as a JSON object with the following format:
format: {{ 
    "answer": [Direct factual answer or null if not found], 
    "references": [ {{ 
        "exact_text": [Extracted text or null if not found], 
        "source": [Filename or null], 
        "page": [Page number or null], 
        "figure": [Figure number if mentioned, otherwise null] 
    }}, 
    ... 
    ] 
}}

**Rules:** 
- If any information is missing, set it to `null` instead of omitting it. 
- Ensure `references` is an array, even if it has zero or one entries.
- If a reference contains a **figure number**, extract it and include it in the `"figure"` field. 
- return only a valid json not mention it on above
- If no relevant information is found in the context, return: 
{{ "answer": null, "references": [] }}

**Context:** 
{context}
  
        """
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [150]:
question_answer_chain = create_stuff_documents_chain(llm, prompt ,document_prompt=document_prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [151]:
import time 
start_time = time.time()
response = rag_chain.invoke({"input": "give me some Russian Supplies as a Percentage of Total Gas Imports in Europe"})
print(response["answer"])
end_time = time.time()
total_time = end_time - start_time
print("Response Time : " , total_time)

```json
{
  "answer": "The provided text mentions that Figure 2.1 shows Russian gas supplies as a percentage of total gas imports in Europe, with categories above 90%, 50-89%, 10-49%, below 10%, and no natural gas infrastructure.  However, the figure itself is not included in this text.",
  "references": [
    {
      "exact_text": "Figure2.1\nRussian Supplies as a Percentage of Total Gas Imports in Europe\n• Above 90%\nD 50%-89%\n• 10%-49%\nD Below 10%\nD No natural gas\ninfrastru ctu re",
      "source": "GreyzoneWarfare.pdf",
      "page": "23",
      "figure": "2.1"
    }
  ]
}
```
Response Time :  3.834144353866577
