### RAG on academic data

In [39]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone.grpc import PineconeGRPC as Pinecone
from sentence_transformers import SentenceTransformer

In [40]:
# Reading document
file_loader = PyPDFDirectoryLoader("../data/academic_data")
document = file_loader.load()
len(document)

30

In [41]:
# splitting and chunking
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000 , chunk_overlap=200)
chunks = text_splitter.split_documents(document)
print(len(chunks))


57


In [42]:
# embedding
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
embedding = model.encode([doc.page_content for doc in chunks])

In [43]:
# Pinecone configuration
import os
from dotenv import load_dotenv
load_dotenv()

True

In [44]:
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENVIRONMENT")
pc = Pinecone(api_key=pinecone_api_key)
index_name = "academic-rag"

In [45]:
host = os.getenv("PINECONE_HOST_ACAD")
index = pc.Index(host= host)
vectors = []

for i, embedding in enumerate(embedding):
    vectors.append({
        "id": f"chunk-{i}",
        "values": embedding.tolist(),
        "metadata": {
            "text": chunks[i].page_content,
            "source": chunks[i].metadata.get("source"),
            "page": chunks[i].metadata.get("page")
        }
    })

In [46]:
def batch_upsert(index, vectors, namespace, batch_size=500):
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch, namespace=namespace)


In [47]:
batch_upsert(
    index=index,
    vectors=vectors,
    namespace="rag-docs",
    batch_size=500   # safe value
)


In [48]:
# cosine similarity search
def retrieve_query(query,k=3):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
    query_embedding = model.encode(query).tolist()
    matching_result = index.query(
        vector=query_embedding,
        top_k=k,
        namespace="rag-docs",
        include_metadata=True
    )
    return matching_result

In [69]:
answer = retrieve_query("credits of Network and System Security")
print(answer.matches)

[{'id': 'chunk-37',
 'metadata': {'page': 20.0,
              'source': '..\\data\\academic_data\\6TH-Sem-Curriculum-and-Syllabus.pdf',
              'text': 'Text Books: \n'
                      'Authors Name Title Edition Publisher, \n'
                      'Country \n'
                      'Year \n'
                      'W. Stallings Network \n'
                      'Security \n'
                      'Essentials \n'
                      '6th Edition Prentice Hall , \n'
                      'International \n'
                      '2017 \n'
                      ' \n'
                      'Reference Books: \n'
                      'Authors \n'
                      'Name \n'
                      'Title Edition Publisher, Country Year \n'
                      'Shari \n'
                      'Lawrence \n'
                      'Pfleeger \n'
                      'Security in \n'
                      'Computing \n'
                      '4th Edition Prentice Hall , \n'
   

In [70]:
# LLM integration: llama-3.1-8b-instant
from langchain_groq import ChatGroq

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-8b-instant",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,llm,top_k=3):
    ## retriever the context
    results=answer.matches[:top_k]
    context = []
    for doc in answer.matches:
        context.append(doc.metadata['text'])
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content


In [71]:
answer = rag_simple("credits of Network and System Security", llm, top_k=3)
print(answer)

50
