In [181]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.llms import Ollama
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.prompts import (SystemMessagePromptTemplate,HumanMessagePromptTemplate,ChatPromptTemplate,MessagesPlaceholder)
from qdrant_client.http.models import VectorParams, PointStruct, HnswConfig, OptimizersConfig, WalConfig
import streamlit as st
from streamlit_chat import message
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, HnswConfig
import numpy as np



### Loading the PDF

In [144]:
# Loading PDF Data
def load_documents(filename):
    file_path = f"./data/{filename}.pdf"
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    return docs
    # pages = loader.load_and_split()
    # print(pages[0])
    # print(len(docs))

docs=load_documents('sample_pdf')
print(docs[0].page_content[0:])
print(docs[0].metadata)

Information security, cybersecurity 
and privacy protection  — Information 
security management systems — 
Requirements
Sécurité de l'information, cybersécurité et protection de la vie 
privée  — Systèmes de management de la sécurité de l'information — 
ExigencesINTERNATIONAL 
STANDARDISO/IEC 
27001
Third edition  
2022-10
Reference number 
ISO/IEC 27001:2022(E)
© ISO/IEC 2022
--``,,,,,``````,,,,,`,`,`,`,,`,-`-`,,`,,`,`,,`---
{'source': './data/sample_pdf.pdf', 'page': 0}


### Chunking, embedding and inserting documents into Qdrant Database

In [146]:

# Initialize Qdrant client
client = QdrantClient(url="http://localhost:6333")  # Change to actual host for a persistent DB

# Function to split documents
def split_documents(documents, chunk_size=1000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=['\n\n', '\n', ' ', '', "(?<=\.)"]
    )
    split_docs = text_splitter.split_documents(documents)
    return split_docs

# Function to generate embeddings using BGE-Large
def create_embeddings(documents):
    # Initialize BGE-Large embedding model via Langchain's OllamaEmbeddings
    embedding_model = OllamaEmbeddings(model="bge-large")
    embeddings = embedding_model.embed_documents([doc.page_content for doc in documents])
    return embeddings

# Function to create or update the collection in Qdrant
def create_or_update_collection(client, collection_name, embedding_size):
    # Check if the collection exists
    if not client.collection_exists(collection_name=collection_name):
        # If it doesn't exist, create the collection with HNSW config
        print(f"Collection '{collection_name}' does not exist. Creating...")
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(
                size=embedding_size,  # Dimension of embeddings
                distance="Cosine"     # Use the distance metric of your choice
            ),
            hnsw_config=HnswConfig(
                m=16,              # HNSW graph configuration
                ef_construct=200   # Trade-off between recall and index time
            )
        )
    else:
        print(f"Collection '{collection_name}' already exists. Using the existing collection.")

# Function to insert embeddings into Qdrant in batches
def insert_embeddings_in_qdrant(client, embeddings, documents, collection_name="rag_embeddings", batch_size=1000):
    # Create or update the collection
    create_or_update_collection(client, collection_name, len(embeddings[0]))
    
    # Prepare points for insertion
    points = [
        PointStruct(
            id=i,
            vector=embeddings[i],
            payload={"text": documents[i].page_content, "metadata": documents[i].metadata}
        ) for i in range(len(embeddings))
    ]
    
    # Insert points in batches
    for i in range(0, len(points), batch_size):
        print(f"Inserting batch {i // batch_size + 1} of size {batch_size}")
        client.upsert(
            collection_name=collection_name,
            points=points[i:i + batch_size]
        )

    print("Embeddings inserted into Qdrant successfully!")

# Load documents (Ensure that the load_documents function is defined elsewhere in your code)
docs = load_documents('sample_pdf')
print(f"First doc content: {docs[0].page_content[:200]}")
print(f"Metadata: {docs[0].metadata}")

# Split documents
split_docs = split_documents(docs)
print(f"Number of split documents: {len(split_docs)}")

# Generate embeddings
embeddings = create_embeddings(split_docs)
print(f"First embedding: {embeddings[0][:5]}...")
print(f'First embedding size: {len(embeddings[0])}')

# Insert embeddings into Qdrant
insert_embeddings_in_qdrant(client, embeddings, split_docs)


First doc content: Information security, cybersecurity 
and privacy protection  — Information 
security management systems — 
Requirements
Sécurité de l'information, cybersécurité et protection de la vie 
privée  — Syst
Metadata: {'source': './data/sample_pdf.pdf', 'page': 0}
Number of split documents: 74
First embedding: [0.22365008294582367, -0.44660961627960205, 0.15050801634788513, 0.4327608644962311, -0.5780003666877747]...
First embedding size: 1024
Collection 'rag_embeddings' already exists. Using the existing collection.
Inserting batch 1 of size 1000
Embeddings inserted into Qdrant successfully!


In [183]:
# len(embeddings)
print(embeddings[1])
print(len(embeddings[1]))


[0.016203749924898148, -0.40745842456817627, -0.19279807806015015, 0.5539073348045349, -0.44253844022750854, -0.7595949172973633, 0.0035794414579868317, -0.08587189763784409, 0.32236623764038086, 1.200115442276001, 0.3055936098098755, 0.6204445958137512, 0.625378429889679, -0.4723284840583801, -0.8864867091178894, -0.036433808505535126, 0.09278106689453125, -0.01830032840371132, -0.33960801362991333, 0.16005003452301025, -0.05351147800683975, -0.5331597328186035, -1.2931432723999023, 0.2474759817123413, -0.7222786545753479, 0.6608253717422485, 0.48409122228622437, 0.4766383171081543, 1.058655858039856, 0.8256269693374634, 0.020189806818962097, 0.1841597855091095, -0.14292854070663452, -0.5162132978439331, 0.027521681040525436, -0.44853517413139343, 1.6636630296707153, -1.0238641500473022, 0.09996160864830017, -0.4378630816936493, -0.18477962911128998, -0.12469814717769623, 1.2667795419692993, -0.8814969062805176, -0.8540010452270508, -0.6265676021575928, -0.06474756449460983, -0.046856

In [182]:
len(split_docs)

74

In [184]:
#Checking if collection is created in our database 
collections = client.get_collections()
print(collections)


collections=[CollectionDescription(name='rag_embeddings')]


In [163]:
# Check how many vectors are stored in the collection
point_count = client.count(collection_name='rag_embeddings')
print(f"Number of points in collection: {point_count.count}")


Number of points in collection: 76


### Test vector search by querying an embedding

In [186]:
def search_qdrant(client, query_embedding, collection_name="rag_embeddings", top_k=5):
    results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k
    )
    return results

# Use the first embedding (or any query embedding) as a test query
query_embedding = embeddings[0]  # Example query vector
results = search_qdrant(client, query_embedding)

# Inspect results
for result in results:
    print(f"Result ID: {result.id}, Score: {result.score}, Payload: {result.payload}")


Result ID: 25, Score: 0.7391363, Payload: {'text': "in industry where you're trying to get a syst em to work using a learning algorithm.  \nTo those of you that are not currently doing re search, one great way to do a project would \nbe if you apply learning algorithms to just pick a problem that you care about. Pick a \nproblem that you find interesting, and apply lear ning algorithms to that  and play with the \nideas and see what happens.", 'metadata': {'source': '/var/folders/gn/l39lfpd57m1flbk7d8cqb7yc0000gn/T/tmpeuc5ohkk.pdf', 'page': 6}}
Result ID: 32, Score: 0.7306084, Payload: {'text': "Later in this quarter, we'll also use the disc ussion sections to go over extensions for the \nmaterial that I'm teaching in the main lectur es. So machine learning is a huge field, and \nthere are a few extensions that we really want  to teach but didn't have time in the main \nlectures for.", 'metadata': {'source': '/var/folders/gn/l39lfpd57m1flbk7d8cqb7yc0000gn/T/tmpeuc5ohkk.pdf', 'page': 8}

### Retrieval 

In [152]:
def get_relevant_context(query_embedding,client, collection_name="rag_embeddings", top_k=5):
    # Search the Qdrant collection for top-k relevant documents based on the query embedding
    # query_embedding=get_query_embeddings()
    search_results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k
    )
    # Extract the content from the search results
    relevant_texts = [result.payload['text'] for result in search_results]
    print(relevant_texts)
    return relevant_texts

# relevant_text =get_relevant_context(client)


In [153]:
def prepare_context(relevant_text):

    context = "\n\n".join(relevant_text)
    return context


In [185]:

def generate_response_with_llama(query, context):
    # Initialize the Llama 3.2 model via Ollama
    llama_model = Ollama(model="llama3.2:1b")
    
    # Creating prompt
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    # Generate a response from LLM
    response = llama_model(prompt)
    return response

# generate_response_with_llama(query='tell me about information security risk management',context=context)

In [154]:
def rag_pipeline(client, collection_name="rag_embeddings"):
    # Step 1: Vectorize the query using Ollama Embeddings (BGE-Large)
    query_text=input("enter your query")
    query_embedding = OllamaEmbeddings(model="bge-large").embed_query(query_text)
    
    # Step 2: Retrieve relevant context from Qdrant
    relevant_texts = get_relevant_context(query_embedding, client)
    
    # Step 3: Prepare context by concatenating relevant texts
    context = prepare_context(relevant_texts)
    
    # Step 4: Generate response using Llama 3.2:8b
    response = generate_response_with_llama(query_text, context)
    return response

rag_pipeline(client)

['Documented information required by the information security management system and by this \ndocument shall be controlled to ensure:\na) it is available and suitable for use, where and when it is needed; and\nb) it is adequately protected (e.g. from loss of confidentiality, improper use, or loss of integrity).\nFor the control of documented information, the organization shall address the following activities, as \napplicable:\nc) distribution, access, retrieval and use;\nd) storage and preservation, including the preservation of legibility;\ne) control of changes (e.g. version control); and\nf) retention and disposition.\nDocumented information of external origin, determined by the organization to be necessary for \nthe planning and operation of the information security management system, shall be identified as \nappropriate, and controlled.\nNOTE Access can imply a decision regarding the permission to view the documented information only, or', 'ISO/IEC 27001:2022(E)\nb) documented in

'The document appears to be a section of an Information Security Management System (ISMS) policy or procedure manual, specifically addressing the control of documented information. It outlines the procedures for managing and maintaining documentation related to the ISMS, ensuring that it is available, suitable, protected, and meets specific requirements.'