# RAG - PDF Search in multiple documents


## Installing and importing dependencies

In [None]:
!pip install -q transformers sentence_transformers faiss-cpu torch PyPDF2 nltk

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
import PyPDF2
import os
import nltk

nltk.download("punkt")
from nltk.tokenize import sent_tokenize
from google.colab import userdata

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Setting up the model and tokenizer

In [None]:
HUGGING_FACE_ACCESS_TOKEN = userdata.get("HF_TOKEN")

model_name = "google/gemma-2-2b-it"

model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float16, token=HUGGING_FACE_ACCESS_TOKEN
).to("cuda")

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGING_FACE_ACCESS_TOKEN)

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

## Extracting and tokenizing info from the PDF files

The `extract_text_from_pdf()` function will look for all PDF files in the  folder.

The `split_text_into_chunks()` function gets the text and breaks it down into smaller chunks.

In [None]:
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = "".join([page.extract_text() for page in reader.pages])
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""


def split_text_into_chunks(text, max_chunk_size=1000):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

## Extracting info from the PDFs

Set the variable `pdf_directory` with the path where your PDF files are.

A Pandas DataFrame is created containing the path of the corresponding PDF, its chunks and the embedding vector of its chunks.

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")

# Process PDF files
pdf_directory = "/content/"
df_documents = pd.DataFrame(columns=["path", "text_chunks", "embeddings"])

for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        print(filename)
        pdf_path = os.path.join(pdf_directory, filename)
        text = extract_text_from_pdf(pdf_path)
        chunks = split_text_into_chunks(text)
        document_embeddings = encoder.encode(chunks)
        new_row = pd.DataFrame(
            {
                "path": [pdf_path],
                "text_chunks": [chunks],
                "embeddings": [document_embeddings],
            }
        )
        df_documents = pd.concat([df_documents, new_row], ignore_index=True)

df_documents

GDPR-GenAI.pdf


Unnamed: 0,path,text_chunks,embeddings
0,/content/GDPR-GenAI.pdf,[1GDPR & Generative AI\nA Guide for Customers\...,"[[-0.05703715, 0.032348257, 0.008656179, -0.05..."


## Creating a FAISS index from all document embeddings

Faiss is a library for efficient similarity search and clustering of vectors. The IndexFlatL2 algorithm will be applied to all chunk embedding vectors.

In [None]:
all_embeddings = np.vstack(df_documents["embeddings"].tolist())
dimension = all_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(all_embeddings)

## Calculating the embedding distance and generating an answer

The `find_most_similar_chunks()` function will create an embedding vector for your query and compare its similarity to all the chunks it retrieved from the PDF files, returning the most similar one, which will be used as the context for the next function.

The `generate_response()` function will generate an answer using  selected model based on the context retrieved from the most similar info chunk.

In [None]:
def find_most_similar_chunks(query, top_k=3):
    query_embedding = encoder.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = []
    total_chunks = sum(len(chunks) for chunks in df_documents["text_chunks"])
    for i, idx in enumerate(indices[0]):
        if idx < total_chunks:
            doc_idx = 0
            chunk_idx = idx
            while chunk_idx >= len(df_documents["text_chunks"].iloc[doc_idx]):
                chunk_idx -= len(df_documents["text_chunks"].iloc[doc_idx])
                doc_idx += 1
            results.append(
                {
                    "document": df_documents["path"].iloc[doc_idx],
                    "chunk": df_documents["text_chunks"].iloc[doc_idx][chunk_idx],
                    "distance": distances[0][i],
                }
            )
    return results


def generate_response(query, context, max_length=1000):
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

    with torch.no_grad():
        output = model.generate(
            input_ids, max_new_tokens=max_length, num_return_sequences=1
        )

    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extracting the answer part by removing the prompt portion
    answer_start = decoded_output.find("Answer:") + len("Answer:")
    answer = decoded_output[answer_start:].strip()

    return answer


def query_documents(query):
    similar_chunks = find_most_similar_chunks(query)
    context = " ".join([result["chunk"].replace("\n", "") for result in similar_chunks])
    response = generate_response(query, context)
    return response, similar_chunks

## Looking for info in the PDFs

The variable `query` contains the information you want to retrieve from the PDF files.

In [None]:
query = "Tell me about how Azure OpenAI Service use personal data"
answer, relevant_chunks = query_documents(query)

print("\n\n--Response---\n")
print(f"Query: {query}\n\n-----\n")
print(f"Generated answer: {answer}\n\n-----\n")
print("Relevant chunks:")
for chunk in relevant_chunks:
    print(f"Document: {chunk['document']}")
    print(f"Chunk: {chunk['chunk']}".replace("\n", ""))
    print(f"Distance: {chunk['distance']}")
    print()



--Response---

Query: Tell me about how Azure OpenAI Service use personal data

-----

Generated answer: The Azure OpenAI Service uses personal data in a few key ways:

**1. Prompt Engineering:** When you use the service, you provide prompts, and the service uses its AI to generate responses. This process involves analyzing your prompts and the context of your requests to generate relevant and accurate outputs.

**2. "On Your Data" Feature:** This feature allows you to use your own data to augment prompts and generate responses. The service retrieves relevant data from a configured Customer Data store and uses it to enhance the generated content. This feature enables you to run supported LLMs on your organization's data without needing to train or fine-tune models.

**3. Security and Privacy:** The Azure OpenAI Service prioritizes security and privacy. It implements technical and organizational measures to ensure the protection of personal data. This includes measures to prevent unau