# Create your own Q&A System Using Python: Build a RAG Model for Answering Questions from ePubs with Gemini" at GDG London Google I/O Extended 2024

This is a workshop .ipynb designed to teach you the concepts of RAG. It is not formatted optimally or using best practice for working with Python or building for production! It is designed to help you learn the concepts and experience the code!

Some code exists only to demonstrate a concept and serves no purpose to the overall project

0.   Play with embeddings for a second
1.   Read in our EPub -> We're using pride and prejduice
2.   Parse the documents
3.   Use Recursive Character Text Splitting to CHUNK it
4.   Embed the chunks

# Set-Up
Pip install and imports

In [None]:
!pip install ebooklib

In [None]:
!pip install langchain-text-splitters

In [None]:
!pip install chromadb

In [None]:
!pip install google.generativeai

In [None]:
###############------------------ GEN AI tools
import google.generativeai as genai
from langchain_text_splitters import RecursiveCharacterTextSplitter

###############------------------ GEN AI tools
import chromadb
from chromadb import Client
from chromadb.config import Settings
from chromadb import Documents, EmbeddingFunction, Embeddings

###############------------------ Google Colab
from google.colab import userdata
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

###############------------------ Ebook tools
from ebooklib import epub
import ebooklib
from bs4 import BeautifulSoup
import html

###############------------------ General tools
import pickle
import requests
from typing import List, Dict
import os

In [None]:
## If you have a Google API key for AI, insert as a secret called 'GOOGLE_API_KEY'!
## If you have no key, don't worry -> we have some pre-build things for you =)

try:
  GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
  with_key = True
  genai.configure(api_key=GOOGLE_API_KEY)
  print("Success, have loaded your key!")
except:
  with_key = False
  print("We didn't find a key -> we'll use the pickle files of what we did earlier! ")

# Part Zero -> Let's see an embedding!

In [None]:
if with_key:
  result = genai.embed_content(
  model="models/text-embedding-004",
  content="Workshops are fun!!!",
  task_type="retrieval_document")
  print('embedded content live')

if not with_key:
  print('Found No key, using the pickled version')
  result = pickle.loads(requests.get('https://github.com/LCarpenter87/GeminiRAG/raw/main/result.pkl').content)

In [None]:
print(result['embedding'])

# Part One -> Read in the Epub


In [None]:
##Download the book from GitHub
book_url = 'https://github.com/LCarpenter87/GeminiRAG/raw/main/pap.epub'
response = requests.get(book_url)

with open('temp.epub', 'wb') as temp_file:
    temp_file.write(response.content)

# Read the EPUB book
book = epub.read_epub('temp.epub', {'ignore_ncx': True})

# Get the documents from the book (updated search)
items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))


In [None]:
## We can explore a little of the content -> it's HTML!
items[0].get_body_content()[:500]

# Part Two -> Parse the content from the ebook

In [None]:
def document_to_str(document):
    """Ingest a chapter object from the Ebook, and strip and clean it up"""
    soup = BeautifulSoup(document.get_body_content(), 'html.parser')
    text = [para.get_text(separator=' ', strip=True) for para in soup.find_all('p')]
    clean_text = ' '.join(text)
    clean_text = clean_text.replace('\n', ' ')  # Remove newlines
    clean_text = html.unescape(clean_text)  # Unescape HTML entities
    clean_text = clean_text.replace("&#x27;", "'")
    clean_text = clean_text.replace("&#39;", "'")
    clean_text = ' '.join(clean_text.split())  # Remove extra spaces
    return clean_text

example = document_to_str(items[1])
print(example)


In [None]:
## Apply the function to every document in our items

whole_book = [document_to_str(x) for x in items]

# Part Three -> Separate our contents into Chunks!

In [None]:
# https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

texts = text_splitter.create_documents(whole_book)

In [None]:
print(texts[2])



> Big improvement opportunity -> We lost A LOT of meta data by stripping our HTML! Recursive is very basic. We could use a HTML based parser instead!



In [None]:
texts = {i:x.page_content for i,x in enumerate(texts)}

# Part Four -> Create a local database using Chroma, to hold our embeddings and also return the content!

In [None]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = "models/text-embedding-004"
        title = "Custom query"
        return genai.embed_content(model=model, content=input, task_type="retrieval_document", title=title)["embedding"]

def set_up_chroma_db(path: str, name: str):
    chroma_client = chromadb.PersistentClient(path=path)
    try:
        db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())
        print("DB loaded")
    except ValueError:
        db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())
        print("DB created")
    return db

path = "/content/drive/MyDrive/Extended_Workshop"
name = "Pride_and_prejudice_QA"
db = set_up_chroma_db(path, name)

## Lets Check Costs and Tokens!

In [None]:
if with_key:
  model_info = genai.get_model('models/text-embedding-004')
  print(model_info.input_token_limit)

In [None]:
if with_key:
  model = genai.GenerativeModel('models/gemini-1.5-flash')
  print(model.count_tokens(example))

In [None]:
## This is the price per characters from the vertex AI pricing info site
price_per_1000_characters = 0.000025
price = len(example) / 1000 * price_per_1000_characters
print(f'Price for the chapter ${price:.3f}')
print(f'Price for the whole book approx ${(price * len(items)):.3f}')

# Part Five -> Add the embeddings to the database in batches!

In [None]:
def add_to_chroma_db_in_batches(db: chromadb.Collection, texts: Dict[int, str], book_title: str, batch_size: int, start_index: int = 0):
    """Adds documents to Chroma DB in batches with metadata and persists changes after each batch."""

    num_docs = len(texts)
    for i in range(start_index, num_docs, batch_size):
        batch_keys = list(texts.keys())[i:i + batch_size]
        batch_docs = [texts[key] for key in batch_keys]

        # Prepare IDs and metadata for the batch
        ids = [str(key) for key in batch_keys]
        metadatas = [{"book_title": book_title} for _ in batch_keys]

        try:
            # Add to Chroma DB (embedding is handled automatically)
            db.add(
                documents=batch_docs,
                ids=ids,
                metadatas=metadatas,
            )

        except Exception as e:
            print(f"Error adding batch {i}-{i + batch_size}: {e}")


        print(f"Added {min(i + batch_size, num_docs)} out of {num_docs} documents")


In [None]:
short_texts = {k:v for k,v in texts.items() if k in range(0,10)}

In [None]:
## To add all of them
add_to_chroma_db_in_batches(db, short_texts, "Pride and Prejudice", 100, 0)

# Part Six -> Querying the database!

In [None]:
## Lets see what passages we get back!

db.query(query_texts=["What are the names of Mrs Bennet's daughters?"])

In [None]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents']
  passages = [' '.join(doc) for doc in passage]
  passage = ' '.join(passages)
  return passage

In [None]:
def make_rag_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = (f"""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
  Only use information given in the passage information, answering as fully as possible using the information provided.
  \You can paraphrase the passage or extrapolate if necessary. If the information is not given at all in the passage say "i do not know".

  The answer should be well written, and be straight to the point.
  It should not include any of the passage given.
  QUESTION: '{query}'
  PASSAGES: '{escaped}'

  ANSWER:
  """)
  return prompt

In [None]:
def generate_answer_genai(prompt):
    generation_config = {
      "temperature": 1,
      "top_p": 0.95,
      "top_k": 64,
      "max_output_tokens": 8192,
      "response_mime_type": "text/plain",
    }
    gemini_api_key = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-1.5-flash', generation_config=generation_config)
    answer = model.generate_content(prompt)
    return answer.text

In [None]:
def generate_answer(db,query):
    relevant_text = get_relevant_passage(query,db, n_results = 5)
    prompt = make_rag_prompt(query,
                             relevant_passage="".join(relevant_text)) # joining the relevant chunks to create a single passage
    answer = generate_answer_genai(prompt)

    return answer.strip()

In [None]:
query = "What are the names of Mrs Bennet's daughters?"
generate_answer(db,query)

In [None]:
query = "Who is considered to be very proud?"
generate_answer(db,query)

In [None]:
query = "Who does Elizabeth dislike from the first time they meet?"
generate_answer(db,query)

In [None]:
query = "Who elopes with Wickham?"
generate_answer(db,query)

In [None]:
query = "Who saves Lydia from Mr.Wickham?"
generate_answer(db,query)

In [None]:
query = "Who is the younger sibling of Mr.Darcy"
generate_answer(db,query)

In [None]:
query = "Who is Mr.Darcy"
generate_answer(db,query)

In [None]:
query = "What is a truth universally acknowledged"
generate_answer(db,query)