# Install and Imports

In [2]:
!pip install pypdf
!pip install google-generativeai
!pip install --upgrade chromadb
!pip install typing



In [13]:
import requests
from pypdf import PdfReader
import os
import re
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import chromadb
from chromadb.config import Settings
from typing import List
from google.colab import userdata
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Download and load PDF

In [12]:
def download_pdf(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as f:
        f.write(response.content)

def load_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# ToDo:
- Text splitting
- ChromaDB
- Prompt Construction

In [14]:
# TODO: Students implement text splitting function
def split_text(text):
    """
    Split the input text into meaningful chunks.
    Returns a list of text chunks.
    """
    chunks = sent_tokenize(text)
    # print(chunks)
    return chunks

# Custom embedding function using Gemini API
class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        # gemini_api_key = os.getenv("GEMINI_API_KEY")
        gemini_api_key = userdata.get('GOOGLE_API_KEY')
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model, content=input, task_type="retrieval_document", title=title)["embedding"]

# TODO: Students implement ChromaDB creation and querying
def create_chroma_db(documents: List[str], path: str, name: str):
    """
    Create a ChromaDB collection with the provided documents.
    Returns the database instance and name.

    Hint: Use the following to create the client:
    client = chromadb.Client(Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=path
    ))
    """

    client = chromadb.PersistentClient(path=path)
    # Create a collection for storing data
    embedding_function = GeminiEmbeddingFunction()

    # collection = client.get_or_create_collection(name=name,
                        # embedding_function=embedding_function)
    try:
        collection = client.get_collection(name=name,
                       embedding_function=embedding_function)
    except:
      collection = client.create_collection(name=name,
                        embedding_function=embedding_function)
      # Generate embeddings for documents
      embeddings = embedding_function(documents)

      # Save documents into db
      collection.add(documents=documents,
        ids=[str(i) for i in range(len(documents))],
        embeddings=embeddings)

    return collection, name


def get_relevant_passage(query: str, db, n_results: int):
    """
    Retrieve the most relevant passages for the given query.
    Returns a list of relevant text passages.
    """
    result = db.query(query_texts=query, n_results=n_results)
    return result.get("documents")

# TODO: Students implement prompt construction
def make_rag_prompt(query: str, relevant_passage: str):
    """
    Construct a prompt for the generation model using the query and retrieved passage.
    Returns the formatted prompt string.
    """
    prompt = f"""
    Please repeat the question and give me the results base on the relevant passage.
    Question: {query}
    Relevant passage: {relevant_passage}
    """
    return prompt


# LLM Response Generation

In [6]:
def generate_answer(prompt: str):
    """Generate answer using Gemini Pro API"""
    # gemini_api_key = os.getenv("GEMINI_API_KEY")
    gemini_api_key = userdata.get('GOOGLE_API_KEY')
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    result = model.generate_content(prompt)
    return result.text

# Main execution
## ToDo:
 - Chat history
 - Multiple file injest

In [15]:
def main():
    # Set up configurations
    pdf_url = "https://services.google.com/fh/files/misc/ai_adoption_framework_whitepaper.pdf"
    pdf_path = "ai_adoption_framework_whitepaper.pdf"
    db_folder = "chroma_db"
    db_name = "rag_experiment"
    pdf_list = []

    # Create database directory
    if not os.path.exists(db_folder):
        os.makedirs(db_folder)

    # Download and process PDF
    if pdf_path not in pdf_list:
      pdf_list.append(pdf_path)
      download_pdf(pdf_url, pdf_path)
      pdf_text = load_pdf(pdf_path)
      # Split text into chunks
      chunked_text = split_text(pdf_text)

      # Create and set up database
      db_path = os.path.join(os.getcwd(), db_folder)
      db, db_name = create_chroma_db(chunked_text, db_path, db_name)

    # # Process user query
    query = input("Please enter your query: ")
    relevant_text = get_relevant_passage(query, db, n_results=3)
    print(relevant_text)
    # Generate and display answer
    if relevant_text:
        flattened_relevant_text = [item for sublist in relevant_text for item in sublist]
        final_prompt = make_rag_prompt(query, "".join(flattened_relevant_text))
        answer = generate_answer(final_prompt)
        print("\nGenerated Answer:", answer)
    else:
        print("No relevant information found for the given query.")

if __name__ == "__main__":
    main()

Please enter your query: Explain the power of AI
[['How do you harness the power inherent in AI, while avoiding any \npotential missteps?', 'But at every step along the way, \nadding in effective AI capabilities brings benefits.', 'In addition, your AI capabilities are supported by clear governance \nand decision-making responsibilities.']]

Generated Answer: **Repeated Question:** Explain the power of AI

**Results based on the relevant passage:**

The passage does not provide a comprehensive explanation of the power of AI. However, it does mention some potential benefits of adding AI capabilities:

* **Increased efficiency:** AI can automate tasks and improve productivity.
* **Improved decision-making:** AI can provide insights and make recommendations based on data analysis.
* **Reduced costs:** AI can help businesses save money by automating processes and improving efficiency.
* **Enhanced customer experience:** AI can be used to provide personalized customer service and support.

