# Install and Imports

In [1]:
!pip install pypdf
!pip install google-generativeai
!pip install chromadb
!pip install --upgrade chromadb
!pip install typing



Exception in thread Thread-5 (attachment_entry):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/debugpy/server/api.py", line 237, in listen
    sock, _ = endpoints_listener.accept()
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 294, in accept
    fd, addr = self._accept()
               ^^^^^^^^^^^^^^
TimeoutError: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy.py", line 52, in attachment_entry
    debugpy.listen(_dap_port)
  File "/usr/local/lib/python3.11/dist-packages/debugpy/public_api.py", line 31, in wrapper
    return wrapped(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^



In [50]:
import requests
from pypdf import PdfReader
import os
import re
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import chromadb
from chromadb.config import Settings
from typing import List
from google.colab import userdata
import nltk
from nltk.tokenize import sent_tokenize

# Download and load PDF

In [3]:
def download_pdf(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as f:
        f.write(response.content)

def load_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# ToDo:
- Text splitting
- ChromaDB
- Prompt Construction

In [51]:
# TODO: Students implement text splitting function
# Kiki
nltk.download('punkt_tab')
# TODO: Students implement text splitting function
def split_text(text):
    """
    Split the input text into meaningful chunks.
    Returns a list of text chunks.
    """
    chunks = sent_tokenize(text)
    # print(chunks)
    return chunks

# Custom embedding function using Gemini API
class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        # gemini_api_key = os.getenv("GEMINI_API_KEY")
        gemini_api_key = userdata.get('GOOGLE_API_KEY')
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model, content=input, task_type="retrieval_document", title=title)["embedding"]

# TODO: Students implement ChromaDB creation and querying
# Kun
def create_chroma_db(documents: List[str], path: str, name: str):
    """
    Create a ChromaDB collection with the provided documents.
    Returns the database instance and name.

    Hint: Use the following to create the client:
    client = chromadb.Client(Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=path
    ))
    """

    client = chromadb.PersistentClient(path=path)
    embedding_functions = GeminiEmbeddingFunction()

    # collection存在就get or create
    # Create a collection for storing data
    try:
        collection = client.get_collection(name=name, embedding_function=embedding_functions)
    except chromadb.errors.InvalidCollectionException:
        collection = client.get_or_create_collection(name=name, embedding_function=embedding_functions)
    embeddings = embedding_functions(documents)

    ids = [str(i) for i in range(len(documents))]
    metadatas = [{"index": i} for i in range(len(documents))]

    collection.add(
        ids=ids,
        documents=documents,
        metadatas=metadatas,
        embeddings=embeddings
    )

    return collection, name

def get_relevant_passage(query: str, db, n_results: int):
    """
    Retrieve the most relevant passages for the given query.
    Returns a list of relevant text passages.
    """
    results = db.query(query_texts=[query], n_results=n_results)
    print("Results:", results)

    return " ".join([doc for sublist in results['documents'] for doc in sublist])


# TODO: Students implement prompt construction
# FL
def make_rag_prompt(query: str, relevant_passage: str):
    """
    Construct a prompt for the generation model using the query and retrieved passage.
    Returns the formatted prompt string.
    """
    prompt = f"""
    You are an advanced AI assistant tasked with answering user queries.
    Here is the relevant context retrieved from the database:
    "{relevant_passage}"

    User's question:
    "{query}"

    Based on the provided context, answer the user's query concisely and accurately.
    """
    return prompt
    # pass

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# LLM Response Generation

In [9]:
def generate_answer(prompt: str):
    """Generate answer using Gemini Pro API"""
    # gemini_api_key = os.getenv("GEMINI_API_KEY")
    gemini_api_key = userdata.get('GOOGLE_API_KEY')
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    result = model.generate_content(prompt)
    return result.text

# Main execution
## ToDo:
 - Chat history
 - Multiple file injest

In [52]:
def main():
    # Set up configurations
    pdf_url = "https://services.google.com/fh/files/misc/ai_adoption_framework_whitepaper.pdf"
    pdf_path = "ai_adoption_framework_whitepaper.pdf"
    db_folder = "chroma_db"
    db_name = "rag_experiment"

    # Create database directory
    if not os.path.exists(db_folder):
        os.makedirs(db_folder)

    # Download and process PDF
    download_pdf(pdf_url, pdf_path)
    pdf_text = load_pdf(pdf_path)

    # Split text into chunks
    chunked_text = split_text(pdf_text)
    # Create and set up database
    db_path = os.path.join(os.getcwd(), db_folder)
    db, name = create_chroma_db(chunked_text, db_path, db_name)

    # Process user query
    query = input("Please enter your query: ")
    relevant_text = get_relevant_passage(query, db, n_results=3)

    # Generate and display answer
    if relevant_text:
        final_prompt = make_rag_prompt(query, "".join(relevant_text))
        answer = generate_answer(final_prompt)
        print("\nGenerated Answer:", answer)
    else:
        print("No relevant information found for the given query.")

if __name__ == "__main__":
    main()



Please enter your query: What's the power of AI
Results: {'ids': [['11', '88', '5']], 'embeddings': None, 'documents': [['How do you harness the power inherent in AI, while avoiding any \npotential missteps?', 'But at every step along the way, \nadding in effective AI capabilities brings benefits.', 'But the path to building an effective AI capability is not an easy one.']], 'uris': None, 'data': None, 'metadatas': [[None, None, None]], 'distances': [[0.20150502026081085, 0.2513818144798279, 0.2521023750305176]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}

Generated Answer: AI's power lies in its ability to enhance capabilities at every step.
