In [5]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.embeddings.bedrock import BedrockEmbeddings


def get_embedding_function():
    """embeddings = BedrockEmbeddings(
        credentials_profile_name="default", region_name="us-east-1"
    )"""
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

In [39]:
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
import hashlib
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

In [33]:
CHROMA_PATH = "/Users/joudi/Measuring_RAG_Effectiveness/"
DATA_PATH = "/Users/joudi/Measuring_RAG_Effectiveness/data"

In [34]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def calculate_chunk_ids(chunks):
    #last_page_id = None
    #current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source", "unknown_source")
        page = chunk.metadata.get("page", "unknown_page")
        content_hash = hashlib.md5(chunk.page_content.encode("utf-8")).hexdigest()
        
        # Create a unique ID using source, page, and a hash of the content
        chunk_id = f"{source}:{page}:{content_hash}"
        chunk.metadata["id"] = chunk_id

    return chunks

def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

def add_to_chroma(chunks):
    db = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=get_embedding_function()
    )
    chunks_with_ids = calculate_chunk_ids(chunks)

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
        if new_chunks:
            db.add_documents(new_chunks, ids=new_chunk_ids)
            db.persist()
        print("✅ Documents successfully added and persisted.")
    else:
        print("✅ No new documents to add")

def check_num_in_DB(path):
    try:
        # Load the existing database
        db = Chroma(persist_directory=path, embedding_function=get_embedding_function())
        existing_items = db.get(include=[])  # Fetch existing document metadata (IDs are always included)
        num_existing = len(existing_items["ids"])
        print(f"📊 Number of existing documents in DB: {num_existing}")
        return num_existing
    except Exception as e:
        print(f"❌ Error checking existing documents: {e}")
        return 0

In [37]:
documents = load_documents()
chunks = split_documents(documents)
add_to_chroma(chunks)

#check_num_in_DB(CHROMA_PATH)

Number of existing documents in DB: 2226
✅ No new documents to add
📊 Number of existing documents in DB: 2226


2226

In [38]:

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""


In [43]:
def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()  
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    # Build context text from the top results
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    
    # Format the prompt using the context and the query
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)  
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Generate the model's response
    model = Ollama(model="llama3")  
    response_text = model.invoke(prompt)

    # Get source document IDs
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    
    # Format the response for output
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    
    # Output the response
    print(formatted_response)
    
    return response_text

def query_without_context(query_text: str):
    model = Ollama(model="llama3") 

    # Generate the response from the model, without sending any context
    response_text = model.invoke(query_text)

    return response_text

In [49]:
query_text = "for what reasons are understanding Apple’s mobile ecosystem is essential?"
response = query_rag(query_text)

Response: According to the text, understanding Apple's mobile ecosystem is essential because:

i) Apple heavily promotes its platform as privacy-oriented using phrases such as “Privacy. That’s Apple” [5], which may lead iOS users to be less concerned about the privacy implications of default apps than Android users.

ii) Due to the closed nature of the ecosystem, verifying data handling practices can be challenging.

iii) In general, Apple's ecosystem is severely understudied in relevant literature, yet it is very popular.
Sources: ['/Users/joudi/Measuring_RAG_Effectiveness/data/Apple privacy of default apps.pdf:3:d9ecc5563b16edf707443f12a490185a', '/Users/joudi/Measuring_RAG_Effectiveness/data/Apple privacy of default apps.pdf:3:ea8014b46b470c41b0a45ecdbd4b9cb4', '/Users/joudi/Measuring_RAG_Effectiveness/data/Apple privacy of default apps.pdf:0:48b208ca16f13234b9bc242f67ae13a1', '/Users/joudi/Measuring_RAG_Effectiveness/data/Apple privacy of default apps.pdf:3:fb9f42e52a1225d6182ab7ab

In [50]:
print(query_without_context(query_text))

Understanding Apple's mobile ecosystem is essential for several reasons:

1. **Dominant Market Share**: Apple has a significant market share in the smartphone industry, with over 50% of the global market share as of 2022. As such, it's crucial to comprehend how their ecosystem functions and how it affects users.
2. **Integration and Interoperability**: Apple devices are designed to work seamlessly together, creating an integrated experience across products like iPhones, iPads, MacBooks, and Apple Watches. Understanding this integration is vital for developers and businesses seeking to create compatible solutions.
3. **Closed System**: Apple's mobile ecosystem is a closed system, meaning that third-party apps must comply with strict guidelines and policies to ensure quality and security. Familiarity with these requirements can help developers and businesses create compliant products and services.
4. **Security and Compliance**: Apple's focus on security and compliance makes it essential