# [STARTER] Udaplay Project

## Part 01 - Offline RAG

In this part of the project, you'll build your VectorDB using Chroma.

The data is inside folder `project/starter/games`. Each file will become a document in the collection you'll create.
Example.:
```json
{
  "Name": "Gran Turismo",
  "Platform": "PlayStation 1",
  "Genre": "Racing",
  "Publisher": "Sony Computer Entertainment",
  "Description": "A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.",
  "YearOfRelease": 1997
}
```


### Setup

In [None]:
# Only needed for Udacity workspace

import importlib.util
import sys

# Check if 'pysqlite3' is available before importing
if importlib.util.find_spec("pysqlite3") is not None:
    import pysqlite3
    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [None]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv

# For Embeddings:
import numpy as np

In [None]:
# TODO: Create a .env file with the following variables
# OPENAI_API_KEY="YOUR_KEY"
# CHROMA_OPENAI_API_KEY="YOUR_KEY"
# TAVILY_API_KEY="YOUR_KEY"

In [None]:
# TODO: Load environment variables
load_dotenv()

# Validate required API keys with helpful error messages
openai_api_key = os.getenv('OPENAI_API_KEY')
openai_base_url = os.getenv('OPENAI_BASE_URL')
if not openai_api_key:
    raise ValueError(
        'OPENAI_API_KEY not found in environment variables. '
        'Please create a .env file with OPENAI_API_KEY="your_key"'
    )
else:
    print(f"OPEN AI KEY: {openai_api_key}")
    print(f"OPEN AI URL: { openai_base_url}")

tavily_api_key = os.getenv('TAVILY_API_KEY')
if not tavily_api_key:
    raise ValueError(
        'TAVILY_API_KEY not found in environment variables. '
        'Please create a .env file with TAVILY_API_KEY="your_key"'
    )
else:
    print(f"TAVILY API KEY: {tavily_api_key}")

chroma_api_key = os.getenv('CHROMA_OPENAI_API_KEY') or openai_api_key
if not chroma_api_key:
    raise ValueError(
        'CHROMA_OPENAI_API_KEY not found in environment variables. '
        'Please create a .env file with CHROMA_OPENAI_API_KEY="your_key"'
    )
else:
    print(f"CHROMA API KEY: {chroma_api_key}")

print('âœ… API keys loaded successfully!')

In [None]:
import os
from typing import List, Dict, Any
from chromadb.api.types import EmbeddingFunction
from openai import OpenAI


class VocareumEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function for use with Vocareum's OpenAI proxy.
    """
    def __init__(self, model_name: str = "text-embedding-ada-002", **kwargs):
        self._client = OpenAI(
            api_key=os.environ.get("VOC_OPENAI_API_KEY"),
            base_url="https://openai.vocareum.com/v1"
        )
        self._model_name = model_name
    def __call__(self, texts: List[str]) -> List[List[float]]:
        """
        Generates embeddings for a list of texts.
        Args:
            texts: A list of strings to embed.
        Returns:
            A list of embeddings, where each embedding is a list of floats.
        """
        if not texts:
            return []
        # OpenAI API can handle multiple texts in one call
        response = self._client.embeddings.create(
            model=self._model_name,
            input=texts
        )
        
        return [embedding.embedding for embedding in response.data]

In [None]:
# Example usage (for testing purposes)
voc_api_key = os.getenv('VOC_OPENAI_API_KEY')

if __name__ == '__main__':
    # This assumes you have set the VOC_OPENAI_API_KEY environment variable
    # In Vocareum, this is typically set for you.
    if not voc_api_key:
        print("Please set the VOC_OPENAI_API_KEY environment variable to test this script.")
    else:
        embedding_function = VocareumEmbeddingFunction()
        sample_texts = ["Hello, world!", "This is a test."]
        embeddings = embedding_function(sample_texts)
        print(f"Successfully generated {len(embeddings)} embeddings.")
        print(f"Dimension of first embedding: {len(embeddings[0])}")

### VectorDB Instance

In [None]:
# TODO: Instantiate your ChromaDB Client
# Choose any path you want
chroma_client = chromadb.PersistentClient(path="chromadb")

### Collection

In [None]:
# TODO: Pick one embedding function
# If picking something different than openai, 
# make sure you use the same when loading it
embedding_fn = VocareumEmbeddingFunction()

In [None]:

# Start Local Embedder
try:
    chroma_client.delete_collection(name="udaplay")
    print("Old collection deleted successfully.")
except Exception as e:
    print(f"Collection did not exist or could not be deleted: {e}")

collection = chroma_client.get_or_create_collection(
    name="udaplay",
    embedding_function=embedding_fn
)
    
print(f"SUCCESS: Collection '{collection.name}' created")

### Add documents

In [None]:
# Make sure you have a directory "project/starter/games"
data_dir = "games"


if os.path.exists(data_dir):
    documents_added = 0
    
    for file_name in sorted(os.listdir(data_dir)):
        if not file_name.endswith(".json"):
            continue
            
        file_path = os.path.join(data_dir, file_name)
        
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                game = json.load(f)
            
            # Create content string for embedding
            content = f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - {game['Description']}"
            
            # Use filename (without extension) as document ID
            doc_id = os.path.splitext(file_name)[0]

            # embedding = FakeEmbeddingFunction()

            # print(f"Docid: {doc_id}, Content:{content}, Meta:{game}")
            # Add to collection
            collection.add(
                ids=[doc_id],
                documents=[content],
                metadatas=[game]
            )
            
            
            documents_added += 1
            
        except Exception as e:
            print(f"Error processing {file_name}: {e}")
    
    print(f"Successfully added {documents_added} documents to ChromaDB")
    print(f"Total documents in collection: {collection.count()}")
else:
    print("Games directory not found. Cannot load data.")



In [None]:
# Test semantic search functionality
def test_semantic_search(collection, query, n_results=3):
    """Test semantic search and display results"""
    print(f"\n=== Searching for: '{query}' ===")
    results = "none"

    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        include=['documents', 'metadatas', 'distances']
    )
    #query_texts=[query],

    if results['documents'][0]:
        print(f"Found {len(results['documents'][0])} results:")
        
        for i, (doc, metadata, distance) in enumerate(zip(
            results['documents'][0], 
            results['metadatas'][0], 
            results['distances'][0]
        )):
            similarity = 1 - distance  # Convert distance to similarity
            print(f"\n{i+1}. {metadata['Name']} ({metadata['YearOfRelease']})")
            print(f"   Platform: {metadata['Platform']}")
            print(f"   Genre: {metadata.get('Genre', 'N/A')}")
            print(f"   Publisher: {metadata.get('Publisher', 'N/A')}")
            print(f"   Similarity: {similarity:.3f}")
            print(f"   Description: {metadata['Description'][:100]}...")
    else:
        print("No results found.")
    
    return results

# Test with different types of queries
test_queries = [
    "Sony published games",
    "RPG games",
    "PlayStation games from the 1990s",
    "Action games with shooting mechanics",
    "Games suitable for families"
]

for query in test_queries:
    test_semantic_search(collection, query, n_results=2)

In [None]:
# Display collection statistics
print("=== ChromaDB Collection Statistics ===")
print(f"Collection name: {collection.name}")
print(f"Total documents: {collection.count()}")

# Get a sample of documents to verify data structure
sample_docs = collection.get(limit=3, include=['documents', 'metadatas'])

print("\n=== Sample Documents ===")
for i, (doc, metadata) in enumerate(zip(sample_docs['documents'], sample_docs['metadatas'])):
    print(f"\n{i+1}. Document ID: {sample_docs['ids'][i]}")
    print(f"   Game: {metadata['Name']}")
    print(f"   Platform: {metadata['Platform']}")
    print(f"   Year: {metadata['YearOfRelease']}")
    print(f"   Content: {doc[:100]}...")