# [STARTER] Udaplay Project

## Part 01 - Offline RAG

In this part of the project, you'll build your VectorDB using Chroma.

The data is inside folder `project/starter/games`. Each file will become a document in the collection you'll create.
Example.:
```json
{
  "Name": "Gran Turismo",
  "Platform": "PlayStation 1",
  "Genre": "Racing",
  "Publisher": "Sony Computer Entertainment",
  "Description": "A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.",
  "YearOfRelease": 1997
}
```


### Setup

In [1]:
# Only needed for Udacity workspace

import importlib.util
import sys

# Check if 'pysqlite3' is available before importing
if importlib.util.find_spec("pysqlite3") is not None:
    import pysqlite3
    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [2]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv

# For Embeddings:
import numpy as np

In [3]:
# TODO: Create a .env file with the following variables
# OPENAI_API_KEY="YOUR_KEY"
# CHROMA_OPENAI_API_KEY="YOUR_KEY"
# TAVILY_API_KEY="YOUR_KEY"

In [4]:
# TODO: Load environment variables
load_dotenv()

# Validate required API keys with helpful error messages
openai_api_key = os.getenv('OPENAI_API_KEY')
openai_base_url = os.getenv('OPENAI_BASE_URL')
if not openai_api_key:
    raise ValueError(
        'OPENAI_API_KEY not found in environment variables. '
        'Please create a .env file with OPENAI_API_KEY="your_key"'
    )
else:
    print(f"OPEN AI KEY: {openai_api_key}")
    print(f"OPEN AI URL: { openai_base_url}")

tavily_api_key = os.getenv('TAVILY_API_KEY')
if not tavily_api_key:
    raise ValueError(
        'TAVILY_API_KEY not found in environment variables. '
        'Please create a .env file with TAVILY_API_KEY="your_key"'
    )
else:
    print(f"TAVILY API KEY: {tavily_api_key}")

chroma_api_key = os.getenv('CHROMA_OPENAI_API_KEY') or openai_api_key
if not chroma_api_key:
    raise ValueError(
        'CHROMA_OPENAI_API_KEY not found in environment variables. '
        'Please create a .env file with CHROMA_OPENAI_API_KEY="your_key"'
    )
else:
    print(f"CHROMA API KEY: {chroma_api_key}")

print('✅ API keys loaded successfully!')

OPEN AI KEY: voc-503484675168865479810469873adb026af0.07347493
OPEN AI URL: https://openai.vocareum.com/v1/
TAVILY API KEY: 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
CHROMA API KEY: YOUR_KEY
✅ API keys loaded successfully!


### VectorDB Instance

In [5]:
# TODO: Instantiate your ChromaDB Client
# Choose any path you want
chroma_client = chromadb.PersistentClient(path="chromadb")

### Collection

In [6]:
# TODO: Pick one embedding function
# If picking something different than openai, 
# make sure you use the same when loading it
embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_api_key,
    api_base=openai_base_url,   # IMPORTANT for Vocareum
    model_name="text-embedding-3-small"         # or "text-embedding-3-large"
)


import hashlib

class DeterministicHashEmbedding(embedding_functions.EmbeddingFunction):
    def __init__(self, dim: int = 768):
        self.dim = dim

    def _seed_from_text(self, text: str) -> int:
        digest = hashlib.sha256(text.encode("utf-8")).digest()
        return int.from_bytes(digest[:8], "big", signed=False)

    def __call__(self, inputs: list[str]) -> list[list[float]]:
        if isinstance(inputs, str):
            inputs = [inputs]
        out = []
        for t in inputs:
            seed = self._seed_from_text(t)
            rng = np.random.default_rng(seed)
            v = rng.random(self.dim)
            # (optional) normalize for cosine metric:
            v = v / (np.linalg.norm(v) + 1e-12)
            out.append(v.tolist())
        return out


# ---- Step 1: Create a custom local embedding function ----
class FakeEmbeddingFunction(embedding_functions.EmbeddingFunction):
    def __init__(self, dim: int = 768):
        """
        dim: Dimension of the embedding vector.
        """
        self.dim = dim

    def __call__(self, input: list[str]) -> list[list[float]]:
        # Ensure input is a list of strings
        if isinstance(input, str):
            input = [input]
        return [np.random.rand(self.dim).tolist() for _ in input]




In [7]:

# Start Local Embedder
try:
    chroma_client.delete_collection(name="udaplay")
    print("Old collection deleted successfully.")
except Exception as e:
    print(f"Collection did not exist or could not be deleted: {e}")

collection = chroma_client.get_or_create_collection(
    name="udaplay",
    embedding_function=DeterministicHashEmbedding(dim=768)
    # embedding_function=FakeEmbeddingFunction()
)
    
print(f"SUCCESS: Collection '{collection.name}' created")

Old collection deleted successfully.
SUCCESS: Collection 'udaplay' created


### Add documents

In [8]:
# Make sure you have a directory "project/starter/games"
data_dir = "games"


if os.path.exists(data_dir):
    documents_added = 0
    
    for file_name in sorted(os.listdir(data_dir)):
        if not file_name.endswith(".json"):
            continue
            
        file_path = os.path.join(data_dir, file_name)
        
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                game = json.load(f)
            
            # Create content string for embedding
            content = f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - {game['Description']}"
            
            # Use filename (without extension) as document ID
            doc_id = os.path.splitext(file_name)[0]

            # embedding = FakeEmbeddingFunction()

            # print(f"Docid: {doc_id}, Content:{content}, Meta:{game}")
            # Add to collection
            collection.add(
                ids=[doc_id],
                documents=[content],
                metadatas=[game]
            )
            
            
            documents_added += 1
            
        except Exception as e:
            print(f"Error processing {file_name}: {e}")
    
    print(f"Successfully added {documents_added} documents to ChromaDB")
    print(f"Total documents in collection: {collection.count()}")
else:
    print("Games directory not found. Cannot load data.")



Successfully added 15 documents to ChromaDB
Total documents in collection: 15


In [9]:
# Test semantic search functionality
def test_semantic_search(collection, query, n_results=3):
    """Test semantic search and display results"""
    print(f"\n=== Searching for: '{query}' ===")
    results = "none"

    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        include=['documents', 'metadatas', 'distances']
    )
    #query_texts=[query],

    if results['documents'][0]:
        print(f"Found {len(results['documents'][0])} results:")
        
        for i, (doc, metadata, distance) in enumerate(zip(
            results['documents'][0], 
            results['metadatas'][0], 
            results['distances'][0]
        )):
            similarity = 1 - distance  # Convert distance to similarity
            print(f"\n{i+1}. {metadata['Name']} ({metadata['YearOfRelease']})")
            print(f"   Platform: {metadata['Platform']}")
            print(f"   Genre: {metadata.get('Genre', 'N/A')}")
            print(f"   Publisher: {metadata.get('Publisher', 'N/A')}")
            print(f"   Similarity: {similarity:.3f}")
            print(f"   Description: {metadata['Description'][:100]}...")
    else:
        print("No results found.")
    
    return results

# Test with different types of queries
test_queries = [
    "Sony published games",
    "RPG games",
    "PlayStation games from the 1990s",
    "Action games with shooting mechanics",
    "Games suitable for families"
]

for query in test_queries:
    test_semantic_search(collection, query, n_results=2)


=== Searching for: 'Sony published games' ===
Found 2 results:

1. Minecraft (2014)
   Platform: Xbox One
   Genre: Sandbox, Survival
   Publisher: Mojang Studios
   Similarity: 0.531
   Description: A sandbox game that allows players to build and explore infinite worlds, fostering creativity and ad...

2. Super Mario World (1990)
   Platform: Super Nintendo Entertainment System (SNES)
   Genre: Platformer
   Publisher: Nintendo
   Similarity: 0.528
   Description: A classic platformer where Mario embarks on a quest to save Princess Toadstool and Dinosaur Land fro...

=== Searching for: 'RPG games' ===
Found 2 results:

1. Kinect Adventures! (2010)
   Platform: Xbox 360
   Genre: Party
   Publisher: Microsoft Game Studios
   Similarity: 0.514
   Description: A collection of mini-games designed to showcase the capabilities of the Kinect motion sensor....

2. Pokémon Gold and Silver (1999)
   Platform: Game Boy Color
   Genre: Role-playing
   Publisher: Nintendo
   Similarity: 0.507
   

In [21]:
# Display collection statistics
print("=== ChromaDB Collection Statistics ===")
print(f"Collection name: {collection.name}")
print(f"Total documents: {collection.count()}")

# Get a sample of documents to verify data structure
sample_docs = collection.get(limit=3, include=['documents', 'metadatas'])

print("\n=== Sample Documents ===")
for i, (doc, metadata) in enumerate(zip(sample_docs['documents'], sample_docs['metadatas'])):
    print(f"\n{i+1}. Document ID: {sample_docs['ids'][i]}")
    print(f"   Game: {metadata['Name']}")
    print(f"   Platform: {metadata['Platform']}")
    print(f"   Year: {metadata['YearOfRelease']}")
    print(f"   Content: {doc[:100]}...")

=== ChromaDB Collection Statistics ===
Collection name: udaplay
Total documents: 15

=== Sample Documents ===

1. Document ID: 001
   Game: Gran Turismo
   Platform: PlayStation 1
   Year: 1997
   Content: [PlayStation 1] Gran Turismo (1997) - A realistic racing simulator featuring a wide array of cars an...

2. Document ID: 002
   Game: Grand Theft Auto: San Andreas
   Platform: PlayStation 2
   Year: 2004
   Content: [PlayStation 2] Grand Theft Auto: San Andreas (2004) - An expansive open-world game set in the ficti...

3. Document ID: 003
   Game: Gran Turismo 5
   Platform: PlayStation 3
   Year: 2010
   Content: [PlayStation 3] Gran Turismo 5 (2010) - A comprehensive racing simulator featuring a vast selection ...
