# [STARTER] Udaplay Project

## Part 01 - Offline RAG

In this part of the project, you'll build your VectorDB using Chroma.

The data is inside folder `project/starter/games`. Each file will become a document in the collection you'll create.
Example.:
```json
{
  "Name": "Gran Turismo",
  "Platform": "PlayStation 1",
  "Genre": "Racing",
  "Publisher": "Sony Computer Entertainment",
  "Description": "A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.",
  "YearOfRelease": 1997
}
```


### Setup

In [None]:
# Only needed for Udacity workspace

import importlib.util
import sys

# Check if 'pysqlite3' is available before importing
if importlib.util.find_spec("pysqlite3") is not None:
    import pysqlite3
    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [None]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv

In [None]:
# Load environment variables
load_dotenv()

# Get API keys - Patched with Vocareum key
OPENAI_API_KEY = "voc-1638987451168865452771968fb1b72492202.69241061"
CHROMA_OPENAI_API_KEY = os.getenv("CHROMA_OPENAI_API_KEY") or OPENAI_API_KEY

# Verify API keys are loaded
assert OPENAI_API_KEY is not None, "OPENAI_API_KEY not found in environment variables"
print("✓ Environment variables loaded successfully")

### VectorDB Instance

In [None]:
# Instantiate ChromaDB Client with persistent storage
chroma_client = chromadb.PersistentClient(path="chromadb")
print("✓ ChromaDB client initialized")

### Collection

In [None]:
# Create OpenAI embedding function with Vocareum configuration
# Note: If using a different embedding function, make sure to use the same when loading the collection
embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=CHROMA_OPENAI_API_KEY,
    api_base="https://api.labs.vocareum.com/inference/openai/v1"
)
print("✓ Embedding function created")

In [None]:
# Create or get collection
# If collection already exists, get it; otherwise create a new one
try:
    collection = chroma_client.get_collection(name="udaplay")
    print("✓ Using existing collection 'udaplay'")
except:
    collection = chroma_client.create_collection(
        name="udaplay",
        embedding_function=embedding_fn
    )
    print("✓ Created new collection 'udaplay'")

### Add documents

In [None]:
# Make sure you have a directory "starter/games"
data_dir = "games"

# Check if directory exists
if not os.path.exists(data_dir):
    raise FileNotFoundError(f"Directory '{data_dir}' not found. Make sure you're running from the starter/ directory.")

game_count = 0
for file_name in sorted(os.listdir(data_dir)):
    if not file_name.endswith(".json"):
        continue

    file_path = os.path.join(data_dir, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        game = json.load(f)

    # Format content string for embedding
    # You can change what text you want to index
    content = f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - {game['Description']}"

    # Use file name (like 001) as ID
    doc_id = os.path.splitext(file_name)[0]

    # Add to collection
    collection.add(
        ids=[doc_id],
        documents=[content],
        metadatas=[game]
    )
    game_count += 1
    print(f"✓ Added game: {game['Name']} ({game['Platform']})")

print(f"\n✓ Successfully added {game_count} games to the collection")

In [None]:
# Check collection count
count = collection.count()
print(f"Total documents in collection: {count}")


In [None]:
# Test semantic search with a sample query
test_query = "racing games on PlayStation"

results = collection.query(
    query_texts=[test_query],
    n_results=3,
    include=['documents', 'metadatas', 'distances']
)

print(f"\nQuery: '{test_query}'")
print(f"\nFound {len(results['documents'][0])} results:\n")

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
), 1):
    print(f"Result {i} (distance: {distance:.4f}):")
    print(f"  Game: {metadata['Name']}")
    print(f"  Platform: {metadata['Platform']}")
    print(f"  Year: {metadata['YearOfRelease']}")
    print(f"  Description: {metadata['Description']}")
    print()


In [None]:
# Only needed for Udacity workspace

import importlib.util
import sys

# Check if 'pysqlite3' is available before importing
if importlib.util.find_spec("pysqlite3") is not None:
    import pysqlite3
    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [None]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv

In [None]:
# Load environment variables
load_dotenv()

# Get API keys - Patched with Vocareum key
OPENAI_API_KEY = "voc-1638987451168865452771968fb1b72492202.69241061"
CHROMA_OPENAI_API_KEY = os.getenv("CHROMA_OPENAI_API_KEY") or OPENAI_API_KEY

# Verify API keys are loaded
assert OPENAI_API_KEY is not None, "OPENAI_API_KEY not found in environment variables"
print("✓ Environment variables loaded successfully")

In [None]:
# Instantiate ChromaDB Client with persistent storage
chroma_client = chromadb.PersistentClient(path="chromadb")
print("✓ ChromaDB client initialized")

In [None]:
# Create OpenAI embedding function with Vocareum configuration
# Note: If using a different embedding function, make sure to use the same when loading the collection
embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=CHROMA_OPENAI_API_KEY,
    api_base="https://api.labs.vocareum.com/inference/openai/v1"
)
print("✓ Embedding function created")

In [None]:
# Create or get collection
# If collection already exists, get it; otherwise create a new one
try:
    collection = chroma_client.get_collection(name="udaplay")
    print("✓ Using existing collection 'udaplay'")
except:
    collection = chroma_client.create_collection(
        name="udaplay",
        embedding_function=embedding_fn
    )
    print("✓ Created new collection 'udaplay'")

In [None]:
# Make sure you have a directory "starter/games"
data_dir = "games"

# Check if directory exists
if not os.path.exists(data_dir):
    raise FileNotFoundError(f"Directory '{data_dir}' not found. Make sure you're running from the starter/ directory.")

game_count = 0
for file_name in sorted(os.listdir(data_dir)):
    if not file_name.endswith(".json"):
        continue

    file_path = os.path.join(data_dir, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        game = json.load(f)

    # Format content string for embedding
    # You can change what text you want to index
    content = f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - {game['Description']}"

    # Use file name (like 001) as ID
    doc_id = os.path.splitext(file_name)[0]

    # Add to collection
    collection.add(
        ids=[doc_id],
        documents=[content],
        metadatas=[game]
    )
    game_count += 1
    print(f"✓ Added game: {game['Name']} ({game['Platform']})")

print(f"\n✓ Successfully added {game_count} games to the collection")

In [None]:
# Check available embedding functions in chromadb
import chromadb.utils.embedding_functions as ef
import inspect

# List all available embedding functions
available_functions = [name for name in dir(ef) if not name.startswith('_') and 'Embedding' in name]
print("Available ChromaDB Embedding Functions:")
for func in available_functions:
    print(f"  - {func}")