In [None]:
# Install required dependencies
%pip install -q chromadb openai python-dotenv pydantic pdfplumber

# Setup and imports
import os
import sys
import json
from pathlib import Path
from typing import Dict, List
from dotenv import load_dotenv

# Handle Google Colab vs local environment
if 'google.colab' in str(get_ipython()):
    print("Running in Google Colab - cloning repository...")
    !git clone https://github.com/Imsharad/udaplay-market-research-agent.git
    os.chdir('/content/udaplay-market-research-agent/projects/building-agents/src/project/starter')
    print("Changed to project directory")
else:
    print("Running locally - navigating to project directory...")
    os.chdir('../projects/building-agents/src/project/starter')
    print("Changed to project directory")

# Now simple imports work from the correct directory
from lib.documents import Document, Corpus
from lib.vector_db import VectorStore

# Load environment variables
load_dotenv()

# Verify API keys are loaded
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
CHROMA_OPENAI_API_KEY = os.getenv('CHROMA_OPENAI_API_KEY')

assert OPENAI_API_KEY, "OPENAI_API_KEY not found in environment"
assert CHROMA_OPENAI_API_KEY, "CHROMA_OPENAI_API_KEY not found in environment"

# Configure for Vocareum if using voc- keys
if OPENAI_API_KEY.startswith('voc-'):
    print("Detected Vocareum OpenAI API key - configuring for Vocareum endpoint")
    os.environ['OPENAI_API_BASE'] = 'https://openai.vocareum.com/v1'
    
if CHROMA_OPENAI_API_KEY.startswith('voc-'):
    print("Detected Vocareum ChromaDB key - configuring for Vocareum endpoint")

print("Environment setup complete!")
print(f"OpenAI API key: {OPENAI_API_KEY[:10]}...{OPENAI_API_KEY[-4:]}")
print(f"ChromaDB API key: {CHROMA_OPENAI_API_KEY[:10]}...{CHROMA_OPENAI_API_KEY[-4:]}")

In [None]:
# Load game data
def load_games() -> List[Dict]:
    """Load all JSON game files into memory."""
    games_dir = Path("games")  # Now relative to current directory
    games: List[Dict] = []
    for json_file in sorted(games_dir.glob("*.json")):
        with open(json_file, "r", encoding="utf-8") as fp:
            games.append(json.load(fp))
    print(f"Loaded {len(games)} game files from {games_dir}")
    if games:
        print("Example game keys:", list(games[0].keys()))
    return games

# Load all game data
games_data = load_games()

# Let's examine the structure of the first game
print("\nExample game structure:")
print(json.dumps(games_data[0], indent=2))

In [3]:
def create_game_document(game_data: Dict, index: int) -> Document:
    """Convert a single game dictionary into a Document."""
    name = game_data.get("Name", "Unknown")
    platform = game_data.get("Platform", "Unknown")
    genre = game_data.get("Genre", "Unknown")
    publisher = game_data.get("Publisher", "Unknown")
    release_year = game_data.get("YearOfRelease", "Unknown")
    description = game_data.get("Description", "No description available")

    content = "\n".join(
        [
            f"Game: {name}",
            f"Platform: {platform}",
            f"Genre: {genre}",
            f"Publisher: {publisher}",
            f"Release Year: {release_year}",
            f"Description: {description}",
        ]
    )

    metadata = {
        "name": name,
        "platform": platform,
        "genre": genre,
        "publisher": publisher,
        "release_year": str(release_year),
        "description": description,
    }

    clean_name = (
        name.lower()
        .replace(" ", "_")
        .replace(":", "")
        .replace("-", "_")
        .replace("'", "")
    )
    doc_id = f"game_{index:03d}_{clean_name}"

    return Document(id=doc_id, content=content, metadata=metadata)

def build_corpus(games: List[Dict]) -> Corpus:
    docs = [create_game_document(game, i) for i, game in enumerate(games)]
    corpus = Corpus(docs)
    print(f"Created {len(corpus)} Document objects (all IDs unique ✔️)")
    return corpus

# Convert all games to documents
game_corpus = build_corpus(games_data)

print(f"\nExample document:")
print(f"ID: {game_corpus[0].id}")
print(f"Content preview: {game_corpus[0].content[:200]}...")
print(f"Metadata: {game_corpus[0].metadata}")


Created 15 Document objects (all IDs unique ✔️)

Example document:
ID: game_000_gran_turismo
Content preview: Game: Gran Turismo
Platform: PlayStation 1
Genre: Racing
Publisher: Sony Computer Entertainment
Release Year: 1997
Description: A realistic racing simulator featuring a wide array of cars and tracks, ...
Metadata: {'name': 'Gran Turismo', 'platform': 'PlayStation 1', 'genre': 'Racing', 'publisher': 'Sony Computer Entertainment', 'release_year': '1997', 'description': 'A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.'}


In [4]:
# Custom VectorStoreManager for Vocareum compatibility
import chromadb
from chromadb.utils import embedding_functions

class VocareumVectorStoreManager:
    """Thin wrapper around ChromaDB to support Vocareum endpoints."""

    def __init__(self, openai_api_key: str):
        # Use persistent client so data survives between script runs
        self.client = chromadb.PersistentClient(path="./chroma_db")
        self.embedding_function = self._create_embedding_function(openai_api_key)

    def _create_embedding_function(self, api_key: str):
        if api_key.startswith("voc-"):
            return embedding_functions.OpenAIEmbeddingFunction(
                api_key=api_key, api_base="https://openai.vocareum.com/v1"
            )
        return embedding_functions.OpenAIEmbeddingFunction(api_key=api_key)

    def create_store(self, name: str, force: bool = False) -> VectorStore:
        if force:
            try:
                self.client.delete_collection(name=name)
            except Exception:
                pass  # ignore if collection didn't previously exist
        collection = self.client.get_or_create_collection(
            name=name, embedding_function=self.embedding_function
        )
        return VectorStore(collection)

    def get_store(self, name: str) -> VectorStore | None:
        try:
            return VectorStore(self.client.get_collection(name=name))
        except Exception:
            return None

# Initialize the Vector Store Manager
vector_manager = VocareumVectorStoreManager(CHROMA_OPENAI_API_KEY)

print("Vector Store Manager initialized successfully!")
print("Ready to create persistent vector store with OpenAI embeddings")


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


Vector Store Manager initialized successfully!
Ready to create persistent vector store with OpenAI embeddings


In [5]:
# Index documents into ChromaDB
def index_documents(corpus: Corpus, store_name: str = "udaplay_games") -> VectorStore:
    vec_store = vector_manager.create_store(store_name, force=True)
    print("Adding documents to vector store – this may take a moment…")
    vec_store.add(corpus)
    print(f"Successfully indexed {len(corpus)} documents into '{store_name}'")
    return vec_store

# Create the vector store and index our documents
vector_store = index_documents(game_corpus)

# Verify the documents were added by retrieving a few
test_retrieval = vector_store.get(limit=3)
print(f"\nVerification - Retrieved {len(test_retrieval['ids'])} documents:")
for i, doc_id in enumerate(test_retrieval['ids']):
    print(f"- {doc_id}")


Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Adding documents to vector store – this may take a moment…


Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


Successfully indexed 15 documents into 'udaplay_games'

Verification - Retrieved 3 documents:
- game_000_gran_turismo
- game_001_grand_theft_auto_san_andreas
- game_002_gran_turismo_5


In [6]:
# Helper function to display search results  
def display_search_results(query: str, results: Dict):
    print("=" * 70)
    print(f"Query → {query}")
    print("=" * 70)
    if results["documents"] and results["documents"][0]:
        for i, (doc, distance, meta) in enumerate(
            zip(results["documents"][0], results["distances"][0], results["metadatas"][0])
        ):
            similarity = 1 - distance
            print(
                f"[{i+1}] {meta['name']} ({meta['release_year']}, {meta['platform']}) – "
                f"sim={similarity:.3f}"
            )
            print(f"    Genre: {meta['genre']} | Publisher: {meta['publisher']}")
            print(f"    Description: {meta['description'][:120]}…\n")
    else:
        print("No results found.\n")

# Run demo searches (semantic + metadata filtering)
def run_demo_searches(store: VectorStore):
    demo_queries = [
        "Pokemon games from the 90s",
        "First 3D Mario platformer", 
        "Mortal Kombat fighting game",
        "RPG games by Nintendo",
        "Games released in 1999",
    ]
    for q in demo_queries:
        res = store.query(query_texts=[q], n_results=3)
        display_search_results(q, res)

    # Metadata-only example 
    nintendo = store.get(where={"publisher": "Nintendo"}, limit=5)
    print("\nNintendo-published titles (metadata filter):")
    for idx, meta in enumerate(nintendo["metadatas"], start=1):
        print(f"  {idx}. {meta['name']} ({meta['release_year']}) – {meta['platform']}")

    # Mixed example
    filtered = store.query(
        query_texts=["adventure game"], n_results=3, where={"platform": "Nintendo 64"}
    )
    print("\nAdventure games on Nintendo 64:")
    for meta in filtered["metadatas"][0]:
        print(f"  – {meta['name']} ({meta['genre']})")

# Run the demonstration
run_demo_searches(vector_store)

print("\nAll done – vector database ready for Part 2! ✔️")


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


Query → Pokemon games from the 90s
[1] Pokémon Gold and Silver (1999, Game Boy Color) – sim=0.732
    Genre: Role-playing | Publisher: Nintendo
    Description: Second-generation Pokémon games introducing new regions, Pokémon, and gameplay mechanics.…

[2] Pokémon Ruby and Sapphire (2002, Game Boy Advance) – sim=0.726
    Genre: Role-playing | Publisher: Nintendo
    Description: Third-generation Pokémon games set in the Hoenn region, featuring new Pokémon and double battles.…

[3] Super Mario 64 (1996, Nintendo 64) – sim=0.612
    Genre: Platformer | Publisher: Nintendo
    Description: A groundbreaking 3D platformer that set new standards for the genre, featuring Mario's quest to rescue Princess Peach.…

Query → First 3D Mario platformer
[1] Super Mario 64 (1996, Nintendo 64) – sim=0.780
    Genre: Platformer | Publisher: Nintendo
    Description: A groundbreaking 3D platformer that set new standards for the genre, featuring Mario's quest to rescue Princess Peach.…

[2] Super Mario W

Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


Query → Games released in 1999
[1] Pokémon Gold and Silver (1999, Game Boy Color) – sim=0.652
    Genre: Role-playing | Publisher: Nintendo
    Description: Second-generation Pokémon games introducing new regions, Pokémon, and gameplay mechanics.…

[2] Gran Turismo (1997, PlayStation 1) – sim=0.641
    Genre: Racing | Publisher: Sony Computer Entertainment
    Description: A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.…

[3] Super Mario 64 (1996, Nintendo 64) – sim=0.634
    Genre: Platformer | Publisher: Nintendo
    Description: A groundbreaking 3D platformer that set new standards for the genre, featuring Mario's quest to rescue Princess Peach.…


Nintendo-published titles (metadata filter):
  1. Pokémon Gold and Silver (1999) – Game Boy Color
  2. Pokémon Ruby and Sapphire (2002) – Game Boy Advance
  3. Super Mario World (1990) – Super Nintendo Entertainment System (SNES)
  4. Super Mario 64 (1996) – Nintendo 64
  5. Su