# [STARTER] Udaplay Project

## Part 01 - Offline RAG

In this part of the project, you'll build your VectorDB using Chroma.

The data is inside folder `project/starter/games`. Each file will become a document in the collection you'll create.
Example.:
```json
{
  "Name": "Gran Turismo",
  "Platform": "PlayStation 1",
  "Genre": "Racing",
  "Publisher": "Sony Computer Entertainment",
  "Description": "A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.",
  "YearOfRelease": 1997
}
```


### Setup

In [41]:
# Only needed for Udacity workspace

import importlib.util
import sys

# Check if 'pysqlite3' is available before importing
if importlib.util.find_spec("pysqlite3") is not None:
    import pysqlite3
    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [42]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv

# For Embeddings:
import numpy as np

In [43]:
# TODO: Create a .env file with the following variables
# OPENAI_API_KEY="YOUR_KEY"
# CHROMA_OPENAI_API_KEY="YOUR_KEY"
# TAVILY_API_KEY="YOUR_KEY"


In [69]:
# TODO: Load environment variables
load_dotenv()

# Validate required API keys with helpful error messages
openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError(
        'OPENAI_API_KEY not found in environment variables. '
        'Please create a .env file with OPENAI_API_KEY="your_key"'
    )

chroma_api_key = os.getenv('CHROMA_OPENAI_API_KEY') or openai_api_key
if not chroma_api_key:
    raise ValueError(
        'CHROMA_OPENAI_API_KEY not found in environment variables. '
        'Please create a .env file with CHROMA_OPENAI_API_KEY="your_key"'
    )

print('✅ API keys loaded successfully!')
print(os.getenv('OPENAI_API_KEY'))

✅ API keys loaded successfully!
voc-503484675168865479810469873adb026af0.07347493


### VectorDB Instance

In [68]:
# TODO: Instantiate your ChromaDB Client
# Choose any path you want
chroma_client = chromadb.PersistentClient(path="chromadb")

### Collection

In [80]:
# TODO: Pick one embedding function
# If picking something different than openai, 
# make sure you use the same when loading it


# embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
#     api_key=os.getenv("OPENAI_API_KEY"),
#     api_base="https://openai.vocareum.com/v1",  # <-- Add this line for Vocareum
#     model_name="text-embedding-3-small"
# )



import openai


openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = "https://openai.vocareum.com/v1"

def udacity_embed(texts):
    response = openai.Embedding.create(
        model="text-embedding-3-small",
        input=texts
    )
    return [item["embedding"] for item in response["data"]]


# embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
#     api_key=os.getenv("OPENAI_API_KEY"),
#     api_base="https://openai.vocareum.com/v1",   # IMPORTANT for Vocareum
#     model_name="text-embedding-3-small"         # or "text-embedding-3-large"
# )


#model_name="text-embedding-ada-002"


# # Local embedding function (same model used for add + query)
# ef = embedding_functions.SentenceTransformerEmbeddingFunction(
#     model_name="all-MiniLM-L6-v2"  # 384-dim vectors
# )


In [85]:
print(f"Deleting collection '{collection.name}'.")
chroma_client.delete_collection(name="udaplay")
print(f"Successfully deleted.")



# TODO: Create a collection
# Choose any name you want
collection = chroma_client.get_or_create_collection(
    name="udaplay",
    embedding_function=None
)
# embedding_function=embedding_fn

print(f"Successfully loaded or created collection '{collection.name}'.")



Deleting collection 'udaplay'.
Successfully deleted.
Successfully loaded or created collection 'udaplay'.


In [67]:
pip install -q sentence_transformers



^C

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


### Add documents

In [86]:

# Create or get collection
collection_name = "udaplay"

try:
    # Try to get existing collection first
    collection = chroma_client.get_collection(
        name=collection_name,
        embedding_function=embedding_fn
    )
    print(f"Retrieved existing collection: {collection_name}")
    print(f"Current document count: {collection.count()}")
except:
    # Create new collection if it doesn't exist
    collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=embedding_fn
    )
    print(f"Created new collection: {collection_name}")


Retrieved existing collection: udaplay
Current document count: 0


In [87]:
# Load game data from JSON files
data_dir = "games"

# Check if directory exists
if not os.path.exists(data_dir):
    print(f"Directory '{data_dir}' not found!")
    print("Please ensure the games directory exists with JSON files.")
else:
    json_files = [f for f in os.listdir(data_dir) if f.endswith('.json')]
    print(f"Found {len(json_files)} JSON files in '{data_dir}' directory")
    
    # Show first few files
    for i, filename in enumerate(sorted(json_files)[:5]):
        print(f"   {i+1}. {filename}")
    
    if len(json_files) > 5:
        print(f"   ... and {len(json_files) - 5} more files")

Found 15 JSON files in 'games' directory
   1. 001.json
   2. 002.json
   3. 003.json
   4. 004.json
   5. 005.json
   ... and 10 more files


In [89]:
# Add documents to ChromaDB

if os.path.exists(data_dir):
    documents_added = 0

    # embeddings = [np.random.rand(768).tolist() for _ in sorted(os.listdir(data_dir))]

    # print(f"Embeddings: {embeddings}")
    
    for file_name in sorted(os.listdir(data_dir)):
        if not file_name.endswith(".json"):
            continue
            
        file_path = os.path.join(data_dir, file_name)
        
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                game = json.load(f)
            
            # Create content string for embedding
            content = f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - {game['Description']}"
            
            # Use filename (without extension) as document ID
            doc_id = os.path.splitext(file_name)[0]


            embedding = np.random.rand(768).tolist()

            # Add to collection
            collection.add(
                ids=[doc_id],
                documents=[content],
                metadatas=[game],
                embeddings=[embedding]
            )
            
            # embeddings=[embedding]
            
            documents_added += 1
            
        except Exception as e:
            print(f"Error processing {file_name}: {e}")
    
    print(f"Successfully added {documents_added} documents to ChromaDB")
    print(f"Total documents in collection: {collection.count()}")
else:
    print("Games directory not found. Cannot load data.")


try:
    print(collection.metadata)

except Exception as e:
    print("Could not read collection metadata:", e)



Successfully added 15 documents to ChromaDB
Total documents in collection: 15
None


In [90]:

try:
    print(collection.metadata)


_IncompleteInputError: incomplete input (4293555288.py, line 2)

In [95]:

def fake_query_embedding(text):
    return np.random.rand(768).tolist()

# Test semantic search functionality
def test_semantic_search(collection, query, n_results=3):
    """Test semantic search and display results"""
    print(f"\n=== Searching for: '{query}' ===")
    results = "none"

    q_emb = fake_query_embedding(query)
    results = collection.query(
        query_embeddings=[q_emb],
        # query_texts=[query],
        n_results=n_results,
        include=['documents', 'metadatas', 'distances']
    )
    #query_texts=[query],

    if results['documents'][0]:
        print(f"Found {len(results['documents'][0])} results:")
        
        for i, (doc, metadata, distance) in enumerate(zip(
            results['documents'][0], 
            results['metadatas'][0], 
            results['distances'][0]
        )):
            similarity = 1 - distance  # Convert distance to similarity
            print(f"\n{i+1}. {metadata['Name']} ({metadata['YearOfRelease']})")
            print(f"   Platform: {metadata['Platform']}")
            print(f"   Genre: {metadata.get('Genre', 'N/A')}")
            print(f"   Publisher: {metadata.get('Publisher', 'N/A')}")
            print(f"   Similarity: {similarity:.3f}")
            print(f"   Description: {metadata['Description'][:100]}...")
    else:
        print("No results found.")
    
    return results

# Test with different types of queries
test_queries = [
    "Nintendo racing games",
    "RPG games with fantasy themes",
    "PlayStation games from the 1990s",
    "Action games with shooting mechanics",
    "Games suitable for families"
]

for query in test_queries:
    test_semantic_search(collection, query, n_results=2)


=== Searching for: 'Nintendo racing games' ===
Found 2 results:

1. Pokémon Ruby and Sapphire (2002)
   Platform: Game Boy Advance
   Genre: Role-playing
   Publisher: Nintendo
   Similarity: -117.602
   Description: Third-generation Pokémon games set in the Hoenn region, featuring new Pokémon and double battles....

2. Halo Infinite (2021)
   Platform: Xbox Series X|S
   Genre: First-person shooter
   Publisher: Xbox Game Studios
   Similarity: -122.525
   Description: The latest installment in the Halo franchise, featuring Master Chief's return in a new open-world se...

=== Searching for: 'RPG games with fantasy themes' ===
Found 2 results:

1. Super Smash Bros. Melee (2001)
   Platform: GameCube
   Genre: Fighting
   Publisher: Nintendo
   Similarity: -122.982
   Description: A crossover fighting game featuring characters from various Nintendo franchises battling it out in d...

2. Gran Turismo (1997)
   Platform: PlayStation 1
   Genre: Racing
   Publisher: Sony Computer Entertai

In [96]:
# Display collection statistics
print("=== ChromaDB Collection Statistics ===")
print(f"Collection name: {collection.name}")
print(f"Total documents: {collection.count()}")

# Get a sample of documents to verify data structure
sample_docs = collection.get(limit=3, include=['documents', 'metadatas'])

print("\n=== Sample Documents ===")
for i, (doc, metadata) in enumerate(zip(sample_docs['documents'], sample_docs['metadatas'])):
    print(f"\n{i+1}. Document ID: {sample_docs['ids'][i]}")
    print(f"   Game: {metadata['Name']}")
    print(f"   Platform: {metadata['Platform']}")
    print(f"   Year: {metadata['YearOfRelease']}")
    print(f"   Content: {doc[:100]}...")

=== ChromaDB Collection Statistics ===
Collection name: udaplay
Total documents: 15

=== Sample Documents ===

1. Document ID: 001
   Game: Gran Turismo
   Platform: PlayStation 1
   Year: 1997
   Content: [PlayStation 1] Gran Turismo (1997) - A realistic racing simulator featuring a wide array of cars an...

2. Document ID: 002
   Game: Grand Theft Auto: San Andreas
   Platform: PlayStation 2
   Year: 2004
   Content: [PlayStation 2] Grand Theft Auto: San Andreas (2004) - An expansive open-world game set in the ficti...

3. Document ID: 003
   Game: Gran Turismo 5
   Platform: PlayStation 3
   Year: 2010
   Content: [PlayStation 3] Gran Turismo 5 (2010) - A comprehensive racing simulator featuring a vast selection ...
