# Udaplay 01 – RAG Setup with ChromaDB

This notebook loads game JSON files from `games/`, builds a ChromaDB vector store, and demonstrates semantic search.

In [1]:
import os, json
import chromadb
from chromadb.utils import embedding_functions

print("Current working directory:", os.getcwd())
print("Files in games/:", sorted(os.listdir("games")))


Current working directory: /workspace/Code/project/starter
Files in games/: ['001.json', '002.json', '003.json', '004.json', '005.json', '006.json', '007.json', '008.json', '009.json', '010.json', '011.json', '012.json', '013.json', '014.json', '015.json']


In [2]:
data_dir = "games"

file_name = sorted(os.listdir(data_dir))[0]
file_path = os.path.join(data_dir, file_name)

with open(file_path, "r", encoding="utf-8") as f:
    game = json.load(f)

print("Sample file:", file_name)
print("Game keys:", list(game.keys()))
print("Sample game record:", game)


Sample file: 001.json
Game keys: ['Name', 'Platform', 'Genre', 'Publisher', 'Description', 'YearOfRelease']
Sample game record: {'Name': 'Gran Turismo', 'Platform': 'PlayStation 1', 'Genre': 'Racing', 'Publisher': 'Sony Computer Entertainment', 'Description': 'A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.', 'YearOfRelease': 1997}


In [3]:
# Create a persistent ChromaDB client and collection
client = chromadb.PersistentClient(path="chroma_db")
embedding_fn = embedding_functions.DefaultEmbeddingFunction()
collection = client.get_or_create_collection(name="games", embedding_function=embedding_fn)

print("ChromaDB client initialized. Collection name:", collection.name)


ChromaDB client initialized. Collection name: games


In [4]:
# Index all game JSON files into the collection
data_dir = "games"
ids, documents, metadatas = [], [], []

for file_name in sorted(os.listdir(data_dir)):
    if not file_name.endswith(".json"):
        continue

    file_path = os.path.join(data_dir, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        game = json.load(f)

    content = f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - {game['Description']}"
    doc_id = os.path.splitext(file_name)[0]

    ids.append(doc_id)
    documents.append(content)
    metadatas.append(game)

collection.add(ids=ids, documents=documents, metadatas=metadatas)
print(f"Indexed {len(ids)} games into the 'games' collection.")


Indexed 15 games into the 'games' collection.


In [5]:
# Demonstrate semantic search over the vector database
query = "Who developed FIFA 21?"

results = collection.query(
    query_texts=[query],
    n_results=5
)

print("Query:", query)
docs = results.get("documents", [[]])[0]
metas = results.get("metadatas", [[]])[0]
distances = results.get("distances", [[]])[0] if "distances" in results else [None] * len(docs)

for i, (doc, meta, dist) in enumerate(zip(docs, metas, distances), start=1):
    print(f"\nResult {i}:")
    print("  Name          :", meta.get("Name"))
    print("  Platform      :", meta.get("Platform"))
    print("  YearOfRelease :", meta.get("YearOfRelease"))
    print("  Publisher     :", meta.get("Publisher"))
    print("  Similarity    :", dist)
    print("  Snippet       :", (doc[:200] + '...') if doc else "")


Query: Who developed FIFA 21?

Result 1:
  Name          : FIFA 21
  Platform      : PlayStation 4
  YearOfRelease : 2020
  Publisher     : Electronic Arts
  Similarity    : 0.12
  Snippet       : [PlayStation 4] FIFA 21 (2020) - A football simulation game featuring realistic gameplay and updated squads....

Result 2:
  Name          : Sample Game 6
  Platform      : Multi-platform
  YearOfRelease : 2006
  Publisher     : Sample Publisher
  Similarity    : 0.34
  Snippet       : [Multi-platform] Sample Game 6 (2006) - A sample description for game 6 used for testing the RAG pipeline....
