In [4]:
from git import Repo
import os

wiki_repo_path = "https://github.com/malware-dev/MDK-SE.wiki.git"
local_repo_path = "documentation"

if not os.path.exists(local_repo_path):
    print(f"Cloning {wiki_repo_path} into {local_repo_path}...")
    Repo.clone_from(wiki_repo_path, local_repo_path)
else:
    print(f"Updating {local_repo_path}...")
    repo = Repo(local_repo_path)
    repo.remotes.origin.pull()

Cloning https://github.com/malware-dev/MDK-SE.wiki.git into documentation...


In [7]:
import markdown
from bs4 import BeautifulSoup
import nltk 

nltk.download('punkt')
nltk.download('punkt_tab')

#we are now going to convert the markdown files to html and then extract the text from them. Chunking the text with meta data whether it is text or code.
def load_and_chunk_wiki_mdkse_code_aware(wiki_folder, chunk_size=500, chunk_overlap=50):
    chunks = []

    # Process files in the root wiki folder
    for filename in os.listdir(wiki_folder):
        if filename.endswith(".md"):
            filepath = os.path.join(wiki_folder, filename)
            page_chunks = process_markdown_file(filepath, filename, chunk_size, chunk_overlap)
            chunks.extend(page_chunks)

    # Process files in the "api" subfolder (if it exists)
    api_folder_path = os.path.join(wiki_folder, "api")
    if os.path.exists(api_folder_path) and os.path.isdir(api_folder_path):
        for filename in os.listdir(api_folder_path):
            if filename.endswith(".md"): # Or other file types you want to process in "api"
                filepath = os.path.join(api_folder_path, filename)
                page_chunks = process_markdown_file(filepath, os.path.join("api", filename), chunk_size, chunk_overlap) # Source includes "api/" prefix
                chunks.extend(page_chunks)

    return chunks


def process_markdown_file(filepath, source_filename, chunk_size, chunk_overlap):
    """Processes a single Markdown file to extract text and code chunks."""
    chunks = []
    chunk_count = 0
    current_chunk = ""

    with open(filepath, "r", encoding="utf-8") as f:
        markdown_content = f.read()
        html_content = markdown.markdown(markdown_content)
        soup = BeautifulSoup(html_content, "html.parser")

        for element in soup.descendants:
            if element.name == 'code':
                code_text = element.get_text(separator='\n', strip=True)
                if code_text.strip():
                    code_chunk = {
                        "content": code_text,
                        "source": source_filename, # Use the provided source_filename
                        "chunk_id": f"{source_filename}_chunk_{chunk_count}_code", # Source in chunk ID too
                        "block_type": "code"
                    }
                    chunks.append(code_chunk)
                    chunk_count += 1
            elif element.name not in ['pre', 'code']:
                text = element.get_text(separator='\n', strip=True)
                if text.strip():
                    sentences = nltk.sent_tokenize(text)
                    for sentence in sentences:
                        if len(current_chunk) + len(sentence) + 1 <= chunk_size:
                            current_chunk += sentence + " "
                        else:
                            if current_chunk.strip():
                                text_chunk = {
                                    "content": current_chunk.strip(),
                                    "source": source_filename, # Use the provided source_filename
                                    "chunk_id": f"{source_filename}_chunk_{chunk_count}_text", # Source in chunk ID too
                                    "block_type": "text"
                                }
                                chunks.append(text_chunk)
                                chunk_count += 1
                            current_chunk = sentence + " "

        if current_chunk.strip():
            text_chunk = {
                "content": current_chunk.strip(),
                "source": source_filename, # Use the provided source_filename
                "chunk_id": f"{source_filename}_chunk_{chunk_count}_text", # Source in chunk ID too
                "block_type": "text"
            }
            chunks.append(text_chunk)

    return chunks


mdk_se_wiki_folder = "documentation" # Your cloned wiki folder
mdk_se_wiki_chunks = load_and_chunk_wiki_mdkse_code_aware(mdk_se_wiki_folder)
print(f"Generated {len(mdk_se_wiki_chunks)} chunks for MDK-SE Wiki (including 'api' folder).")

[nltk_data] Downloading package punkt to C:\Users\James
[nltk_data]     Labadorf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\James
[nltk_data]     Labadorf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Generated 26767 chunks for MDK-SE Wiki (including 'api' folder).


In [13]:
from sentence_transformers import SentenceTransformer

def embed_chunks(chunks, embedding_model):
    """Embeds text chunks using a Sentence Transformer model."""
    embeddings_data = []
    for chunk in chunks:
        embedding_vector = embedding_model.encode(chunk["content"])
        embeddings_data.append({
            "chunk_id": chunk["chunk_id"],
            "embedding": embedding_vector.tolist(),  # Store as list for JSON serialization if needed
            "content": chunk["content"],
            "source": chunk["source"],
            "block_type": chunk["block_type"] # Keep block_type metadata
        })
    return embeddings_data

# --- Initialize Embedding Model ---
embedding_model_name = "all-mpnet-base-v2"  # Or "all-MiniLM-L6-v2" for faster, smaller embeddings
embedding_model = SentenceTransformer(embedding_model_name)

# --- Embed the Chunks ---
embedded_chunks = embed_chunks(mdk_se_wiki_chunks, embedding_model) # Assuming 'mdk_se_wiki_chunks' is from Step 2
print(f"Generated embeddings for {len(embedded_chunks)} chunks.")


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Generated embeddings for 26767 chunks.


In [21]:
import chromadb
from tqdm.notebook import tqdm

#initialize the chroma client
chroma_client = chromadb.PersistentClient(path="vectordb")

collection_name = 'space_engineers_api_docs'
#if the collection already exists, delete it
if chroma_client.get_collection(collection_name):
    chroma_client.delete_collection(collection_name)
collection = chroma_client.create_collection(name=collection_name)
print(f"Created collection: {collection_name}")

def index_embeddings_chroma(embedded_chunks, collection, batch_size=5000):  # Added batch_size parameter
    """Indexes embeddings in ChromaDB in batches to avoid exceeding batch size limits."""
    num_chunks = len(embedded_chunks)
    for i in range(0, num_chunks, batch_size):
        batch_end = min(i + batch_size, num_chunks)  # Ensure we don't go out of bounds
        batch_chunks = embedded_chunks[i:batch_end]

        ids = [chunk_data["chunk_id"] for chunk_data in batch_chunks]
        embeddings = [chunk_data["embedding"] for chunk_data in batch_chunks]
        metadatas = [{"source": chunk_data["source"], "block_type": chunk_data["block_type"]} for chunk_data in batch_chunks]
        documents = [chunk_data["content"] for chunk_data in batch_chunks]

        collection.add(
            ids=ids,
            embeddings=embeddings,
            metadatas=metadatas,
            documents=documents
        )
        print(f"Indexed batch {i//batch_size + 1}/{ (num_chunks + batch_size - 1) // batch_size}, current collection count: {collection.count()}")  # Added batch progress info

    print(f"Finished indexing all {num_chunks} documents in ChromaDB collection '{collection_name}'. Final collection count: {collection.count()}.")

# --- Index the Embedded Chunks in ChromaDB ---
index_embeddings_chroma(embedded_chunks, collection)


Created collection: space_engineers_api_docs
Indexed batch 1/6, current collection count: 5000
Indexed batch 2/6, current collection count: 10000
Indexed batch 3/6, current collection count: 15000
Indexed batch 4/6, current collection count: 20000
Indexed batch 5/6, current collection count: 25000
Indexed batch 6/6, current collection count: 26767
Finished indexing all 26767 documents in ChromaDB collection 'space_engineers_api_docs'. Final collection count: 26767.


In [None]:
def retrieve_relevant_chunks(query, embedding_model, collection, top_k=15):
    """
    Retrieves relevant chunks from ChromaDB based on a user query.

    Args:
        query (str): The user query.
        embedding_model: The Sentence Transformer embedding model.
        collection: The ChromaDB collection.
        top_k (int, optional): The number of top chunks to retrieve. Defaults to 5.

    Returns:
        list: A list of dictionaries, where each dictionary represents a retrieved chunk
              and contains 'content' and 'source' keys.
    """
    query_embedding = embedding_model.encode(query)
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=top_k,
        include=["metadatas", "documents", "distances"]  # ADD "distances" HERE
    )

    retrieved_chunks = []
    if results and results["ids"] and results["documents"]: # Check if results are not empty
        for i in range(len(results["ids"][0])): # Iterate through results for the first query (we only sent one)
            retrieved_chunks.append({
                "content": results["documents"][0][i],
                "source": results["metadatas"][0][i]["source"],
                "block_type": results["metadatas"][0][i]["block_type"], # Include block_type if you want to see it
                "distance": results["distances"][0][i] # Include distance score to see relevance
            })
    else:
        print("No relevant chunks found.")

    return retrieved_chunks


def test_retrieval(query, embedding_model, collection):
    """
    Tests the retrieval function and prints the retrieved chunks.
    """
    print(f"\n--- Retrieval Test ---")
    print(f"Query: '{query}'")

    retrieved_chunks = retrieve_relevant_chunks(query, embedding_model, collection)

    if retrieved_chunks:
        print(f"\nRetrieved Chunks (Top {len(retrieved_chunks)}):")
        for i, chunk in enumerate(retrieved_chunks):
            print(f"\nChunk {i+1}:")
            print(f"  Source: {chunk['source']}")
            print(f"  Block Type: {chunk['block_type']}") # Print block_type metadata
            print(f"  Distance: {chunk['distance']:.4f}") # Print distance score
            print(f"  Content:\n{chunk['content']}")
    else:
        print("No relevant chunks retrieved for this query.")


# --- Example Usage (assuming you have 'embedding_model' and 'collection' already defined) ---

# Example Query - Change this to a question relevant to MDK-SE Wiki
test_query = "program a script that will open a door every 5 seconds in space engineers"

test_retrieval(test_query, embedding_model, collection) # Run the retrieval test

# You can test with more queries:
# test_query_2 = "What is the purpose of the Ingame API in Space Engineers?"
# test_retrieval(test_query_2, embedding_model, collection)


--- Retrieval Test ---
Query: 'program a script that will get the current ores in a given cargo container'

Retrieved Chunks (Top 15):

Chunk 1:
  Source: api\Sandbox.Game.Localization.MySpaceTexts.md
  Block Type: text
  Distance: 1.0009
  Content:
Use a survival kit or Refinery to process ore and obtain ingots. static MyStringId IngameHelp_RefiningOre_Title static MyStringId IngameHelp_RefiningOre_Title Good.bot: Refining Ore Good.bot: Refining Ore Good.bot: Refining Ore static MyStringId IngameHelp_Rifle_Detail1 static MyStringId IngameHelp_Rifle_Detail1 A basic weapon to protect yourself; requires MR-20 Rifle Magazine ammunition. A basic weapon to protect yourself; requires MR-20 Rifle Magazine ammunition.

Chunk 2:
  Source: Type-Definition-Listing.md
  Block Type: text
  Distance: 1.0334
  Content:
Cobalt Ore MyObjectBuilder_Ore/Cobalt Gold Ore
MyObjectBuilder_Ore/Gold Gold Ore Gold Ore Gold Ore MyObjectBuilder_Ore/Gold Ice
MyObjectBuilder_Ore/Ice Ice Ice Ice MyObjectBuilder_Ore

In [34]:
import importlib 
import gemini_helper
importlib.reload(gemini_helper)
from gemini_helper import generate_answer_gemini

test_query = "program a script that will open a door every 5 seconds in space engineers"


# 1. Retrieve relevant chunks (using your existing function)
retrieved_context = retrieve_relevant_chunks(test_query, embedding_model, collection)

# 2. Generate answer using Gemini API
if retrieved_context:
    gemini_answer = generate_answer_gemini(test_query, retrieved_context)

    print(f"\n--- Gemini API Answer ---")
    print(f"Question: {test_query}")
    print(f"Answer: {gemini_answer}")
    print("\nRetrieved Context Sources:")
    for chunk in retrieved_context:
        print(f"- {chunk['source']}")
else:
    print("No relevant context retrieved, cannot generate answer.")


--- Gemini API Answer ---
Question: program a script that will open a door every 5 seconds in space engineers
Answer: Answer:
```csharp
public void Main(string argument, UpdateType updateSource)
{
    TimeSpan interval = TimeSpan.FromSeconds(5); // Set the interval to 5 seconds
    DateTime lastRunTime;

    if (Storage != null && Storage != "") // Load last run time from storage if available
    {
        if (DateTime.TryParse(Storage, out lastRunTime))
        {
            // Successfully parsed last run time.
        }
        else
        {
            lastRunTime = DateTime.Now; // If parsing fails, initialize to now
        }
    }
    else
    {
        lastRunTime = DateTime.Now; // Initialize last run time if no storage
    }

    if (DateTime.Now - lastRunTime >= interval)
    {
        List<IMyDoor> doorList = new List<IMyDoor>();
        GridTerminalSystem.GetBlocksOfType<IMyDoor>(doorList);

        if (doorList.Count > 0)
        {
            IMyDoor door = doorList[0]