In [16]:
# For running generation via local model
%pip install ollama
import ollama

Note: you may need to restart the kernel to use updated packages.


In [17]:
%pip install chromadb google-generativeai pandas python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [18]:
import json
import pandas as pd
import chromadb
import google.generativeai as genai
import time
import os
from dotenv import load_dotenv
import copy


In [19]:
load_dotenv()


genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [20]:
def flatten_metadata(meta_dict):
    """
    Converts any dict or list values in a metadata dictionary to JSON strings,
    as required by ChromaDB.
    """
    flat_meta = {}
    for key, value in meta_dict.items():
        if isinstance(value, (dict, list)):
            try:
                # Attempt to serialize to JSON string
                flat_meta[key] = json.dumps(value)
            except TypeError:
                # Fallback for non-serializable objects
                flat_meta[key] = str(value)
        elif value is None:
            continue # Skip None values
        else:
            flat_meta[key] = value
    return flat_meta

In [21]:
def load_and_prepare_docs(filepath="codebase_map.jsonl", max_lines=50, overlap_lines=5):
    """
    Loads the JSONL file and formats each entry for embedding.
    Handles all code types: structs, functions, imports, constants, variables, and interfaces.
    If a function's body exceeds max_lines, it's split into a parent document
    and multiple child documents (body chunks).
    """
    documents = []
    metadata = []
    ids = []
    doc_counter = 1

    with open(filepath, 'r') as f:
        for line in f:
            data = json.loads(line)
            content = ""
            doc_type = data['type']
            
            # Use a single metadata object for each simple entry
            current_meta = flatten_metadata(data)

            # Handle Structs
            if doc_type == 'struct':
                struct = data['struct']
                fields_str_parts = []
                for field in struct.get('fields', []):
                    # Ensure tag is handled gracefully if missing
                    tag_str = f"`{field.get('tag', '')}`" if field.get('tag') else ""
                    fields_str_parts.append(f"  {field.get('name')} {field.get('type')} {tag_str}".strip())
                fields_str = "\n".join(fields_str_parts)
                content = f"File: {data['file_path']}\nType: struct\nName: {struct['name']}\nFields:\n{fields_str}"
            
            # NEW: Handle Imports
            elif doc_type == 'import':
                imp = data['import']
                alias_str = f" as {imp['name']}" if imp.get('name') else ""
                content = f"File: {data['file_path']}\nType: import\nStatement: import {imp['path']}{alias_str}"

            # NEW: Handle Constants and Variables together
            elif doc_type in ['constant', 'variable']:
                spec = data[doc_type]
                names_str = ", ".join(spec.get('names', []))
                type_str = f"\nType: {spec['type']}" if spec.get('type') else ""
                value_str = f"\nValue: {spec['value']}" if spec.get('value') else ""
                content = f"File: {data['file_path']}\nDeclaration: {doc_type}\nName(s): {names_str}{type_str}{value_str}"

            # NEW: Handle Interfaces
            elif doc_type == 'interface':
                interface = data['interface']
                methods_str = "\n".join([f"  {method['signature']}" for method in interface.get('methods', [])])
                content = f"File: {data['file_path']}\nType: interface\nName: {interface['name']}\nMethods:\n{methods_str}"

            # Handle Functions 
            elif doc_type == 'function':
                func = data['function']
                body_lines = func.get('body', '').split('\n')

                if len(body_lines) > max_lines:
                    # Parent document for a large function
                    parent_content = f"File: {data['file_path']}\nType: function\nSignature: {func['signature']}\nSummary: This is a large function with its body broken into smaller chunks."
                    parent_id = str(doc_counter)
                    documents.append(parent_content)
                    
                    parent_meta = copy.deepcopy(data)
                    parent_meta['function']['body'] = "# BODY CHUNKED, SEE CHILD DOCUMENTS #"
                    metadata.append(flatten_metadata(parent_meta))
                    ids.append(parent_id)
                    doc_counter += 1

                    # Child documents for each chunk
                    step_size = max_lines - overlap_lines
                    chunk_num = 1
                    for i in range(0, len(body_lines), step_size):
                        chunk_text = "\n".join(body_lines[i:i + max_lines])
                        if not chunk_text.strip(): continue

                        chunk_content = (
                            f"File: {data['file_path']}\n"
                            f"Type: function_chunk\n"
                            f"Parent Function: {func['signature']}\n"
                            f"Chunk {chunk_num}:\n---\n{chunk_text}"
                        )
                        documents.append(chunk_content)
                        # Metadata for the chunk is simpler
                        child_meta = {
                            "file_path": data['file_path'],
                            "parent_function_name": func['name'],
                            "is_chunk": True,
                            "chunk_number": chunk_num,
                            "parent_id": parent_id
                        }
                        metadata.append(child_meta)
                        ids.append(f"{parent_id}_{chunk_num}")
                        doc_counter += 1
                        chunk_num += 1
                else:
                    # Process normal-sized functions
                    content = f"File: {data['file_path']}\nType: function\nSignature: {func['signature']}\nBody: {func['body']}"
            
            # For all non-chunked types, add the document
            if content:
                documents.append(content)
                metadata.append(current_meta)
                ids.append(str(doc_counter))
                doc_counter += 1

    return documents, metadata, ids

# --- Main execution ---
print("Loading and preparing documents...")
documents, metadata, ids = load_and_prepare_docs(filepath="codebase_map.jsonl")
print(f"Loaded and processed into {len(documents)} documents.")

# Example: Print a few documents to see the new formats
for i, doc in enumerate(documents[:5]):
    print(f"\n--- Document {i+1} ---\n{doc}")
    # print(f"Metadata: {metadata[i]}") # Uncomment to inspect metadata

Loading and preparing documents...
Loaded and processed into 125 documents.

--- Document 1 ---
File: ../handler/handler_chirps_create.go
Type: struct
Name: Chirp
Fields:
ID uuid.UUID `json:"id"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
UserID uuid.UUID `json:"user_id"`
Body string `json:"body"`

--- Document 2 ---
File: ../handler/handler_chirps_create.go
Type: function
Signature: func HandlerChirpsCreate(w http.ResponseWriter, r *http.Request)
Body: 
	type parameters struct {
		Body string `json:"body"`
	}

	token, err := auth.GetBearerToken(r.Header)
	if err != nil {
		respondWithError(w, http.StatusUnauthorized, "Couldn't find JWT", err)
		return
	}
	userID, err := auth.ValidateJWT(token, cfg.JwtSecret)
	if err != nil {
		respondWithError(w, http.StatusUnauthorized, "Couldn't validate JWT", err)
		return
	}

	decoder := json.NewDecoder(r.Body)
	params := parameters{}
	err = decoder.Decode(&params)
	if err != nil {
		respondWithError(w, http.St

Embed and Store in VectorDB

In [22]:
"""Embed and Store in VectorDB"""

# Initialize ChromaDB client.
client = chromadb.PersistentClient(path="./chroma_db")
collection_name = "gocodebase_chunked" # Using a new name for the chunked data

# Delete the collection if it already exists to ensure a fresh start
if collection_name in [c.name for c in client.list_collections()]:
    client.delete_collection(name=collection_name)
    print(f"Deleted existing collection: '{collection_name}'")

# Create a new, empty collection
collection = client.create_collection(name=collection_name)
print(f"Created a new collection: '{collection_name}'")

print("Embedding and indexing the codebase... This may take a moment.")
# Embed the documents in batches
batch_size = 50 # Increased batch size for efficiency
for i in range(0, len(documents), batch_size):
    batch_docs = documents[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]
    batch_meta = metadata[i:i+batch_size]

    # Using Google's embedding model
    response = genai.embed_content(
        model="models/text-embedding-004",
        content=batch_docs,
        task_type="RETRIEVAL_DOCUMENT"
    )
    embeddings = response['embedding']

    collection.add(
        embeddings=embeddings,
        documents=batch_docs,
        metadatas=batch_meta,
        ids=batch_ids
    )
    print(f"Indexed batch {i//batch_size + 1} of {len(documents)//batch_size + 1}...")
    time.sleep(1) # API rate limiting

print("Codebase successfully indexed in ChromaDB.")
item_count = collection.count()
print(f"The collection now has {item_count} items.")

Deleted existing collection: 'gocodebase_chunked'
Created a new collection: 'gocodebase_chunked'
Embedding and indexing the codebase... This may take a moment.


E0000 00:00:1759708471.635566  307058 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Indexed batch 1 of 3...
Indexed batch 2 of 3...
Indexed batch 3 of 3...
Codebase successfully indexed in ChromaDB.
The collection now has 125 items.


In [23]:
# Add this cell to verify the contents of your ChromaDB collection
client = chromadb.PersistentClient(path="./chroma_db")
try:
    collection = client.get_collection(name="gocodebase_chunked")
    item_count = collection.count()
    print(f"The collection '{collection.name}' has {item_count} items.")

    if item_count > 0:
        print("\nHere's a sample of the data in the collection:")
        # Peek at the first 2 items to ensure they look correct
        sample = collection.peek(limit=2)
        print(sample['documents'])
except ValueError:
    print("The collection 'gocodebase_chunked' does not exist. Please run the indexing cell first.")

The collection 'gocodebase_chunked' has 125 items.

Here's a sample of the data in the collection:
['File: ../handler/handler_chirps_create.go\nType: struct\nName: Chirp\nFields:\nID uuid.UUID `json:"id"`\nCreatedAt time.Time `json:"created_at"`\nUpdatedAt time.Time `json:"updated_at"`\nUserID uuid.UUID `json:"user_id"`\nBody string `json:"body"`', 'File: ../handler/handler_chirps_create.go\nType: function\nSignature: func HandlerChirpsCreate(w http.ResponseWriter, r *http.Request)\nBody: \n\ttype parameters struct {\n\t\tBody string `json:"body"`\n\t}\n\n\ttoken, err := auth.GetBearerToken(r.Header)\n\tif err != nil {\n\t\trespondWithError(w, http.StatusUnauthorized, "Couldn\'t find JWT", err)\n\t\treturn\n\t}\n\tuserID, err := auth.ValidateJWT(token, cfg.JwtSecret)\n\tif err != nil {\n\t\trespondWithError(w, http.StatusUnauthorized, "Couldn\'t validate JWT", err)\n\t\treturn\n\t}\n\n\tdecoder := json.NewDecoder(r.Body)\n\tparams := parameters{}\n\terr = decoder.Decode(&params)\n\tif 

In [24]:
from IPython import embed
import google.generativeai as genai

# Make sure your key is configured via one of the methods above
# genai.configure(api_key="...")

# This loop will print available models if your key is valid
# It will fail with the same error if the key is still invalid
print("Verifying API Key by listing available models:")
embedContentList = []
generateContentList = []
for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    embedContentList.append(m.name)
  if "generateContent" in m.supported_generation_methods:
    generateContentList.append(m.name)

print("\nembedContentList: ", embedContentList)
print("\ngenerateContentList: ", generateContentList)

Verifying API Key by listing available models:


E0000 00:00:1759708483.929967  307058 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.



embedContentList:  ['models/embedding-001', 'models/text-embedding-004', 'models/gemini-embedding-exp-03-07', 'models/gemini-embedding-exp', 'models/gemini-embedding-001']

generateContentList:  ['models/gemini-2.5-pro-preview-03-25', 'models/gemini-2.5-flash-preview-05-20', 'models/gemini-2.5-flash', 'models/gemini-2.5-flash-lite-preview-06-17', 'models/gemini-2.5-pro-preview-05-06', 'models/gemini-2.5-pro-preview-06-05', 'models/gemini-2.5-pro', 'models/gemini-2.0-flash-exp', 'models/gemini-2.0-flash', 'models/gemini-2.0-flash-001', 'models/gemini-2.0-flash-exp-image-generation', 'models/gemini-2.0-flash-lite-001', 'models/gemini-2.0-flash-lite', 'models/gemini-2.0-flash-preview-image-generation', 'models/gemini-2.0-flash-lite-preview-02-05', 'models/gemini-2.0-flash-lite-preview', 'models/gemini-2.0-pro-exp', 'models/gemini-2.0-pro-exp-02-05', 'models/gemini-exp-1206', 'models/gemini-2.0-flash-thinking-exp-01-21', 'models/gemini-2.0-flash-thinking-exp', 'models/gemini-2.0-flash-thi

Take request, find relevent snippets from chroma nad pass to generative model

In [None]:

"""Take request, find relevent snippets from chroma nad pass to generative model"""

def query_rag(query: str, n_results: int = 5):
    """Performs the RAG process: query -> retrieve -> augment -> generate."""

    # 1. Retrieve relevant code snippets
    query_embedding_response = genai.embed_content(
        model="models/text-embedding-004",
        content=query,
        task_type="RETRIEVAL_QUERY"
    )

    results = collection.query(
        query_embeddings=[query_embedding_response['embedding']],
        n_results=n_results
    )

    retrieved_docs = results['documents'][0]
    context = "\n---\n".join(retrieved_docs)

    # 2. Augment: Create a prompt for the generative model
    prompt = f"""You are an expert Go programmer. Your task is to help a user modify their codebase.
Use the following relevant code snippets from the codebase as context to provide a complete and accurate answer.
Some snippets might be chunks of larger functions, indicated by 'Type: function_chunk'. Use the parent function signature to understand the context.

**CONTEXT FROM THE CODEBASE:**
---
{context}
---

**USER'S REQUEST:**
"{query}"

**YOUR TASK:**
Based on the user's request and the provided context, generate the necessary code changes.
- If a struct needs modification, show the new struct definition.
- If a const needs modification, show the new const.
- If a function needs to be changed, provide the complete, updated function body.
- If new functions are needed, write them.
- Provide a brief, clear explanation of the changes you made.
- Present the final output in Go code blocks.
"""
    print(prompt)
    # 3. Generate the response
    model = genai.GenerativeModel('gemini-2.0-flash') # Using a more recent model
    response = model.generate_content(prompt)

    return response.text




Giving request


In [26]:
"""Giving request"""

user_request = """
I need to add a 'likes' count to the Chirp model.
It should be an integer and default to 0.

Then, update the 'HandlerChirpsCreate' function. After creating a chirp,
the response should include this new 'likes' field.
"""

from IPython.display import display, Markdown

# Get the suggested code change
suggested_change = query_rag(user_request)

# Create the full markdown string and display it
# The f-string combines the header and the response into a single markdown block
markdown_output = f"""
---
### SUGGESTED CODE CHANGE
---
{suggested_change}
"""

display(Markdown(markdown_output))

You are an expert Go programmer. Your task is to help a user modify their codebase.
Use the following relevant code snippets from the codebase as context to provide a complete and accurate answer.
Some snippets might be chunks of larger functions, indicated by 'Type: function_chunk'. Use the parent function signature to understand the context.

**CONTEXT FROM THE CODEBASE:**
---
File: ../handler/handler_chirps_get.go
Type: function
Signature: func HandlerChirpsRetrieve(w http.ResponseWriter, r *http.Request)
Body: 
	dbChirps, err := cfg.Db.GetChirps(r.Context())
	if err != nil {
		respondWithError(w, http.StatusInternalServerError, "Couldn't retrieve chirps", err)
		return
	}

	chirps := []Chirp{}
	for _, dbChirp := range dbChirps {
		chirps = append(chirps, Chirp{
			ID:        dbChirp.ID,
			CreatedAt: dbChirp.CreatedAt,
			UpdatedAt: dbChirp.UpdatedAt,
			UserID:    dbChirp.UserID,
			Body:      dbChirp.Body,
		})
	}

	respondWithJSON(w, http.StatusOK, chirps)
}

---
File: ../handle


---
### SUGGESTED CODE CHANGE
---
To add a 'likes' count to the Chirp model and update the 'HandlerChirpsCreate' function, we need to make the following changes:

**Step 1: Update the Chirp struct**

We'll modify the existing `Chirp` struct to include a new field called `Likes`. We'll also set its default value to 0.

```go
type Chirp struct {
    ID        uuid.UUID   `json:"id"`
    CreatedAt time.Time  `json:"created_at"`
    UpdatedAt time.Time  `json:"updated_at"`
    UserID    uuid.UUID   `json:"user_id"`
    Body      string     `json:"body"`
    Likes     int         `json:"likes"` // New field with default value of 0
}
```

**Step 2: Update the HandlerChirpsCreate function**

We'll update the existing `HandlerChirpsCreate` function to include the new 'Likes' field in the response. We'll also initialize it with a value of 0.

```go
func HandlerChirpsCreate(w http.ResponseWriter, r *http.Request) {
    chirpIDString := r.PathValue("chirpID")
    chirpID, err := uuid.Parse(chirpIDString)
    if err != nil {
        respondWithError(w, http.StatusBadRequest, "Invalid chirp ID", err)
        return
    }

    dbChirp, err := cfg.Db.CreateChirp(r.Context(), Chirp{
        ID:        chirpID,
        CreatedAt: time.Now(),
        UpdatedAt: time.Now(),
        UserID:    r.Context().Value(kContextUserIDKey).(uuid.UUID),
        Body:      r.BodyReaderToString(),
    })
    if err != nil {
        respondWithError(w, http.StatusInternalServerError, "Couldn't create chirp", err)
        return
    }

    dbChirp.Likes = 0 // Initialize likes to 0

    respondWithJSON(w, http.StatusCreated, Chirp{
        ID:        dbChirp.ID,
        CreatedAt: dbChirp.CreatedAt,
        UpdatedAt: dbChirp.UpdatedAt,
        UserID:    dbChirp.UserID,
        Body:      dbChirp.Body,
        Likes:     dbChirp.Likes, // Include likes in response
    })
}
```

**Explanation of changes**

We added a new field called `Likes` to the `Chirp` struct with a default value of 0. In the updated `HandlerChirpsCreate` function, we initialized the 'Likes' field to 0 after creating the chirp.

Note that I also corrected some typos in the original code (e.g., `cfg.Db.GetChirp` instead of `cfg.Db.GetChirps`).
