In [3]:
%pip install chromadb google-generativeai pandas python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [16]:
import json
import pandas as pd
import chromadb
import google.generativeai as genai
import time
import os
from dotenv import load_dotenv
import copy


In [17]:
load_dotenv()


genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [18]:
def flatten_metadata(meta_dict):
    """
    Converts any dict or list values in a metadata dictionary to JSON strings,
    as required by ChromaDB.
    """
    flat_meta = {}
    for key, value in meta_dict.items():
        if isinstance(value, (dict, list)):
            flat_meta[key] = json.dumps(value)
        else:
            flat_meta[key] = value
    return flat_meta

In [32]:
def load_and_prepare_docs(filepath="codebase_map.jsonl", max_lines=50,overlap_lines=5):
    """
    Loads the JSONL file and formats each entry for embedding.
    If a function's body exceeds max_lines, it's split into a parent document
    (signature only) and multiple child documents (body chunks).
    """
    documents = []
    metadata = []
    ids = []
    doc_counter = 1

    with open(filepath, 'r') as f:
        for line in f:
            data = json.loads(line)
            content = ""

            # Handle Structs
            if data['type'] == 'struct':
                struct = data['struct']
                fields_str = "\n".join([f"  {field['name']} {field['type']} `{field.get('tag', '')}`" for field in struct['fields']])
                content = f"File: {data['file_path']}\nType: struct\nName: {struct['name']}\nFields:\n{fields_str}"
                documents.append(content)
                metadata.append(flatten_metadata(data)) # FLATTEN METADATA
                ids.append(str(doc_counter))
                doc_counter += 1

            # Handle Functions
            elif data['type'] == 'function':
                func = data['function']
                body_lines = func.get('body', '').split('\n')

                # If function body is larger than max_lines, chunk it
                if len(body_lines) > max_lines:
                    parent_content = f"File: {data['file_path']}\nType: function\nSignature: {func['signature']}\nSummary: This is a large function with its body broken into smaller chunks."
                    parent_id = str(doc_counter)
                    documents.append(parent_content)
                    
                    parent_meta = copy.deepcopy(data)
                    parent_meta['function']['body'] = "# BODY CHUNKED, SEE CHILD DOCUMENTS #"
                    metadata.append(flatten_metadata(parent_meta))
                    ids.append(parent_id)
                    doc_counter += 1

                    # 2. MODIFIED: Create overlapping child documents
                    step_size = max_lines - overlap_lines
                    chunk_num = 1
                    for i in range(0, len(body_lines), step_size):
                        # Define the chunk with overlap
                        chunk_text = "\n".join(body_lines[i:i + max_lines])
                        if not chunk_text: continue # Skip empty chunks

                        chunk_content = (
                            f"File: {data['file_path']}\n"
                            f"Type: function_chunk\n"
                            f"Parent Function: {func['signature']}\n"
                            f"Chunk {chunk_num}:\n---\n{chunk_text}"
                        )
                        documents.append(chunk_content)
                        child_meta = {
                            "file_path": data['file_path'],
                            "parent_function_name": func['name'],
                            "is_chunk": True
                        }
                        metadata.append(child_meta)
                        ids.append(f"{parent_id}_{chunk_num-1}")
                        doc_counter += 1
                        chunk_num += 1
                else:
                    # If function is not too long, process it normally
                    content = f"File: {data['file_path']}\nType: function\nSignature: {func['signature']}\nBody: {func['body']}"
                    documents.append(content)
                    metadata.append(flatten_metadata(data)) # FLATTEN METADATA
                    ids.append(str(doc_counter))
                    doc_counter += 1

    return documents, metadata, ids

print("Loading and preparing documents with hierarchical chunking...")
documents, metadata, ids = load_and_prepare_docs()
print(f"Loaded and processed into {len(documents)} documents.")

print(documents[68])

Loading and preparing documents with hierarchical chunking...
Loaded and processed into 107 documents.
File: ../json.go
Type: struct
Name: errorResponse
Fields:
  Error string `json:"error"`


Embed and Store in VectorDB

In [33]:

"""Embed and Store in VectorDB"""

# Initialize ChromaDB client.
client = chromadb.PersistentClient(path="./chroma_db")
collection_name = "gocodebase_chunked" # Using a new name for the chunked data

# Delete the collection if it already exists to ensure a fresh start
if collection_name in [c.name for c in client.list_collections()]:
    client.delete_collection(name=collection_name)
    print(f"Deleted existing collection: '{collection_name}'")

# Create a new, empty collection
collection = client.create_collection(name=collection_name)
print(f"Created a new collection: '{collection_name}'")

print("Embedding and indexing the codebase... This may take a moment.")
# Embed the documents in batches
batch_size = 50 # Increased batch size for efficiency
for i in range(0, len(documents), batch_size):
    batch_docs = documents[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]
    batch_meta = metadata[i:i+batch_size]

    # Using Google's embedding model
    response = genai.embed_content(
        model="models/text-embedding-004",
        content=batch_docs,
        task_type="RETRIEVAL_DOCUMENT"
    )
    embeddings = response['embedding']

    collection.add(
        embeddings=embeddings,
        documents=batch_docs,
        metadatas=batch_meta,
        ids=batch_ids
    )
    print(f"Indexed batch {i//batch_size + 1} of {len(documents)//batch_size + 1}...")
    time.sleep(1) # API rate limiting

print("Codebase successfully indexed in ChromaDB.")
item_count = collection.count()
print(f"The collection now has {item_count} items.")

Deleted existing collection: 'gocodebase_chunked'
Created a new collection: 'gocodebase_chunked'
Embedding and indexing the codebase... This may take a moment.
Indexed batch 1 of 3...
Indexed batch 2 of 3...
Indexed batch 3 of 3...
Codebase successfully indexed in ChromaDB.
The collection now has 107 items.


In [34]:
# Add this cell to verify the contents of your ChromaDB collection
client = chromadb.PersistentClient(path="./chroma_db")
try:
    collection = client.get_collection(name="gocodebase_chunked")
    item_count = collection.count()
    print(f"The collection '{collection.name}' has {item_count} items.")

    if item_count > 0:
        print("\nHere's a sample of the data in the collection:")
        # Peek at the first 2 items to ensure they look correct
        sample = collection.peek(limit=2)
        print(sample['documents'])
except ValueError:
    print("The collection 'gocodebase_chunked' does not exist. Please run the indexing cell first.")

The collection 'gocodebase_chunked' has 107 items.

Here's a sample of the data in the collection:
['File: ../codebase/runMain.go\nType: struct\nName: Field\nFields:\n  Name string `json:"name"`\n  Type string `json:"type"`\n  Tag string `json:"tag,omitempty"`', 'File: ../codebase/runMain.go\nType: struct\nName: FunctionMetadata\nFields:\n  Name string `json:"name"`\n  Signature string `json:"signature"`\n  Parameters []Field `json:"parameters"`\n  Returns []Field `json:"returns"`\n  Body string `json:"body"`']


In [35]:
from IPython import embed
import google.generativeai as genai

# Make sure your key is configured via one of the methods above
# genai.configure(api_key="...")

# This loop will print available models if your key is valid
# It will fail with the same error if the key is still invalid
print("Verifying API Key by listing available models:")
embedContentList = []
generateContentList = []
for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    embedContentList.append(m.name)
  if "generateContent" in m.supported_generation_methods:
    generateContentList.append(m.name)

print("\nembedContentList: ", embedContentList)
print("\ngenerateContentList: ", generateContentList)

Verifying API Key by listing available models:

embedContentList:  ['models/embedding-001', 'models/text-embedding-004', 'models/gemini-embedding-exp-03-07', 'models/gemini-embedding-exp', 'models/gemini-embedding-001']

generateContentList:  ['models/gemini-2.5-pro-preview-03-25', 'models/gemini-2.5-flash-preview-05-20', 'models/gemini-2.5-flash', 'models/gemini-2.5-flash-lite-preview-06-17', 'models/gemini-2.5-pro-preview-05-06', 'models/gemini-2.5-pro-preview-06-05', 'models/gemini-2.5-pro', 'models/gemini-2.0-flash-exp', 'models/gemini-2.0-flash', 'models/gemini-2.0-flash-001', 'models/gemini-2.0-flash-exp-image-generation', 'models/gemini-2.0-flash-lite-001', 'models/gemini-2.0-flash-lite', 'models/gemini-2.0-flash-preview-image-generation', 'models/gemini-2.0-flash-lite-preview-02-05', 'models/gemini-2.0-flash-lite-preview', 'models/gemini-2.0-pro-exp', 'models/gemini-2.0-pro-exp-02-05', 'models/gemini-exp-1206', 'models/gemini-2.0-flash-thinking-exp-01-21', 'models/gemini-2.0-fl

Take request, find relevent snippets from chroma nad pass to generative model

In [36]:

"""Take request, find relevent snippets from chroma nad pass to generative model"""

def query_rag(query: str, n_results: int = 5):
    """Performs the RAG process: query -> retrieve -> augment -> generate."""

    # 1. Retrieve relevant code snippets
    query_embedding_response = genai.embed_content(
        model="models/text-embedding-004",
        content=query,
        task_type="RETRIEVAL_QUERY"
    )

    results = collection.query(
        query_embeddings=[query_embedding_response['embedding']],
        n_results=n_results
    )

    retrieved_docs = results['documents'][0]
    context = "\n---\n".join(retrieved_docs)

    # 2. Augment: Create a prompt for the generative model
    prompt = f"""You are an expert Go programmer. Your task is to help a user modify their codebase.
Use the following relevant code snippets from the codebase as context to provide a complete and accurate answer.
Some snippets might be chunks of larger functions, indicated by 'Type: function_chunk'. Use the parent function signature to understand the context.

**CONTEXT FROM THE CODEBASE:**
---
{context}
---

**USER'S REQUEST:**
"{query}"

**YOUR TASK:**
Based on the user's request and the provided context, generate the necessary code changes.
- If a struct needs modification, show the new struct definition.
- If a function needs to be changed, provide the complete, updated function body.
- If new functions are needed, write them.
- Provide a brief, clear explanation of the changes you made.
- Present the final output in Go code blocks.
"""

    # 3. Generate the response
    model = genai.GenerativeModel('gemini-2.0-flash') # Using a more recent model
    response = model.generate_content(prompt)

    return response.text

Giving request


In [28]:
"""Giving request"""

user_request = """
I need to add a 'likes' count to the Chirp model.
It should be an integer and default to 0.

Then, update the 'handlerChirpsCreate' function. After creating a chirp,
the response should include this new 'likes' field.
"""

from IPython.display import display, Markdown

# Get the suggested code change
suggested_change = query_rag(user_request)

# Create the full markdown string and display it
# The f-string combines the header and the response into a single markdown block
markdown_output = f"""
---
### SUGGESTED CODE CHANGE
---
{suggested_change}
"""

display(Markdown(markdown_output))


---
### SUGGESTED CODE CHANGE
---
```go
// File: ../internal/database/models.go
type Chirp struct {
	ID        uuid.UUID
	CreatedAt time.Time
	UpdatedAt time.Time
	Body      string
	UserID    uuid.UUID
	Likes     int // Added Likes field
}
```

```go
// File: ../handler_chirps_create.go
type Chirp struct {
	ID        uuid.UUID `json:"id"`
	CreatedAt time.Time `json:"created_at"`
	UpdatedAt time.Time `json:"updated_at"`
	UserID    uuid.UUID `json:"user_id"`
	Body      string    `json:"body"`
	Likes     int       `json:"likes"` // Added Likes field
}
```

```go
// File: ../handler_chirps_get.go
Type: function
Signature: func handlerChirpsRetrieve(w http.ResponseWriter, r *http.Request)
Body: 
	dbChirps, err := cfg.db.GetChirps(r.Context())
	if err != nil {
		respondWithError(w, http.StatusInternalServerError, "Couldn't retrieve chirps", err)
		return
	}

	chirps := []Chirp{}
	for _, dbChirp := range dbChirps {
		chirps = append(chirps, Chirp{
			ID:        dbChirp.ID,
			CreatedAt: dbChirp.CreatedAt,
			UpdatedAt: dbChirp.UpdatedAt,
			UserID:    dbChirp.UserID,
			Body:      dbChirp.Body,
			Likes:     dbChirp.Likes,
		})
	}

	respondWithJSON(w, http.StatusOK, chirps)
}
```

```go
// File: ../handler_chirps_get.go
Type: function
Signature: func handlerChirpsGet(w http.ResponseWriter, r *http.Request)
Body: 
	chirpIDString := r.PathValue("chirpID")
	chirpID, err := uuid.Parse(chirpIDString)
	if err != nil {
		respondWithError(w, http.StatusBadRequest, "Invalid chirp ID", err)
		return
	}

	dbChirp, err := cfg.db.GetChirp(r.Context(), chirpID)
	if err != nil {
		respondWithError(w, http.StatusNotFound, "Couldn't get chirp", err)
		return
	}

	respondWithJSON(w, http.StatusOK, Chirp{
		ID:        dbChirp.ID,
		CreatedAt: dbChirp.CreatedAt,
		UpdatedAt: dbChirp.UpdatedAt,
		UserID:    dbChirp.UserID,
		Body:      dbChirp.Body,
		Likes:     dbChirp.Likes,
	})
}
```

```go
// File: ../handler_chirps_create.go
Type: function_chunk
Parent Function: func handlerChirpsCreate(w http.ResponseWriter, r *http.Request)
Chunk 2:
---
		respondWithError(w, http.StatusInternalServerError, "Couldn't create chirp", err)
		return
	}

	respondWithJSON(w, http.StatusCreated, Chirp{
		ID:        chirp.ID,
		CreatedAt: chirp.CreatedAt,
		UpdatedAt: chirp.UpdatedAt,
		Body:      chirp.Body,
		UserID:    chirp.UserID,
		Likes:     chirp.Likes, // Include the likes field in the response
	})
}
```

**Explanation:**

1.  **Added `Likes` field to `Chirp` struct in `internal/database/models.go`:** This adds the `Likes` field (an integer) to the database model. It will default to 0 upon creation.
2.  **Added `Likes` field to `Chirp` struct in `handler_chirps_create.go`:** This adds the `Likes` field to the response struct, ensuring it is included in the JSON response.
3.  **Modified `handlerChirpsCreate` to include `Likes` in response:** This makes sure that when a chirp is created, the response includes the (default 0) value of `Likes`.
4.  **Modified `handlerChirpsRetrieve` and `handlerChirpsGet` to include `Likes` in response:** This adds the `Likes` field when retrieving single or multiple chirps.



---
### SUGGESTED CODE CHANGE
---
```go
// File: ../internal/database/models.go
type Chirp struct {
	ID        uuid.UUID
	CreatedAt time.Time
	UpdatedAt time.Time
	Body      string
	UserID    uuid.UUID
	Likes     int // Added Likes field
}
```

```go
// File: ../handler_chirps_create.go
type Chirp struct {
	ID        uuid.UUID `json:"id"`
	CreatedAt time.Time `json:"created_at"`
	UpdatedAt time.Time `json:"updated_at"`
	UserID    uuid.UUID `json:"user_id"`
	Body      string `json:"body"`
	Likes     int       `json:"likes"` // Added Likes field
}
```

```go
// File: ../handler_chirps_get.go
func handlerChirpsRetrieve(w http.ResponseWriter, r *http.Request) {
	dbChirps, err := cfg.db.GetChirps(r.Context())
	if err != nil {
		respondWithError(w, http.StatusInternalServerError, "Couldn't retrieve chirps", err)
		return
	}

	chirps := []Chirp{}
	for _, dbChirp := range dbChirps {
		chirps = append(chirps, Chirp{
			ID:        dbChirp.ID,
			CreatedAt: dbChirp.CreatedAt,
			UpdatedAt: dbChirp.UpdatedAt,
			UserID:    dbChirp.UserID,
			Body:      dbChirp.Body,
			Likes:     dbChirp.Likes, // Populating the Likes field
		})
	}

	respondWithJSON(w, http.StatusOK, chirps)
}
```

```go
// File: ../handler_chirps_get.go
func handlerChirpsGet(w http.ResponseWriter, r *http.Request) {
	chirpIDString := r.PathValue("chirpID")
	chirpID, err := uuid.Parse(chirpIDString)
	if err != nil {
		respondWithError(w, http.StatusBadRequest, "Invalid chirp ID", err)
		return
	}

	dbChirp, err := cfg.db.GetChirp(r.Context(), chirpID)
	if err != nil {
		respondWithError(w, http.StatusNotFound, "Couldn't get chirp", err)
		return
	}

	respondWithJSON(w, http.StatusOK, Chirp{
		ID:        dbChirp.ID,
		CreatedAt: dbChirp.CreatedAt,
		UpdatedAt: dbChirp.UpdatedAt,
		UserID:    dbChirp.UserID,
		Body:      dbChirp.Body,
		Likes:     dbChirp.Likes, // Populating the Likes field
	})
}
```

**Explanation:**

1.  **Modified `Chirp` struct in `internal/database/models.go`:** Added the `Likes int` field to the `Chirp` struct. This field will store the number of likes for a chirp. The default value for `int` will be 0.
2.  **Modified `Chirp` struct in `handler_chirps_create.go`:** Added the `Likes int \`json:"likes"\`` field to the `Chirp` struct. This ensures the Likes field is also returned when creating a new chirp.
3.  **Modified `handlerChirpsRetrieve` and `handlerChirpsGet` functions:** The handler functions now also return the Likes field.

