In [1]:
import os

# Fill the environmental variable with your own OpenAI API key
# See: https://platform.openai.com/account/api-keys
os.environ["OPENAI_API_KEY"] = "x"

In [2]:
import qdrant_client

client = qdrant_client.QdrantClient("http://localhost:6333", prefer_grpc=True)
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='repo-embeddings')])

In [3]:
import pandas as pd

df = pd.read_parquet("../output.parquet")

In [4]:
print(df.describe())

                  filename                                            message  \
count              7737054                                            7737054   
unique                8005                                               1000   
top     b'.browserslistrc'  b'Update dependency doorkeeper to v5.6.8 (#281...   
freq                  1000                                               7824   

                                             author_email  \
count                                             7737054   
unique                                                 72   
top     b'29139614+renovate[bot]@users.noreply.github....   
freq                                              1764394   

                                               hash bug_spot_likelihood  \
count                                       7737054             7737054   
unique                                         1000                 212   
top     b'456597dae5251af841e46ab0608e0d44a7de1197'            

In [5]:
COLLECTION_NAME="repo-embeddings"

In [6]:
from qdrant_client.http import models as rest

vector_size = 1536

client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        "content": rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

True

In [7]:
chunk_size = 100

In [8]:
from openai import OpenAI
oaclient = OpenAI()

In [9]:
import asyncio
from tqdm.asyncio import tqdm
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, Payload, Filter
import time
import uuid

def get_embeddings(text, model="text-embedding-ada-002", encoding_format="float"):
   text = text.replace("\n", " ")
   time.sleep(0.5) # Prevent getting rate limited 
   return oaclient.embeddings.create(input = [text], model=model, encoding_format=encoding_format).data

async def add_to_client(sdf):
    for _, row in sdf.iterrows():
        formatted_output = f"""
        filename: {row['filename']},
        message: {row['message']},
        author_email: {row['author_email']},
        hash: {row['hash']},
        bug_spot_likelihood: {row['bug_spot_likelihood']},
        commit_timestamp: {row['commit_timestamp']}
        """
        embeddings = get_embeddings(formatted_output)
        points=[
            rest.PointStruct(
                id=str(uuid.uuid4()),
                vector={"content": embedding.embedding},
                payload={f"#{str(row['hash'])}_#{str(row['filename'])}": formatted_output}
            ) 
            for embedding in embeddings
        ]
        client.upsert(collection_name=COLLECTION_NAME, points=points)
    print(f"processed:#{sdf.head(1).index} to: #{sdf.tail(1).index}")

async def process():
    tasks = [add_to_client(df[i:i+chunk_size]) for i in range(0, len(df), chunk_size)]
    for task in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
        await task


In [10]:
#await process()

In [11]:
TOP_N_RESULTS = 100
SCORE_THRESHOLD = 0.700
def rag(question: str, n_points: int = 30) -> str:
    embeddings = get_embeddings(question)
    embedded_query = embeddings[0].embedding

    results = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=(
            "content", embedded_query
        ),
        limit=n_points,
    )
    results = [result for result in results if round(result.score, 3) >= SCORE_THRESHOLD]

    context = "\n".join(str(r) for r in results[:TOP_N_RESULTS])

    metaprompt = f"""
    You are a software architect. 
    Answer the following question using the provided context. 
    If you can't find the answer, do not pretend you know it, but answer "I don't know".
    
    Question: {question.strip()}
    
    Context: you have the following list of Git Commits: {context.strip()}.
    
    Answer:
    """
    completion = oaclient.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": metaprompt},
        ],
        timeout=10.0,
    )
    return completion.choices[0].message.content

In [13]:
rag("Give me a summary about the context", n_points=1000)

'The context does not provide any specific information or details about a list of Git commits.'