In [1]:
import os

# Fill the environmental variable with your own OpenAI API key
# See: https://platform.openai.com/account/api-keys
os.environ["OPENAI_API_KEY"] = "x"

In [2]:
import qdrant_client

client = qdrant_client.QdrantClient("http://localhost:6333", prefer_grpc=True)
client.get_collections()

CollectionsResponse(collections=[])

In [3]:
COLLECTION_NAME="repo-embeddings"

In [4]:
from qdrant_client.http import models as rest

vector_size = 1536

client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        "content": rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

True

In [5]:
import pandas as pd

df = pd.read_parquet("../output.parquet")

In [6]:
from langchain_openai import OpenAIEmbeddings
import asyncio

In [7]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
async def generate_embeddings_batch(texts):
    return embeddings.embed_documents(texts)

In [8]:

import numpy as np

# Split the dataframe into batches
chunk_size = 20

In [9]:
import tqdm
import asyncio
from tqdm.asyncio import tqdm
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, Payload, Filter
import time
import uuid
import json

In [10]:
async def add_to_client(sdf):
    for _, row in sdf.iterrows():
        row = row.to_dict()
        formatted_output = {
            "message": str(row.get("message", "")),
            "author_email": str(row.get("author_email", "")),
            "hash": str(row.get("hash", "")),
            "bug_spot_likelihood": str(row.get("bug_spot_likelihood", "")),
            "commit_timestamp": str(row.get("commit_timestamp", "")),
            "filename": str(row.get("filename", "")),
        }
        formatted_output = json.dumps(formatted_output)
        embeddings = await generate_embeddings_batch([formatted_output])
        points=[
            rest.PointStruct(
                id=str(uuid.uuid4()),
                vector={"content": embedding},
                payload={"output": formatted_output, "hash": str(row.get("hash", ""))}
            ) 
            for embedding in embeddings
        ]
        client.upsert(collection_name=COLLECTION_NAME, points=points)
    print(f"processed:#{sdf.head(1).index} to: #{sdf.tail(1).index}")

async def process():
    semaphore = asyncio.Semaphore(3)  # Limit to 3 concurrent tasks
    tasks = []

    async def sem_task(task):
        async with semaphore:
            return await task

    for i in range(0, len(df), chunk_size):
        task = add_to_client(df[i:i+chunk_size])
        tasks.append(sem_task(task))

    for task in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
        await task

In [11]:
await process()

  0%|                                                                                                                                                               | 0/50 [00:00<?, ?it/s]

processed:#RangeIndex(start=520, stop=521, step=1) to: #RangeIndex(start=539, stop=540, step=1)
processed:#RangeIndex(start=180, stop=181, step=1) to: #RangeIndex(start=199, stop=200, step=1)
processed:#RangeIndex(start=860, stop=861, step=1) to: #RangeIndex(start=879, stop=880, step=1)
processed:#RangeIndex(start=700, stop=701, step=1) to: #RangeIndex(start=719, stop=720, step=1)
processed:#RangeIndex(start=360, stop=361, step=1) to: #RangeIndex(start=379, stop=380, step=1)
