# Data embedding and vector uploading
`create a queue of the file names -> then spawn 10 threads -> each thread takes a file from the queue -> generates the proper datastructure/dataframe from the json file -> sends to the inference api or runs the transformer for embeddings -> takes the resulting embedding -> generates the proper datastructure to upload to a vector database (pinecone) and does so in chuncks of 100 vectors, as per the pinecone documentation -> when finished with a file save to a log.`

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

In [None]:
import pinecone

pinecone.init(
    api_key=str(PINECONE_API_KEY),
    environment='gcp-starter'
)

#### In case we need to delete and recreate the index to start fresh

In [None]:
# pinecone.delete_index('review-owl')
# pinecone.create_index('review-owl', dimension=384, metric='euclidean', pods=1, pod_type='starter')

In [None]:
PINECONE_POOL_THREADS = 30
index = pinecone.Index('review-owl', pool_threads=PINECONE_POOL_THREADS)
index.describe_index_stats()

In [None]:
from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('BAAI/bge-large-en-v1.5')
model = SentenceTransformer('BAAI/bge-small-en-v1.5')

# alternative embedding model to consider
# model = SentenceTransformer('BAAI/llm-embedder')

In [None]:
# importing a specific repo dataset by finding all the files starting with the index number and a dash
import glob

# getting the index number from the file name
def get_index_number(file):
    """A helper function to get the index number from the file name."""
    return int(file.split('\\')[1].split('-')[0])

def import_filepaths(folder_path: str):
    # getting all the files in the directory
    file_paths = glob.glob(folder_path)

    # sorting the files by the index number
    sorted_filepaths = sorted(file_paths, key=get_index_number)

    return sorted_filepaths

In [None]:
from tqdm.auto import tqdm
import pandas as pd
import json
import requests
import time
import random
import itertools

# API_URL = "https://api-inference.huggingface.co/models/BAAI/bge-base-en-v1.5"
API_URL = "https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5"
# API_URL = "https://api-inference.huggingface.co/models/BAAI/bge-small-en-v1.5"
# API_URL = "https://api-inference.huggingface.co/models/BAAI/llm-embedder"
headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}

def file_to_embedding_inputs(file_path: str):
    embedding_input = []
    
    data = pd.read_json(file_path, orient='index')
    for row in tqdm(data.iloc[:, 0], 'Splitting file into payloads', position=2):
        embedding_input.append('path: ' + row['path'] + '\n' + 'diff_hunk: ' + row['diff_hunk'])
    
    return embedding_input

def huggingface_inference_api_request(payload: list[str]):
    try:
        data = json.dumps(payload)
        response = requests.request("POST", API_URL, headers=headers, data=data)

        while (response.status_code != 200 and 'is currently loading' in response.content.decode("utf-8")):
            time.sleep(20)
            print(response.content.decode("utf-8"))
            response = requests.request("POST", API_URL, headers=headers, data=data)

        return json.loads(response.content.decode("utf-8"))
    except Exception as e:
        print(e)
        return e

def make_huggingface_request_with_backoff(payload: list[str]):
    max_retries = 10
    retries = 0

    while retries < max_retries:
        try:
            response = huggingface_inference_api_request(payload)
            return response
        except requests.HTTPError as e:
            print('HTTP error: ' + str(e.response.status_code))
            if e.response.status_code == 503:
                # Handle rate-limiting error
                wait_time = (2 ** retries) + (random.uniform(0, 1) * 0.1)  # Exponential backoff with random jitter
                time.sleep(wait_time)
                retries += 1
            else:
                print('Error: ' + str(e))
                # If it's not a 503 error, re-raise the exception
                raise e
        except Exception as e:
            # Handle other exceptions
            print(e)
            retries += 1

    # If all retries fail
    raise Exception("Max retries exceeded")

In [None]:
def split_into_chunks(iterable: list, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunks = []
    for i in range(0, len(iterable), batch_size):
        chunks.append(list(itertools.islice(it, batch_size)))

    return chunks

def pinecone_upsert_request(index, payload):
    try:
        data = json.dumps(payload)
        response = requests.request("POST", API_URL, headers=headers, data=data)
        if (response.status_code != 200): raise Exception(response.json())
        return json.loads(response.content.decode("utf-8"))
    except Exception as e:
        print(e)
        return e

def make_pinecone_upsert_with_backoff(index, payload):
    max_retries = 3
    retries = 0

    while retries < max_retries:
        try:
            pinecone_responses = pinecone_upsert_request(index, payload)
            return pinecone_responses
        except requests.HTTPError as e:
            if e.response.status_code == 503:
                # Handle rate-limiting error
                wait_time = (2 ** retries) + (random.uniform(0, 1) * 0.1)  # Exponential backoff with random jitter
                time.sleep(wait_time)
                retries += 1
            else:
                # If it's not a 503 error, re-raise the exception
                raise e
        except Exception as e:
            # Handle other exceptions
            retries += 1

    # If all retries fail
    raise Exception("Max retries exceeded")

In [None]:
import threading
import queue
import hashlib

def hash_id(id):
    return hashlib.sha256(id.encode('utf-8')).hexdigest()

def embed_file(file_path: str):
    vectors_list = []
    embedding_inputs_list = file_to_embedding_inputs(file_path)

    for payload in tqdm(embedding_inputs_list, 'Generating vector embeddings', position=1):
        # vectors_list.append(make_huggingface_request_with_backoff(payload))
        vectors_list.append(model.encode(payload).tolist())
    
    return vectors_list

def generate_metadata(file_path: str):
    metadata_list = []
    data = pd.read_json(file_path, orient='index')

    for row in tqdm(data.iloc[:, 0], 'Splitting file into payloads', position=2):
        metadata_list.append({'repo': data.columns[0],'path': row['path'], 'diff': row['diff_hunk'], 'body': row['body']})
    
    return metadata_list

def generate_vector_ids(metadata_list: list[dict]):
    ids = []
    for i in tqdm(range(len(metadata_list)), 'Generating vector ids', position=2):
        ids.append(metadata_list[i]['repo'] + '-' + hash_id(str(i)))
    return ids

def generate_upsert_data(file_path: str):
    embedding_result = embed_file(file_path)
    metadata_list = generate_metadata(file_path)
    vector_ids = generate_vector_ids(metadata_list)

    upsert_data = list(zip(vector_ids, embedding_result, metadata_list))

    return upsert_data

def worker(file_queue: queue.Queue):
    while True:
        file_path = file_queue.get()
        if file_path is None:
            break

        upsert_data = generate_upsert_data(file_path)

        upsert_data_chunks = split_into_chunks(upsert_data)

        for chunk in tqdm(upsert_data_chunks, 'Uploading data to Pinecone', position=1):
            index.upsert(chunk)
        print(f"Logging: {file_path} processed")

        file_queue.task_done()

# Function to create and manage worker threads
def create_worker_threads(num_threads: int, file_queue: queue.Queue):
    threads = list[threading.Thread]()

    for _ in range(num_threads):
        thread = threading.Thread(target=worker, args=(file_queue,))
        thread.start()
        threads.append(thread)

    return threads

# Main function to process files using multiple threads
def process_files_with_threads(file_names: list[str], num_threads: int, start: int = 0, end=None):
    # Create a thread-safe queue
    file_queue = queue.Queue()

    # Populate the queue with file names
    for file_name in file_names[start:end]:
        file_queue.put(file_name)

    # Create worker threads
    threads = create_worker_threads(num_threads, file_queue)

    # Wait for all file processing to be completed
    file_queue.join()

    # Stop the worker threads
    for _ in range(num_threads):
        file_queue.put(None)

    for thread in threads:
        thread.join()

    print("All files processed.")

In [None]:
# For testing
# file_paths = import_filepaths('dataset/mined-comments-25stars-25prs-JavaScript.json/repo-split/*.json')
# upsert_data = generate_upsert_data(file_paths[3500])
# upsert_data_chunks = split_into_chunks(upsert_data)
# len(upsert_data)

In [None]:
file_paths = import_filepaths('dataset/mined-comments-25stars-25prs-JavaScript.json/repo-split/*.json')
process_files_with_threads(file_names=file_paths, num_threads=2, start=6000)