# Data embedding and vector uploading
`create a queue of the file names -> then spawn 10 threads -> each thread takes a file from the queue -> generates the proper datastructure/dataframe from the json file -> sends to the inference api or runs the transformer for embeddings -> takes the resulting embedding -> generates the proper datastructure to upload to a vector database (pinecone) and does so in chuncks of 100 vectors, as per the pinecone documentation -> when finished with a file save to a log.`

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

In [3]:
import pinecone

pinecone.init(
    api_key=str(PINECONE_API_KEY),
    environment='gcp-starter'
)

#### In case we need to delete and recreate the index to start fresh

In [4]:
# pinecone.delete_index('review-owl')
# pinecone.create_index('review-owl', dimension=384, metric='euclidean', pods=1, pod_type='starter')

In [5]:
PINECONE_POOL_THREADS = 30
index = pinecone.Index('review-owl', pool_threads=PINECONE_POOL_THREADS)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [6]:
from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('BAAI/bge-large-en-v1.5')
model = SentenceTransformer('BAAI/bge-small-en-v1.5')

# alternative embedding model to consider
# model = SentenceTransformer('BAAI/llm-embedder')

In [7]:
# importing a specific repo dataset by finding all the files starting with the index number and a dash
import glob

# getting the index number from the file name
def get_index_number(file):
    """A helper function to get the index number from the file name."""
    return int(file.split('\\')[1].split('-')[0])

def import_filepaths(folder_path: str):
    # getting all the files in the directory
    file_paths = glob.glob(folder_path)

    # sorting the files by the index number
    sorted_filepaths = sorted(file_paths, key=get_index_number)

    return sorted_filepaths

In [8]:
from tqdm.auto import tqdm
import pandas as pd
import json
import requests
import time
import random
import itertools
import logging

logging.basicConfig(filename='embedding_progress.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# API_URL = "https://api-inference.huggingface.co/models/BAAI/bge-base-en-v1.5"
# API_URL = "https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5"
API_URL = "https://api-inference.huggingface.co/models/BAAI/bge-small-en-v1.5"
# API_URL = "https://api-inference.huggingface.co/models/BAAI/llm-embedder"
headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}

def file_to_embedding_inputs(file_path: str):
    embedding_input = []
    
    data = pd.read_json(file_path, orient='index')
    for row in tqdm(data.iloc[:, 0], 'Splitting file into payloads'):
        embedding_input.append('path: ' + row['path'] + '\n' + 'diff_hunk: ' + row['diff_hunk'])
    
    return embedding_input

def huggingface_inference_api_request(payload: list[str], payload_size: int = 1000):
    try:
        rejoined_list = []
        for i in tqdm(range(0, len(payload), payload_size), 'Making requests to HuggingFace API'):
            payload_slice = payload[i:i + payload_size]
            data = json.dumps(payload_slice)
            response = requests.request("POST", API_URL, headers=headers, data=data)

            if ('Rate limit reached' in response.content.decode("utf-8")):
                raise Exception('Rate limit reached')
            while (response.status_code != 200 and 'is currently loading' in response.content.decode("utf-8")):
                logging.info(f'Waiting for 30 seconds. Reponse: {response.content.decode("utf-8")}')
                time.sleep(30)
                response = requests.request("POST", API_URL, headers=headers, data=data)

            for v in json.loads(response.content.decode("utf-8")):
                rejoined_list.append(v)

        return rejoined_list
    except Exception as e:
        print(e)
        return e

def make_huggingface_request_with_backoff(payload: list[str]):
    max_retries = 10
    retries = 0

    while retries < max_retries:
        try:
            response = huggingface_inference_api_request(payload, 200)
            print(type(response))
            if (response is Exception): raise response
            return response
        except requests.HTTPError as e:
            print('HTTP error: ' + str(e.response.status_code))
            if e.response.status_code == 503:
                wait_time = (2 ** retries) + (random.uniform(0, 1) * 0.1)  # Exponential backoff with random jitter
                time.sleep(wait_time)
                retries += 1
            else:
                print('Error: ' + str(e))
                raise e
        except Exception as e:
            if (e == 'Rate limit reached'):
                print('Rate limit reached, waiting for 10 minutes')
                time.sleep(60 * 10) # wait for 10 minutes to try again
                retries = 0
                pass
            logging.error(e)
            retries += 1

    logging.error('Max retries exceeded')
    raise Exception("Max retries exceeded")

In [9]:
def split_into_chunks(iterable: list, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunks = []
    for i in range(0, len(iterable), batch_size):
        chunks.append(list(itertools.islice(it, batch_size)))

    return chunks

In [26]:
import threading
import queue
import hashlib

def hash_id(id):
    return hashlib.sha256(id.encode('utf-8')).hexdigest()

def embed_file(file_path: str):
    vectors_list = []
    embedding_inputs_list = file_to_embedding_inputs(file_path)

    vectors_list = make_huggingface_request_with_backoff(embedding_inputs_list)
    # print(vectors_list)

    # for payload in tqdm(embedding_inputs_list, 'Generating vector embeddings'):
        # vectors_list.append(model.encode(payload).tolist())
    
    return vectors_list

def measure_metadata_size(metadata_list: list[dict]):
    return [len(str(metadata)) for metadata in metadata_list]

def truncate_metadata(metadata: dict, size: int):
    repo_len = len(str(metadata['repo']))
    path_len = len(str(metadata['path']))
    diff_len = len(str(metadata['diff']))
    body_len = len(str(metadata['body']))
    total_len = repo_len + path_len + diff_len + body_len

    if (total_len - diff_len < size):
        truncated_metadata = {'repo': metadata['repo'], 'path': metadata['path'], 'diff': metadata['diff'][:size - (total_len - size)], 'body': metadata['body']}
        logging.warning(f"Truncated diff from {diff_len} to {size - (total_len - size)} bytes.")
    elif (total_len - body_len < size):
        truncated_metadata = {'repo': metadata['repo'], 'path': metadata['path'], 'diff': metadata['diff'], 'body': metadata['body'][:size - (total_len - size)]}
        logging.warning(f"Truncated body from {diff_len} to {size - (total_len - size)} bytes.")
    else:
        truncated_metadata = {'repo': metadata['repo'], 'path': metadata['path']}
        logging.error(f"Neither the diff nor the body can be truncated to fit the size of {size} bytes.")

    return truncated_metadata

def generate_metadata(file_path: str):
    metadata_list = []
    data = pd.read_json(file_path, orient='index')

    for row in data.iloc[:, 0]:
        metadata_list.append({'repo': data.columns[0],'path': row['path'], 'diff': row['diff_hunk'], 'body': row['body']})
    
    metadata_size_list = measure_metadata_size(metadata_list)
    for metadata_size in metadata_size_list:
        if (metadata_size > 40960):
            logging.warning(f"Warning: Metadata size is greater than 40960 bytes at {metadata_size} bytes at vector index: {metadata_size_list.index(metadata_size)}.\nfile: {file_path}.")
            metadata_list[metadata_size_list.index(metadata_size)] = truncate_metadata(metadata_list[metadata_size_list.index(metadata_size)], 40960)

    return metadata_list

def generate_vector_ids(metadata_list: list[dict]):
    ids = []
    for i in range(len(metadata_list)):
        ids.append(metadata_list[i]['repo'] + '-' + hash_id(metadata_list[i]['repo'] + str(i)))
    return ids

def generate_upsert_data(file_path: str):
    embedding_result = embed_file(file_path)
    metadata_list = generate_metadata(file_path)
    vector_ids = generate_vector_ids(metadata_list)

    # print(embedding_result)

    # print('lens: ' + len(embedding_result) + ' ' + len(metadata_list) + ' ' + len(vector_ids))

    upsert_data = list(zip(vector_ids, embedding_result, metadata_list))

    return upsert_data

def worker(file_queue: queue.Queue):
    while True:
        file_path = file_queue.get()
        if file_path is None:
            break

        upsert_data = generate_upsert_data(file_path)

        upsert_data_chunks = split_into_chunks(upsert_data)

        for chunk in tqdm(upsert_data_chunks, 'Uploading data to Pinecone'):
            index.upsert(chunk)
        
        logging.info(f"Logging: {file_path} processed")

        file_queue.task_done()

# Function to create and manage worker threads
def create_worker_threads(num_threads: int, file_queue: queue.Queue):
    threads = list[threading.Thread]()

    for _ in range(num_threads):
        thread = threading.Thread(target=worker, args=(file_queue,))
        thread.start()
        threads.append(thread)

    return threads

# Main function to process files using multiple threads
def process_files_with_threads(file_names: list[str], num_threads: int, start: int = 0, end=None):
    # Create a thread-safe queue
    file_queue = queue.Queue()

    # Populate the queue with file names
    for file_name in file_names[start:end]:
        file_queue.put(file_name)

    # Create worker threads
    threads = create_worker_threads(num_threads, file_queue)

    # Wait for all file processing to be completed
    file_queue.join()

    # Stop the worker threads
    for _ in range(num_threads):
        file_queue.put(None)

    for thread in threads:
        thread.join()

    logging.info("All files processed.")

In [11]:
# # For testing
# file_paths = import_filepaths('dataset/mined-comments-25stars-25prs-JavaScript.json/repo-split/*.json')
# e0 = file_to_embedding_inputs(file_paths[500])


Splitting file into payloads:   0%|          | 0/393 [00:00<?, ?it/s]

In [12]:
# e1 = huggingface_inference_api_request(e0, 200)

Making requests to HuggingFace API:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
# m1 = generate_metadata(file_paths[500])

In [25]:
# m1[0]['repo']

'node-fetch@node-fetch'

In [27]:
# v1 = generate_vector_ids(m1)

In [28]:
# z1 = list(zip(v1, e1, m1))

In [21]:
# len(z1)

393

In [29]:
# hash_id(m1[0]['repo'] + '0')

'00ba6353de9f4aa6efb0ed83c38790acae2a2319ac4bb85d142aaacc88e34ea9'

In [30]:
# z1

[('node-fetch@node-fetch-00ba6353de9f4aa6efb0ed83c38790acae2a2319ac4bb85d142aaacc88e34ea9',
  [-0.07506393641233444,
   -0.019878899678587914,
   -0.02450016513466835,
   -0.015582806430757046,
   0.06509425491094589,
   -0.030761074274778366,
   -0.07639472931623459,
   -0.009630845859646797,
   -0.028613490983843803,
   -0.027913769707083702,
   0.04799017310142517,
   -0.06161446124315262,
   0.030297275632619858,
   0.0025403203908354044,
   0.03118194453418255,
   -0.0001860154006863013,
   -0.00307579361833632,
   -0.027675483375787735,
   0.001085397438146174,
   0.04185732081532478,
   0.05891271308064461,
   0.03723185509443283,
   0.04698672518134117,
   -0.048856623470783234,
   -0.018216200172901154,
   0.02545543573796749,
   0.041416823863983154,
   -0.00034583432716317475,
   -0.030689667910337448,
   -0.1296376883983612,
   0.008982096798717976,
   -0.0632685199379921,
   -0.06899777054786682,
   0.00011311168782413006,
   0.030351396650075912,
   -0.04873471334576607,


In [31]:
# upsert_data_chunks = split_into_chunks(z1)

# for chunk in tqdm(upsert_data_chunks, 'Uploading data to Pinecone'):
#     index.upsert(chunk)

Uploading data to Pinecone:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
# upsert_data = generate_upsert_data(file_paths[0])
# upsert_data_chunks = split_into_chunks(upsert_data)

Splitting file into payloads:   0%|          | 0/9224 [00:00<?, ?it/s]

Making requests to HuggingFace API:   0%|          | 0/47 [00:00<?, ?it/s]

<class 'list'>


In [32]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.00393,
 'namespaces': {'': {'vector_count': 393}},
 'total_vector_count': 393}

In [33]:
file_paths = import_filepaths('dataset/mined-comments-25stars-25prs-JavaScript.json/repo-split/*.json')
process_files_with_threads(file_names=file_paths, num_threads=1, end=500)

Splitting file into payloads:   0%|          | 0/9224 [00:00<?, ?it/s]

Making requests to HuggingFace API:   0%|          | 0/47 [00:00<?, ?it/s]

<class 'list'>


Uploading data to Pinecone:   0%|          | 0/93 [00:00<?, ?it/s]

Splitting file into payloads:   0%|          | 0/9083 [00:00<?, ?it/s]

Making requests to HuggingFace API:   0%|          | 0/46 [00:00<?, ?it/s]