In [None]:
from datasets import load_dataset
import json
import re
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
# APPS
ds = load_dataset("codeparrot/apps", split="test")

In [None]:
def extract_functions(code_snippet):

    function_pattern = re.compile(r"def\s+\w+\(.*?\):.*?(?=\n\s*def|\Z)", re.DOTALL)
    functions = function_pattern.findall(code_snippet)
    return functions

def extract_relevant_info(sample):


    sample["solutions"] = json.loads(sample["solutions"])
    

    info = {
        "question": sample["question"],
        "solutions": sample["solutions"],
        "input_output": sample["input_output"],
    }
    

    function_blocks = []

    for solution in info["solutions"]:

        functions = extract_functions(solution)

        for func in functions:

            function_blocks.append({
                "question": info["question"],
                "function_content": func,
                "input_output": info["input_output"]
            })
    

    return function_blocks

relevant_info = []
for sample in ds:
    function_blocks = extract_relevant_info(sample)
    relevant_info.extend(function_blocks)

embedding_path = r"C:\Users\ZOE\Desktop\Langchain_framework\all-MiniLM-L6-v2"
embedding_function = HuggingFaceEmbeddings(model_name=embedding_path)
embeddings = [embedding_function.embed_query(json.dumps(info)) for info in relevant_info]

vectorstore = Chroma(
    persist_directory="Chroma_databases/",
    embedding_function=HuggingFaceEmbeddings(model_name=embedding_path)
)

for info, embedding in zip(relevant_info, embeddings):
    vectorstore.add_texts([json.dumps(info)], embeddings=[embedding])

print("The training set of the APPS dataset has been successfully imported into the Chroma vector database and permanently saved in the Chroma_databases directory")