In [1]:
import chromadb
import re
import os
import hashlib
from datetime import datetime
from chromadb import Settings
from typing import List, Dict
from IPython.display import clear_output
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
DATA_PATH = "./raw_database/"

In [3]:
chroma_client = chromadb.PersistentClient(
    path="./persistent_db",
    settings= chromadb.config.Settings(allow_reset=True)
)

In [4]:
collection = chroma_client.get_or_create_collection(
    name="cv_doc",
    metadata= {"hnsw:space" :"cosine"}
)

In [5]:
print(collection)

Collection(name=cv_doc)


In [6]:
results = collection.query(
    query_texts=["Que proyectos ha desarrollado Martin?"],
    n_results=6,
    include = ["documents"]
)

In [7]:
#print(results)

In [8]:
def parse_markdown_text(document_name: str, document_data: str) -> Dict:
    # Extract the data from the differents sections of the document
    title_match = re.search(r"% title:\s*(.*)", document_data)
    document_match = re.search(r"% document: \s*(.*)", document_data)
    document_match = re.search(r"% document: \s*(.*)", document_data)
    url_source_match = re.search(r"% url_source:\s*(.*)", document_data)
    web_source_match = re.search(r"% web_source:\s*(.*)", document_data)
    date_publication_match = re.search(r"% date_publication:\s*(.*)", document_data)
    content_match = re.search(r"% content:\s*(.*)", document_data, re.DOTALL)
    
    #Converts the text into a json object
    parsed_document = {
        "title" : title_match.group(1).strip() if title_match else "Unknow title document",
        "document" : document_match.group(1).strip() if document_match else "No document data",
        "document_name" : document_name,
        "url_source" : url_source_match.group(1).strip() if url_source_match else "https://github.com/Martingago",
        "web_source" :  web_source_match.group(1).strip() if web_source_match else "https://martingago.dev/",
        "date_publication" : date_publication_match.group(1).strip() if date_publication_match else "No date",
        "content": content_match.group(1).strip() if content_match else ""
    }
    return parsed_document

### Splits the document into small chunks

In [9]:
def split_text_from_string(document: str) -> List[str]:
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 350,
    chunk_overlap = 30,
    length_function=len,
    is_separator_regex=False
    )
    splitted_text = text_splitter.split_text(document)
    return splitted_text

In [10]:
def create_chunks_from_string(file_path: str) -> List[Dict] :
    #Reads the doc
    with open(file_path ,'r', encoding='utf-8') as f:
        document_data = f.read()
        document_name = os.path.splitext(os.path.basename(file_path))[0]
    
    parsed_doc = parse_markdown_text(document_name, document_data)
    content_chunks = split_text_from_string(parsed_doc["content"])

    chunk_with_metadata = []

    for chunk in content_chunks:
        hash_object = hashlib.sha256(chunk.encode('utf-8')) #generates a hash in base of content to avoid duplicates
        chunk_id = hash_object.hexdigest()
        
        chunk_metadata = {
            "document" :  parsed_doc["document"],
            "document_name" : parsed_doc["document_name"],
            "title" : parsed_doc["title"],
            "url_source" : parsed_doc["url_source"],
            "web_source" :  parsed_doc["web_source"],
            "date_publication" : parsed_doc["date_publication"]
        }
        chunk_data = {
            "id": "ID"+chunk_id,
            "document": chunk,
            "metadata": chunk_metadata
        }
        chunk_with_metadata.append(chunk_data)
    return chunk_with_metadata

In [11]:
def upload_chunks_to_collection(document_chunks, collection):
    
    for chunk in document_chunks:
        # Validates if a documents exists on the database or not
        item = collection.get(ids=[chunk["id"]])
        if not len(item["ids"]) :
            collection.add(
                ids =[chunk["id"]],
                documents = [chunk["document"]],
                metadatas = [chunk["metadata"]]
            )
            print(f"    >Chunk: {chunk['id']} successfully added")
        else : 
            print("Duplicate entry, skipping")

### Inserts a single file into the database

In [12]:
def insert_file_to_database(file_path: str) :
    print("executing")
    document_chunks = create_chunks_from_string(file_path) # Generates an array of chunks from a file path
    upload_chunks_to_collection(document_chunks, collection) # upload the chunks to the  specified collection

## Insert all files from a dir into the selected collection

In [13]:
def handle_folders_upload(input_directory : str, collection) :
    for root, dirs, files in os.walk(input_directory):
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        for file in files:
            if file.endswith('.md') and not file.startswith('.'):
                file_path = os.path.join(root, file)
                time_start = datetime.now()
                insert_file_to_database(file_path) # handle the document path and insert data into database
                time_end = datetime.now()
                execution_time = (time_end - time_start).total_seconds() * 1000
                #clear_output(wait=True)
                print(f"[{execution_time:.0f}ms]: {file} transformed into chunks and inserted into:{collection.name}")

In [14]:
handle_folders_upload(DATA_PATH, collection)

executing
    >Chunk: ID7d97a620623d7a455003238a83e3e4ac0d4da933a37b63c0e98b2b39264ffee4 successfully added
    >Chunk: ID98028029b548ce806a3a24a877556502128689308b34dcd81d8dfbdb6652930d successfully added
    >Chunk: ID4ca74782ec1456599a5be3e9e8d0c54b3beba303111c62de85a11216afd22fd0 successfully added
    >Chunk: IDf888865d2213a6c85c1b89ca0a40842b83ea9ef8101ca1258d6516fe18a626ee successfully added
    >Chunk: ID018e1e14b6613f3879627cb7a2b1b8f0ede0ba211e2ac82baac066c261fcd68f successfully added
    >Chunk: ID3771cc3f5ed1e9e255d3bb71f6b955ab6fa5d0b3538ea128a3b2ef036091f370 successfully added
    >Chunk: ID02125f52abc0bc2a0b530394bb075095b8a9bcad009de7217b75ce13f5ba566d successfully added
[256ms]: bravus.md transformed into chunks and inserted into:cv_doc
executing
    >Chunk: ID57a320c5bc9caf05199871aef8e0b4c49b6c477cfaf5f86d35ed26378b654525 successfully added
    >Chunk: ID4ae6d97a4d4a2a545098801f2121f454ceffa9b91525fe008d76f6210a867e08 successfully added
    >Chunk: ID8c4055abeeb31769

In [15]:
#chroma_client.delete_collection("cv_doc")