## This notebook creates chunks of each submission, grabs an embedding and uploads to db


In [2]:
import re
import os
from tqdm.notebook import tqdm
import json
from db.chunks import ChunkManager
from db.db_instance import DBClient
from concurrent.futures import ThreadPoolExecutor

client = DBClient()
db = ChunkManager()

def chunk_text(md_text):
    init_split = md_text.replace(":\n\n", ":\n").replace("\n\n-", "\n-").split("\n\n")
    chunks = []    
    previous_str = ""
    for i in init_split:        
        if len(f"{previous_str}\n{i.strip()}") > 200:
            if "png](" in i:
                second_split = i.split("\n")
                second_clean = ""
                for j in second_split:
                    if len(j.strip()) > 2 and "png]" not in j:
                        second_clean += j
                if len(f"{previous_str}\n{second_clean.strip()}") > 200:
                    fixed_string = re.sub(r'\r?\n\s*', ' ', second_clean).strip()
                    if len(fixed_string) > 6000:
                        # If the fixed string is too long, lets split it up into 3000 character chunks
                        split_fixed = [fixed_string[i:i+3000] for i in range(0, len(fixed_string), 3000)]
                        for split in split_fixed:
                            append_str = ''
                            if previous_str:
                                append_str += f"\n{previous_str.strip()}"
                            append_str += f"\n{split}"
                            chunks.append(append_str.strip())
                            previous_str = ""
                        continue
                    append_str = ''
                    if previous_str:
                        append_str += f"\n{previous_str.strip()}"
                    append_str += f"\n{fixed_string}"                    
                    chunks.append(append_str.strip())                    
                    previous_str = ""
                else:
                    previous_str += f"{second_clean.strip()}\n"            
            else:                
                fixed_string = re.sub(r'\r?\n\s*', ' ', i)
                if len(fixed_string) > 6000:
                    # If the fixed string is too long, lets split it up into 3000 character chunks
                    split_fixed = [fixed_string[i:i+3000] for i in range(0, len(fixed_string), 3000)]
                    for split in split_fixed:
                        append_str = ''
                        if previous_str:
                            append_str += f"\n{previous_str.strip()}"
                        append_str += f"\n{split}"
                        chunks.append(append_str.strip())
                        previous_str = ""
                    continue
                append_str = ''
                if previous_str:
                    append_str += f"\n{previous_str.strip()}"
                append_str += f"\n{fixed_string}"
                chunks.append(append_str.strip())                
                previous_str = ""   
        else:
            # Still need to remove images from the text
            if "png](" in i.lower():
                second_split = i.split("\n")
                second_clean = ""
                for j in second_split:
                    if len(j.strip()) > 2 and "png]" not in j.lower():
                        second_clean += j
                fixed_string = re.sub(r'\r?\n\s*', ' ', second_clean).strip()
                previous_str += f"{second_clean.strip()}\n"                
            else:
                fixed_string = re.sub(r'\r?\n\s*', ' ', i).strip()
                previous_str += f"{fixed_string.strip()}\n"
    # If there is still a previous_str, lets append it as the last chunk
    if previous_str:        
        chunks.append(previous_str.strip())
    return chunks

# Data to process
with open('./data/step1/list.json', 'r') as f:
    list = json.load(f)

data = list["data"]

# this function will take a doc_id and return the file path for the md file
def get_file_path(doc_id, folder_path = './data/files'):    
    for file_name in os.listdir(folder_path):
        if file_name.split('-')[0] == doc_id:
            return os.path.join(folder_path, file_name)

error_items = []

# To show number of chunks being added
up_counter_bar = tqdm(desc="Creating items", unit="item")

def chunk_submission(item, pbar):
    try: 
        md_file_path = get_file_path(item.get("uniqueId"))       
        with open(md_file_path, 'r') as file:
            submission = file.read()
        chunks = chunk_text(submission)
        for idx, chunk in enumerate(chunks):
            chunk_obj = {
                "submission_uniqueId": item["uniqueId"],
                "chunk_text": chunk,
                "submitter": item['submitter'],
                "group": item['group'],
                "support": item.get('step_1', {}).get('support', {}).get('support', None),
                "regulation_type": item.get('step_1', {}).get('regulation', {}).get('regulation_type', None),
                "regulator_trust": item.get('step_1', {}).get('regulator_trust', {}).get('regulator_trust', None),
                "chunk_index": idx
            }        
            db.new_chunk(chunk_obj, True)
            up_counter_bar.update(1)
        pbar.update(1)
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        error_items.append(item)
        pbar.update(1)
        return

with tqdm(total=len(data), desc="Overall Progress") as main_pbar:
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(chunk_submission, item, main_pbar): item for item in data}
        for future in futures:
            future.result()

up_counter_bar.close()

# Data to process
print("!" * 30)
print(error_items)
print("!" * 30)

Started /Users/k/.cache/weaviate-embedded: process ID 94136


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-06-16T11:52:52+10:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-06-16T11:52:52+10:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-06-16T11:52:52+10:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50050","time":"2024-06-16T11:52:52+10:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-06-16T11:52:52+10:00"}


Creating items: 0item [00:00, ?item/s]

Overall Progress:   0%|          | 0/2267 [00:00<?, ?it/s]

chunk client initialized


{"level":"info","msg":"Completed loading shard submission_mQ2WBUFfXTPI in 8.60875ms","time":"2024-06-16T11:52:53+10:00"}
{"action":"hnsw_vector_cache_prefill","count":5000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-06-16T11:52:54+10:00","took":23675208}
{"level":"info","msg":"Completed loading shard chunk_LynOu2iRu2n4 in 57.460167ms","time":"2024-06-16T11:52:54+10:00"}
{"action":"hnsw_vector_cache_prefill","count":29463,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-06-16T11:52:54+10:00","took":180415791}


deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding new one
deleting chunk, then adding 