# Storing Data with LangChain and ChromaDB

In [2]:
import chromadb
import os
from collections import defaultdict
from tqdm import tqdm
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from unstructured.partition.docx import partition_docx
from unstructured.partition.doc import partition_doc
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import Image

load_dotenv(find_dotenv())

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
POPPLER_PATH = os.getenv("POPPLER_PATH")
TESSERACT_PATH = os.getenv("TESSERACT_PATH")
DATA_PATH = os.getenv("DATA_PATH")                                          # contains 108 files  
SUB_DATA_SET_PATH = os.path.join(DATA_PATH, "aktive_leistungen", "ark")     # contains 17 files

In [3]:
def search_for_double_file_names(curr_file_paths: list[str], target: str) -> bool:
    """ If same file name exists in the list of file paths, return True, else False. 
    """
    if not curr_file_paths:
        return False
    else:
        for file_path in curr_file_paths:
            if file_path.split("\\")[-1] == target:
                return True
        return False


def delete_double_file_path_from_file_path(file_paths: list[str]) -> list[str]:
    """ This function deletes double file paths from a list of file paths.
    """
    unique_file_paths = []
    double_file_paths = []
    for file_path in file_paths:
        if not search_for_double_file_names(unique_file_paths, file_path.split("\\")[-1]):
            unique_file_paths.append(file_path)
        else:
            double_file_paths.append(file_path)
    
    return unique_file_paths, double_file_paths

In [4]:
ALL_FILE_PATHS = set([str(f) for f in Path(DATA_PATH).rglob("*.*")])
RED_FILE_PATHS, DOUBLE_FILE_PATHS = delete_double_file_path_from_file_path(ALL_FILE_PATHS)         # 108 files, delete files with same file name
ALL_FILE_NAMES = set([file_path.split("\\")[-1] for file_path in ALL_FILE_PATHS])
DATABASE_PATH = "../../Database/"
EMBEDDING_MODEL = "text-embedding-ada-002"

len(ALL_FILE_PATHS), len(RED_FILE_PATHS), len(ALL_FILE_NAMES)

(111, 108, 108)

## Creating RCTS Chunks

In [5]:
# Parameters

CHUNK_SIZE = 1000
CHUNK_OVERLAP = int(CHUNK_SIZE * (1/5)) 
SEPARATORS = ["\n{2,}", "(?<=[.?!])\s*\n|\n\s*", "[.!?]"]   # 1. Split by amount of newlines, 2. Split by newlines after punctuation, 3. Split by punctuation
IS_SEPARATOR_REGEX = True
STRIP_WHITESPACE = True

In [6]:
# creation of splitters

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=SEPARATORS,
    is_separator_regex=IS_SEPARATOR_REGEX,
    strip_whitespace=STRIP_WHITESPACE,
)

In [None]:
client = chromadb.PersistentClient(
    path=os.path.join(DATABASE_PATH, "RCTS", f"{EMBEDDING_MODEL}"),
)

In [None]:
collection_already_exists = True

for file_path, file_name in zip(ALL_FILE_PATHS, ALL_FILE_NAMES):
    
    chunks = None
    if file_name.endswith(".pdf"):
        docs = PyPDFLoader(file_path).load()
        chunks = splitter.split_documents(docs)
    
    elif file_name.endswith(".docx"):
        docs = Docx2txtLoader(file_path).load()
        chunks = splitter.split_documents(docs)
    
    else:
        print(f"File type not supported: {file_name}")
    
    
    Chroma.from_documents(
        documents=chunks,
        embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY, model=EMBEDDING_MODEL),
        client=client,
        collection_name=f"collection_{CHUNK_SIZE}",
        collection_metadata={
            "hnsw:space": "cosine",
            "chunk_size": CHUNK_SIZE,
            "chunk_overlap": CHUNK_OVERLAP,
            "separators": str(SEPARATORS),
            "is_separator_regex": IS_SEPARATOR_REGEX,
            "strip_whitespace": STRIP_WHITESPACE,
        } if not collection_already_exists else None,   # because metadata is already set and i can't change space or overwrite it
    )

## Creating Unstructured Chunks

In [5]:
# Parameters

PAR_STRATEGY = "hi_res"
PAR_LANGUAGES = ["deu"]
REMOVABLE_ELEMENTS = (Image)

In [6]:
def get_elements(file_path: str, delete_element_types=(Image)) -> list:
    
    elements = None
    if file_path.endswith(".pdf"):
        elements = partition_pdf(
            filename=file_path,
            strategy=PAR_STRATEGY,
            languages=PAR_LANGUAGES,
        )
    elif file_path.endswith(".docx"):
        elements = partition_docx(
            filename=file_path,
            strategy=PAR_STRATEGY,
            languages=PAR_LANGUAGES,
        )
    elif file_path.endswith(".doc"):
        elements = partition_doc(
            filename=file_path,
            strategy=PAR_STRATEGY,
            languages=PAR_LANGUAGES,
        )
    else:
        print(f"File type not supported: {file_path}")
    
    if delete_element_types:
        elements = [element for element in elements if not isinstance(element, delete_element_types)]
    
    return elements


def create_documents(chunk_elements) -> list:
    documents = []
    
    for chunk_element in chunk_elements:
        source = os.path.join(chunk_element.metadata.file_directory, chunk_element.metadata.filename)
        page_number = chunk_element.metadata.page_number if chunk_element.metadata.page_number != None else -1 

        document = Document(
            page_content=chunk_element.text,
            metadata={
                "source": source,
                "page_number": page_number,
            },
        )
        documents.append(document)
    
    return documents

In [7]:
elements_dict = defaultdict(dict)

with tqdm(RED_FILE_PATHS) as iterator:
    for file_path in iterator:
        file_name = file_path.split("\\")[-1]
        iterator.set_postfix_str(f"Processing: {file_name}")
        
        elements = get_elements(file_path, delete_element_types=REMOVABLE_ELEMENTS)
        
        elements_dict[file_name]["elements"] = elements

100%|██████████| 108/108 [36:31<00:00, 20.29s/it, Processing: ark_045_-_einstiegsqualifizierung.pdf]                                 


### `basic`

In [10]:
# Parameters

CHUNK_SIZE = 1500
OVERLAP = int(CHUNK_SIZE * (1/5))
MAX_CHARACTERS = int(CHUNK_SIZE * (5/3))    # chunk size + 2/3 chunk size
METHOD = "basic"
DB_PATH = os.path.join(DATABASE_PATH, "Unstructured", METHOD, f"{EMBEDDING_MODEL}")

CLIENT = chromadb.PersistentClient(
    path=DB_PATH,
)

MAX_CHARACTERS, CHUNK_SIZE, OVERLAP

(2500, 1500, 300)

In [11]:
all_documents = []

with tqdm(elements_dict.keys()) as iterator:
    for file_path in iterator:
        file_name = file_path.split("\\")[-1]
        iterator.set_postfix_str(f"Processing: {file_name}")
        
        elements = elements_dict[file_name]["elements"]
        
        chunks = chunk_elements(
            elements=elements,
            max_characters=MAX_CHARACTERS,
            new_after_n_chars=CHUNK_SIZE,
            overlap=OVERLAP,
            overlap_all=True,
        )
        
        documents = create_documents(chunks)
        all_documents.extend(documents)

len(all_documents)

100%|██████████| 108/108 [00:00<00:00, 545.04it/s, Processing: ark_045_-_einstiegsqualifizierung.pdf]                                 


1583

In [12]:
# collection_already_exists = False

# Chroma.from_documents(
#     documents=all_documents,
#     embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY, model=EMBEDDING_MODEL),
#     client=CLIENT,
#     collection_name=f"collection_{CHUNK_SIZE}",
#     collection_metadata={
#         "hnsw:space": "cosine",
#         "chunk_size": CHUNK_SIZE,
#         "max_characters": MAX_CHARACTERS,
#         "chunk_overlap": OVERLAP,
#         "method": METHOD,
#         "embedding_model": EMBEDDING_MODEL,
#     } if not collection_already_exists else None,   # because metadata is already set and i can't change space or overwrite it
# )

<langchain_chroma.vectorstores.Chroma at 0x2964f1a0110>

### `by_title`

In [13]:
# Parameters

CHUNK_SIZE = 1800
MAX_CHARACTERS = int(CHUNK_SIZE * (5/3))    # chunk size + 2/3 chunk size
COMBINE_TEXT_UNDER_N_CHARS = int(2/3 * CHUNK_SIZE)
METHOD = "by_title"
DB_PATH = os.path.join(DATABASE_PATH, "Unstructured", METHOD, f"{EMBEDDING_MODEL}")

CLIENT = chromadb.PersistentClient(
    path=DB_PATH,
)

CHUNK_SIZE, MAX_CHARACTERS, COMBINE_TEXT_UNDER_N_CHARS

(1800, 3000, 1200)

In [14]:
all_documents = []

with tqdm(RED_FILE_PATHS[:]) as iterator:
    for file_path in iterator:
        file_name = file_path.split("\\")[-1]
        iterator.set_postfix_str(f"Processing: {file_name}")
        
        elements = elements_dict[file_name]["elements"]
        
        chunks = chunk_by_title(
            elements=elements,
            max_characters=MAX_CHARACTERS,
            new_after_n_chars=CHUNK_SIZE,
            combine_text_under_n_chars=COMBINE_TEXT_UNDER_N_CHARS,
        )
        
        documents = create_documents(chunks)
        all_documents.extend(documents)

len(all_documents)

100%|██████████| 108/108 [00:00<00:00, 476.70it/s, Processing: ark_045_-_einstiegsqualifizierung.pdf]                                 


1197

In [15]:
# collection_already_exists = False

# Chroma.from_documents(
#     documents=all_documents,
#     embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY, model=EMBEDDING_MODEL),
#     client=CLIENT,
#     collection_name=f"collection_{CHUNK_SIZE}",
#     collection_metadata={
#         "hnsw:space": "cosine",
#         "chunk_size": CHUNK_SIZE,
#         "max_characters": MAX_CHARACTERS,
#         "combine_text_under_n_chars": COMBINE_TEXT_UNDER_N_CHARS,
#         "method": METHOD,
#         "embedding_model": EMBEDDING_MODEL, 
#     } if not collection_already_exists else None,   # because metadata is already set and i can't change space or overwrite it
# )

<langchain_chroma.vectorstores.Chroma at 0x2964f0d4190>