In [1]:
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import (
    DirectoryLoader,
    UnstructuredMarkdownLoader,
)
from langchain_community.vectorstores import StarRocks
from langchain_community.vectorstores.starrocks import StarRocksSettings
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_text_splitters import TokenTextSplitter,RecursiveCharacterTextSplitter,MarkdownHeaderTextSplitter

import os
import re
import spacy

In [2]:
from langchain_community.document_loaders import (
    DirectoryLoader,
    UnstructuredMarkdownLoader,
)

def load_md_directory(dir):

    files = os.listdir(dir)
    documents_md = []
    for i,file in enumerate(files):
        
        # print(f'reading file {i+1}/{len(files)} - {file}')
        file_r = os.path.join(dir_md,file)
        loader = UnstructuredMarkdownLoader(file_r,mode='single')
        doc = loader.load()
        documents_md.append(doc[0])
    
    print(f'{len(documents_md)} Markdown documents were loaded.')

    return documents_md

dir_md = 'files/'
documents = load_md_directory(dir_md)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
splits_md = text_splitter.split_documents(documents)

print(f'Splits {len(splits_md)}')


1 Markdown documents were loaded.
Splits 11


In [3]:
splits_md

[Document(page_content='The 50/30/20 rule: how to budget your money more efficiently\n\nBudgeting doesn’t need to be complicated, nor should it take hours out of your day. In fact, the best ways to budget are often the simplest. Take, for example, the 50/30/20 rule. The 50/30/20 rule is a straightforward monthly budgeting method that tells you exactly how much to put towards your savings and your living costs each month.\nWith a clear big-picture overview of your budget for the month, you can confidently avoid overspending and build up your\nsavings over time—all without painstakingly recording every\nsingle transaction.\n\xa0\nSo, if you’ve ever downloaded a budgeting app only to abandon\nit by the third day, you might want to give the 50/30/20 method\na try. It’s one of the best budgeting tips we’ve found, and\nhere’s how it works.\n\nWhat is the 50/30/20 rule?', metadata={'source': 'files/503020budgetrule.md'}),
 Document(page_content='The 50/30/20 rule is an easy budgeting method t

In [4]:
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import chromadb

client = chromadb.PersistentClient(path="storage")

os.environ["OPENAI_API_KEY"] = ''
EMBEDDING_MODEL = 'text-embedding-3-small'

embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'), model_name=EMBEDDING_MODEL)

col = client.get_or_create_collection("rag_system",embedding_function = embedding_function)




In [5]:
col.add(ids = [f"id_{i}" for i in range(len(splits_md))],
    documents= [doc.page_content for doc in splits_md],
    metadatas=[doc.metadata for doc in splits_md])

print(f'registers {col.count()}')

registers 11


In [7]:
import pandas as pd

def query_collection(collection, query, max_results):
    results = collection.query(query_texts=query, n_results=max_results, include=['metadatas','documents','distances']) 
    
    return results

In [8]:
query_collection(col,'How can I budget easily?',max_results=10)

{'ids': [['id_0',
   'id_1',
   'id_2',
   'id_10',
   'id_9',
   'id_8',
   'id_4',
   'id_3',
   'id_7',
   'id_5']],
 'distances': [[0.7511122090804999,
   0.9017633848982403,
   0.9068933903935067,
   0.9351686947866521,
   0.9472874484522491,
   0.9660680816542415,
   0.9671269928241254,
   1.0239775192027702,
   1.045755434427398,
   1.0989073254178041]],
 'metadatas': [[{'source': 'files/503020budgetrule.md'},
   {'source': 'files/503020budgetrule.md'},
   {'source': 'files/503020budgetrule.md'},
   {'source': 'files/503020budgetrule.md'},
   {'source': 'files/503020budgetrule.md'},
   {'source': 'files/503020budgetrule.md'},
   {'source': 'files/503020budgetrule.md'},
   {'source': 'files/503020budgetrule.md'},
   {'source': 'files/503020budgetrule.md'},
   {'source': 'files/503020budgetrule.md'}]],
 'embeddings': None,
 'documents': [['The 50/30/20 rule: how to budget your money more efficiently\n\nBudgeting doesn’t need to be complicated, nor should it take hours out of your 

In [103]:
r =query_collection(col,'Was sind Effizienzhäuser?',max_results=10)

In [111]:
print(r['distances'][0])
print(r['metadatas'][0])
print(r['ids'][0])
print(r['documents'][0])

[0.6326306462287903, 0.7478874921798706, 0.7492695450782776, 0.8539144396781921, 0.8739721179008484, 0.8822094202041626, 0.8912213444709778, 0.9006096720695496, 0.9024194478988647, 0.9025322794914246]
[{'source': 'md_processed_files/Richtlinie_BEG_EM_2023-12-21.md'}, {'source': 'md_processed_files/Richtlinie_BEG_EM_2023-12-21.md'}, {'source': 'md_processed_files/Richtlinie_BEG_EM_2023-12-21.md'}, {'source': 'md_processed_files/faq_bundesfoerderung_fuer_effiziente_gebaeude.md'}, {'source': 'md_processed_files/Technische_FAQ_BEG_EM_5_2023-03-13_full.md'}, {'source': 'md_processed_files/faq_bundesfoerderung_fuer_effiziente_gebaeude.md'}, {'source': 'md_processed_files/Technische_FAQ_BEG_EM_5_2023-03-13_full.md'}, {'source': 'md_processed_files/Richtlinie_BEG_EM_2023-12-21.md'}, {'source': 'md_processed_files/Richtlinie_BEG_EM_2023-12-21.md'}, {'source': 'md_processed_files/Richtlinie_BEG_EM_2023-12-21.md'}]
['id_537', 'id_722', 'id_538', 'id_491', 'id_150', 'id_493', 'id_55', 'id_761', 'i

In [129]:
for i in r['distances'][0]:
    print(f'distance {i}')

distances = r['distances'][0]
documents = r['documents'][0]
ids = r['ids'][0]
metadatas = r['metadatas'][0]

# Define keys for the dictionaries
keys = ['id', 'distance', 'text', 'metadata']

# Combine lists using zip and create a list of dictionaries
results2 = [dict(zip(keys, values)) for values in zip(ids, distances, documents, metadatas)]

# Define the threshold for filtering
distance_threshold = 0.8

# Apply the filter using a list comprehension
filtered_data = [item for item in results2 if item['distance'] < distance_threshold]

for j in filtered_data:
    print(j)

distance 0.6326306462287903
distance 0.7478874921798706
distance 0.7492695450782776
distance 0.8539144396781921
distance 0.8739721179008484
distance 0.8822094202041626
distance 0.8912213444709778
distance 0.9006096720695496
distance 0.9024194478988647
distance 0.9025322794914246
{'id': 'id_537', 'distance': 0.6326306462287903, 'text': 'Anlagen zur Heizungsunterstützung, die erneuerbare Energien nutzen (zum Beispiel Umweltwärme, Geothermie),\nder Einbau von Geräten zur digitalen Energieverbrauchsoptimierung, oder die Errichtung eines Wärmespeichers\nneben dem Gebäude;\nf) „Effizienzhäuser“: Wohngebäude und Nichtwohngebäude, die sich durch eine energetisch optimierte Bauweise\nund Anlagentechnik auszeichnen und die die mit der BEG-Förderrichtlinie festgelegten technischen Mindestanfor-\nderungen an die Gesamtenergieeffizienz (Bezugsgröße: Primärenergiebedarf QP) und an die Energieeffizienz der\nGebäudehülle (Bezugsgröße Nichtwohngebäude: Mittelwerte der Wärmedurchgangskoeffizienten Ū, Be