In [1]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from pathlib import Path

In [2]:
# 1. Embeddings vorbereiten
embeddings = OllamaEmbeddings(model="nomic-embed-text")

# 2. Verzeichnis definieren
root_dir = Path("../knowledge_base")

# 3. Unterstützte Dateitypen definieren
supported_extensions = {
    ".xlsx": TextLoader,
    ".xml": TextLoader,
    ".json": TextLoader,
    ".csv": TextLoader,
}

# 4. Alle unterstützten Dateien rekursiv finden
all_documents = []
for file_path in root_dir.rglob("*"):
    if file_path.suffix.lower() in supported_extensions:
        loader_cls = supported_extensions[file_path.suffix.lower()]
        try:
            loader = loader_cls(str(file_path), encoding="utf-8")
            all_documents.extend(loader.load())
        except Exception as e:
            print(f"Fehler beim Laden von {file_path}: {e}")

In [4]:
# 5. Text aufteilen
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(all_documents)

# 6. In Vector Store speichern
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db"
)
vector_store.add_documents(texts)

['b62917e1-a9a3-4900-abf7-68cfbfbf1994',
 '23bd2ca9-d9ed-403f-8a3a-1f7c3a1bf043']

In [9]:
query = "Daten uebr musiker und produzenten und verkaufte Alben"
similar_docs = vector_store.similarity_search(query, k=1)

In [10]:
similar_docs



In [12]:
# LLM über Ollama
llm = Ollama(model="mistral")

In [13]:
# Retriever aus Chroma
retriever = vector_store.as_retriever()

rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

In [20]:
frage = "Kannst du mir ein DBML schema das die daten bzws das format sowie die relationen wiederspiegelt?"
antwort = rag_chain({"query": frage})

print("Antwort:", antwort["result"])
print("Quellen:", [doc.metadata for doc in antwort["source_documents"]])

KeyboardInterrupt: 

In [None]:
# retrieved docs aus der Antwort (Source Documents)
retrieved_docs = antwort["source_documents"]
# Text der Dokumente zusammenfügen
retrieved_context = "\n".join([doc.page_content for doc in retrieved_docs])

# Gesamte Eingabe an das Modell (z.B. Query + Kontext)
input_text = frage + "\n" + retrieved_context

# Tokens zählen

print(f"Input Tokens an das Modell: {input_text}")

NameError: name 'antwort' is not defined

In [None]:
import tiktoken

# Passenden Tokenizer für dein Modell wählen, z.B. "gpt-4" oder "gpt-3.5-turbo"
encoding = tiktoken.encoding_for_model("gpt-4")

# Tokenize und Tokens zählen
tokens = encoding.encode(input_text)
print(f"Anzahl Tokens: {len(tokens)}")


NameError: name 'input_text' is not defined

In [31]:
import json
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path

import gradio as gr
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

KB_DIR = Path("../knowledge_base")
SUPPORTED_EXTS = [".csv", ".json", ".xlsx", ".xml"]

prompt = PromptTemplate.from_template("""
Here is the content of the following files:

{data}

Please answer the following question as specifically as possible:
{question}
""")

llm = Ollama(model="llama3")
chain = LLMChain(llm=llm, prompt=prompt)

def list_kb_files():
    return [str(p.name) for p in KB_DIR.glob("*") if p.suffix.lower() in SUPPORTED_EXTS]

def parse_file(file_path):
    ext = file_path.suffix.lower()
    try:
        if ext == ".csv":
            df = pd.read_csv(file_path)
            return df.to_markdown()
        elif ext == ".xlsx":
            df = pd.read_excel(file_path)
            return df.to_markdown()
        elif ext == ".json":
            with open(file_path, "r", encoding="utf-8") as f:
                return json.dumps(json.load(f), indent=2)
        elif ext == ".xml":
            tree = ET.parse(file_path)
            root = tree.getroot()
            return ET.tostring(root, encoding="unicode")
        else:
            return f"(Unsupported format: {ext})"
    except Exception as e:
        return f"(Error parsing {file_path.name}: {e})"

def respond(message, chat_history, selected_files, uploaded_files):
    selected_paths = [KB_DIR / name for name in selected_files]
    uploaded_paths = [Path(f.name) for f in uploaded_files or []]

    texts = []
    for p in selected_paths + uploaded_paths:
        texts.append(f"=== {p.name} ===\n{parse_file(p)}")

    full_context = "\n\n".join(texts)
    response = chain.run(data=full_context, question=message)

    chat_history = chat_history or []
    chat_history.append((message, response))
    return chat_history

with gr.Blocks() as demo:
    gr.Markdown("# 🧠 File Chatbot")

    # CheckboxGroup + Upload inside an Accordion as additional inputs
    with gr.Accordion("📂 Select or upload files", open=False) as inputs_accordion:
        kb_files = gr.CheckboxGroup(
            label="Select files from knowledge_base",
            choices=list_kb_files()
        )
        uploads = gr.File(
            label="Upload your own files",
            file_types=SUPPORTED_EXTS,
            file_count="multiple"
        )

    chatbot = gr.ChatInterface(
        fn=respond,
        title="File Chatbot",
        type='messages',
        additional_inputs=[kb_files, uploads],
        additional_inputs_accordion=inputs_accordion,
        submit_btn=True,
        autofocus=True,
        autoscroll=True,
        fill_width=True,
        save_history=True,
    )

demo.launch()


* Running on local URL:  http://127.0.0.1:7868
* To create a public link, set `share=True` in `launch()`.


