In [3]:
# from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
PATH = "data/11.010-1-1.de.pdf"
def load_documents():
    # loader = DirectoryLoader(PATH, glob="*.md")
    loader = UnstructuredPDFLoader(PATH, language="de")
    documents = loader.load()
    return documents

In [4]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [5]:
docs = load_documents()
chunks = split_text(docs)

Split 1 documents into 164 chunks.
und kaum eindeutig vorzunehmen. Das Kirchengesetz zählt die «inneren» Angelegenheiten der Kirchen nicht abschliessend auf, sondern umschreibt sie in der Fassung vom 12. September 1995, in Kraft seit dem 1. Juli 1996, wie folgt: «Alles, was sich auf die Wortverkündigung, die Lehre, die Seelsorge,
{'source': 'data/11.010-1-1.de.pdf', 'start_index': 1573}


In [40]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name=r"C:\bjsChatBot\distiluse-base-multilingual-cased-v2"
)
chunk_embeddings = [embeddings.embed_query(chunk.page_content) for chunk in chunks]


No sentence-transformers model found with name C:\bjsChatBot\distiluse-base-multilingual-cased-v2. Creating a new one with mean pooling.


In [48]:
import re
def is_dense_text(text, min_word_ratio=0.7):
    words = text.split()
    num_words = len(words)
    num_alpha_words = sum(1 for w in words if re.search(r'[A-Za-zÄÖÜäöüß]', w))
    return num_words > 0 and (num_alpha_words / num_words) > min_word_ratio

In [53]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name = r"C:\bjsChatBot\multilingual-e5-base"
)
filter_chunks = [chunk for chunk in chunks if len(chunk.page_content.split()) > 20]
# chunk_embeddings = [embeddings.embed_query(chunk.page_content) for chunk in chunks]

dense = [c for c in filter_chunks if is_dense_text(c.page_content)]
chunk_embeddings = [embeddings.embed_query(chunk.page_content) for chunk in dense]


No sentence-transformers model found with name C:\bjsChatBot\multilingual-e5-base. Creating a new one with mean pooling.


In [54]:
import faiss
import numpy as np

embedding_matrix = np.array(chunk_embeddings).astype("float32")

faiss.normalize_L2(embedding_matrix)

d = embedding_matrix.shape[1]

index = faiss.IndexFlatL2(d)  # L2 distance; for cosine, you can normalize first

index.add(embedding_matrix)

faiss.write_index(index, "chunk_index.faiss")

In [55]:
query = "Pffarer"
query_embedding = embeddings.embed_query(query)
query_embedding = np.array([query_embedding]).astype("float32")

faiss.normalize_L2(query_embedding)

k = 50  # number of nearest neighbors
distances, indices = index.search(query_embedding, k)

# print(indices)
# print(distances)

ind_list = indices[0].tolist()
dist_list = distances[0].tolist()
print(ind_list)
print(dist_list)

[134, 137, 102, 138, 2, 49, 104, 1, 90, 136, 10, 63, 22, 78, 106, 108, 75, 71, 19, 29, 95, 25, 56, 86, 89, 11, 43, 21, 115, 103, 114, 16, 46, 133, 105, 116, 28, 70, 66, 5, 34, 39, 87, 80, 110, 35, 97, 31, 130, 100]
[0.4884769022464752, 0.48901665210723877, 0.4910545349121094, 0.49464672803878784, 0.49649009108543396, 0.5044476389884949, 0.5062118172645569, 0.5089321136474609, 0.5127937197685242, 0.5130429267883301, 0.514473557472229, 0.5147382020950317, 0.5168616771697998, 0.5180409550666809, 0.5194836854934692, 0.5197793841362, 0.5203147530555725, 0.5242317914962769, 0.5242651700973511, 0.524651288986206, 0.5250217914581299, 0.5251355171203613, 0.5263976454734802, 0.5282731056213379, 0.528329610824585, 0.5284526348114014, 0.5293096303939819, 0.5298016667366028, 0.5313051342964172, 0.5317108631134033, 0.5324579477310181, 0.533250093460083, 0.5334582924842834, 0.5341223478317261, 0.5352600812911987, 0.5357095003128052, 0.5368797779083252, 0.5379868745803833, 0.5392143130302429, 0.539607

In [56]:
# Suppose these are the results from FAISS
# indices = [116, 150, 117, 153, 128, 129, 130, 131, 120,  44]
# distances = [ 88.89868,   91.77553,   95.75629,   96.94322,   99.195854, 100.25284,
#   100.646454, 100.95151,  101.32193,  101.53297 ]

# Map indices back to your original text chunks and metadata
for idx, dist in zip(ind_list, dist_list):
    chunk = chunks[idx]  # get the Document object
    print(f"Index: {idx}, Distance: {dist}")
    print(chunk.page_content)
    print("Metadata:")
    print(chunk.metadata)
    print("-" * 50)


Index: 134, Distance: 0.4884769022464752
15

11.010

Reformierte Kirchen Bern-Jura-Solothurn

2 Sie bekennt sich damit zur Verpflichtung, für die Ausbreitung des Evangeli- ums in der weiten Welt einzustehen.

4 Der Geldhaushalt der Kirche

Art. B4

Lukas 16,10
Metadata:
{'source': 'data/11.010-1-1.de.pdf', 'start_index': 27331}
--------------------------------------------------
Index: 137, Distance: 0.48901665210723877
a) b) c) d)

aus den Leistungen des Staates aus dem Ertrag der Kirchengüter und Stiftungen aus freiwilligen Zuwendungen und Vermächtnissen aus den Kirchensteuern, die von den Kirchgemeinden beschlossen und erhoben werden
Metadata:
{'source': 'data/11.010-1-1.de.pdf', 'start_index': 27820}
--------------------------------------------------
Index: 102, Distance: 0.4910545349121094
4 Das Verfahren wird im Einzelnen durch eine Verordnung geregelt.

Art. 24

Vorschlagsrecht

1 Das Vorschlagsrecht nach Art. 67 des Kirchengesetzes kann ausgeübt werden:

a) b)

von mindestens 10

In [14]:
from gpt4all import GPT4All

# No local path, just model name:
model = GPT4All("gpt4all-lora-quantized")  
print(model.generate("Hallo! Erkläre mir, was ein Pfarrer macht."))


SSLError: HTTPSConnectionPool(host='gpt4all.io', port=443): Max retries exceeded with url: /models/models3.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1007)')))

In [None]:
import requests
import json

# API endpoint for Ollama
url = "http://localhost:11434/api/generate"

# Data payload for the request
payload = {
    "model": "llama3.2:3b",
    "prompt": "Write me a short poem about the ocean",
    "stream": False  # set to True if you want streaming responses
}

# Send POST request to Ollama
response = requests.post(url, json=payload)

# Parse and print the output
data = response.json()
print(data["response"])
