In [1]:
from pathlib import Path

def load_documents(folder_path):
    docs = []
    for file in Path(folder_path).glob("*.txt"):
        with open(file, "r", encoding="utf-8") as f:
            docs.append(f.read())
    return docs


In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load a sentence transformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def load_documents(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return [f.read()]  # Return as a list with one item


docs = load_documents("pg24440.txt")
chunks = [chunk for doc in docs for chunk in chunk_text(doc)]

# Compute embeddings
embeddings = embedding_model.encode(chunks, convert_to_numpy=True)

# Build FAISS index only if embeddings are available
if embeddings.shape[0] > 0 and len(embeddings.shape) > 1:
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
else:
    print("No embeddings to index. Please check your documents and chunks.")


In [6]:
def retrieve(query, k=3):
    query_embedding = embedding_model.encode([query])
    _, indices = index.search(np.array(query_embedding), k)
    return [chunks[i] for i in indices[0]]


In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")

def generate_answer(query):
    context = "\n".join(retrieve(query))
    prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
    result = generator(prompt, max_length=200, do_sample=True)[0]['generated_text']
    return result.split("Answer:")[-1].strip()


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


In [None]:
query = "what is aneurism"
answer = generate_answer(query)
print("Answer:", answer)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: that is, for instance, or that it does not induce any of its

changes, or also from a prolapse to

the epididymis, and, if it does, or if so did

be by itself and it cannot be

cautiously

came to be. This is more a matter of principle, I think, than any question

of how or by whom it should be. For, I cannot get rid of an Aneurist
