In [1]:
from pathlib import Path
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.settings import Settings
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

CUTOFF = 0.6
MODEL_NAME = 'KBLab/sentence-bert-swedish-cased'
DATA_FOLDER = Path("bp2025")
INDEX_FOLDER = DATA_FOLDER / Path("indexes")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def get_metadata_from_filename(filename):
    """Custom function to fetch metadata from filename."""
    path=Path(filename)
    return {
        "document_type": path.parent.name,
        "file_name": str(path.name),
    }

# load data
loader = SimpleDirectoryReader(
    input_dir=DATA_FOLDER,
    required_exts=[".pdf"], # Läser bara in pdf-filer
    recursive=False,
    file_metadata=get_metadata_from_filename)
docs = loader.load_data()

print('Number of documents:', len(docs))

Number of documents: 6066


In [3]:

from torch import backends
if backends.mps.is_available():
    device = "mps"
    print("Using MPS")
else:
    device = "cpu"
embed_model = HuggingFaceEmbedding(model_name=MODEL_NAME, trust_remote_code=True, device=device)

# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=256, chunk_overlap=20),
        embed_model,
    ]
)

# Check if INDEX_FOLDER is empty. If empty run the pipeline and save the index
if not INDEX_FOLDER.exists() or not any(INDEX_FOLDER.iterdir()):
    print("No index found. Running pipeline and saving index.")
    # run the pipeline
    nodes = pipeline.run(documents=docs, show_progress=True)

    # Building index from nodes
    index = VectorStoreIndex(nodes, embed_model=embed_model)

    # Save index
    index.storage_context.persist(persist_dir=INDEX_FOLDER)
else:
    # Load index
    index = VectorStoreIndex(embed_model=embed_model)
    index.storage_context.load(INDEX_FOLDER)
    print("Index loaded.")



No index found. Running pipeline and saving index.


Parsing nodes: 100%|██████████| 6066/6066 [00:21<00:00, 285.40it/s]
Generating embeddings:   0%|          | 70/32255 [00:14<1:15:17,  7.12it/s]

KeyboardInterrupt: 

In [None]:
# Make a retriever object
retriever = index.as_retriever(similarity_top_k=2)

In [None]:

target_text="""Hur går det för unga på arbetsmarknaden?"""

retrieved_results = retriever.retrieve(target_text)
            #print(f"{target_id} found: {len(retrieved_results)} results")
            for r in retrieved_results: