In [1]:
from pathlib import Path
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.settings import Settings
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

CUTOFF = 0.6
MODEL_NAME = 'KBLab/sentence-bert-swedish-cased'
DATA_FOLDER = Path("bp2025")
INDEX_FOLDER = DATA_FOLDER / Path("indexes")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def get_metadata_from_filename(filename):
    """Custom function to fetch metadata from filename."""
    path=Path(filename)
    return {
        "document_type": path.parent.name,
        "file_name": str(path.name),
    }

# load data
loader = SimpleDirectoryReader(
    input_dir=DATA_FOLDER,
    required_exts=[".pdf"], # Läser bara in pdf-filer
    recursive=False,
    file_metadata=get_metadata_from_filename)


Number of documents: 6066


In [3]:

from torch import backends, cuda
if backends.mps.is_available():
    device = "mps"
    print("Using MPS")
else:
    # check for cuda
    if cuda.is_available():
        device = "cuda"
        print("Using CUDA")
    else:
        device = "cpu"
        print("Using CPU")
        
embed_model = HuggingFaceEmbedding(model_name=MODEL_NAME, trust_remote_code=True, device=device)

# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=256, chunk_overlap=20),
        embed_model,
    ]
)

# Check if INDEX_FOLDER is empty. If empty run the pipeline and save the index
if not INDEX_FOLDER.exists() or not any(INDEX_FOLDER.iterdir()):

    # create index folder
    INDEX_FOLDER.mkdir(parents=True, exist_ok=True)

    print("No index found. Running pipeline and saving index.")

    docs = loader.load_data()
    print('Number of documents:', len(docs))
    
    # run the pipeline
    nodes = pipeline.run(documents=docs, show_progress=True)

    # Building index from nodes
    index = VectorStoreIndex(nodes, embed_model=embed_model)

    # Save index
    index.storage_context.persist(persist_dir=INDEX_FOLDER)
else:
    # Load index
    index = VectorStoreIndex(embed_model=embed_model)
    index.storage_context.load(INDEX_FOLDER)
    print("Index loaded.")

Using MPS




No index found. Running pipeline and saving index.


Parsing nodes: 100%|██████████| 6066/6066 [00:12<00:00, 478.12it/s]
Generating embeddings: 100%|██████████| 32255/32255 [07:46<00:00, 69.09it/s]


In [12]:
# Make a retriever object
retriever = index.as_retriever(similarity_top_k=5)

In [13]:
from llama_index.core.response.notebook_utils import (
    display_source_node,
    display_response,
)
target_text="""Hur går det för unga på arbetsmarknaden?"""

retrieved_results = retriever.retrieve(target_text)

for r in retrieved_results:
    display_source_node(r)

**Node ID:** 7430944e-07cd-407c-a710-a835ee5410e5<br>**Similarity:** 0.739268431083924<br>**Text:** Prop.  2024/25:1  Utgiftsområde  17 
150 Ungas förutsättningar för att etablera sig på arbetsmark...<br>

**Node ID:** 90034015-8201-4b0e-86a9-21d6b72f61de<br>**Similarity:** 0.7333949943853504<br>**Text:** Prop.  2024/25:1  Utgiftsområde  17 
150 Ungas förutsättningar för att etablera sig på arbetsmark...<br>

**Node ID:** cfe0bd51-1256-4ff8-8353-34b3a91176f4<br>**Similarity:** 0.7200013485255454<br>**Text:** Om de heltidsstuderande som söker 
arbete exkluderas sjunker dock arbetslöshetsnivån under 2023 t...<br>

**Node ID:** 78c45ea6-da17-403e-95d1-a964417fb673<br>**Similarity:** 0.7199509784394222<br>**Text:** Om de heltidsstuderande som söker 
arbete exkluderas sjunker dock arbetslöshetsnivån under 2023 t...<br>

**Node ID:** 83557bae-3c10-4919-b72b-e14d24ed3ad5<br>**Similarity:** 0.6983238213498363<br>**Text:** Andelen unga inskrivna på Arbetsförmedlingen  minskar i alla 
grupper och är den lägsta på 10 år ...<br>