#### Importing Libs

In [19]:
# Python basics
from pprint import pprint
import re
from typing import List
import json

# NLP & Embeddings
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

import nltk       
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/gerzem1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Constants

In [24]:
from constants import (
    CLEANED_DATA_DIR, 
    SITES_ELEMENTS_JSON_PATH,
    URL_PATTERN, 
    REPLACED_URL_MAP_PATH, 
    MAX_TOKENS, 
    OVERLAP
)

#### Tokenizer & Embedding Model Setup

In [6]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda")

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#### Handle urls

In [25]:
sites_elements = [
    {
        'text': "Toll-free hotline 0 800 22922 Consultations will not be available on 17 April • Housing and declaration of residence • Social security and healthcare • Exchanging a driver's license and other questions • Job search and navigation in the labor market • Preparation of CV, motivation letters, job interviews • Individual activity and other questions • Coping with cultural adjustment • Managing stress and anxiety • Emotional regulation • Learn to navigate bureaucracy • Understand local traditions • Gain skills and information for daily life • Residence permits in Lithuania • Job contracts and protection of employee rights • Rental agreements and other • Build a sense of belonging and connection • Navigate challenges and develop resilience • Gain peer support in a safe environment The International Organization for Migration (IOM) is part of the United Nations System as the leading inter-governmental organization promoting since 1951 humane and orderly migration for the benefit of all, with 175 member states and a presence in over 100 countries. IOM has had a presence in Lithuania since 1998. IOM has established a Migration Information Center in Lithuania that provides information and services to migrants to facilitate their integration. 0 800 22922 (free line) +370 525 14352 Jasinskio st. 16, II floor [https://www.google.com/maps/dir/54.7157558,25.266244/migration+information+center+lithuania/@54.7019533,25.2448175,14z/data=!3m1!4b1!4m9!4m8!1m1!4e1!1m5!1m1!1s0x46dd952291c16a0d:0x5ab716e3b0066f76!2m2!1d25.259492!2d54.6870158?entry=ttu&g_ep=EgoyMDI0MDkyNC4wIKXMDSoASAFQAw%3D%3D] ",
        'source': "https://micenter.lt/en"
    }
]

In [None]:
def is_url_saved(saved_urls: List[Dict[str, str]], url: str):
    for saved_url_label, saved_url in saved_urls.items:
        if saved_url == url:
            return saved_url_label
    return False

def extract_and_replace_urls(text: str, last_url_number: int, saved_urls: List[Dict[str, str]], url_pattern: str = URL_PATTERN):
    urls = re.findall(url_pattern, text)
    url_map = {}
    for url in urls:
        saved_url_label = is_url_saved(saved_urls, url)
        if isinstance(saved_url_label, str):
            placeholder = saved_url_label
        else:
            pass
        
    # for i, url in enumerate(urls, start=1):
    #     placeholder = f"[URL_{i}]"
    #     text = text.replace(url, placeholder)
    #     url_map[placeholder] = url
    return text, url_map

In [None]:
replaced_url_map = []
saved_urls = [] # also saved file
last_url_number = 0

for text, source in sites_elements.items():
    extract_and_replace_urls(text, last_url_number, saved_urls, URL_PATTERN)

#### Chunking

In [23]:
def split_into_chunks(text, max_tokens=400, overlap=50):
    """
    Splits `text` into token-based chunks with overlap.
    Returns a list of plain‑text chunks.
    """
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_toks = tokens[start:end]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_toks)
        chunks.append(chunk_text.strip())
        # move start forward, leaving some overlap
        start += max_tokens - overlap
    return chunks

In [18]:
chunks = split_into_chunks(test_json[0]['text'], max_tokens=MAX_TOKENS, overlap=OVERLAP)
pprint(len(chunks))
for c in chunks:
    print(f"{c}\n\n")

1
toll - free hotline 0 800 22922 consultations will not be available on 17 april • housing and declaration of residence • social security and healthcare • exchanging a driver ' s license and other questions • job search and navigation in the labor market • preparation of cv, motivation letters, job interviews • individual activity and other questions • coping with cultural adjustment • managing stress and anxiety • emotional regulation • learn to navigate bureaucracy • understand local traditions • gain skills and information for daily life • residence permits in lithuania • job contracts and protection of employee rights • rental agreements and other • build a sense of belonging and connection • navigate challenges and develop resilience • gain peer support in a safe environment the international organization for migration ( iom ) is part of the united nations system as the leading inter - governmental organization promoting since 1951 humane and orderly migration for the benefit of 