#### Importing Libs

In [2]:
import sys; print(sys.executable)

/opt/conda/envs/chatbot/bin/python


In [1]:
# Python basics
from pprint import pprint
import re
from typing import List, Dict, Optional, Tuple
import json
import os
import numpy as np

# NLP & Embeddings
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

import spacy
from spacy.cli import download

# Try to load the model, if not present – download it
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

import faiss
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("✅ FAISS working. GPU count:", faiss.get_num_gpus())

✅ FAISS working. GPU count: 2


#### Constants

In [3]:
from constants import (
    CLEANED_DATA_DIR, 
    SITES_ELEMENTS_JSON_PATH,
    URL_PATTERN, 
    REPLACED_URL_MAP_PATH, 
    SITES_ELEMENTS_WITH_REPLACED_URLS_PATH,
    TOKENIZING_MODEL,
    EMBEDDING_MODEL,
    MAX_TOKENS, 
    OVERLAP,
    FAISS_INDEX_PATH,
    FAISS_METADATA_PATH
)

#### Tokenizer & Embedding Model Setup

In [4]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZING_MODEL)
embedding_model = SentenceTransformer(EMBEDDING_MODEL, device="cuda")

#### Handle urls

In [4]:
def is_url_saved(saved_urls: Dict[str, str], url: str):
    for saved_url_label, saved_url in saved_urls.items():
        if saved_url == url:
            return saved_url_label
    return False

def extract_and_replace_urls(
    text: str, last_url_number: int, 
    replaced_url_map: Dict[str, str], 
    url_pattern: str = URL_PATTERN
) -> (str, int):
    urls = re.findall(url_pattern, text)
    for url in urls:
        saved_url_label = is_url_saved(replaced_url_map, url)
        if isinstance(saved_url_label, str):
            placeholder = saved_url_label
        else:
            placeholder = f"[URL_{last_url_number}]"
            last_url_number += 1
            replaced_url_map[placeholder] = url
        
        text = text.replace(f"[{url}]", placeholder)
    return text, last_url_number

In [5]:
sites_elements_with_replaced_urls = []
last_url_number = 0

if os.path.exists(REPLACED_URL_MAP_PATH):
    with open(REPLACED_URL_MAP_PATH, 'r', encoding='utf-8') as file_m:
        replaced_url_map = json.load(file_m)
else:
    replaced_url_map = {}

if os.path.exists(SITES_ELEMENTS_JSON_PATH):
    with open(SITES_ELEMENTS_JSON_PATH, 'r', encoding='utf-8') as file_s:
        sites_elements = json.load(file_s)
else:
    sites_elements = []

for site_element in sites_elements:
    site_element_with_replaced_urls, last_url_number = extract_and_replace_urls(site_element['text'], last_url_number, replaced_url_map, URL_PATTERN)
    site_element_with_replaced_urls = {
        'text': site_element_with_replaced_urls,
        'source': site_element['source']
    }
    sites_elements_with_replaced_urls.append(site_element_with_replaced_urls)
    
with open(SITES_ELEMENTS_WITH_REPLACED_URLS_PATH, "w", encoding="utf-8") as f:
    json.dump(sites_elements_with_replaced_urls, f, ensure_ascii=False, indent=4)

with open(REPLACED_URL_MAP_PATH, "w", encoding="utf-8") as f:
    json.dump(replaced_url_map, f, ensure_ascii=False, indent=4)

#### Chunking and Tokenizing

In [None]:
def sentence_aware_chunks(text: str, max_tokens: Optional[int] = 400, overlap: Optional[int] = 50) -> List[str]:
    """
    Splits `text` into sentence-aware chunks that do not exceed `max_tokens` tokens.
    Uses sentence boundaries and maintains an overlap of `overlap` tokens between chunks.
    
    Returns:
        List[str]: List of plain-text chunks.
    """
    sentences = [sent.text for sent in nlp(text).sents]
    chunks = []
    current_chunk = ""
    current_tokens = 0

    for sent in sentences:
        sent_token_ids = tokenizer.encode(sent, add_special_tokens=False)
        sent_tokens = len(sent_token_ids)
        
        if sent_tokens + overlap > max_tokens:
            # Use sliding window with overlap
            stride = max_tokens - overlap
            for i in range(0, sent_tokens, stride):
                chunk_ids = sent_token_ids[i:i+max_tokens]
                chunk_text = tokenizer.decode(chunk_ids)
                chunks.append(chunk_text.strip())
            continue  # skip adding to current_chunk
        
        if current_tokens + sent_tokens + overlap > max_tokens:
            if current_chunk.strip():
                chunks.append(current_chunk.strip())
            
            # Overlap part:
            current_chunk_tokens = tokenizer.encode(current_chunk, add_special_tokens=False)
            overlap_chunk_tokens = current_chunk_tokens[-overlap:] if overlap > 0 else []
            overlap_chunk = tokenizer.decode(overlap_chunk_tokens)
            current_chunk = overlap_chunk + " " + sent
            current_tokens = len(tokenizer.encode(current_chunk, add_special_tokens=False))
            if current_tokens > max_tokens:
                print(f"⚠️ current_tokens is too long: {current_tokens} tokens")
        else:
            current_chunk += " " + sent
            current_tokens += sent_tokens

    # Append last chunk
    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    return chunks

In [6]:
if os.path.exists(SITES_ELEMENTS_WITH_REPLACED_URLS_PATH):
    with open(SITES_ELEMENTS_WITH_REPLACED_URLS_PATH, 'r', encoding='utf-8') as file_s:
        sites_elements = json.load(file_s)
else:
    raise ValueError(f"Failed to open {SITES_ELEMENTS_WITH_REPLACED_URLS_PATH}")

sites_chunks = []
for site_element in sites_elements:
    # for 1 source text >1 chunks can be produced
    site_chunks = sentence_aware_chunks(site_element['text'], max_tokens=MAX_TOKENS, overlap=OVERLAP)

    # Including source for every chunk
    for site_chunk in site_chunks:
       sites_chunks.append({"text": site_chunk, "source": site_element["source"]})

Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors


#### Embedding Vectors

In [8]:
# Extract texts for embeddings
texts = [item["text"] for item in sites_chunks]
metadata = sites_chunks  # same order!

# Generate embeddings
embeddings = embedding_model.encode(texts, convert_to_numpy=True)

# Store in FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
faiss.write_index(index, FAISS_INDEX_PATH)

# Save metadata
with open(FAISS_METADATA_PATH, "wb") as f:
    pickle.dump(metadata, f)

In [16]:
# query = "all the questions in the form, and submit the declaration online. You can also fill out a paper declaration and submit it at your"
query = "Each month, your employer takes out an estimated portion of your salary for taxes. At the end of the year, this advance is adjusted"

matches = [text for text in texts if query in text]

In [17]:
matches[0]

'## How to receive a tax refund?\n Each month, your employer takes out an estimated portion of your salary for taxes. At the end of the year, this advance is adjusted based on your actual tax liability. If you’ve paid more in taxes than you owe, you’ll get a refund.\xa0\xa0\n After completing your income declaration, you will be able to see if you have overpaid your taxes. Refunds are only possible from personal income tax (GPM). If you have overpaid your taxes, you can request a refund of the excess amount.'

In [15]:
len(matches)

1

## Generating Answers

In [None]:
# Load everything
index = faiss.read_index(FAISS_INDEX_PATH)
with open(FAISS_METADATA_PATH, "rb") as f:
    metadata = pickle.load(f)

# Search query
query = "How do I pay taxes in Lithuania?"
query_embedding = embedding_model.encode([query])

# Search top 3 results
D, I = index.search(query_embedding, k=3)

for rank, (i, distance) in enumerate(zip(I[0], D[0]), start=1):
    print(f"Result {rank}:")
    print("🔹 Chunk:", metadata[i]["text"])
    print("🔗 Source:", metadata[i]["source"])
    print("📏 Distance:", distance)
    print()


Result 1:
🔹 Chunk: to worry about paying taxes directly — your employer handles this on your behalf. however, if you are self - employed, you are responsible for paying taxes yourself. # # other taxes corporate income tax ( known as pelno mokestis ) • Rate: 16% on the taxable profits.  
 • Contributors: Lithuanian entities and foreign entities' permanent establishments. 
 • Exemptions: Entities with fewer than 10 employees and income under €300,000 are taxed at 0% during their first tax period.  
 Value Added Tax - VAT (knows as pridėtinės vertės mokestnis - PVM) 
• Rate: Standard rate is 21%. Reduced rates of 0%, 5%, and 9% apply to certain goods and services. 
 • Application: Applies to most goods and services bought and sold for use or consumption.  
 Property Tax (known as nekilnojamo turto mokestis) 
• Rate: 0.5% (for property valued €150,000- €300,000); 1% (for property valued €300,000 - €500,000); 2% (for property valued over €500,000). 
 • Contributors:  natural and legal perso