#### Importing Libs

In [85]:
# Python basics
from pprint import pprint
import re
from typing import List, Dict, Optional, Tuple
import json
import os

# NLP & Embeddings
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

import nltk       
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/gerzem1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Constants

In [2]:
from constants import (
    CLEANED_DATA_DIR, 
    SITES_ELEMENTS_JSON_PATH,
    URL_PATTERN, 
    REPLACED_URL_MAP_PATH, 
    SITES_ELEMENTS_WITH_REPLACED_URLS_PATH,
    MAX_TOKENS, 
    OVERLAP
)

#### Tokenizer & Embedding Model Setup

In [38]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda")

#### Handle urls

In [32]:
def is_url_saved(saved_urls: Dict[str, str], url: str):
    for saved_url_label, saved_url in saved_urls.items():
        if saved_url == url:
            return saved_url_label
    return False

def extract_and_replace_urls(
    text: str, last_url_number: int, 
    replaced_url_map: Dict[str, str], 
    url_pattern: str = URL_PATTERN
) -> (str, int):
    urls = re.findall(url_pattern, text)
    for url in urls:
        saved_url_label = is_url_saved(replaced_url_map, url)
        if isinstance(saved_url_label, str):
            placeholder = saved_url_label
        else:
            placeholder = f"[URL_{last_url_number}]"
            last_url_number += 1
            replaced_url_map[placeholder] = url
        
        text = text.replace(f"[{url}]", placeholder)
    return text, last_url_number

In [150]:
sites_elements_with_replaced_urls = []
last_url_number = 0

if os.path.exists(REPLACED_URL_MAP_PATH):
    with open(REPLACED_URL_MAP_PATH, 'r', encoding='utf-8') as file_m:
        replaced_url_map = json.load(file_m)
else:
    replaced_url_map = {}

if os.path.exists(SITES_ELEMENTS_JSON_PATH):
    with open(SITES_ELEMENTS_JSON_PATH, 'r', encoding='utf-8') as file_s:
        sites_elements = json.load(file_s)
else:
    sites_elements = []

for site_element in sites_elements:
    site_element_with_replaced_urls, last_url_number = extract_and_replace_urls(site_element['text'], last_url_number, replaced_url_map, URL_PATTERN)
    site_element_with_replaced_urls = {
        'text': site_element_with_replaced_urls,
        'source': site_element['source']
    }
    sites_elements_with_replaced_urls.append(site_element_with_replaced_urls)
    
with open(SITES_ELEMENTS_WITH_REPLACED_URLS_PATH, "w", encoding="utf-8") as f:
    json.dump(sites_elements_with_replaced_urls, f, ensure_ascii=False, indent=4)

with open(REPLACED_URL_MAP_PATH, "w", encoding="utf-8") as f:
    json.dump(replaced_url_map, f, ensure_ascii=False, indent=4)

#### Chunking

In [151]:
def tokenize(text: str, max_tokens: Optional[int] = 400, overlap: Optional[int] = 50):
    """
    Splits `text` into token-based chunks with overlap and tokenizes it.
    Returns a list of plain‑text chunks and tokens
    """
    encoding = tokenizer(
        text,
        # tells the tokenizer to return the start and end character positions
        # of each token relative to the original text.
        return_offsets_mapping=True,

        # disables automatic insertion of tokens like [CLS], [SEP], etc., 
        # which can interfere with chunking.
        add_special_tokens=False,
        truncation=True  # suppresses warning
    )

    # a list of integers representing the tokens.
    input_ids = encoding["input_ids"]
    if not input_ids:
        return [], []

    
    tokens_list = tokenizer.convert_ids_to_tokens(input_ids)

    # offsets is a list of tuples: each tuple is (start_char_index, end_char_index)
    # — where each token lives in the original text.
    offsets = encoding["offset_mapping"]
    
    chunks = []
    tokens = []
    start_token = 0
    while start_token < len(input_ids):
        end_token = min(start_token + max_tokens, len(input_ids))
        chunk_offsets = offsets[start_token:end_token]

        chunk_start_char = chunk_offsets[0][0]
        chunk_end_char = chunk_offsets[-1][1]

        chunk_text = text[chunk_start_char:chunk_end_char]
        chunks.append(chunk_text)
        # chunks.append({
        #     "text": chunk_text.strip(),
        #     "start_char": chunk_start_char,
        #     "end_char": chunk_end_char,
        #     "token_count": end_token - start_token
        # })
        tokens.append(tokens_list[start_token:end_token])
        start_token += max_tokens - overlap
    
    return chunks, tokens

In [152]:
# sites_elements = [
#     {
#         'text': 'This is some text with a link [URL_0] and more explanation. Another [URL_1] appears.',
#         'source': ""
#     },
#     {
#         'text': 'This is some text with a link [URL_0] and more explanation. Another [URL_1] appears.',
#         'source': ""
#     }
# ]

In [153]:
if os.path.exists(SITES_ELEMENTS_WITH_REPLACED_URLS_PATH):
    with open(SITES_ELEMENTS_WITH_REPLACED_URLS_PATH, 'r', encoding='utf-8') as file_s:
        sites_elements = json.load(file_s)
else:
    raise ValueError(f"Failed to open {SITES_ELEMENTS_WITH_REPLACED_URLS_PATH}")

chunk_list = []
token_list = []
for site_element in sites_elements:
    chunks, tokens = tokenize(site_element['text'], max_tokens=MAX_TOKENS, overlap=OVERLAP)
    # for 1 chunk, token_list has 1 array of tokens
    chunk_list.extend(chunks)
    token_list.append(tokens)