### Import libraries

In [1]:
# Core Python
import os
import json
import numpy as np
from ipynbname import path as nb_path
from pathlib import Path
from typing import TypedDict, List, Optional, Dict, Any
from dataclasses import dataclass, field
from typing import List
from pprint import pprint

# HTML Cleanup
from bs4 import BeautifulSoup, NavigableString
from bs4.element import Tag

# Json cleanup
from difflib import SequenceMatcher
import re

### Constants

In [2]:
from constants import (
    CHATBOT_SERVICE_ROOT,
    RAW_DATA_DIR,
    CLEANED_DATA_DIR, 
    SITES_DIRS,
    URL_MAP_FILENAME,
    SITE_ELEMENTS_JSON_FILENAME,
    SITES_ELEMENTS_JSON_PATH,
    SAVED_TEXTS_PATH,
    MERGED_SITES_PATH,
    ALLOWED_TAGS,
    SIMILAR_TEXT_THRESHOLD
)

### Cleaning HTMLs

#### HTML File Proccessing

In [3]:
# Helpers
def remove_links(text: str) -> str:
        # Removes [anything inside brackets]
        return re.sub(r"\[[^\[\]]*\]", "", text)

def normalize_for_comparison(text: str) -> str:
    text_no_links = remove_links(text)
    text_cleaned = text_no_links.lower().strip().replace('\xa0', ' ')
    return ' '.join(text_cleaned.split())


In [4]:
class HTMLCleaner:
    '''
    Cleans single HTML file
    '''
    def __init__(
        self,
        html_path: str,
        config: Optional[Dict[str, Any]] = None
    ):
        config = config or {}
        self.html_path: str = html_path
        self.allowed_tags = config.get("allowed_tags", [])
        self.debug: bool = config.get("debug", False)
        self.indent_for_saving_json = config.get("indent_for_saving_json", 4)
        self.SIMILAR_TEXT_THRESHOLD = config.get("SIMILAR_TEXT_THRESHOLD", 0.85)

    def load_html(self):
        with open(self.html_path, "r", encoding="utf-8") as f:
            self.soup = BeautifulSoup(f, "html.parser")
            
    def get_raw_html(self) -> BeautifulSoup:
        self.load_html()
        return self.soup

    def clean_html(self) -> BeautifulSoup:
        """
        Remove all tags except given in 'allowed_tags'
        Handle <a> only when inside a <p>: inline its text + href.
        Remove empty layout <div> but unwrap any <div> that has real text.
        """
        # ensure we have a parsed soup
        if not hasattr(self, "soup"):
            self.load_html()

        # First handle all <a> tags
        for a in list(self.soup.find_all("a")):
            # only keep if anchor is inside a paragraph
            if a.find_parent("p") or a.find_parent("li") or a.find_parent("ol") or a.find_parent("ul"):
                href = a.get("href", "").strip()
                text = a.get_text(strip=False)
                if text == "":
                    a.decompose()
                    continue
                
                # replace <a> with "text [href]"
                replacement = f"{text} [{href}]" if href else text
                a.replace_with(NavigableString(replacement))
            else:
                # drop menu or navigation links
                a.decompose()

        # Next process all other tags
        for tag in list(self.soup.find_all(True)):
            # guard against non‐Tags or tags with no name
            if not isinstance(tag, Tag) or tag.name is None:
                continue            
            
            name = tag.name.lower()
            if name in self.allowed_tags and not tag.get_text(strip=False) and not tag.find_all("br"):
                tag.decompose()
                continue
            if name in self.allowed_tags:
                continue

            if name == "div":
                # if there's no user‑visible text, drop it entirely
                if not tag.get_text(strip=False):
                    tag.decompose()
                else:
                    # unwrap to keep its children
                    tag.unwrap()
            else:
                # for any other disallowed tag, just unwrap it
                tag.unwrap()

        return self.soup

    def get_clean_soup(self) -> BeautifulSoup:
        """Load, clean, and return the BeautifulSoup tree in one call."""
        if not hasattr(self, "soup"):
            self.load_html()
        return self.clean_html()

    def convert_list_item_into_clean_text(self, li: Tag) -> str:
        # Convert <li> content into clean plain text
        return "• " + li.get_text(separator=" ", strip=False)

    def convert_br_into_clean_text(self, br: Tag) -> str:
        return "\n"

    def convert_p_into_clean_text(self, tag: Tag) -> str:
        return f"{tag.get_text(strip=False)}\n"
    
    def convert_header_into_clean_text(self, tag: Tag) -> str:
        name = tag.name.lower()
        text = tag.get_text(strip=False)
        
        header_level = int(name[1]) # Assuming that header number is single digit
        prefix = "#" * header_level
        return f"{prefix} {tag.get_text(strip=True)}\n"
        
    def convert_soup_to_clean_text(self, saved_texts: Optional[List[str]] = None) -> str:
        saved_texts = saved_texts if saved_texts is not None else []
        clean_soup = self.get_clean_soup()
        merged_text = ""
        
        tags = clean_soup.find_all(True)
        for tag in tags:
            name = tag.name.lower()

            # Potential text to be merged
            new_text = ""
            
            if name == "li":                
                new_text = self.convert_list_item_into_clean_text(tag) + "\n"

            elif name.startswith("h") and name[1:].isdigit():
                new_text = self.convert_header_into_clean_text(tag)

            elif name == 'p':
                new_text = self.convert_p_into_clean_text(tag)

            elif name == 'br':
                new_text = self.convert_br_into_clean_text(tag)

            if not new_text:
                continue

            is_new_el_different = True
            for saved_text in saved_texts:
                if abs(len(new_text) - len(saved_text)) > 500:
                    continue
                
                normalized_saved = normalize_for_comparison(saved_text)
                normalized_new = normalize_for_comparison(new_text)

                if abs(len(normalized_new) - len(normalized_saved)) > 20:
                    continue
                
                if SequenceMatcher(None, normalized_saved, normalized_new).ratio() > self.SIMILAR_TEXT_THRESHOLD:
                    is_new_el_different = False
                    break
            if is_new_el_different:
                merged_text += new_text
                saved_texts.append(new_text)
                with open(SAVED_TEXTS_PATH, "w", encoding="utf-8") as f:
                    json.dump(saved_texts, f, ensure_ascii=False, indent=4)
                            
        return merged_text


In [None]:
from pprint import pprint

config_html_cleaner = {
    "allowed_tags": ALLOWED_TAGS,
    "debug": True,
    "indent_for_saving_json": 4
}

html_filename = os.path.join(RAW_DATA_DIR, 'micenter_site', 'micenter.lt_en_change-of-employer.html')
cleaner = HTMLCleaner(html_filename, config_html_cleaner)

base, _ = os.path.splitext(os.path.join(CLEANED_DATA_DIR, 'micenter_site', 'micenter.lt_en_change-of-employer.html'))
saving_path = base + ".json"
print(saving_path)

text = cleaner.convert_soup_to_clean_text()
pprint(text)

/home/gerzem1/PDP/DCIP/migrant-info-platform/backend/chatbot_service/data/cleaned/micenter_site/micenter.lt_en_change-of-employer.json
('Toll-free hotline\n'
 '0 800 22922 \n'
 'Consultations will not \n'
 'be available on 17 April \n'
 '\n'
 '\n'
 'If you’re a foreigner working in Lithuania with a residence permit and want '
 'to change employers, in most cased you’ll need to get permission '
 '[https://www.migracija.lt/changing-employer] first. Your new employer will '
 'need to send a mediation letter '
 '[https://www.migracija.lt/app/form-wizard/mediation-letter] to the Migration '
 'Department, and you’ll need to submit a request for approval. You can start '
 'your new job only after receiving this permission.\n'
 '## How to change employer in Lithuania?\n'
 'For Non-Highly Skilled Occupations: If your residence permit isn’t for a '
 'high-skilled job [/en/blue-card], you’ll need permission to change your '
 'employer. You can apply for this change no sooner than 6 months after y

#### Cleaning of All HTMLs

## Merging All Data into a Single Json File

In [5]:
def make_text_source_entry(text: str = "", source: str = "") -> dict[str, str]:
    return {
        "text": text,
        "source": source
    }

class AllCleanedSitesData:
    """
    Cleaned data of all scraped sites in the same file.
    """
    def __init__(
        self, 
        sites_dirs: List[str],
        saved_texts: List[str],
        config: Dict[str, Any]
    ):
        if not isinstance(sites_dirs, list):
            raise TypeError("sites_dirs must be a list of strings.")
        self.sites_dirs: List[str] = sites_dirs

        REQUIRED_KEYS: Dict[str, type] = {
            "ALLOWED_TAGS": (list, set),
            "RAW_DATA_DIR": str,
            "CLEANED_DATA_DIR": str,
            "URL_MAP_FILENAME": str,
            "SITE_ELEMENTS_JSON_FILENAME": str,
            "SITES_ELEMENTS_JSON_PATH": str,
            "SIMILAR_TEXT_THRESHOLD": float
        }
        
        for key, expected_type in REQUIRED_KEYS.items():
            if key not in config:
                raise ValueError(f"Missing required config key: {key}")
            if not isinstance(config[key], expected_type):
                raise TypeError(
                    f"Config key '{key}' must be of type {expected_type.__name__}, "
                    f"got {type(config[key]).__name__}"
                )

        self.ALLOWED_TAGS = config["ALLOWED_TAGS"]
        self.RAW_DATA_DIR = config["RAW_DATA_DIR"]
        self.CLEANED_DATA_DIR = config["CLEANED_DATA_DIR"]
        self.URL_MAP_FILENAME = config["URL_MAP_FILENAME"]
        self.SITE_ELEMENTS_JSON_FILENAME = config["SITE_ELEMENTS_JSON_FILENAME"]
        self.SITES_ELEMENTS_JSON_PATH = config["SITES_ELEMENTS_JSON_PATH"]
        self.SIMILAR_TEXT_THRESHOLD = config["SIMILAR_TEXT_THRESHOLD"]

        self.html_cleaner_config = {
            "allowed_tags": self.ALLOWED_TAGS,
            "SIMILAR_TEXT_THRESHOLD": self.SIMILAR_TEXT_THRESHOLD,
            "debug": config.get("debug", False)
        }

        self.saved_texts = saved_texts # text chunks from every file

    def merge_cleaned_site_jsons(self, site_dir: str, use_existing_site_data: bool, save_site: bool) -> List[Dict[str, str]]:
        url_map_path = os.path.join(self.RAW_DATA_DIR, site_dir, self.URL_MAP_FILENAME)
        with open(url_map_path, 'r', encoding='utf-8') as file:
            url_map = json.load(file)

        data_path = os.path.join(self.CLEANED_DATA_DIR, site_dir, self.SITE_ELEMENTS_JSON_FILENAME)
        if use_existing_site_data and os.path.isfile(data_path):
            with open(data_path, "r") as f:
                site_elements_in_json = json.load(f)
        else:
            site_elements_in_json = []
            
        total_files = len(url_map)
        last_printed_percent = -10  # to ensure 0% prints immediately
        print(f"Processing: {site_dir}")
        
        for i, (html_filename, page_link) in enumerate(url_map.items(), start=1):
            print(f"[{site_dir}] Processing file {i}/{total_files}: {html_filename}")
            percent_complete = int((i / total_files) * 100)
            if percent_complete >= last_printed_percent + 10:
                print(f"Progress: {percent_complete}% ({i}/{total_files})")
                last_printed_percent = percent_complete

            path_with_base, extension = os.path.splitext(os.path.join(self.CLEANED_DATA_DIR, site_dir, html_filename))
            if extension != ".html":
                continue       
            
            html_path = os.path.join(self.RAW_DATA_DIR, site_dir, html_filename)
            if not os.path.isfile(html_path):
                print(f"Warning: File not found: {html_path}")
                continue
                
            if len(site_elements_in_json) > 0:
                resource_found = any(entry["source"] == page_link for entry in site_elements_in_json)
                if resource_found:
                    print(f"[{site_dir}] file {html_filename} was already processed. Going to the next one.")
                    continue
            
            html_cleaner = HTMLCleaner(html_path=html_path, config=self.html_cleaner_config)
            merged_clean_text = html_cleaner.convert_soup_to_clean_text(self.saved_texts)
            text_source_entry = make_text_source_entry(text=merged_clean_text, source=page_link)
            site_elements_in_json.append(text_source_entry)

            if not save_site:
                continue
            with open(data_path, "w", encoding="utf-8") as f:
                json.dump(site_elements_in_json, f, indent=4, ensure_ascii=False)
        
        return site_elements_in_json

    def site_elements_file_exists(self, site_dir: str) -> bool:
        filepath = os.path.join(self.CLEANED_DATA_DIR, site_dir, self.SITE_ELEMENTS_JSON_FILENAME)        
        return os.path.exists(filepath)
    
    def convert_sites_data_to_json_list(
        self, 
        merged_data: List[Dict[str, str]],
        merged_sites: List[str],
        append_to_existing: bool, 
        use_site_elements_file_if_exists: bool,
        save: bool = True
    ) -> List[Dict[str, str]]:   
        """
        Merges all sites data to single json list
        """
        merged_data = merged_data if append_to_existing else []

        for site_dir in self.sites_dirs:
            if site_dir in merged_sites:
                continue
            
            if use_site_elements_file_if_exists and self.site_elements_file_exists(site_dir):
                # use that file           
                site_data = self.merge_cleaned_site_jsons(site_dir=site_dir,
                                                          use_existing_site_data=True,
                                                          save_site=save)
            else:
                site_data = self.merge_cleaned_site_jsons(site_dir=site_dir,
                                                          use_existing_site_data=False,
                                                          save_site=save)
                
            merged_data.extend(site_data)
            merged_sites.append(site_dir)
            print(f"Processed: {site_dir}")

        return merged_data

In [6]:
config_all_sites_data = {
    "ALLOWED_TAGS": ALLOWED_TAGS,
    "RAW_DATA_DIR": RAW_DATA_DIR,
    "CLEANED_DATA_DIR": CLEANED_DATA_DIR,
    "URL_MAP_FILENAME": URL_MAP_FILENAME,
    "SITE_ELEMENTS_JSON_FILENAME": SITE_ELEMENTS_JSON_FILENAME,
    "SITES_ELEMENTS_JSON_PATH": SITES_ELEMENTS_JSON_PATH,
    "SIMILAR_TEXT_THRESHOLD": SIMILAR_TEXT_THRESHOLD
}

SAVED_TEXTS_PATH = os.path.join(CLEANED_DATA_DIR, 'saved_texts.json')
if os.path.exists(SAVED_TEXTS_PATH):
    with open(SAVED_TEXTS_PATH, 'r', encoding='utf-8') as file:
        saved_texts = json.load(file)
else:
    saved_texts = []

if os.path.exists(MERGED_SITES_PATH):
    with open(MERGED_SITES_PATH, 'r', encoding='utf-8') as file_m:
        merged_sites = json.load(file_m)
else:
    merged_sites = []

if os.path.exists(SITES_ELEMENTS_JSON_PATH):
    with open(SITES_ELEMENTS_JSON_PATH, 'r', encoding='utf-8') as file_s:
        sites_data = json.load(file_s)
else:
    sites_data = []
    
all_sites_data_handler = AllCleanedSitesData(sites_dirs=SITES_DIRS,
                                             saved_texts=saved_texts,
                                             config=config_all_sites_data)

sites_data = all_sites_data_handler.convert_sites_data_to_json_list(merged_data=sites_data,
                                                                    merged_sites=merged_sites, 
                                                                    append_to_existing=True,
                                                                    use_site_elements_file_if_exists=True,
                                                                    save=True)

with open(SAVED_TEXTS_PATH, "w", encoding="utf-8") as f: # TODO: go to every site_dir, get each text and append it to save_texts
    json.dump(saved_texts, f, ensure_ascii=False, indent=4)
with open(MERGED_SITES_PATH, "w", encoding="utf-8") as f:
    json.dump(merged_sites, f, ensure_ascii=False, indent=4)

print(f"✅ Elements from all cleaned sites merged and saved to {SITES_ELEMENTS_JSON_PATH}")

Processing: vmi_site
[vmi_site] Processing file 1/1781: www.vmi.lt_evmi.html
Progress: 0% (1/1781)
[vmi_site] file www.vmi.lt_evmi.html was already processed. Going to the next one.
[vmi_site] Processing file 2/1781: www.vmi.lt_evmi_svetaines-medis.html
[vmi_site] file www.vmi.lt_evmi_svetaines-medis.html was already processed. Going to the next one.
[vmi_site] Processing file 3/1781: www.vmi.lt_evmi_informacija-gestu-kalba.html
[vmi_site] file www.vmi.lt_evmi_informacija-gestu-kalba.html was already processed. Going to the next one.
[vmi_site] Processing file 4/1781: www.vmi.lt_evmi_lengvai-suprantama-kalba.html
[vmi_site] file www.vmi.lt_evmi_lengvai-suprantama-kalba.html was already processed. Going to the next one.
[vmi_site] Processing file 5/1781: sso.vmi.lt_sso_login.html
[vmi_site] file sso.vmi.lt_sso_login.html was already processed. Going to the next one.
[vmi_site] Processing file 6/1781: www.vmi.lt_evmi_mokesciu-moketoju-informacija.html
[vmi_site] file www.vmi.lt_evmi_moke


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  self.soup = BeautifulSoup(f, "html.parser")


[sodra_site] Processing file 1984/6290: www.sodra.lt_uploads_documents_files_1SD_klasifik_real(3).xls.html
[sodra_site] Processing file 1985/6290: www.sodra.lt_lt_dokumentai_download_id.3688.html
[sodra_site] Processing file 1986/6290: www.sodra.lt_lt_dokumentai_download_id.3689.html
[sodra_site] Processing file 1987/6290: www.sodra.lt_lt_dokumentai_download_id.4641.html
[sodra_site] Processing file 1988/6290: www.sodra.lt_lt_dokumentai_download_id.2626.html
[sodra_site] Processing file 1989/6290: www.sodra.lt_lt_dokumentai_download_id.2627.html
[sodra_site] Processing file 1990/6290: www.sodra.lt_lt_sd-pranesimai.html
[sodra_site] Processing file 1991/6290: www.sodra.lt_uploads_documents_files_1-SD%20formos%20duomen%C5%B3%20strukt%C5%ABros%20apra%C5%A1as%2C%20v11.pdf.html
[sodra_site] Processing file 1992/6290: www.sodra.lt_uploads_documents_files_1-SD%20v_11%20forma.pdf.html
[sodra_site] Processing file 1993/6290: www.sodra.lt_uploads_documents_files_1-SD-v11(4).zip.html
[sodra_site]

In [7]:
config_all_sites_data = {
    "ALLOWED_TAGS": ALLOWED_TAGS,
    "RAW_DATA_DIR": RAW_DATA_DIR,
    "CLEANED_DATA_DIR": CLEANED_DATA_DIR,
    "URL_MAP_FILENAME": URL_MAP_FILENAME,
    "SITE_ELEMENTS_JSON_FILENAME": SITE_ELEMENTS_JSON_FILENAME,
    "SITES_ELEMENTS_JSON_PATH": SITES_ELEMENTS_JSON_PATH,
    "SIMILAR_TEXT_THRESHOLD": SIMILAR_TEXT_THRESHOLD
}

all_sites_data_handler = AllCleanedSitesData(sites_dirs=SITES_DIRS, config=config_all_sites_data)
sites_data = all_sites_data_handler.convert_sites_data_to_json_list()
print(f"✅ Elements from {SITES_DIRS} sites merged and saved to {SITES_ELEMENTS_JSON_PATH}")

Processing: micenter_site
Progressing micenter.lt_en.html (1)
Progress: 0% (1/228)
Progressing micenter.lt_lt.html (2)
Progressing micenter.lt_ru.html (3)
Progressing micenter.lt_en_learn-lithuanian.html (4)
Progressing micenter.lt_en_about-us.html (5)
Progressing micenter.lt_index.html (6)
Progressing micenter.lt_en_main-information.html (7)
Progressing micenter.lt_en_interesting-facts.html (8)
Progressing micenter.lt_en_migration-statistics.html (9)
Progressing micenter.lt_en_travel-to-lithuania.html (10)
Progressing micenter.lt_en_schengen-visa.html (11)
Progressing micenter.lt_en_visa-d.html (12)
Progressing micenter.lt_en_temporary-residence-permit.html (13)
Progressing micenter.lt_en_blue-card.html (14)
Progressing micenter.lt_en_permanent-residence-permit.html (15)
Progressing micenter.lt_en_temporary-protection.html (16)
Progressing micenter.lt_en_legalization-of-documents.html (17)
Progressing micenter.lt_en_job-search.html (18)
Progressing micenter.lt_en_working-conditions.ht