In [1]:
import gzip
import json
from dataclasses import dataclass
from pathlib import Path
from tqdm import tqdm

from ftlangdetect import detect

from success_prediction.rag_components.embeddings import EmbeddingCreator
from success_prediction.rag_components.cleanup import MarkdownCleaner
from success_prediction.vector_db.utils import DatabaseClient

from success_prediction.config import RAW_DATA_DIR

[32m2025-05-07 17:36:55.458[0m | [1mINFO    [0m | [36msuccess_prediction.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/manuelbolz/Documents/git/for_work/company_success_prediction[0m


In [None]:
@dataclass
class Clients:
    md_cleaner: MarkdownCleaner
    embedding_creator: EmbeddingCreator
    db_client: DatabaseClient


def load_raw_file(file_path: Path):
    """
    """
    with gzip.open(file_path, 'r') as f:
        return json.load(f)


def store_links(file_path: Path, data: dict):
    """
    """
    with open(file_path, 'w', encoding='utf-8') as f:
        return json.dump(data, f, ensure_ascii=False, indent=4)


def structure_links(
    ehraid: int,
    links: list[dict],
    email_addresses: set,
    social_media: dict
) -> dict:
    """
    """
    for link in links:
        base_domain = link.get('base_domain')
        if '@' in link.get('text'):
            email_addresses[ehraid]['emails'].add(link['text'])
        elif base_domain == "linkedin.com":
            social_media[ehraid]['linkedin'].add(link['href'])
        elif base_domain == "instagram.com":
            social_media[ehraid]['instagram'].add(link['href'])
        elif base_domain == "facebook.com":
            social_media[ehraid]['facebook'].add(link['href'])
        elif base_domain == "tiktok.com":
            social_media[ehraid]['tiktok'].add(link['href'])
        elif base_domain == "youtube.com":
            social_media[ehraid]['youtube'].add(link['href'])
        elif base_domain == "x.com" or base_domain == "twitter.com":
            social_media[ehraid]['x'].add(link['href'])
    return email_addresses, social_media


def run_pipeline(clients: Clients, idx: int, file_path: Path):
    """
    """
    raw_json = load_raw_file(file_path)
    processed_files = []
    email_addresses, social_media = {}, {}

    for ehraid, urls2attributes in tqdm(raw_json.items()):
        email_addresses[ehraid] = {'emails': set()}
        social_media[ehraid] = {k: set() for k in ['linkedin', 'instagram', 'facebook', 'tiktok', 'youtube', 'x']}

        for url, attributes in urls2attributes.items():
            markdown = attributes.get('markdown')
            if not markdown:
                continue

            date = attributes['date']
            internal_links = [link['href'] for link in attributes['links']['internal']]
            external_links = [link['href'] for link in attributes['links']['external']]

            email_addresses, social_media = structure_links(
                ehraid, attributes['links']['external'], email_addresses, social_media)

            markdown_clean = clients.md_cleaner.clean(markdown, internal_links, external_links)
            if len(markdown_clean) <= 300:
                continue
            
            # Detect language using the text without bracket content, since it includes
            # English tokens such as INTERNAL_LINKS that might confuse the model
            language = detect(text=clients.md_cleaner.remove_nested_brackets(markdown_clean).replace('\n', ' '))

            # Split the text into smaller chunks to fit into the model context + normalize whitespace per chunk
            markdown_chunks = clients.embedding_creator.chunk(markdown_clean)
            markdown_chunks_clean = [
                clients.md_cleaner.normalize_whitespace(doc.page_content)
                for doc in markdown_chunks
            ]

            query_embeddings = clients.embedding_creator.embed(
                markdown_chunks_clean, prefix='query:')
            
            processed_files.extend([
                {
                    'ehraid': int(ehraid),
                    'url': str(url),
                    'date': date,
                    'language': language.get('lang'),
                    'text': md,
                    'embedding': q_emb
                }
                for md, q_emb in zip(markdown_chunks_clean, query_embeddings)
            ])

        email_addresses[ehraid] = {k: list(v) for k, v in email_addresses[ehraid].items()}
        social_media[ehraid] = {k: list(v) for k, v in social_media[ehraid].items()}

    clients.db_client.insert_data(data=processed_files)

    store_links(RAW_DATA_DIR / f'emails_{idx}.json', email_addresses)
    store_links(RAW_DATA_DIR / f'social_media_{idx}.json', social_media)


def main(clients: Clients, raw_files: list[Path]):
    for i, file in enumerate(raw_files):
        run_pipeline(clients, i, file)

In [None]:
if __name__ == '__main__':

    init_args = {'dim': 768}

    clients = Clients(
        md_cleaner=MarkdownCleaner(),
        embedding_creator=EmbeddingCreator(model_name='intfloat/multilingual-e5-base'),
        db_client=DatabaseClient(**init_args)
    )
    clients.db_client.setup_database(replace=True)

    # raw_files = [file for file in Path(RAW_DATA_DIR).iterdir() if file.endswith('.json.gz')]
    raw_files = [RAW_DATA_DIR / 'company_websites' / 'current' / '0_websites.json.gz']
    main(clients, raw_files)

[EmbeddingCreator] Using model on `mps`.


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 498/498 [13:27<00:00,  1.62s/it]
2025-05-07 17:50:29,732 [ERROR][handler]: RPC error: [insert_rows], <DataNotMatchException: (code=1, message=The Input data type is inconsistent with defined schema, {language} field should be a varchar, but got a {<class 'dict'>} instead.)>, <Time:{'RPC start': '2025-05-07 17:50:29.731459', 'RPC error': '2025-05-07 17:50:29.732173'}> (decorators.py:140)


DataNotMatchException: <DataNotMatchException: (code=1, message=The Input data type is inconsistent with defined schema, {language} field should be a varchar, but got a {<class 'dict'>} instead.)>

In [10]:
db_client = DatabaseClient()
company_data = db_client.query_by_ehraid(filter="ehraid == 1252082")

In [11]:
company_data

data: []