In [1]:
import json
import re
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModel

from config import RAW_DATA_DIR, PROCESSED_DATA_DIR

  from .autonotebook import tqdm as notebook_tqdm
[32m2025-02-07 16:18:48.535[0m | [1mINFO    [0m | [36mconfig[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/mabolz/company_success_prediction[0m


In [2]:
uid = 'CHE152876230'
with open(RAW_DATA_DIR / f'{uid}.json') as f:
    data = {url: content['markdown'] for url, content in json.load(f)[uid].items()}

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
def remove_links_from_markdown(markdown: str) -> str:
    """Removes all markdown links from the text."""
    markdown = re.sub(r'(\[[^\]]+\])\((https?:\/\/[^\)]+)\)', ' ', markdown)  # Remove markdown-style links
    markdown = re.sub(r'\((https?:\/\/[^\)]+)\)', '', markdown)  # Remove raw links in parentheses
    return markdown

In [5]:
headers_to_split_on = [
    ("#", "Header 1")
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=50)

chunked_data = [[], []]
for url, markdown in data.items():
    markdown_clean = remove_links_from_markdown(markdown)
    md_header_splits = markdown_splitter.split_text(markdown_clean)
    splits = recursive_splitter.split_documents(md_header_splits)
    for chunk in splits:
        if len(chunk.page_content) >= 20:  # minimum of 20 words
            chunked_data[0].append(url)
            chunked_data[1].append(chunk)

assert len(chunked_data[0]) == len(chunked_data[1])

In [9]:
from sentence_transformers import SentenceTransformer
from transformers import BitsAndBytesConfig
torch.cuda.empty_cache()
torch.cuda.synchronize()

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = SentenceTransformer(
    "Linq-AI-Research/Linq-Embed-Mistral",
    model_kwargs={
        'quantization_config': quantization_config,
        'torch_dtype': torch.float16,  # Half precision to reduce VRAM
        'device_map': 'auto',  # Automatic device allocation
        'offload_folder': 'offload',  # Offload to RAM if needed
    }
)

# model.max_seq_length = 256

# Each query must come with a one-sentence instruction that describes the task
task = 'Given a question, retrieve Website passages that answer the question'
prompt = f"Instruct: {task}\nQuery: "
queries = [
    "What are the names and descriptions of the companies' main products?",
    "What are the names of the people wording at the company?"
]
# No need to add instruction for retrieval documents
passages = [doc.page_content.replace('\n', ' ') for doc in chunked_data[1]]
urls = chunked_data[0]

# Load model and tokenizer
with torch.no_grad():
    query_embeddings = model.encode(
        queries,
        prompt=prompt,
        batch_size=4,
        convert_to_tensor=True,
        normalize_embeddings=True
    )

    passage_embeddings = model.encode(
        passages,
        batch_size=4,
        convert_to_tensor=True,
        normalize_embeddings=True
    )

# Compute the (cosine) similarity scores
scores = model.similarity(query_embeddings, passage_embeddings) * 100

torch.cuda.empty_cache()
torch.cuda.synchronize()

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.15s/it]


In [14]:
scores

tensor([[30.0312, 30.0312, 24.2031, 26.3438, 26.9531, 26.3438, 27.4844, 28.8281,
         25.5625, 28.0938, 25.9688, 28.7188, 29.0938, 28.1562, 28.9062, 26.8750,
         23.6562, 28.7344, 32.1875, 27.4219, 35.0000, 31.5000, 26.9219, 24.9375,
         27.3906, 26.5156, 25.5625, 26.8750, 32.1875, 33.4375, 33.3125, 27.2188,
         31.2500, 33.0938, 29.6094, 28.0938, 30.8594, 30.5156, 31.1719, 29.2188,
         30.8594, 26.9062, 31.0781, 31.2812, 30.8594, 28.8750, 31.1719, 30.3906,
         31.1719, 30.9062, 31.0781, 26.9531, 28.5938, 28.8281, 28.0938, 30.2969,
         37.9062, 36.0000, 29.6094, 34.5625, 32.9375, 20.3750, 25.4844, 24.8594,
         27.4219, 26.0000, 27.4219, 27.8594, 29.0312, 30.0312, 31.2500],
        [29.0469, 30.2500, 26.3750, 28.7188, 28.0000, 28.7188, 28.9375, 27.9062,
         27.9531, 31.5625, 27.0312, 28.3750, 29.4375, 31.1250, 27.7344, 29.3438,
         25.9219, 28.3750, 29.9062, 29.5000, 41.1562, 39.9375, 29.3906, 32.6875,
         29.4688, 29.7656, 27.8281, 

In [15]:
np_scores = np.array(scores[0].cpu())
sorted_scores = np.argsort(np_scores)[::-1]

sorted_docs = {}
for i, idx in enumerate(sorted_scores):
    sorted_docs[i] = (passages[idx], urls[idx])

  np_scores = np.array(scores[0].cpu())


In [16]:
sorted_docs

{0: ('# Digital solutions by digital natives ![] ] Chiron Services was founded with the goal of supporting businesses in their digital transformation. Our focus lies on process automation, aimed at minimizing manual tasks through the use of software.With an interdisciplinary team of data scientists, software developers, full-stack web developers, and business analysts, Chiron Services has built a skill set that is unique in the market. By combining the latest academic knowledge and proven industry methods, we support clients from various sectors in their digitalization. Our internationally positioned team, consisting of highly qualified specialists from the best European universities, provides the necessary expertise.Our mission is to generate added value for our clients. To achieve this, we first evaluate the return on investment generated by the implementation of digital solutions. ## Team ![] Fredi holds a master in marketing and strategic management and is responsible for the compa

In [None]:
# Each query must come with a one-sentence instruction that describes the task
task = 'Given a question, retrieve Website passages that answer the question'
prompt = f"Instruct: {task}\nQuery: "
queries = [
    "What are the names and descriptions of the companies' main products?"
]
# No need to add instruction for retrieval documents
passages = [doc.page_content.replace('\n', ' ') for doc in chunked_data[1]]
urls = chunked_data[0]

# Load model and tokenizer
query_embeddings = model.encode(queries, prompt=prompt)
passage_embeddings = model.encode(passages)

# Compute the (cosine) similarity scores
scores = model.similarity(query_embeddings, passage_embeddings) * 100

OutOfMemoryError: CUDA out of memory. Tried to allocate 684.00 MiB. GPU 0 has a total capacity of 14.57 GiB of which 522.75 MiB is free. Including non-PyTorch memory, this process has 14.05 GiB memory in use. Of the allocated memory 12.86 GiB is allocated by PyTorch, and 1.06 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [14]:
model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

# Each query must come with a one-sentence instruction that describes the task
task = 'Given a search query, retrieve relevant passages that answer the query'
queries = [
    get_detailed_instruct(task, "What are the names and descriptions of the companies' main products?"),
]

# No need to add instruction for retrieval documents
documents = [doc.page_content.replace('\n', ' ') for doc in chunked_data[1]]
urls = chunked_data[0]

input_texts = queries + documents

In [15]:
embeddings = model.encode(input_texts, convert_to_tensor=True, normalize_embeddings=True)
scores = (embeddings[:len(queries)] @ embeddings[len(queries):].T) * 100

np_scores = np.array(scores[0].cpu())
sorted_scores = np.argsort(np_scores)[::-1]

sorted_docs = {}
for i, idx in enumerate(sorted_scores):
    sorted_docs[i] = (documents[idx], urls[idx])

  np_scores = np.array(scores[0].cpu())


In [16]:
sorted_docs

{0: ('# Build Measure Learn ! From idea to a ready-to-use product in the shortest possible time. With our innovative approach to agile working, we break away from traditional customer service and product development methods. By incorporating customer feedback, we reduce development costs and shorten the time to delivery. ] MODERN PRODUCT DEVELOPMENT ## Agile development using the Build-Measure-Learn approach We work in sprints - focused, short, and intensive work intervals. Rapid prototyping and continuous feedback enable us to avoid misguided developments. Whenever possible, we create synergies by leveraging existing IT infrastructures. ![] ## LinkMed LinkMed is your digital AI assistant, taking care of master data maintenance for your doctor\'s data. Update and expand your data at the push of a button! The product is aimed at hospitals, clinics, and practices.   ## Synk Synk is a process automation tool for foundations. Synchronize your expenses, commitments, and projects with just o

In [135]:
np.argsort(np_scores)[::-1]

array([33, 56, 28, 57, 34, 52,  1, 29, 32, 39, 20, 47, 30, 35, 60,  0, 15,
       27,  3,  5, 22, 45, 41, 31, 21, 25, 37, 70,  9, 13, 58, 26, 49,  6,
       43, 16, 18, 53, 12, 23, 63, 55,  2, 10, 62, 65, 40, 36, 44, 67, 50,
       42, 51, 54, 48, 38, 46, 64, 66, 24, 19, 59,  4, 14, 61,  7,  8, 17,
       11, 69, 68])

In [136]:
np_scores

array([83.09274 , 84.54229 , 78.511154, 82.315575, 73.16283 , 82.315575,
       80.565285, 72.25804 , 71.66676 , 81.47917 , 78.418495, 70.90068 ,
       79.306206, 81.28316 , 72.76632 , 82.32267 , 80.15391 , 70.90068 ,
       79.80944 , 73.55938 , 83.6295  , 82.19152 , 82.2595  , 79.304085,
       73.55938 , 82.1071  , 81.04856 , 82.32267 , 86.07127 , 84.3755  ,
       83.46146 , 82.207085, 83.94556 , 86.56631 , 84.814835, 83.317276,
       76.78014 , 81.999084, 75.422745, 83.709404, 76.78014 , 82.23286 ,
       75.9801  , 80.33271 , 76.78014 , 82.23775 , 75.422745, 83.56724 ,
       75.422745, 80.9039  , 75.9801  , 75.94322 , 84.735   , 79.46084 ,
       75.44986 , 78.542885, 86.496086, 86.01206 , 81.15595 , 73.45154 ,
       83.1053  , 72.54138 , 77.684364, 78.572464, 73.55938 , 76.92735 ,
       73.55938 , 76.02347 , 70.76567 , 70.77973 , 81.83685 ],
      dtype=float32)

In [105]:
len(documents)

71

In [117]:
documents

['# Unlocking the Value of Data  !Digital solutions by digital natives## Beratung und Entwicklung für Daten, Software und KI LösungenAls Data Science und Software Unternehmen bieten wir umfassende Beratungs- und Entwicklungsdienstleistungen an, die darauf abzielen, Ihnen einen echten Mehrwert zu bieten. Wir setzen auf innovative Datenlösungen und Softwareentwicklung, um Ihre Anforderungen zu erfüllen. Dafür evaluieren wir jeweils zuerst den Return on Investment, der durch den Einsatz von digitalen Lösungen entsteht.  ## Unsere Kernkompetenzen###  ]###  ]###  ]###  ]###  ]###  ]###  ]###  ]### DatenerhebungAutomatisches Sammeln von Daten]### DatenextraktionUmwandlung von unstrukturierten Daten in strukturierte]### Daten-managementSpeicherung und Zugriff auf Daten]### Daten-aufbereitungStandardisierung und Organisation]### KI BeratungEinsatz künstlicher Intelligenz für Ihr Unternehmen]### DatenanalyseErkennung von Muster, Beziehungen und Prognosen]### Prozess-automatisierungReduktion man

In [109]:
scores_array = np.array(scores)

# Get indices of top k highest scoring documents
top_indices = np.argsort(scores_array)[:][::-1]

In [119]:
scores_array

array([83.09274292, 84.54228973, 78.51115417, 82.31557465, 73.16282654,
       82.31557465, 80.56528473, 72.25804138, 71.66676331, 81.47917175,
       78.41849518, 70.90068054, 79.30620575, 81.28315735, 72.76631927,
       82.32266998, 80.15390778, 70.90068054, 79.80944061, 73.55937958,
       83.62950134, 82.19152069, 82.2594986 , 79.30408478, 73.55937958,
       82.10710144, 81.0485611 , 82.32266998, 86.0712738 , 84.37550354,
       83.4614563 , 82.20708466, 83.94555664, 86.56630707, 84.81483459,
       83.317276  , 76.78014374, 81.99908447, 75.42274475, 83.70940399,
       76.78014374, 82.23285675, 75.98010254, 80.33271027, 76.78014374,
       82.23774719, 75.42274475, 83.56723785, 75.42274475, 80.90390015,
       75.98010254, 75.94322205, 84.73500061, 79.46083832, 75.44985962,
       78.54288483, 86.49608612, 86.01206207, 81.15595245, 73.45153809,
       83.1053009 , 72.54138184, 77.68436432, 78.57246399, 73.55937958,
       76.92735291, 73.55937958, 76.02346802, 70.76567078, 70.77

In [118]:
fortop_indices

array([33, 56, 28, 57, 34, 52,  1, 29, 32, 39, 20, 47, 30, 35, 60,  0, 15,
       27,  3,  5, 22, 45, 41, 31, 21, 25, 37, 70,  9, 13, 58, 26, 49,  6,
       43, 16, 18, 53, 12, 23, 63, 55,  2, 10, 62, 65, 40, 36, 44, 67, 50,
       42, 51, 54, 48, 38, 46, 64, 66, 24, 19, 59,  4, 14, 61,  7,  8, 17,
       11, 69, 68])

In [111]:
len(top_indices)

71

In [None]:
class InstructEmbeddings:
    def __init__(self):
        pass

def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'


