In [None]:
# https://python.langchain.com/docs/how_to/custom_embeddings/

from typing import List, Iterable, Union

from langchain_core.embeddings import Embeddings

import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig

def _last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

class MyEmbeddings(Embeddings):
    def __init__(
        self,
        model: str = "Qwen/Qwen3-Embedding-8B",
        *,
        max_length: int = 8192,
        device = None
    ):
        self.model_name = model
        self.max_length = max_length

        self._tokenizer = tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side='left')

        bnb_config = BitsAndBytesConfig(load_in_4bit=True)
        self._model = AutoModel.from_pretrained(self.model_name,quantization_config=bnb_config,).eval()

        if device is None:
            if torch.cuda.is_available():
                device = "cuda"
            elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
                device = "mps"
            else:
                device = "cpu"
        self._device = torch.device(device)
        self._model.to(self._device)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self._embed_texts(texts)

    def embed_query(self, text: str) -> List[float]:
        return self._embed_texts([text])[0]

    @torch.inference_mode()
    def _embed_texts(self, texts: Union[List[str], Iterable[str]]) -> List[List[float]]:
        batch_dict = self._tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        batch_dict.to(self._device)
        outputs = self._model(**batch_dict)
        

        embeddings = _last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

        out = embeddings.tolist()
        return out

In [2]:
import json
from langchain_core.documents import Document

with open("../../../data/parsed/pdfplumber.json", "r", encoding="utf-8") as f:
    docs_dict = json.load(f) 

docs = [Document(**d) for d in docs_dict]

In [None]:
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(MyEmbeddings())

temp = text_splitter.split_documents(docs)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.50 GiB. GPU 0 has a total capacity of 47.53 GiB of which 2.66 GiB is free. Process 3642344 has 44.87 GiB memory in use. Of the allocated memory 36.58 GiB is allocated by PyTorch, and 7.98 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)