In [None]:
!pip install -q sentence-transformers langchain-text-splitters

In [None]:
!pip install -q tqdm

In [None]:
from typing import List, Dict, Optional
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from uuid import uuid4
import numpy as np
import json
import os

In [None]:
class RecursiveChunker:
    def __init__(self,
                 chunk_size: int = 1000,
                 chunk_overlap: int = 200,
                 separators: Optional[List[str]] = None):
        if separators is None:
            separators = ["\n\n", "\n", ".", " ", ""]
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=separators
        )

    def split_text(self, text: str) -> List[str]:
        if not text:
            return []
        return self.splitter.split_text(text)

In [None]:
class SentenceTransformerEmbeddingService:
    def __init__(self, model_name: str = "paraphrase-multilingual-MiniLM-L12-v2", device: str = "cpu"):
        self.model_name = model_name
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    def embed(self, texts: List[str], batch_size: int = 64, show_progress: bool = True) -> np.ndarray:
        if not texts:
            return np.zeros((0, self.model.get_sentence_embedding_dimension()))
        embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            show_progress_bar=show_progress,
            convert_to_numpy=True,
            normalize_embeddings=False
        )
        return embeddings

In [None]:
class Preprocessor:
    def __init__(self,
                 chunk_size: int = 1000,
                 chunk_overlap: int = 200,
                 model_name: str = "paraphrase-multilingual-MiniLM-L12-v2",
                 device: str = "cpu"):
        self.chunker = RecursiveChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        self.embedder = SentenceTransformerEmbeddingService(model_name=model_name, device=device)

    def read_text(self, txt_path: str) -> str:
        with open(txt_path, "r", encoding="utf-8") as f:
            return f.read()

    def prepare(self,
                txt_path: str,
                output_dir: str,
                batch_size: int = 64,
                show_progress: bool = True) -> Dict:

        os.makedirs(output_dir, exist_ok=True)

        raw_text = self.read_text(txt_path)
        chunks = self.chunker.split_text(raw_text)

        ids = [str(uuid4()) for _ in range(len(chunks))]
        embeddings = self.embedder.embed(chunks, batch_size=batch_size, show_progress=show_progress)

        chunks_path = os.path.join(output_dir, "chunks.json")
        with open(chunks_path, "w", encoding="utf-8") as f:
            json.dump([{"id": _id, "text": txt} for _id, txt in zip(ids, chunks)], f, ensure_ascii=False, indent=2)

        embeddings_path = os.path.join(output_dir, "embeddings.npy")
        np.save(embeddings_path, embeddings)

        meta = {
            "chunks_path": os.path.basename(chunks_path),
            "embeddings_path": os.path.basename(embeddings_path),
            "n_chunks": len(chunks),
            "embedding_dim": int(embeddings.shape[1]) if embeddings.size else None,
            "source_text_path": os.path.abspath(txt_path),
            "model_name": self.embedder.model_name
        }
        meta_path = os.path.join(output_dir, "meta.json")
        with open(meta_path, "w", encoding="utf-8") as f:
            json.dump(meta, f, ensure_ascii=False, indent=2)

        return {
            "chunks_path": chunks_path,
            "embeddings_path": embeddings_path,
            "meta_path": meta_path,
            "n_chunks": len(chunks)
        }

In [None]:
pre = Preprocessor(chunk_size=1000, chunk_overlap=200, model_name="paraphrase-multilingual-MiniLM-L12-v2", device="cuda")

In [None]:
result = pre.prepare("/content/texto.txt", "/content/data", batch_size=64, show_progress=True)
print(result)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'chunks_path': '/content/data/chunks.json', 'embeddings_path': '/content/data/embeddings.npy', 'meta_path': '/content/data/meta.json', 'n_chunks': 62}
