In [1]:
# Install core dependencies
!pip install sentence-transformers

# Install Pathway nightly with xpacks from official index
!pip install pathway


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [2]:
from google.colab import files

# Upload the CSV file
uploaded = files.upload()
# Your file (e.g. iiti_data_merged_final.csv) will be available in /content/


Saving iiti_data_merged_final.csv to iiti_data_merged_final.csv


In [3]:
import pathway as pw
from pathway.xpacks.llm.splitters import RecursiveSplitter

In [4]:
# ✅ Step 1: Define schema for your CSV
class IITIWebSchema(pw.Schema):
    id: int
    url: str
    title: str
    body_text: str
    source_domain: str
    metadata: str

In [5]:
# ✅ Step 2: Load your CSV
init_table = pw.io.csv.read(
    "iiti_data_merged_final.csv",
    schema=IITIWebSchema,
    mode="static",
    autocommit_duration_ms=1000
)

In [6]:
# Optionally check for non-empty body_text or metadata instead
final_table = init_table.filter(pw.this.body_text != "")

In [7]:
# ✅ Step 3: Setup RecursiveSplitter
splitter = RecursiveSplitter(
    chunk_size=500,
    chunk_overlap=150,
    separators=["\n#", "\n##", "\n\n", "\n","."],
    model_name="gpt-4o-mini",
)

In [8]:
# ✅ Step 4: Apply splitter (use row_id instead of id)
chunked = final_table.select(
    row_id=pw.this.id,
    url=pw.this.url,
    title=pw.this.title,
    metadata=pw.this.metadata,
    chunks=splitter(pw.this.body_text)
)

In [9]:
# ✅ Step 5: Flatten the chunks
flattened = chunked.flatten(pw.this.chunks)

In [10]:
# ✅ Step 6: Save to CSV for inspection
pw.io.csv.write(
    table=flattened,
    filename="output_chunks.csv"
)

    https://beartype.readthedocs.io/en/latest/api_roar/#pep-585-deprecations
  warn(


In [None]:
# ✅ Step 7: Run the pipeline
pw.run()

Output()



In [16]:
from sentence_transformers import SentenceTransformer
import numpy as np
from pathway.xpacks.llm.splitters import RecursiveSplitter

In [12]:
# Load your model
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
# UDF to generate embeddings
@pw.udf
def batch_embedding(texts: list[str]) -> list[list[float]]:
    return model.encode(texts).tolist()

In [18]:
# Add embeddings to each chunk
embedded = flattened.select(
    row_id=pw.this.row_id,
    chunk=pw.this.chunks,
    embedding=batch_embedding(pw.this.chunks),  # This batches internally
    url=pw.this.url,
    title=pw.this.title,
    metadata=pw.this.metadata
)

In [19]:
pw.io.csv.write(
    table=embedded,
    filename="final_embedded_chunks.csv"
)


In [20]:
pw.run()

Output()