## Step 0. Preparing the environment

In [11]:
import sys

PROJECT_ROOT = "/Users/theother_archee/CursorProjects/SmartClause/" # Use your path to SmartClause repository
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

import pandas as pd
import contextlib
import numpy as np
import itertools
from tqdm.auto import tqdm

from analyzer.app.api.routes import retrieval_metrics
from analyzer.app.services.retrieval_service import retrieval_service


## Step 1. Loading the source dataset

In [12]:
DATA_PATH = PROJECT_ROOT + "parser/dataset/dataset_codes_rf.csv"
df_raw = pd.read_csv(DATA_PATH)

print(f"Строк в исходном датасете: {len(df_raw):,}")
df_raw.head(2)

Строк в исходном датасете: 8,267


Unnamed: 0,rule_id,file,rule_number,rule_title,rule_text,section_title,chapter_title,start_char,end_char,text_length
0,0,Арбитражный процессуальный кодекс Российской Ф...,1,Осуществление правосудия арбитражными судами,Статья 1. Осуществление правосудия арбитражным...,ОБЩИЕ ПОЛОЖЕНИЯ,ОСНОВНЫЕ ПОЛОЖЕНИЯ,2943,3531,588
1,1,Арбитражный процессуальный кодекс Российской Ф...,2,Задачи судопроизводства в арбитражных судах,Статья 2. Задачи судопроизводства в арбитражны...,ОБЩИЕ ПОЛОЖЕНИЯ,ОСНОВНЫЕ ПОЛОЖЕНИЯ,3531,4806,1275


## Step 2. Chunking Helper

In [13]:
def chunk_text(text: str, chunk_size: int, overlap: int):
    chunks = []
    if len(text) <= chunk_size:
        return [(text, 0, len(text))]
    step = chunk_size - overlap
    for start in range(0, len(text), step):
        end = min(start + chunk_size, len(text))
        chunks.append((text[start:end], start, end))
        if end == len(text):
            break
    return chunks


## Step 3. Embedding function

In [33]:
# We are trying to take the production model
try:
    from analyzer.app.services.embedding_service import embedding_service
    encode = embedding_service.encode_to_list
    print("All is good")
except Exception as e:
    print("⚠️ embedding_service not found, using SentenceTransformer demo model")
    from sentence_transformers import SentenceTransformer
    _model = SentenceTransformer("all-MiniLM-L6-v2")
    encode = lambda txt: _model.encode([txt])[0].tolist()

All is good


## Step 4. Parameter grid and subsampling

In [34]:
PARAM_GRID = list(itertools.product(
    [600, 700, 800, 900, 1000, 1500, 2000, 3000],   # chunk_size
    [0, 200, 300, 400, 500]          # overlap
))
print("Комбинаций:", len(PARAM_GRID))

SAMPLE_SIZE = 150          # if we want to do it faster
df_sample = df_raw.sample(SAMPLE_SIZE, random_state=42)
# df_sample = df_raw


Комбинаций: 40


## Step 5. Context manager for “substituting” metrics

In [35]:
@contextlib.contextmanager
def patch_get_embeddings(embeddings, labels):
    """
    Temporarily overrides retrieval_service.get_all_embeddings_and_labels
    so that retrieval_metrics returns metrics for our data.
    """

    orig_fn = retrieval_service.get_all_embeddings_and_labels
    retrieval_service.get_all_embeddings_and_labels = lambda db: (embeddings, labels)
    try:
        yield
    finally:
        retrieval_service.get_all_embeddings_and_labels = orig_fn


async def calc_metrics_via_endpoint(embeddings, labels):
    """
    Asynchronously calls your /metrics/retrieval endpoint, 
    but with patch data instead of the DB.
    """

    with patch_get_embeddings(embeddings, labels):
        resp = await retrieval_metrics(db=None) # db is not needed, we redefined it
    return resp.dict()


## Step 6. Enumeration and calculation of metrics

In [36]:
import nest_asyncio; 
nest_asyncio.apply()

In [37]:
async def run_grid():
    results = []
    for chunk_sz, ov in tqdm(PARAM_GRID, desc="Grid"):
        embs, labs = [], []
        for _, row in df_sample.iterrows():
            text = str(row["rule_text"])
            label = row.get("file") or str(row.get("rule_id"))
            for chunk_txt, _, _ in chunk_text(text, chunk_sz, ov):
                embs.append(encode(chunk_txt))
                labs.append(label)
        if len(embs) < 2:
            continue
        metrics = await calc_metrics_via_endpoint(embs, labs)
        results.append({
            "chunk_size":   chunk_sz,
            "overlap":      ov,
            "n_chunks":     len(embs),
            **metrics
        })
    return results

results = await run_grid()

Grid:   0%|          | 0/40 [00:00<?, ?it/s]/var/folders/5r/r5srq9nx6v3bld2_whlhpj3m0000gn/T/ipykernel_69340/47765139.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return resp.dict()
Grid:   2%|▎         | 1/40 [00:31<20:19, 31.26s/it]/var/folders/5r/r5srq9nx6v3bld2_whlhpj3m0000gn/T/ipykernel_69340/47765139.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return resp.dict()
Grid:   5%|▌         | 2/40 [01:10<22:55, 36.19s/it]/var/folders/5r/r5srq9nx6v3bld2_whlhpj3m0000gn/T/ipykernel_69340/47765139.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0

In [39]:
df_res = (pd.DataFrame(results)
            .sort_values("silhouette_score", ascending=False)
            .reset_index(drop=True))

display(df_res)

Unnamed: 0,chunk_size,overlap,n_chunks,total_variance,silhouette_score,eid,dr
0,700,500,1439,0.480269,0.122352,920.782588,0.100798
1,800,500,989,0.48121,0.121506,920.085548,0.101479
2,600,500,2788,0.480529,0.121148,920.62481,0.100952
3,900,500,770,0.482864,0.119152,919.13107,0.102411
4,1000,500,627,0.485525,0.117014,919.130334,0.102412
5,700,400,1029,0.484067,0.114984,920.510511,0.101064
6,800,400,791,0.484592,0.111785,919.998107,0.101564
7,900,400,660,0.486596,0.111704,919.660613,0.101894
8,600,400,1499,0.483756,0.1097,920.837041,0.100745
9,1000,400,557,0.486195,0.109258,919.093916,0.102447


In [None]:
best = df_res.iloc[0]
print("Best configuration:", best.to_dict())

We did the tuning parameter on 150 random articles with an emphasis on the silhouette_score metric. The best results were shown by **overlap=500 and chunk_size from 600 to 900.** 

We chose **chunk_size=800 and overlap=500** as the final configuration because the dataset is large, and this setup offers a good trade-off between retrieval quality and token efficiency.

## Step 7. Chunking the entire dataset with chosen configuration

In [16]:
CHUNK_SZ, OV = 800, 500

chunk_rows = []
chunk_id_counter = 0  # глобальный уникальный ID для чанков

for _, row in df_raw.iterrows():
    rule_id = row["rule_id"]
    full_text = str(row["rule_text"])

    chunks = chunk_text(full_text, CHUNK_SZ, OV)

    for i, (chunk_txt, st, en) in enumerate(chunks):
        chunk_rows.append({
            "chunk_id": chunk_id_counter,
            "rule_id": rule_id,
            "chunk_number": i,
            "chunk_text": chunk_txt,
            "chunk_char_start": st,
            "chunk_char_end": en
            # "embedding": encode(chunk_txt)  # можно раскомментировать, если хочешь сразу
        })
        chunk_id_counter += 1

df_chunks = pd.DataFrame(chunk_rows)
print(f"Total chunks generated: {len(df_chunks):,}")


Total chunks generated: 48,517


In [17]:
df_chunks

Unnamed: 0,chunk_id,rule_id,chunk_number,chunk_text,chunk_char_start,chunk_char_end
0,0,0,0,Статья 1. Осуществление правосудия арбитражным...,0,588
1,1,1,0,Статья 2. Задачи судопроизводства в арбитражны...,0,800
2,2,1,1,"рации, субъектов Российской Федерации, муницип...",300,1100
3,3,1,2,ых лиц в указанной сфере;\n2) обеспечение дост...,600,1275
4,4,2,0,Статья 3. Законодательство о судопроизводстве ...,0,800
...,...,...,...,...,...,...
48512,48512,8265,1,"еждународной защитой, на служебные или жилые п...",300,986
48513,48513,8266,0,Статья 361. Акт международного терроризма\n \n...,0,800
48514,48514,8266,1,либо направленных против интересов Российской...,300,1100
48515,48515,8266,2,"усмотренных частью первой настоящей статьи, ск...",600,1400


## Step 8. Save csv with chunking.

In [19]:
OUT = PROJECT_ROOT + "experements/chunking/dataset_codes_rf_chunking_800chunksize_500overlap.csv"
df_chunks.to_csv(OUT, index=False)
print("File saved:", OUT)

File saved: /Users/theother_archee/CursorProjects/SmartClause/experements/chunking/dataset_codes_rf_chunking_800chunksize_500overlap.csv
