In [10]:
import os 
from Utils.chunk_vis import visualize_chunks
import sys, os

# Move to project root
root = os.path.abspath("..")
print(f"Root path: {root}")
sys.path.append(root)

ModuleNotFoundError: No module named 'Utils'

In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import re


class SafeMarkdownSplitter(RecursiveCharacterTextSplitter):
    """
    Prevents splitting inside <table>...</table> and <img ...>.
    Repairs broken chunks by merging small, incomplete, or context-dependent pieces
    forward into the next chunk.
    """

    TABLE_PATTERN = r"(<table.*?</table>)"
    IMG_PATTERN = r"(<img[^>]*>)"

    def split_text(self, text):
        # -------- STEP 1: Protect entire <table>...</table> blocks --------
        blocks = re.split(self.TABLE_PATTERN, text, flags=re.DOTALL)
        raw_chunks = []

        for block in blocks:
            if block.strip().startswith("<table"):
                raw_chunks.append(block)  # keep table whole
                continue

            # -------- STEP 2: Protect <img ...> groups --------
            pieces = re.split(self.IMG_PATTERN, block)

            for p in pieces:
                if p.strip().startswith("<img"):
                    raw_chunks.append(p)  # full <img> tag stays atomic
                else:
                    # Apply normal splitting to text-only
                    if p.strip():
                        raw_chunks.extend(super().split_text(p))

        # -------- STEP 3: Merge fragments to ensure high-quality chunks --------
        final_chunks = self.merge_forward(raw_chunks)
        return final_chunks

    # ---------------------------------------------------------------------
    # LOGIC TO MERGE SMALL / BROKEN / INCOMPLETE CHUNKS FORWARD
    # ---------------------------------------------------------------------
    def merge_forward(self, chunks):
        merged = []
        i = 0

        while i < len(chunks):
            ch = chunks[i].rstrip()

            # ---------- RULE 1: Very small chunks ----------
            is_small = len(ch) < 60 or ch.count("\n") <= 0

            # ---------- RULE 2: Broken <img> tag ----------
            broken_img = ("<img" in ch and ">" not in ch)

            # ---------- RULE 3: Next chunk is a table ----------
            next_is_table = (
                i + 1 < len(chunks) and "<table" in chunks[i + 1]
            )

            # If this chunk requires merging forward
            if is_small or broken_img or next_is_table:
                if i + 1 < len(chunks):
                    # merge with next chunk
                    merged_chunk = ch + "\n" + chunks[i + 1]
                    merged.append(merged_chunk)
                    i += 2
                    continue

            # Otherwise keep as-is
            merged.append(chunks[i])
            i += 1

        return merged


# ---------------------------------------------------------------------
# INITIALIZE YOUR SPLITTER
# ---------------------------------------------------------------------

splitter = SafeMarkdownSplitter(
    chunk_size=1200,
    chunk_overlap=150,
    separators=[
        r"(?=\n#{1,6} )",   # split before headings but keep heading attached
        r"\n\n+",
        r"\n",
        " ",
        ""
    ],
    keep_separator=True
)


In [2]:
import os
path = os.path.join(os.getcwd() ,"../Pipeline/Cleaned")
list = os.listdir(path)
print(list)

['History Class 10_cleaned.md', 'History Class 11 (Part 1)_cleaned.md', 'History Class 12 (Part 1.1)_cleaned.md', 'History Class 12 (Part 2.1)_cleaned.md', 'History Class 12 (Part 3.1)_cleaned.md', 'History Class 6 (Part 1)_cleaned.md', 'History Class 7_cleaned.md', 'History Class 8_cleaned.md', 'History Class 9_cleaned.md']


In [3]:
for idx , file in enumerate(list):
    print(idx, file)

selected_book = None
input_idx = int(input("Enter file index to process: "))
selected_book = list[input_idx]

print("\n Selected file:", selected_book)

0 History Class 10_cleaned.md
1 History Class 11 (Part 1)_cleaned.md
2 History Class 12 (Part 1.1)_cleaned.md
3 History Class 12 (Part 2.1)_cleaned.md
4 History Class 12 (Part 3.1)_cleaned.md
5 History Class 6 (Part 1)_cleaned.md
6 History Class 7_cleaned.md
7 History Class 8_cleaned.md
8 History Class 9_cleaned.md

 Selected file: History Class 10_cleaned.md


In [5]:
import os
# clean = os.path.exists("Cleaned/History Class 11 (Part 1)_cleaned")
book_path = os.path.join(path , selected_book)
with open(book_path, "r", encoding="utf-8") as f:
    text = f.read()

chunks = splitter.split_text(text)


In [None]:


visualize_chunks(chunks[:15])

ModuleNotFoundError: No module named 'Utils'

In [38]:
maxi = int(-1e9)
idx1 = -1
idx2 = -1
mini = int(1e9)
tables = []

for chunk in chunks:
    l = len(chunk)
    if "<table" in chunk:
       tables.append(chunk)
       continue
    if l > maxi:
        maxi = l
        idx1 = chunks.index(chunk)
    if l < mini:
        mini = l
        idx2 = chunks.index(chunk)

print("Max length:", maxi, "at index", idx1)
print("Min length:", mini, "at index", idx2)

Max length: 1242 at index 4
Min length: 385 at index 186


In [39]:
group = {chunks[idx1] , chunks[idx2]}
visualize_chunks(group)

In [40]:
print("Number of tables:", len(tables))
mx = 0 
idx = -1 
for i , table in enumerate(tables):
    if len(table) > mx:
        mx = len(table)
        idx = i
print("Max table length:", mx)
print("at index:", idx)

Number of tables: 0
Max table length: 0
at index: -1


In [32]:
visualize_chunks(tables[idx : idx + 1])

In [41]:
g = [chunks[idx2 - 1], chunks[idx2], chunks[idx2 + 1],chunks[idx2 + 2]]
visualize_chunks(g)