In [None]:
# import environment, IO and tokenization helpers
import os
from dotenv import find_dotenv, load_dotenv
import json
import cohere
from urllib.parse import urlparse
from tqdm.notebook import tqdm_notebook

In [None]:
# load local .env for API keys
load_dotenv(find_dotenv(".env"))

True

In [None]:
# read Cohere API key from environment
COHERE_API_KEY=os.getenv("COHERE_API_KEY")

In [None]:
# initialize Cohere client for token ops
cohere_client = cohere.ClientV2(COHERE_API_KEY)

In [None]:
# load scraped documents from data.json
with open("data.json", "r") as file:
    data = json.load(file)

In [None]:
# helper to derive category/page/subpage from URL path
def parse_url(url):
    # Remove domain & leading/trailing slashes
    path = urlparse(url).path.strip("/")
    parts = path.split("/")

    if len(parts) == 0:
        return {"category": None, "page_name": None, "subpage": None, "url": url}

    category = parts[0]
    page_name = parts[-1]
    subpage = "/".join(parts[1:-1]) if len(parts) > 2 else None

    return {
        "category": category,
        "subpage": subpage,
        "page_name": page_name,
        "url": url
    }

In [None]:
# tokenize and detokenize helpers using Cohere
def tokenize(content):
    return cohere_client.tokenize(
                text=content, model="embed-v4.0")

def detokenize(content):
    return cohere_client.detokenize(
                tokens=content, model="embed-v4.0")

In [None]:
# initialize chunks list
chunks = []

In [None]:
# split long documents into overlapping token chunks
MAX_TOKENS=2048
OVERLAP_TOKENS=150

for _, item in tqdm_notebook(data.items(), "Chunking"):
    for value in (item):

        idx = 1
        
        metadata = parse_url(value['url'])
        tokens = tokenize(value['content']).tokens
        
        if len(tokens) > 2048:
            step = MAX_TOKENS - OVERLAP_TOKENS
            total_tokens = len(tokens)
            
            for i in range(0, total_tokens, step):
                chunk_tokens = tokens[i:i + MAX_TOKENS]
                chunk_text = detokenize(chunk_tokens).text
                chunk_id = f"{metadata['page_name']}_{idx}"
                chunks.append({**metadata, "chunk_id": chunk_id, "content": chunk_text})
                idx+=1
        
        else:
            chunk_id = f"{metadata['page_name']}_{idx}"
            chunks.append({**metadata, "chunk_id": chunk_id, "content": value['content']})

Chunking:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
# report number of generated chunks
len(chunks)

81

In [None]:
# persist chunks to chunks.json
with open("chunks.json", "w") as file:
    json.dump(chunks, file, indent=4)

In [None]:
# inspect a sample chunk for verification
chunks[80]

{'category': 'contact-us',
 'subpage': None,
 'page_name': 'work-with-us',
 'url': 'https://www.sunmarke.com/contact-us/work-with-us/',
 'chunk_id': 'work-with-us_1',
 'content': 'Life at Sunmarke\nSunmarke - Where Growth is a Way of Life\nLife at Sunmarke is an exciting journey in an organisation that believes in transforming children’s lives.\nOur teachers strongly believe in our education philosophy and play the role of nurturing adults who help spark imagination, build self-esteem in a child and help them discover their potential.\nAt Sunmarke, you will join a large family of some of the best teaching talent in the region, handpicked for their inspirational teaching.\nYou will be given the support, dedicated time, and world-class coaching and professional development to allow you to develop your skills and expertise to the highest levels, and keep abreast of the latest developments in teaching.\nAs we grow our schools, you will be given multiple opportunities to take on more senior