In [None]:
#Hybrid Chunking Implementation (Codespaces Ready)


import pandas as pd
import numpy as np
import re
from typing import List, Dict, Tuple
import json
from collections import Counter
import pickle
from pathlib import Path
import os

# -------------------------------------------------------------------
# Utility: Auto-detect dataset path (works inside /notebooks folder)
# -------------------------------------------------------------------
def get_dataset_path(filename="wikipedia_knowledge_base_balanced.csv"):
    """Return dataset path automatically whether in /data/raw, /data, or repo root."""
    base_paths = [Path.cwd(), Path.cwd().parent]
    possible_dirs = ["data/raw", "data", "."]
    possible_paths = [b / d / filename for b in base_paths for d in possible_dirs]

    for p in possible_paths:
        if p.exists():
            print(f" Dataset found: {p.resolve()}")
            return str(p)
    raise FileNotFoundError(f" Dataset '{filename}' not found. Checked: {[str(p) for p in possible_paths]}")


In [7]:
# -------------------------------------------------------------------
# Hybrid Chunker Class
# -------------------------------------------------------------------
class HybridChunker:
    """Production-ready hybrid chunking implementation (Codespaces version)."""

    def __init__(self):
        self.chunks = []
        self.article_stats = {}

    # -----------------------------
    # Main chunking logic
    # -----------------------------
    def create_hybrid_chunks(self, article_row: pd.Series) -> List[Dict]:
        title = article_row["title"]
        content = article_row["content"]
        domain = article_row["domain"]
        url = article_row.get("url", "")

        article_chunks = []
        article_chunks.append(self._create_title_beginning_chunk(title, content, domain, url))

        definition_chunk = self._create_definitions_chunk(title, content, domain)
        if definition_chunk:
            article_chunks.append(definition_chunk)

        content_chunks = self._create_content_chunks(title, content, domain, url)
        article_chunks.extend(content_chunks)

        return article_chunks

    # --- title + intro chunk ---
    def _create_title_beginning_chunk(self, title: str, content: str, domain: str, url: str) -> Dict:
        beginning_text = content[:1000]
        sentences = re.split(r"(?<=[.!?])\s+", beginning_text)
        if len(sentences) > 1:
            beginning_text = ". ".join(sentences[:-1]) + "."

        chunk_text = f"Title: {title}\nDomain: {domain}\n\n{beginning_text}"

        return {
            "text": chunk_text,
            "chunk_type": "title_beginning",
            "priority": "HIGH",
            "metadata": {
                "title": title,
                "domain": domain,
                "url": url,
                "word_count": len(chunk_text.split()),
                "char_count": len(chunk_text),
            },
        }

    # --- definitions chunk ---
    def _create_definitions_chunk(self, title: str, content: str, domain: str) -> Dict:
        first_paragraph = content.split("\n\n")[0] if "\n\n" in content else content[:800]
        patterns = [
            rf"{re.escape(title)} is (?:the|a|an) (.+?)(?:\.|,|;)",
            rf"{re.escape(title)} refers to (.+?)(?:\.|,|;)",
            rf"{re.escape(title)} (?:means|denotes|represents) (.+?)(?:\.|,|;)",
        ]

        definitions = []
        for pattern in patterns:
            matches = re.findall(pattern, first_paragraph, re.IGNORECASE | re.DOTALL)
            for m in matches:
                definitions.append((title, m.strip()))

        if not definitions:
            return None

        def_text = f"Key definitions for {title} ({domain}):\n\n"
        for term, definition in definitions[:3]:
            def_text += f"• {term}: {definition}\n"

        return {
            "text": def_text,
            "chunk_type": "definitions",
            "priority": "HIGH",
            "metadata": {
                "title": title,
                "domain": domain,
                "definitions_count": len(definitions),
                "word_count": len(def_text.split()),
                "char_count": len(def_text),
            },
        }

    # --- content chunks ---
    def _create_content_chunks(self, title: str, content: str, domain: str, url: str) -> List[Dict]:
        sentences = re.split(r"(?<=[.!?])\s+", content)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]

        chunks, current_chunk, current_wc = [], [], 0
        target_size, overlap = 400, 2

        for i, sentence in enumerate(sentences):
            sw = len(sentence.split())
            if current_wc + sw > target_size and current_chunk:
                chunk_text = " ".join(current_chunk)
                chunks.append({
                    "text": chunk_text,
                    "chunk_type": "content",
                    "priority": "MEDIUM",
                    "metadata": {
                        "title": title,
                        "domain": domain,
                        "url": url,
                        "sentence_start": i - len(current_chunk),
                        "sentence_end": i,
                        "word_count": current_wc,
                        "char_count": len(chunk_text),
                    },
                })
                # overlap
                current_chunk = current_chunk[-overlap:]
                current_wc = sum(len(s.split()) for s in current_chunk)
            current_chunk.append(sentence)
            current_wc += sw

        if current_chunk and current_wc > 50:
            chunk_text = " ".join(current_chunk)
            chunks.append({
                "text": chunk_text,
                "chunk_type": "content",
                "priority": "MEDIUM",
                "metadata": {
                    "title": title,
                    "domain": domain,
                    "url": url,
                    "sentence_start": len(sentences) - len(current_chunk),
                    "sentence_end": len(sentences),
                    "word_count": current_wc,
                    "char_count": len(chunk_text),
                },
            })
        return chunks

    # -----------------------------
    # Process all articles
    # -----------------------------
    def process_knowledge_base(self, df: pd.DataFrame) -> Dict:
        print("🚀 Processing Knowledge Base with Hybrid Chunking")
        print("=" * 70)

        all_chunks = []
        stats = {
            "total_articles": len(df),
            "total_chunks": 0,
            "chunk_types": Counter(),
            "priority_distribution": Counter(),
            "domain_stats": {},
        }

        for idx, article in df.iterrows():
            print(f" [{idx+1}/{len(df)}] {article['title']}")
            article_chunks = self.create_hybrid_chunks(article)

            for chunk in article_chunks:
                chunk["article_idx"] = idx
                chunk["chunk_id"] = len(all_chunks)
                all_chunks.append(chunk)
                stats["chunk_types"][chunk["chunk_type"]] += 1
                stats["priority_distribution"][chunk["priority"]] += 1

            domain = article["domain"]
            if domain not in stats["domain_stats"]:
                stats["domain_stats"][domain] = {"articles": 0, "chunks": 0}
            stats["domain_stats"][domain]["articles"] += 1
            stats["domain_stats"][domain]["chunks"] += len(article_chunks)

        stats["total_chunks"] = len(all_chunks)
        self.chunks = all_chunks
        self.article_stats = stats
        return {"chunks": all_chunks, "stats": stats}
    
    # -----------------------------
    # Analysis & save utilities
    # -----------------------------
    def analyze_chunking_results(self):
        s = self.article_stats
        print(f"\n Hybrid Chunking Results Summary")
        print("=" * 60)
        print(f" Total articles: {s['total_articles']}")
        print(f" Total chunks: {s['total_chunks']}")
        print(f"📏 Avg chunks/article: {s['total_chunks']/s['total_articles']:.1f}")
        print("\n🎯 Chunk Type Distribution:")
        for t, c in s["chunk_types"].items():
            print(f"   {t}: {c}")
        print("\n Priority Distribution:")
        for p, c in s["priority_distribution"].items():
            print(f"   {p}: {c}")

    def save_chunks(self, out_path="data/processed/hybrid_chunks.pkl"):
        out_file = Path(out_path)
        out_file.parent.mkdir(parents=True, exist_ok=True)

        data = {
            "chunks": self.chunks,
            "stats": self.article_stats,
            "method": "hybrid",
        }
        with open(out_file, "wb") as f:
            pickle.dump(data, f)
        print(f" Saved hybrid chunks → {out_file.resolve()}")
        return str(out_file)

# -------------------------------------------------------------------
# Runner Function (Codespaces)
# -------------------------------------------------------------------
def run_hybrid_chunking():
    print(" Step 2: Hybrid Chunking Implementation (Codespaces)")
    print("=" * 70)

    csv_path = get_dataset_path()
    df = pd.read_csv(csv_path)
    print(f" Loaded {len(df)} articles from {csv_path}")

    chunker = HybridChunker()
    results = chunker.process_knowledge_base(df)
    chunker.analyze_chunking_results()

    saved_file = chunker.save_chunks()
    print(f"\n Step 2 Complete! {len(results['chunks'])} chunks created.")
    print(f" File saved: {saved_file}")
    print(" Ready for Step 3: Embedding Creation")

    return results, chunker

# Execute directly
if __name__ == "__main__":
    results, chunker = run_hybrid_chunking()

 Step 2: Hybrid Chunking Implementation (Codespaces)
📂 Dataset found: /workspaces/Rag-Knowledge-Assiatant/data/wikipedia_knowledge_base_balanced.csv
 Loaded 150 articles from /workspaces/Rag-Knowledge-Assiatant/data/wikipedia_knowledge_base_balanced.csv
🚀 Processing Knowledge Base with Hybrid Chunking
 [1/150] Physics
 [2/150] Chemistry
 [3/150] Biology
 [4/150] Quantum mechanics
 [5/150] Thermodynamics
 [6/150] Organic chemistry
 [7/150] Molecular biology
 [8/150] Genetics
 [9/150] Evolution
 [10/150] Mechanical engineering
 [11/150] Electrical engineering
 [12/150] Materials science
 [13/150] Artificial intelligence
 [14/150] Machine learning
 [15/150] Deep learning
 [16/150] Computer science
 [17/150] Software engineering
 [18/150] Data science
 [19/150] Computer security
 [20/150] Blockchain
 [21/150] Python (programming language)
 [22/150] Cloud computing
 [23/150] Quantum computing
 [24/150] Robotics
 [25/150] Mathematics
 [26/150] Calculus
 [27/150] Linear algebra
 [28/150] Stat