# Data Ingestion Module Verification

This notebook tests the `src.ingestion` module which handles:
- **Loading mtRAG data** from JSONL files
- **Parent-Child Chunking** for optimal retrieval (large chunks for context, small for search)
- **Vector Store Creation** using Qdrant + BGE-M3 embeddings

Uses a **small subset (50 docs)** for fast testing.

In [None]:
import sys
import os
import json
import zipfile

sys.path.append(os.path.abspath(".."))
PROJECT_ROOT = os.path.abspath("..")
QDRANT_PATH = os.path.join(PROJECT_ROOT, "qdrant_ingestion_test")
MAX_DOCS = 50

print(f"Project root: {PROJECT_ROOT}")
print(f"Test subset size: {MAX_DOCS} documents")

## Step 0: Prepare Test Data
Extract the corpus and create a small subset for fast testing.

In [None]:
# Extract corpus if needed
corpus_dir = os.path.join(PROJECT_ROOT, "dataset/corpora/passage_level")
jsonl_file = os.path.join(corpus_dir, "govt.jsonl")
zip_file = os.path.join(corpus_dir, "govt.jsonl.zip")

if not os.path.exists(jsonl_file) and os.path.exists(zip_file):
    print("Extracting corpus...")
    with zipfile.ZipFile(zip_file, 'r') as zf:
        zf.extractall(corpus_dir)
    print("Corpus extracted")
else:
    print(f"Corpus ready: govt.jsonl")

# Create test subset
test_file = os.path.join(PROJECT_ROOT, "data/test_subset.jsonl")
os.makedirs(os.path.dirname(test_file), exist_ok=True)

print(f"Creating test subset with {MAX_DOCS} documents...")
with open(jsonl_file, 'r') as f_in, open(test_file, 'w') as f_out:
    for i, line in enumerate(f_in):
        if i >= MAX_DOCS:
            break
        f_out.write(line)
print(f"Test file created: {test_file}")

## Step 1: Test `load_and_chunk_data()`

This function:
1. Loads documents from JSONL
2. Applies **Parent-Child Chunking**:
   - Parent chunks: 1200 chars (full context for LLM)
   - Child chunks: 400 chars (indexed for search)
3. Stores parent text in child metadata for retrieval

In [None]:
from src.ingestion import load_and_chunk_data

print("Loading and chunking data...")
docs = load_and_chunk_data(test_file)

print(f"\nResults:")
print(f"   â€¢ Total chunks created: {len(docs)}")
print(f"   â€¢ Avg chunks per document: {len(docs) / MAX_DOCS:.1f}")

print(f"\nðŸ“„ Sample chunk:")
sample = docs[0]
print(f"   â€¢ Child content (indexed): {sample.page_content[:100]}...")
print(f"   â€¢ Parent text length: {len(sample.metadata.get('parent_text', ''))} chars")
print(f"   â€¢ Metadata keys: {list(sample.metadata.keys())}")

print("\nload_and_chunk_data() working correctly!")

## Step 2: Test `build_vector_store()`

This function:
1. Creates HuggingFace embeddings (BGE-M3)
2. Initializes Qdrant local database
3. Indexes all chunks with their embeddings

In [None]:
from src.ingestion import build_vector_store

# Use only first 30 chunks for speed
docs_subset = docs[:30]
print(f"Building vector store with {len(docs_subset)} chunks...")
print("   (Using subset for faster testing)")

vectorstore = build_vector_store(docs_subset, persist_dir=QDRANT_PATH)

print("\nbuild_vector_store() working correctly!")

## Step 3: Verify Qdrant Collection

In [None]:
info = vectorstore.client.get_collection("mtrag_collection")

print(f"Collection Statistics:")
print(f"   â€¢ Points (vectors): {info.points_count}")
print(f"   â€¢ Status: {info.status}")

print("\nCollection created and verified!")

## Step 4: Test Similarity Search

Verify that the vector store can find relevant documents.

In [None]:
query = "government regulations"
print(f"Testing search with query: '{query}'")

results = vectorstore.similarity_search_with_score(query, k=3)

print(f"\nFound {len(results)} results:")
for i, (doc, score) in enumerate(results):
    print(f"\n   Result {i+1} (similarity: {score:.4f}):")
    print(f"   â€¢ Child chunk: {doc.page_content[:80]}...")
    parent = doc.metadata.get('parent_text', '')
    print(f"   â€¢ Parent context: {parent[:80]}..." if parent else "   â€¢ No parent")

print("\nSimilarity search working correctly!")

## Cleanup
Remove test files after verification.

In [None]:
import shutil

# Close client first
vectorstore.client.close()

# Remove test files
if os.path.exists(QDRANT_PATH):
    shutil.rmtree(QDRANT_PATH)
    print(f"Removed test database: {QDRANT_PATH}")

if os.path.exists(test_file):
    os.remove(test_file)
    print(f"Removed test subset: {test_file}")

print("\nAll ingestion tests passed!")