# MTRAGEval - Data Ingestion

This notebook handles data loading and vector store creation for the SemEval 2026 Task 8 system.

## Steps:
1. Install dependencies
2. Load mtRAG dataset
3. Apply Parent-Child Chunking
4. Build and persist Chroma vector store

## 1. Environment Setup

In [None]:
# Install dependencies (run once)
!pip install -q langchain==0.1.10 langchain-community==0.0.25 chromadb==0.4.24 sentence-transformers==2.5.1

In [None]:
import sys
import os

# Add src to path for imports
sys.path.insert(0, '../')

# Verify GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Configuration

In [None]:
# Configuration
DATA_PATH = "../data/mtrag_dataset.json"  # Path to your mtRAG JSON file
CHROMA_PERSIST_DIR = "../chromadb"
EMBEDDING_MODEL = "BAAI/bge-m3"

# Chunking parameters
PARENT_CHUNK_SIZE = 1200
PARENT_CHUNK_OVERLAP = 100
CHILD_CHUNK_SIZE = 400
CHILD_CHUNK_OVERLAP = 50

## 3. Load and Chunk Data

In [None]:
import json
import uuid
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def load_mtrag_data(json_path: str):
    """
    Load mtRAG dataset from JSON file.
    
    TODO: Implement data loading logic
    """
    raise NotImplementedError("Implement data loading from JSON")


def apply_parent_child_chunking(raw_docs):
    """
    Apply Parent-Child chunking strategy.
    
    - Parent chunks: Large for context
    - Child chunks: Small for precise retrieval
    - Child metadata stores parent content
    
    TODO: Implement chunking logic
    """
    raise NotImplementedError("Implement parent-child chunking")

In [None]:
# Execute data loading and chunking
# raw_docs = load_mtrag_data(DATA_PATH)
# chunked_docs = apply_parent_child_chunking(raw_docs)
# print(f"Created {len(chunked_docs)} child chunks")

## 4. Build Vector Store

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

def build_chroma_vectorstore(docs, persist_dir, embedding_model):
    """
    Build and persist Chroma vector store.
    
    TODO: Implement vector store creation
    """
    raise NotImplementedError("Implement Chroma vector store creation")

In [None]:
# Build vector store
# vectorstore = build_chroma_vectorstore(chunked_docs, CHROMA_PERSIST_DIR, EMBEDDING_MODEL)
# print("Vector store built and persisted successfully!")

## 5. Verification

In [None]:
# Verify vector store was created
# test_query = "test query"
# results = vectorstore.similarity_search(test_query, k=3)
# print(f"Found {len(results)} results for test query")
# for i, doc in enumerate(results):
#     print(f"\nResult {i+1}:")
#     print(f"  Content: {doc.page_content[:200]}...")
#     print(f"  Has parent: {'parent_content' in doc.metadata}")